diff --git a/CHANGELOG.md b/CHANGELOG.md index 06cef75..83e7085 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,10 +7,11 @@ breaking changes, Y is new features or larger non-breaking changes, and Z is sma However, it is still pre-1.0 software, and does not claim to be super stable. -## [0.17.0.0] -### Changed -- Updated all the dependency packages (Transformers, OpenAI, Anthropic, Numpy, etc) to latest versions. Might be breaking for any cached values. +## [0.16.3.0] + +### Added +- support for duplicates in a openai batch prompt ## [0.16.2.1] diff --git a/README.md b/README.md index 61b2ed7..00274f4 100644 --- a/README.md +++ b/README.md @@ -114,10 +114,10 @@ print(pred.completion_text) # "2 + 6 equals 8." pred = lm.predict(LmPrompt( [ "What is 2+2?", # user turn - "4", # assistant turn - "What is 5+3?" # user turn - "8", # assistant turn - "What is 4+4?" # user turn + "4", # assistant turn + "What is 5+3?" # user turn + "8", # assistant turn + "What is 4+4?" # user turn # We use few-shot turns to encourage the answer to be our desired format. # If you don't give example turns you might get something like # "4 + 4 equals 8." instead of just "8" as desired. @@ -153,14 +153,14 @@ from lmwrapper.structs import LmPrompt lm = get_open_ai_lm(OpenAiModelNames.gpt_4o_mini) prompt = LmPrompt( - "Describe Paris in a few sentences", + "Describe Paris in one sentence", cache=True, temperature=1, - max_tokens=25, + max_tokens=10, ) first_prediction = lm.predict(prompt) print(first_prediction.completion_text) -# ... eg, "Paris is a city of romance and art, renowned for its iconic landmarks, vibrant culture, and rich history..." +# ... eg, "Paris is a city of romance and art, renowned for its iconic landmarks, vibrant culture, and rich history." # The response to this prompt is now saved to the disk. # You could rerun this script and you would load from cache near-instantly. @@ -294,7 +294,7 @@ from lmwrapper.huggingface_wrapper import get_huggingface_lm from lmwrapper.structs import LmPrompt # Download a small model for demo -lm = get_huggingface_lm("HuggingFaceTB/SmolLM2-135M") +lm = get_huggingface_lm("gpt2") # 124M parameters prediction = lm.predict(LmPrompt( "The capital of Germany is Berlin. The capital of France is", diff --git a/lmwrapper/huggingface_wrapper/predictor.py b/lmwrapper/huggingface_wrapper/predictor.py index abc0be3..7e1403b 100644 --- a/lmwrapper/huggingface_wrapper/predictor.py +++ b/lmwrapper/huggingface_wrapper/predictor.py @@ -349,7 +349,6 @@ def new_call(attention_mask, *args, **kwargs): **encoded_input, generation_config=gen_config, stopping_criteria=stopping_criteria, - return_dict_in_generate=True, return_legacy_cache=True, ) #logging.info("Generation output type:" + str(type(generation_output))) diff --git a/lmwrapper/openai_wrapper/batching.py b/lmwrapper/openai_wrapper/batching.py index 9a06228..e591360 100644 --- a/lmwrapper/openai_wrapper/batching.py +++ b/lmwrapper/openai_wrapper/batching.py @@ -54,7 +54,6 @@ def __init__( self._validate_prompts_input(prompts) self._awaiting_marker = object() self._output = [self._awaiting_marker] * len(prompts) - # TODO store the output as a hash to list[outputs]. Then can read from that for dup prompts self._num_yielded = 0 self._cache = cache self._maintain_order = maintain_order @@ -63,11 +62,11 @@ def __init__( self._index_to_hash = [ prompt_to_text_and_sample_hash(p, _cache_key) for i, p in enumerate(prompts) ] - self._prompt_hashes_to_index = {h: i for i, h in enumerate(self._index_to_hash)} - if len(self._prompt_hashes_to_index) != len(prompts): - raise NotImplementedError( - "Duplicate prompts detected. This is not currently handled", - ) + self._prompt_hashes_to_index = {} + for i, h in enumerate(self._index_to_hash): + if h not in self._prompt_hashes_to_index: + self._prompt_hashes_to_index[h] = [] + self._prompt_hashes_to_index[h].append(i) self._started = False self._lm: OpenAIPredictor = cache.lm self._batch_id_to_pbar = {} @@ -182,15 +181,15 @@ def _send_batch(self, batch: "_BatchToMonitor"): return lines = [] custom_ids = set() + # Deduplicate prompts at batch level to avoid sending identical requests for prompt in batch.prompts: custom_id = prompt_to_text_and_sample_hash( prompt, self._lm.get_model_cache_key(), ) if custom_id in custom_ids: - raise RuntimeError( - "Duplicate custom id target outputs? This should not happen?", - ) + # Skip duplicate prompts - result will be distributed to all positions + continue l = json.dumps(_prompt_to_arg_dict_for_batch(prompt, self._lm, custom_id)) custom_ids.add(custom_id) lines.append(l) @@ -226,8 +225,9 @@ def _send_batch(self, batch: "_BatchToMonitor"): ) place_holders = self._cache.put_batch_placeholders(batch_row, batch.prompts) for place_holder in place_holders: - index = self._prompt_hashes_to_index[place_holder.text_and_sample_hash] - self._output[index] = place_holder + indices = self._prompt_hashes_to_index[place_holder.text_and_sample_hash] + for index in indices: + self._output[index] = place_holder def _poll_completion(self, target: BatchPredictionPlaceholder | object): if not isinstance(target, BatchPredictionPlaceholder): @@ -335,10 +335,14 @@ def _handle_if_batch_expired( ) if phash not in self._prompt_hashes_to_index: continue - index = self._prompt_hashes_to_index[phash] - if self._output[index] is self._awaiting_marker or isinstance( - self._output[index], - BatchPredictionPlaceholder, + indices = self._prompt_hashes_to_index[phash] + # Check if any of the duplicate positions still need results + if any( + self._output[index] is self._awaiting_marker or isinstance( + self._output[index], + BatchPredictionPlaceholder, + ) + for index in indices ): needed_prompts.append(prompt) new_monitor = _BatchToMonitor( @@ -436,8 +440,9 @@ def _remove_in_progress_batch(self, batch: "_BatchToMonitor"): prompt, self._lm.get_model_cache_key(), ) - index = self._prompt_hashes_to_index[phash] - self._output[index] = self._awaiting_marker + indices = self._prompt_hashes_to_index[phash] + for index in indices: + self._output[index] = self._awaiting_marker for prompt in batch.prompts: self._cache.delete(prompt) @@ -510,8 +515,9 @@ def _update_cache_rows_from_output( "Custom id not found in a batch we started", custom_id, ) - out_index = self._prompt_hashes_to_index[custom_id] - prompt = self._prompts[out_index] + out_indices = self._prompt_hashes_to_index[custom_id] + # Use first index to get the prompt (all duplicates have same prompt) + prompt = self._prompts[out_indices[0]] body = response["body"] if self._lm.is_chat_model: body = openai.types.chat.ChatCompletion.parse_obj(body) @@ -520,7 +526,9 @@ def _update_cache_rows_from_output( pred = self._lm.prediction_from_api_response(body, prompt) assert len(pred) == 1 pred = pred[0] - self._output[out_index] = pred + # Set the same prediction for all duplicate prompt positions + for out_index in out_indices: + self._output[out_index] = pred self._cache.add_or_set(pred) def _update_cache_rows_from_errors( @@ -553,15 +561,18 @@ def _update_cache_rows_from_errors( "Custom id not found in a batch we started", custom_id, ) - out_index = self._prompt_hashes_to_index[custom_id] - prompt = self._prompts[out_index] + out_indices = self._prompt_hashes_to_index[custom_id] + # Use first index to get the prompt (all duplicates have same prompt) + prompt = self._prompts[out_indices[0]] pred = LmPrediction( completion_text=None, prompt=prompt, metad=None, error_message=json.dumps(body["error"]), ) - self._output[out_index] = pred + # Set the same error prediction for all duplicate prompt positions + for out_index in out_indices: + self._output[out_index] = pred self._cache.add_or_set(pred) def _pbar_for_targer(self, api_id: str, total: int): diff --git a/pyproject.toml b/pyproject.toml index 6cd096b..db81f3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ include = ["lmwrapper*"] [project] name = "lmwrapper" -version = "0.17.0.0" +version = "0.16.3.0" authors = [ { name = "David Gros" }, { name = "Claudio Spiess" }, @@ -36,14 +36,14 @@ requires-python = ">=3.10" classifiers = ["Programming Language :: Python :: 3"] keywords = ["large language models", "openai"] dependencies = [ - "openai~=1.79.0", + "openai~=1.55.3", #"diskcache~=5.6.3", #"joblib~=1.3.2", - "tiktoken~=0.9.0", + "tiktoken~=0.7.0", "ratemate~=0.1", "humanize~=4.8.0", "xxhash~=3.4", - "numpy~=2.2.6", + "numpy~=1.24", "packaging>=22.0", ] @@ -55,11 +55,11 @@ dev = [ ] quant = ["bitsandbytes~=0.41.1"] hf = [ - "torch~=2.7.0", + "torch~=2.3.0", #"transformers~=4.42.4" - "transformers~=4.51.3" + "transformers~=4.46.3" ] -anthropic = ["anthropic~=0.51.0"] +anthropic = ["anthropic~=0.40.0"] docs = [ "mkdocs>=1.4.0", "mkdocstrings[python]>=0.24.0", diff --git a/test/test_caching.py b/test/test_caching.py index 3bfc471..2fcdf01 100644 --- a/test/test_caching.py +++ b/test/test_caching.py @@ -9,7 +9,6 @@ from lmwrapper.abstract_predictor import get_mock_predictor from lmwrapper.caching import set_cache_dir -from lmwrapper.compatibility import has_transformers_compatibility_issues from lmwrapper.huggingface_wrapper.wrapper import get_huggingface_lm from lmwrapper.structs import LmPrompt @@ -22,8 +21,7 @@ def test_set_cache_dir(): tmpdirname = Path(tmpdirname) assert len(list(tmpdirname.rglob("*"))) == 0 set_cache_dir(tmpdirname) - model_name = "HuggingFaceTB/SmolLM2-135M" - lm = get_huggingface_lm(model_name) + lm = get_huggingface_lm("gpt2") prompt = LmPrompt( "Write a story about fish:", max_tokens=10, @@ -34,13 +32,13 @@ def test_set_cache_dir(): prompt = dataclasses.replace(prompt, cache=True) r2 = lm.predict(prompt) assert r1.completion_text != r2.completion_text - lm2 = get_huggingface_lm(model_name) + lm2 = get_huggingface_lm("gpt2") r3 = lm2.predict(prompt) assert r2.completion_text == r3.completion_text with tempfile.TemporaryDirectory() as tmpdirname: tmpdirname = Path(tmpdirname) set_cache_dir(tmpdirname) - lm2 = get_huggingface_lm(model_name) + lm2 = get_huggingface_lm("gpt2") r4 = lm2.predict(prompt) assert r3.completion_text != r4.completion_text diff --git a/test/test_huggingface.py b/test/test_huggingface.py index 3f7bce1..ad203c7 100644 --- a/test/test_huggingface.py +++ b/test/test_huggingface.py @@ -32,8 +32,7 @@ class Models(StrEnum): CodeLLama_7B_Instruct = "codellama/CodeLlama-7b-Instruct-hf" DistilGPT2 = "distilgpt2" GPT2 = "gpt2" - SMOLLM2_135M = "HuggingFaceTB/SmolLM2-135M" - SMOLLM2_135M_INSTRUCT = "HuggingFaceTB/SmolLM2-135M-Instruct" + SMOLLM2_135M = "HuggingFaceTB/SmolLM2-135M-Instruct" Mistral_7B = "mistralai/Mistral-7B-v0.1" qwen25_500M_instruct = "Qwen/Qwen2.5-0.5B-Instruct" QwenCoder25_500M = "Qwen/Qwen2.5-Coder-0.5B" @@ -49,7 +48,7 @@ class Models(StrEnum): SMALL_GPU = True SEQ2SEQ_MODELS = {Models.CodeT5plus_220M} -CAUSAL_MODELS = {Models.SMOLLM2_135M,} +CAUSAL_MODELS = {Models.GPT2,} BIG_SEQ2SEQ_MODELS = {Models.CodeT5plus_6B, Models.InstructCodeT5plus_16B} BIG_CAUSAL_MODELS = {Models.CodeGen2_1B, Models.CodeGen2_3_7B, Models.Mistral_7B} BIG_MODELS = BIG_SEQ2SEQ_MODELS | BIG_CAUSAL_MODELS @@ -1096,7 +1095,7 @@ def test_chat_qwen(): def test_chat_smol(): - model = Models.SMOLLM2_135M_INSTRUCT + model = Models.SMOLLM2_135M lm = get_huggingface_lm( model, ) @@ -1123,7 +1122,7 @@ def test_chat_smol(): # so mac works 4.42.2 but not continue_final_message... def test_smol_continue_chat(): - model = Models.SMOLLM2_135M_INSTRUCT + model = Models.SMOLLM2_135M lm = get_huggingface_lm( model, ) diff --git a/test/test_huggingface_internals.py b/test/test_huggingface_internals.py index f6c75e4..6b2963b 100644 --- a/test/test_huggingface_internals.py +++ b/test/test_huggingface_internals.py @@ -1,7 +1,5 @@ import dataclasses import os - -from lmwrapper.compatibility import has_transformers_compatibility_issues from test.test_huggingface import BIG_MODELS, Models import numpy as np @@ -16,20 +14,15 @@ HAS_CUDA = torch.cuda.is_available() -models = [ - (Models.CodeGen_350M, 20, 64 * 16), - (Models.CodeGen2_1B, 16, 2048), - # ^ Important to run since it doesn't use the same attentions value -] -if not has_transformers_compatibility_issues(): - models = [ - (Models.GPT2, 12, 768), - ] + models - @pytest.mark.parametrize( "model_name_layers_hidden", - models, + [ + (Models.GPT2, 12, 768), + (Models.CodeGen_350M, 20, 64 * 16), + (Models.CodeGen2_1B, 16, 2048), + # ^ Important to run since it doesn't use the same attentions value + ], ) def test_get_internals_hidden_states(pytestconfig, model_name_layers_hidden): model_name, num_layers, hidden_size = model_name_layers_hidden diff --git a/test/test_models_common.py b/test/test_models_common.py index ec55dc3..ea6f759 100644 --- a/test/test_models_common.py +++ b/test/test_models_common.py @@ -74,8 +74,7 @@ def wrapper(*args, **kwargs): ECHOABLE_MODELS = [ # get_open_ai_lm(OpenAiModelNames.gpt_3_5_turbo_instruct), # Won't work with now that echo disabled - #get_huggingface_lm("gpt2"), - MODEL_NAMES["small_hug"], + get_huggingface_lm("gpt2"), ] random_prompt = ("Give a random base-64 guid (answer with only the guid). " @@ -333,7 +332,6 @@ def test_unconditional_gen(lm): "The capital of Germany is the city Berlin. " "The capital of Spain is the city Madrid. " "The capital of UK is the city London. " - "The capital of Japan is the city Tokyo. " "The capital of France" ) diff --git a/test/test_openai_batching.py b/test/test_openai_batching.py index ab543c8..8e7cc28 100644 --- a/test/test_openai_batching.py +++ b/test/test_openai_batching.py @@ -134,9 +134,9 @@ def mock_batches_create(**kwargs): assert calls == 3 -@pytest.mark.skip("duplicate prompts not working") def test_batch_dup_prompts(): - """A test with duplicate values. Should only submit one of them""" + """Test that duplicate prompts are deduplicated at API level while maintaining order for results. + 4 prompts with 3 unique values should only send 3 requests to OpenAI API.""" clear_cache_dir() cache = SqlBackedCache(lm=get_open_ai_lm()) orig_api = cache._lm._api @@ -155,8 +155,17 @@ def mock_files_create(**kwargs): assert kwargs["purpose"] == "batch" assert kwargs["file"] is not None file_text = kwargs["file"].getvalue().decode() - file_lines = file_text.split("\n") - assert len(file_lines) == 1, "unexpected number of lines" + file_lines = [line for line in file_text.split("\n") if line.strip()] + assert len(file_lines) == 3, "should have 3 unique prompts: hello, Goodbye, Yo" + + # Verify that the JSONL contains the expected unique prompts + import json + custom_ids = set() + for line in file_lines: + data = json.loads(line) + custom_ids.add(data["custom_id"]) + # Should have 3 unique custom_ids corresponding to our 3 unique prompts + assert len(custom_ids) == 3, f"Expected 3 unique custom_ids, got {len(custom_ids)}" return openai.types.FileObject.model_validate(sample_file_resp) def mock_batches_create(**kwargs): @@ -169,7 +178,12 @@ def mock_batches_create(**kwargs): mock_api.batches.create = mock_batches_create batch_manager = OpenAiBatchManager( - [LmPrompt("hello", cache=True) for i in range(2)], + [ + LmPrompt("hello", cache=True), + LmPrompt("Goodbye", cache=True), + LmPrompt("hello", cache=True), + LmPrompt("Yo", cache=True), + ], cache=cache, ) batch_manager.start_batch() diff --git a/test/test_params.py b/test/test_params.py index 7215aed..c476953 100644 --- a/test/test_params.py +++ b/test/test_params.py @@ -1,3 +1,2 @@ #DEFAULT_SMALL = "HuggingFaceTB/SmolLM-135M-Instruct" -#DEFAULT_SMALL = "gpt2" -DEFAULT_SMALL = "HuggingFaceTB/SmolLM2-135M" +DEFAULT_SMALL = "gpt2" diff --git a/test/test_xbatching_actual_run.py b/test/test_xbatching_actual_run.py index d4c06ac..39c4240 100644 --- a/test/test_xbatching_actual_run.py +++ b/test/test_xbatching_actual_run.py @@ -9,10 +9,10 @@ from lmwrapper.structs import LmPrompt IS_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" - +SKIP_ALL = True @pytest.mark.skipif( - IS_GITHUB_ACTIONS or True, # taking too long right now + IS_GITHUB_ACTIONS or SKIP_ALL, # taking too long right now reason="Batches could take a long time so skip in CI", ) def test_split_up_prompt_with_arithmetic(): @@ -54,12 +54,12 @@ def prompt_text_for_num(num): @pytest.mark.skipif( - IS_GITHUB_ACTIONS or True, # taking too long right now + IS_GITHUB_ACTIONS or SKIP_ALL, # taking too long right now reason="Batches could take a long time so skip in CI", ) def test_failed_prompt(): clear_cache_dir() - model_name = OpenAiModelNames.gpt_4_1_nano + model_name = "gpt-4.1-nano-2025-04-14" lm = get_open_ai_lm(model_name) cache = SqlBackedCache(lm=lm) batching_manager = OpenAiBatchManager( @@ -67,13 +67,13 @@ def test_failed_prompt(): LmPrompt( "a", cache=True, - max_tokens=30_000, # output too big + max_tokens=60_000, # output too big temperature=0, ), LmPrompt( "a", cache=True, - max_tokens=30_000, # output too big + max_tokens=3, temperature=1000, # Bad temp ), LmPrompt( # A good prompt @@ -94,10 +94,6 @@ def test_failed_prompt(): assert results[2].completion_text.strip() == "e" -def test_why_fail(): - assert True - - if __name__ == "__main__": if not IS_GITHUB_ACTIONS: # simple()