DaiseyCode · DNGros · May 23, 2025 · May 22, 2025 · May 22, 2025 · May 23, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,10 +7,11 @@ breaking changes, Y is new features or larger non-breaking changes, and Z is sma
 However, it is still pre-1.0 software, and does not claim to
 be super stable.
 
-## [0.17.0.0]
 
-### Changed
-- Updated all the dependency packages (Transformers, OpenAI, Anthropic, Numpy, etc) to latest versions. Might be breaking for any cached values.
+## [0.16.3.0]
+
+### Added
+- support for duplicates in a openai batch prompt
 
 ## [0.16.2.1]
 

diff --git a/README.md b/README.md
@@ -114,10 +114,10 @@ print(pred.completion_text)  # "2 + 6 equals 8."
 pred = lm.predict(LmPrompt(
     [
         "What is 2+2?",  # user turn
-        "4",             # assistant turn
-        "What is 5+3?"   # user turn
-        "8",             # assistant turn
-        "What is 4+4?"   # user turn
+        "4",  # assistant turn
+        "What is 5+3?"  # user turn
+        "8",  # assistant turn
+        "What is 4+4?"  # user turn
         # We use few-shot turns to encourage the answer to be our desired format.
         #   If you don't give example turns you might get something like
         #   "4 + 4 equals 8." instead of just "8" as desired.
@@ -153,14 +153,14 @@ from lmwrapper.structs import LmPrompt
 lm = get_open_ai_lm(OpenAiModelNames.gpt_4o_mini)
 
 prompt = LmPrompt(
-  "Describe Paris in a few sentences", 
+  "Describe Paris in one sentence", 
   cache=True,
   temperature=1,
-  max_tokens=25,
+  max_tokens=10,
 )
 first_prediction = lm.predict(prompt)
 print(first_prediction.completion_text) 
-# ... eg, "Paris is a city of romance and art, renowned for its iconic landmarks, vibrant culture, and rich history..."
+# ... eg, "Paris is a city of romance and art, renowned for its iconic landmarks, vibrant culture, and rich history."
 
 # The response to this prompt is now saved to the disk.
 # You could rerun this script and you would load from cache near-instantly.
@@ -294,7 +294,7 @@ from lmwrapper.huggingface_wrapper import get_huggingface_lm
 from lmwrapper.structs import LmPrompt
 
 # Download a small model for demo
-lm = get_huggingface_lm("HuggingFaceTB/SmolLM2-135M")
+lm = get_huggingface_lm("gpt2") # 124M parameters
 
 prediction = lm.predict(LmPrompt(
     "The capital of Germany is Berlin. The capital of France is",

diff --git a/lmwrapper/huggingface_wrapper/predictor.py b/lmwrapper/huggingface_wrapper/predictor.py
@@ -349,7 +349,6 @@ def new_call(attention_mask, *args, **kwargs):
                 **encoded_input,
                 generation_config=gen_config,
                 stopping_criteria=stopping_criteria,
-                return_dict_in_generate=True,
                 return_legacy_cache=True,
             )
         #logging.info("Generation output type:" + str(type(generation_output)))

diff --git a/lmwrapper/openai_wrapper/batching.py b/lmwrapper/openai_wrapper/batching.py
@@ -54,7 +54,6 @@ def __init__(
         self._validate_prompts_input(prompts)
         self._awaiting_marker = object()
         self._output = [self._awaiting_marker] * len(prompts)
-        # TODO store the output as a hash to list[outputs]. Then can read from that for dup prompts
         self._num_yielded = 0
         self._cache = cache
         self._maintain_order = maintain_order
@@ -63,11 +62,11 @@ def __init__(
         self._index_to_hash = [
             prompt_to_text_and_sample_hash(p, _cache_key) for i, p in enumerate(prompts)
         ]
-        self._prompt_hashes_to_index = {h: i for i, h in enumerate(self._index_to_hash)}
-        if len(self._prompt_hashes_to_index) != len(prompts):
-            raise NotImplementedError(
-                "Duplicate prompts detected. This is not currently handled",
-            )
+        self._prompt_hashes_to_index = {}
+        for i, h in enumerate(self._index_to_hash):
+            if h not in self._prompt_hashes_to_index:
+                self._prompt_hashes_to_index[h] = []
+            self._prompt_hashes_to_index[h].append(i)
         self._started = False
         self._lm: OpenAIPredictor = cache.lm
         self._batch_id_to_pbar = {}
@@ -182,15 +181,15 @@ def _send_batch(self, batch: "_BatchToMonitor"):
             return
         lines = []
         custom_ids = set()
+        # Deduplicate prompts at batch level to avoid sending identical requests
         for prompt in batch.prompts:
             custom_id = prompt_to_text_and_sample_hash(
                 prompt,
                 self._lm.get_model_cache_key(),
             )
             if custom_id in custom_ids:
-                raise RuntimeError(
-                    "Duplicate custom id target outputs? This should not happen?",
-                )
+                # Skip duplicate prompts - result will be distributed to all positions
+                continue
             l = json.dumps(_prompt_to_arg_dict_for_batch(prompt, self._lm, custom_id))
             custom_ids.add(custom_id)
             lines.append(l)
@@ -226,8 +225,9 @@ def _send_batch(self, batch: "_BatchToMonitor"):
         )
         place_holders = self._cache.put_batch_placeholders(batch_row, batch.prompts)
         for place_holder in place_holders:
-            index = self._prompt_hashes_to_index[place_holder.text_and_sample_hash]
-            self._output[index] = place_holder
+            indices = self._prompt_hashes_to_index[place_holder.text_and_sample_hash]
+            for index in indices:
+                self._output[index] = place_holder
 
     def _poll_completion(self, target: BatchPredictionPlaceholder | object):
         if not isinstance(target, BatchPredictionPlaceholder):
@@ -335,10 +335,14 @@ def _handle_if_batch_expired(
             )
             if phash not in self._prompt_hashes_to_index:
                 continue
-            index = self._prompt_hashes_to_index[phash]
-            if self._output[index] is self._awaiting_marker or isinstance(
-                self._output[index],
-                BatchPredictionPlaceholder,
+            indices = self._prompt_hashes_to_index[phash]
+            # Check if any of the duplicate positions still need results
+            if any(
+                self._output[index] is self._awaiting_marker or isinstance(
+                    self._output[index],
+                    BatchPredictionPlaceholder,
+                )
+                for index in indices
             ):
                 needed_prompts.append(prompt)
         new_monitor = _BatchToMonitor(
@@ -436,8 +440,9 @@ def _remove_in_progress_batch(self, batch: "_BatchToMonitor"):
                 prompt,
                 self._lm.get_model_cache_key(),
             )
-            index = self._prompt_hashes_to_index[phash]
-            self._output[index] = self._awaiting_marker
+            indices = self._prompt_hashes_to_index[phash]
+            for index in indices:
+                self._output[index] = self._awaiting_marker
         for prompt in batch.prompts:
             self._cache.delete(prompt)
 
@@ -510,8 +515,9 @@ def _update_cache_rows_from_output(
                     "Custom id not found in a batch we started",
                     custom_id,
                 )
-            out_index = self._prompt_hashes_to_index[custom_id]
-            prompt = self._prompts[out_index]
+            out_indices = self._prompt_hashes_to_index[custom_id]
+            # Use first index to get the prompt (all duplicates have same prompt)
+            prompt = self._prompts[out_indices[0]]
             body = response["body"]
             if self._lm.is_chat_model:
                 body = openai.types.chat.ChatCompletion.parse_obj(body)
@@ -520,7 +526,9 @@ def _update_cache_rows_from_output(
             pred = self._lm.prediction_from_api_response(body, prompt)
             assert len(pred) == 1
             pred = pred[0]
-            self._output[out_index] = pred
+            # Set the same prediction for all duplicate prompt positions
+            for out_index in out_indices:
+                self._output[out_index] = pred
             self._cache.add_or_set(pred)
 
     def _update_cache_rows_from_errors(
@@ -553,15 +561,18 @@ def _update_cache_rows_from_errors(
                     "Custom id not found in a batch we started",
                     custom_id,
                 )
-            out_index = self._prompt_hashes_to_index[custom_id]
-            prompt = self._prompts[out_index]
+            out_indices = self._prompt_hashes_to_index[custom_id]
+            # Use first index to get the prompt (all duplicates have same prompt)
+            prompt = self._prompts[out_indices[0]]
             pred = LmPrediction(
                 completion_text=None,
                 prompt=prompt,
                 metad=None,
                 error_message=json.dumps(body["error"]),
             )
-            self._output[out_index] = pred
+            # Set the same error prediction for all duplicate prompt positions
+            for out_index in out_indices:
+                self._output[out_index] = pred
             self._cache.add_or_set(pred)
 
     def _pbar_for_targer(self, api_id: str, total: int):

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ include = ["lmwrapper*"]
 
 [project]
 name = "lmwrapper"
-version = "0.17.0.0"
+version = "0.16.3.0"
 authors = [
     { name = "David Gros" },
     { name = "Claudio Spiess" },
@@ -36,14 +36,14 @@ requires-python = ">=3.10"
 classifiers = ["Programming Language :: Python :: 3"]
 keywords = ["large language models", "openai"]
 dependencies = [
-    "openai~=1.79.0",
+    "openai~=1.55.3",
     #"diskcache~=5.6.3",
     #"joblib~=1.3.2",
-    "tiktoken~=0.9.0",
+    "tiktoken~=0.7.0",
     "ratemate~=0.1",
     "humanize~=4.8.0",
     "xxhash~=3.4",
-    "numpy~=2.2.6",
+    "numpy~=1.24",
     "packaging>=22.0",
 ]
 
@@ -55,11 +55,11 @@ dev = [
 ]
 quant = ["bitsandbytes~=0.41.1"]
 hf = [
-    "torch~=2.7.0",
+    "torch~=2.3.0",
     #"transformers~=4.42.4"
-    "transformers~=4.51.3"
+    "transformers~=4.46.3"
 ]
-anthropic = ["anthropic~=0.51.0"]
+anthropic = ["anthropic~=0.40.0"]
 docs = [
     "mkdocs>=1.4.0",
     "mkdocstrings[python]>=0.24.0",

diff --git a/test/test_caching.py b/test/test_caching.py
@@ -9,7 +9,6 @@
 
 from lmwrapper.abstract_predictor import get_mock_predictor
 from lmwrapper.caching import set_cache_dir
-from lmwrapper.compatibility import has_transformers_compatibility_issues
 from lmwrapper.huggingface_wrapper.wrapper import get_huggingface_lm
 from lmwrapper.structs import LmPrompt
 
@@ -22,8 +21,7 @@ def test_set_cache_dir():
         tmpdirname = Path(tmpdirname)
         assert len(list(tmpdirname.rglob("*"))) == 0
         set_cache_dir(tmpdirname)
-        model_name = "HuggingFaceTB/SmolLM2-135M"
-        lm = get_huggingface_lm(model_name)
+        lm = get_huggingface_lm("gpt2")
         prompt = LmPrompt(
             "Write a story about fish:",
             max_tokens=10,
@@ -34,13 +32,13 @@ def test_set_cache_dir():
         prompt = dataclasses.replace(prompt, cache=True)
         r2 = lm.predict(prompt)
         assert r1.completion_text != r2.completion_text
-        lm2 = get_huggingface_lm(model_name)
+        lm2 = get_huggingface_lm("gpt2")
         r3 = lm2.predict(prompt)
         assert r2.completion_text == r3.completion_text
     with tempfile.TemporaryDirectory() as tmpdirname:
         tmpdirname = Path(tmpdirname)
         set_cache_dir(tmpdirname)
-        lm2 = get_huggingface_lm(model_name)
+        lm2 = get_huggingface_lm("gpt2")
         r4 = lm2.predict(prompt)
         assert r3.completion_text != r4.completion_text
 

diff --git a/test/test_huggingface.py b/test/test_huggingface.py
@@ -32,8 +32,7 @@ class Models(StrEnum):
     CodeLLama_7B_Instruct = "codellama/CodeLlama-7b-Instruct-hf"
     DistilGPT2 = "distilgpt2"
     GPT2 = "gpt2"
-    SMOLLM2_135M = "HuggingFaceTB/SmolLM2-135M"
-    SMOLLM2_135M_INSTRUCT = "HuggingFaceTB/SmolLM2-135M-Instruct"
+    SMOLLM2_135M = "HuggingFaceTB/SmolLM2-135M-Instruct"
     Mistral_7B = "mistralai/Mistral-7B-v0.1"
     qwen25_500M_instruct = "Qwen/Qwen2.5-0.5B-Instruct"
     QwenCoder25_500M = "Qwen/Qwen2.5-Coder-0.5B"
@@ -49,7 +48,7 @@ class Models(StrEnum):
     SMALL_GPU = True
 
 SEQ2SEQ_MODELS = {Models.CodeT5plus_220M}
-CAUSAL_MODELS = {Models.SMOLLM2_135M,}
+CAUSAL_MODELS = {Models.GPT2,}
 BIG_SEQ2SEQ_MODELS = {Models.CodeT5plus_6B, Models.InstructCodeT5plus_16B}
 BIG_CAUSAL_MODELS = {Models.CodeGen2_1B, Models.CodeGen2_3_7B, Models.Mistral_7B}
 BIG_MODELS = BIG_SEQ2SEQ_MODELS | BIG_CAUSAL_MODELS
@@ -1096,7 +1095,7 @@ def test_chat_qwen():
 
 
 def test_chat_smol():
-    model = Models.SMOLLM2_135M_INSTRUCT
+    model = Models.SMOLLM2_135M
     lm = get_huggingface_lm(
         model,
     )
@@ -1123,7 +1122,7 @@ def test_chat_smol():
 
 # so mac works 4.42.2 but not continue_final_message...
 def test_smol_continue_chat():
-    model = Models.SMOLLM2_135M_INSTRUCT
+    model = Models.SMOLLM2_135M
     lm = get_huggingface_lm(
         model,
     )

diff --git a/test/test_huggingface_internals.py b/test/test_huggingface_internals.py
@@ -1,7 +1,5 @@
 import dataclasses
 import os
-
-from lmwrapper.compatibility import has_transformers_compatibility_issues
 from test.test_huggingface import BIG_MODELS, Models
 
 import numpy as np
@@ -16,20 +14,15 @@
 
 HAS_CUDA = torch.cuda.is_available()
 
-models = [
-    (Models.CodeGen_350M, 20, 64 * 16),
-    (Models.CodeGen2_1B, 16, 2048),
-    # ^ Important to run since it doesn't use the same attentions value
-]
-if not has_transformers_compatibility_issues():
-    models = [
-        (Models.GPT2, 12, 768),
-    ] + models
-
 
 @pytest.mark.parametrize(
     "model_name_layers_hidden",
-    models,
+    [
+        (Models.GPT2, 12, 768),
+        (Models.CodeGen_350M, 20, 64 * 16),
+        (Models.CodeGen2_1B, 16, 2048),
+        # ^ Important to run since it doesn't use the same attentions value
+    ],
 )
 def test_get_internals_hidden_states(pytestconfig, model_name_layers_hidden):
     model_name, num_layers, hidden_size = model_name_layers_hidden

diff --git a/test/test_models_common.py b/test/test_models_common.py
@@ -74,8 +74,7 @@ def wrapper(*args, **kwargs):
 ECHOABLE_MODELS = [
     # get_open_ai_lm(OpenAiModelNames.gpt_3_5_turbo_instruct),
     # Won't work with now that echo disabled
-    #get_huggingface_lm("gpt2"),
-    MODEL_NAMES["small_hug"],
+    get_huggingface_lm("gpt2"),
 ]
 
 random_prompt = ("Give a random base-64 guid (answer with only the guid). "
@@ -333,7 +332,6 @@ def test_unconditional_gen(lm):
     "The capital of Germany is the city Berlin. "
     "The capital of Spain is the city Madrid. "
     "The capital of UK is the city London. "
-    "The capital of Japan is the city Tokyo. "
     "The capital of France"
 )