Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ breaking changes, Y is new features or larger non-breaking changes, and Z is sma
However, it is still pre-1.0 software, and does not claim to
be super stable.

## [0.17.0.0]

### Changed
- Updated all the dependency packages (Transformers, OpenAI, Anthropic, Numpy, etc) to latest versions. Might be breaking for any cached values.
## [0.16.3.0]

### Added
- support for duplicates in a openai batch prompt

## [0.16.2.1]

Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,10 @@ print(pred.completion_text) # "2 + 6 equals 8."
pred = lm.predict(LmPrompt(
[
"What is 2+2?", # user turn
"4", # assistant turn
"What is 5+3?" # user turn
"8", # assistant turn
"What is 4+4?" # user turn
"4", # assistant turn
"What is 5+3?" # user turn
"8", # assistant turn
"What is 4+4?" # user turn
# We use few-shot turns to encourage the answer to be our desired format.
# If you don't give example turns you might get something like
# "4 + 4 equals 8." instead of just "8" as desired.
Expand Down Expand Up @@ -153,14 +153,14 @@ from lmwrapper.structs import LmPrompt
lm = get_open_ai_lm(OpenAiModelNames.gpt_4o_mini)

prompt = LmPrompt(
"Describe Paris in a few sentences",
"Describe Paris in one sentence",
cache=True,
temperature=1,
max_tokens=25,
max_tokens=10,
)
first_prediction = lm.predict(prompt)
print(first_prediction.completion_text)
# ... eg, "Paris is a city of romance and art, renowned for its iconic landmarks, vibrant culture, and rich history..."
# ... eg, "Paris is a city of romance and art, renowned for its iconic landmarks, vibrant culture, and rich history."

# The response to this prompt is now saved to the disk.
# You could rerun this script and you would load from cache near-instantly.
Expand Down Expand Up @@ -294,7 +294,7 @@ from lmwrapper.huggingface_wrapper import get_huggingface_lm
from lmwrapper.structs import LmPrompt

# Download a small model for demo
lm = get_huggingface_lm("HuggingFaceTB/SmolLM2-135M")
lm = get_huggingface_lm("gpt2") # 124M parameters

prediction = lm.predict(LmPrompt(
"The capital of Germany is Berlin. The capital of France is",
Expand Down
1 change: 0 additions & 1 deletion lmwrapper/huggingface_wrapper/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,6 @@ def new_call(attention_mask, *args, **kwargs):
**encoded_input,
generation_config=gen_config,
stopping_criteria=stopping_criteria,
return_dict_in_generate=True,
return_legacy_cache=True,
)
#logging.info("Generation output type:" + str(type(generation_output)))
Expand Down
57 changes: 34 additions & 23 deletions lmwrapper/openai_wrapper/batching.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def __init__(
self._validate_prompts_input(prompts)
self._awaiting_marker = object()
self._output = [self._awaiting_marker] * len(prompts)
# TODO store the output as a hash to list[outputs]. Then can read from that for dup prompts
self._num_yielded = 0
self._cache = cache
self._maintain_order = maintain_order
Expand All @@ -63,11 +62,11 @@ def __init__(
self._index_to_hash = [
prompt_to_text_and_sample_hash(p, _cache_key) for i, p in enumerate(prompts)
]
self._prompt_hashes_to_index = {h: i for i, h in enumerate(self._index_to_hash)}
if len(self._prompt_hashes_to_index) != len(prompts):
raise NotImplementedError(
"Duplicate prompts detected. This is not currently handled",
)
self._prompt_hashes_to_index = {}
for i, h in enumerate(self._index_to_hash):
if h not in self._prompt_hashes_to_index:
self._prompt_hashes_to_index[h] = []
self._prompt_hashes_to_index[h].append(i)
self._started = False
self._lm: OpenAIPredictor = cache.lm
self._batch_id_to_pbar = {}
Expand Down Expand Up @@ -182,15 +181,15 @@ def _send_batch(self, batch: "_BatchToMonitor"):
return
lines = []
custom_ids = set()
# Deduplicate prompts at batch level to avoid sending identical requests
for prompt in batch.prompts:
custom_id = prompt_to_text_and_sample_hash(
prompt,
self._lm.get_model_cache_key(),
)
if custom_id in custom_ids:
raise RuntimeError(
"Duplicate custom id target outputs? This should not happen?",
)
# Skip duplicate prompts - result will be distributed to all positions
continue
l = json.dumps(_prompt_to_arg_dict_for_batch(prompt, self._lm, custom_id))
custom_ids.add(custom_id)
lines.append(l)
Expand Down Expand Up @@ -226,8 +225,9 @@ def _send_batch(self, batch: "_BatchToMonitor"):
)
place_holders = self._cache.put_batch_placeholders(batch_row, batch.prompts)
for place_holder in place_holders:
index = self._prompt_hashes_to_index[place_holder.text_and_sample_hash]
self._output[index] = place_holder
indices = self._prompt_hashes_to_index[place_holder.text_and_sample_hash]
for index in indices:
self._output[index] = place_holder

def _poll_completion(self, target: BatchPredictionPlaceholder | object):
if not isinstance(target, BatchPredictionPlaceholder):
Expand Down Expand Up @@ -335,10 +335,14 @@ def _handle_if_batch_expired(
)
if phash not in self._prompt_hashes_to_index:
continue
index = self._prompt_hashes_to_index[phash]
if self._output[index] is self._awaiting_marker or isinstance(
self._output[index],
BatchPredictionPlaceholder,
indices = self._prompt_hashes_to_index[phash]
# Check if any of the duplicate positions still need results
if any(
self._output[index] is self._awaiting_marker or isinstance(
self._output[index],
BatchPredictionPlaceholder,
)
for index in indices
):
needed_prompts.append(prompt)
new_monitor = _BatchToMonitor(
Expand Down Expand Up @@ -436,8 +440,9 @@ def _remove_in_progress_batch(self, batch: "_BatchToMonitor"):
prompt,
self._lm.get_model_cache_key(),
)
index = self._prompt_hashes_to_index[phash]
self._output[index] = self._awaiting_marker
indices = self._prompt_hashes_to_index[phash]
for index in indices:
self._output[index] = self._awaiting_marker
for prompt in batch.prompts:
self._cache.delete(prompt)

Expand Down Expand Up @@ -510,8 +515,9 @@ def _update_cache_rows_from_output(
"Custom id not found in a batch we started",
custom_id,
)
out_index = self._prompt_hashes_to_index[custom_id]
prompt = self._prompts[out_index]
out_indices = self._prompt_hashes_to_index[custom_id]
# Use first index to get the prompt (all duplicates have same prompt)
prompt = self._prompts[out_indices[0]]
body = response["body"]
if self._lm.is_chat_model:
body = openai.types.chat.ChatCompletion.parse_obj(body)
Expand All @@ -520,7 +526,9 @@ def _update_cache_rows_from_output(
pred = self._lm.prediction_from_api_response(body, prompt)
assert len(pred) == 1
pred = pred[0]
self._output[out_index] = pred
# Set the same prediction for all duplicate prompt positions
for out_index in out_indices:
self._output[out_index] = pred
self._cache.add_or_set(pred)

def _update_cache_rows_from_errors(
Expand Down Expand Up @@ -553,15 +561,18 @@ def _update_cache_rows_from_errors(
"Custom id not found in a batch we started",
custom_id,
)
out_index = self._prompt_hashes_to_index[custom_id]
prompt = self._prompts[out_index]
out_indices = self._prompt_hashes_to_index[custom_id]
# Use first index to get the prompt (all duplicates have same prompt)
prompt = self._prompts[out_indices[0]]
pred = LmPrediction(
completion_text=None,
prompt=prompt,
metad=None,
error_message=json.dumps(body["error"]),
)
self._output[out_index] = pred
# Set the same error prediction for all duplicate prompt positions
for out_index in out_indices:
self._output[out_index] = pred
self._cache.add_or_set(pred)

def _pbar_for_targer(self, api_id: str, total: int):
Expand Down
14 changes: 7 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ include = ["lmwrapper*"]

[project]
name = "lmwrapper"
version = "0.17.0.0"
version = "0.16.3.0"
authors = [
{ name = "David Gros" },
{ name = "Claudio Spiess" },
Expand All @@ -36,14 +36,14 @@ requires-python = ">=3.10"
classifiers = ["Programming Language :: Python :: 3"]
keywords = ["large language models", "openai"]
dependencies = [
"openai~=1.79.0",
"openai~=1.55.3",
Copy link

Copilot AI May 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Consider adding a comment that explains the rationale behind downgrading the dependency versions, so that maintainers understand any compatibility or legacy reasons behind these changes.

Copilot uses AI. Check for mistakes.
#"diskcache~=5.6.3",
#"joblib~=1.3.2",
"tiktoken~=0.9.0",
"tiktoken~=0.7.0",
"ratemate~=0.1",
"humanize~=4.8.0",
"xxhash~=3.4",
"numpy~=2.2.6",
"numpy~=1.24",
"packaging>=22.0",
]

Expand All @@ -55,11 +55,11 @@ dev = [
]
quant = ["bitsandbytes~=0.41.1"]
hf = [
"torch~=2.7.0",
"torch~=2.3.0",
#"transformers~=4.42.4"
"transformers~=4.51.3"
"transformers~=4.46.3"
]
anthropic = ["anthropic~=0.51.0"]
anthropic = ["anthropic~=0.40.0"]
docs = [
"mkdocs>=1.4.0",
"mkdocstrings[python]>=0.24.0",
Expand Down
8 changes: 3 additions & 5 deletions test/test_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from lmwrapper.abstract_predictor import get_mock_predictor
from lmwrapper.caching import set_cache_dir
from lmwrapper.compatibility import has_transformers_compatibility_issues
from lmwrapper.huggingface_wrapper.wrapper import get_huggingface_lm
from lmwrapper.structs import LmPrompt

Expand All @@ -22,8 +21,7 @@ def test_set_cache_dir():
tmpdirname = Path(tmpdirname)
assert len(list(tmpdirname.rglob("*"))) == 0
set_cache_dir(tmpdirname)
model_name = "HuggingFaceTB/SmolLM2-135M"
lm = get_huggingface_lm(model_name)
lm = get_huggingface_lm("gpt2")
prompt = LmPrompt(
"Write a story about fish:",
max_tokens=10,
Expand All @@ -34,13 +32,13 @@ def test_set_cache_dir():
prompt = dataclasses.replace(prompt, cache=True)
r2 = lm.predict(prompt)
assert r1.completion_text != r2.completion_text
lm2 = get_huggingface_lm(model_name)
lm2 = get_huggingface_lm("gpt2")
r3 = lm2.predict(prompt)
assert r2.completion_text == r3.completion_text
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdirname = Path(tmpdirname)
set_cache_dir(tmpdirname)
lm2 = get_huggingface_lm(model_name)
lm2 = get_huggingface_lm("gpt2")
r4 = lm2.predict(prompt)
assert r3.completion_text != r4.completion_text

Expand Down
9 changes: 4 additions & 5 deletions test/test_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ class Models(StrEnum):
CodeLLama_7B_Instruct = "codellama/CodeLlama-7b-Instruct-hf"
DistilGPT2 = "distilgpt2"
GPT2 = "gpt2"
SMOLLM2_135M = "HuggingFaceTB/SmolLM2-135M"
SMOLLM2_135M_INSTRUCT = "HuggingFaceTB/SmolLM2-135M-Instruct"
SMOLLM2_135M = "HuggingFaceTB/SmolLM2-135M-Instruct"
Mistral_7B = "mistralai/Mistral-7B-v0.1"
qwen25_500M_instruct = "Qwen/Qwen2.5-0.5B-Instruct"
QwenCoder25_500M = "Qwen/Qwen2.5-Coder-0.5B"
Expand All @@ -49,7 +48,7 @@ class Models(StrEnum):
SMALL_GPU = True

SEQ2SEQ_MODELS = {Models.CodeT5plus_220M}
CAUSAL_MODELS = {Models.SMOLLM2_135M,}
CAUSAL_MODELS = {Models.GPT2,}
BIG_SEQ2SEQ_MODELS = {Models.CodeT5plus_6B, Models.InstructCodeT5plus_16B}
BIG_CAUSAL_MODELS = {Models.CodeGen2_1B, Models.CodeGen2_3_7B, Models.Mistral_7B}
BIG_MODELS = BIG_SEQ2SEQ_MODELS | BIG_CAUSAL_MODELS
Expand Down Expand Up @@ -1096,7 +1095,7 @@ def test_chat_qwen():


def test_chat_smol():
model = Models.SMOLLM2_135M_INSTRUCT
model = Models.SMOLLM2_135M
lm = get_huggingface_lm(
model,
)
Expand All @@ -1123,7 +1122,7 @@ def test_chat_smol():

# so mac works 4.42.2 but not continue_final_message...
def test_smol_continue_chat():
model = Models.SMOLLM2_135M_INSTRUCT
model = Models.SMOLLM2_135M
lm = get_huggingface_lm(
model,
)
Expand Down
19 changes: 6 additions & 13 deletions test/test_huggingface_internals.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import dataclasses
import os

from lmwrapper.compatibility import has_transformers_compatibility_issues
from test.test_huggingface import BIG_MODELS, Models

import numpy as np
Expand All @@ -16,20 +14,15 @@

HAS_CUDA = torch.cuda.is_available()

models = [
(Models.CodeGen_350M, 20, 64 * 16),
(Models.CodeGen2_1B, 16, 2048),
# ^ Important to run since it doesn't use the same attentions value
]
if not has_transformers_compatibility_issues():
models = [
(Models.GPT2, 12, 768),
] + models


@pytest.mark.parametrize(
"model_name_layers_hidden",
models,
[
(Models.GPT2, 12, 768),
(Models.CodeGen_350M, 20, 64 * 16),
(Models.CodeGen2_1B, 16, 2048),
# ^ Important to run since it doesn't use the same attentions value
],
)
def test_get_internals_hidden_states(pytestconfig, model_name_layers_hidden):
model_name, num_layers, hidden_size = model_name_layers_hidden
Expand Down
4 changes: 1 addition & 3 deletions test/test_models_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ def wrapper(*args, **kwargs):
ECHOABLE_MODELS = [
# get_open_ai_lm(OpenAiModelNames.gpt_3_5_turbo_instruct),
# Won't work with now that echo disabled
#get_huggingface_lm("gpt2"),
MODEL_NAMES["small_hug"],
get_huggingface_lm("gpt2"),
]

random_prompt = ("Give a random base-64 guid (answer with only the guid). "
Expand Down Expand Up @@ -333,7 +332,6 @@ def test_unconditional_gen(lm):
"The capital of Germany is the city Berlin. "
"The capital of Spain is the city Madrid. "
"The capital of UK is the city London. "
"The capital of Japan is the city Tokyo. "
"The capital of France"
)

Expand Down
Loading