From 22b55e6b3cb93ce6d243c2e4b76649708a1ecebf Mon Sep 17 00:00:00 2001 From: Couiz <11710548+Couiz@users.noreply.github.com> Date: Sun, 31 May 2026 01:49:49 +0000 Subject: [PATCH 1/5] fix(llm): use adaptive thinking for Opus 4.7+ Anthropic models Claude Opus 4.7 and 4.8 removed the legacy extended-thinking format. Sending `thinking: {"type": "enabled", "budget_tokens": N}` to these models now fails with HTTP 400: "thinking.type.enabled" is not supported for this model. Use "thinking.type.adaptive" and "output_config.effort" to control thinking behavior. The Anthropic backend only ever emitted the legacy shape, so every dream/dialectic call on Opus 4.8 failed (reproduced on a self-hosted v3.0.7 deployment: deriver dream.deduction, claude-opus-4-8, thinking_budget_tokens=16000 -> 400 on all retries). Detect models that require the new API by version (Opus >= 4.7, and any future Opus) and emit `thinking: {"type": "adaptive"}` plus `output_config: {"effort": ...}`, mapping the existing `thinking_budget_tokens` to a sensible effort level (16000 -> high, 32000 -> xhigh) or honoring an explicit `thinking_effort`. Models that still accept the legacy format (Opus 4.6, Sonnet 4.6, Sonnet 4.5, Haiku, etc.) keep the `enabled` + `budget_tokens` shape unchanged. Applied to both the complete and stream paths; the assistant-prefill gate now keys off whether thinking is enabled rather than the raw budget. Verified the adaptive shape against the Anthropic docs: https://platform.claude.com/docs/en/build-with-claude/adaptive-thinking Co-Authored-By: Claude Opus 4.8 --- src/llm/backends/anthropic.py | 132 ++++++++++++++-- tests/llm/test_backends/test_anthropic.py | 180 ++++++++++++++++++++++ 2 files changed, 298 insertions(+), 14 deletions(-) diff --git a/src/llm/backends/anthropic.py b/src/llm/backends/anthropic.py index 17138583c..6a8b44203 100644 --- a/src/llm/backends/anthropic.py +++ b/src/llm/backends/anthropic.py @@ -2,6 +2,7 @@ import copy import json +import re from collections.abc import AsyncIterator from typing import Any @@ -11,6 +12,110 @@ from src.llm.backend import CompletionResult, StreamChunk, ToolCallResult from src.llm.structured_output import repair_response_model_json +# Effort levels accepted by ``output_config.effort`` (ordered low -> high). +# Honcho's ThinkingEffortLevel additionally allows "none"/"minimal", which the +# Messages API does not accept, so those are mapped onto supported values below. +_ANTHROPIC_EFFORTS: frozenset[str] = frozenset( + {"low", "medium", "high", "xhigh", "max"} +) +_EFFORT_ALIASES: dict[str, str] = {"minimal": "low"} + +# claude---[-]; only the version prefix matters +# for picking the thinking format. +_MODEL_VERSION_RE = re.compile(r"^claude-(opus|sonnet|haiku)-(\d+)-(\d+)") + +# Per-tier (major, minor) at/above which Anthropic removed the legacy +# ``thinking: {"type": "enabled", "budget_tokens": N}`` format and *requires* +# adaptive thinking. Sending the legacy shape to one of these models returns +# HTTP 400: +# "thinking.type.enabled" is not supported for this model. Use +# "thinking.type.adaptive" and "output_config.effort" to control thinking ... +# Verified against the Anthropic docs (Opus 4.7 and Opus 4.8 are adaptive-only): +# https://platform.claude.com/docs/en/build-with-claude/adaptive-thinking +# Opus 4.6 / Sonnet 4.6 still accept the legacy format (deprecated but +# functional), so they intentionally stay on the legacy path. +_ADAPTIVE_THINKING_MIN_VERSION: dict[str, tuple[int, int]] = {"opus": (4, 7)} + + +def _requires_adaptive_thinking(model: str) -> bool: + """Return True if ``model`` rejects legacy budget-based thinking (HTTP 400). + + Such models require ``thinking: {"type": "adaptive"}`` with + ``output_config.effort`` instead of ``{"type": "enabled", "budget_tokens"}``. + """ + match = _MODEL_VERSION_RE.match(model) + if match is None: + return False + minimum = _ADAPTIVE_THINKING_MIN_VERSION.get(match.group(1)) + if minimum is None: + return False + return (int(match.group(2)), int(match.group(3))) >= minimum + + +def _budget_to_effort(thinking_budget_tokens: int) -> str: + """Bucket a legacy thinking-token budget into an adaptive effort level. + + There is no exact mapping from ``budget_tokens`` to ``effort``; these + buckets keep budget-configured models at a comparable thinking depth (e.g. + the deriver's 16000-token dream budget maps to ``high``, 32000 to ``xhigh``). + """ + if thinking_budget_tokens < 4096: + return "low" + if thinking_budget_tokens < 16000: + return "medium" + if thinking_budget_tokens < 32000: + return "high" + return "xhigh" + + +def _adaptive_effort( + thinking_effort: str | None, thinking_budget_tokens: int | None +) -> str | None: + """Resolve ``output_config.effort`` for an adaptive-thinking request. + + An explicit ``thinking_effort`` wins; otherwise the legacy + ``thinking_budget_tokens`` is bucketed so existing budget-based configs keep + a comparable thinking depth. Returns None to fall back to the API default + (``high``). + """ + if thinking_effort and thinking_effort != "none": + normalized = _EFFORT_ALIASES.get(thinking_effort, thinking_effort) + if normalized in _ANTHROPIC_EFFORTS: + return normalized + if thinking_budget_tokens: + return _budget_to_effort(thinking_budget_tokens) + return None + + +def _build_thinking_params( + model: str, + thinking_budget_tokens: int | None, + thinking_effort: str | None, +) -> dict[str, Any]: + """Build the ``thinking`` (+ ``output_config``) request params for a model. + + Opus 4.7+ reject the legacy ``{"type": "enabled", "budget_tokens": N}`` shape + with HTTP 400 and require adaptive thinking; older models keep the legacy + shape unchanged. Returns an empty dict when thinking is not requested. + """ + if not _requires_adaptive_thinking(model): + if thinking_budget_tokens: + return { + "thinking": { + "type": "enabled", + "budget_tokens": thinking_budget_tokens, + } + } + return {} + + effort = _adaptive_effort(thinking_effort, thinking_budget_tokens) + if not thinking_budget_tokens and effort is None: + return {} + params: dict[str, Any] = {"thinking": {"type": "adaptive"}} + if effort is not None: + params["output_config"] = {"effort": effort} + return params + class AnthropicBackend: """Provider backend wrapping the native Anthropic SDK.""" @@ -34,7 +139,7 @@ async def complete( max_output_tokens: int | None = None, extra_params: dict[str, Any] | None = None, ) -> CompletionResult: - del max_output_tokens, thinking_effort + del max_output_tokens request_messages, system_messages = self._extract_system(messages) params: dict[str, Any] = { @@ -43,6 +148,11 @@ async def complete( "messages": request_messages, } + thinking_params = _build_thinking_params( + model, thinking_budget_tokens, thinking_effort + ) + params.update(thinking_params) + if temperature is not None: params["temperature"] = temperature if stop: @@ -60,11 +170,6 @@ async def complete( converted_tool_choice = self._convert_tool_choice(tool_choice) if converted_tool_choice is not None: params["tool_choice"] = converted_tool_choice - if thinking_budget_tokens: - params["thinking"] = { - "type": "enabled", - "budget_tokens": thinking_budget_tokens, - } if extra_params: for key in ("top_p", "top_k"): if key in extra_params: @@ -72,7 +177,7 @@ async def complete( use_json_prefill = ( bool(response_format or self._json_mode(extra_params)) - and not thinking_budget_tokens + and "thinking" not in thinking_params and self._supports_assistant_prefill(model) ) if use_json_prefill and params["messages"]: @@ -119,7 +224,7 @@ async def stream( extra_params: dict[str, Any] | None = None, ) -> AsyncIterator[StreamChunk]: is_json_mode = self._json_mode(extra_params) - del max_output_tokens, thinking_effort + del max_output_tokens request_messages, system_messages = self._extract_system(messages) params: dict[str, Any] = { @@ -127,6 +232,10 @@ async def stream( "max_tokens": max_tokens, "messages": request_messages, } + thinking_params = _build_thinking_params( + model, thinking_budget_tokens, thinking_effort + ) + params.update(thinking_params) if temperature is not None: params["temperature"] = temperature if stop: @@ -150,7 +259,7 @@ async def stream( params[key] = extra_params[key] use_json_prefill = ( bool(response_format or is_json_mode) - and not thinking_budget_tokens + and "thinking" not in thinking_params and self._supports_assistant_prefill(model) ) if use_json_prefill and params["messages"]: @@ -169,11 +278,6 @@ async def stream( params["messages"], f"\n\nRespond with valid JSON matching this schema:\n{schema_json}", ) - if thinking_budget_tokens: - params["thinking"] = { - "type": "enabled", - "budget_tokens": thinking_budget_tokens, - } async with self._client.messages.stream(**params) as stream: async for chunk in stream: diff --git a/tests/llm/test_backends/test_anthropic.py b/tests/llm/test_backends/test_anthropic.py index 52de0fa2a..d3cb1332e 100644 --- a/tests/llm/test_backends/test_anthropic.py +++ b/tests/llm/test_backends/test_anthropic.py @@ -1,4 +1,5 @@ from types import SimpleNamespace +from typing import Any from unittest.mock import AsyncMock, Mock import pytest @@ -152,3 +153,182 @@ async def test_anthropic_backend_ignores_thinking_effort() -> None: call = await_args.kwargs assert "thinking" not in call assert "reasoning_effort" not in call + + +def _text_response_client() -> Mock: + """Mock Anthropic client returning a minimal text completion.""" + client = Mock() + client.messages.create = AsyncMock( + return_value=SimpleNamespace( + content=[TextBlock(type="text", text="ok")], + usage=SimpleNamespace( + input_tokens=10, + output_tokens=5, + cache_creation_input_tokens=0, + cache_read_input_tokens=0, + ), + stop_reason="end_turn", + ) + ) + return client + + +def _create_call_kwargs(client: Mock) -> dict[str, Any]: + await_args = client.messages.create.await_args + if await_args is None: + raise AssertionError("Expected Anthropic client call") + return await_args.kwargs + + +@pytest.mark.parametrize( + "model", + [ + "claude-opus-4-8", + "claude-opus-4-7", + "claude-opus-5-0", # future Opus keeps using adaptive thinking + "claude-opus-4-8-20260120", # date-suffixed variant + ], +) +@pytest.mark.asyncio +async def test_anthropic_backend_uses_adaptive_thinking(model: str) -> None: + # These models reject the legacy {"type": "enabled", "budget_tokens": N} + # shape with HTTP 400; the backend must send adaptive thinking instead. + client = _text_response_client() + backend = AnthropicBackend(client) + await backend.complete( + model=model, + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_budget_tokens=16000, + ) + + call = _create_call_kwargs(client) + assert call["thinking"] == {"type": "adaptive"} + assert call["output_config"] == {"effort": "high"} + + +@pytest.mark.parametrize( + "model", + [ + "claude-opus-4-6", + "claude-opus-4-5", + "claude-sonnet-4-6", + "claude-sonnet-4-5", + "claude-haiku-4-5", + ], +) +@pytest.mark.asyncio +async def test_anthropic_backend_keeps_legacy_thinking(model: str) -> None: + # Models that still accept the legacy budget format must be unchanged. + client = _text_response_client() + backend = AnthropicBackend(client) + await backend.complete( + model=model, + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_budget_tokens=16000, + ) + + call = _create_call_kwargs(client) + assert call["thinking"] == {"type": "enabled", "budget_tokens": 16000} + assert "output_config" not in call + + +@pytest.mark.asyncio +async def test_anthropic_backend_adaptive_thinking_respects_explicit_effort() -> None: + # An explicit thinking_effort overrides the budget-derived effort bucket. + client = _text_response_client() + backend = AnthropicBackend(client) + await backend.complete( + model="claude-opus-4-8", + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_budget_tokens=16000, + thinking_effort="low", + ) + + call = _create_call_kwargs(client) + assert call["thinking"] == {"type": "adaptive"} + assert call["output_config"] == {"effort": "low"} + + +class _FakeAnthropicStream: + """Minimal async-context-manager stand-in for client.messages.stream().""" + + def __init__(self, final_message: Any) -> None: + self._final_message: Any = final_message + + async def __aenter__(self) -> "_FakeAnthropicStream": + return self + + async def __aexit__(self, *exc_info: object) -> bool: + del exc_info + return False + + def __aiter__(self) -> "_FakeAnthropicStream": + return self + + async def __anext__(self) -> Any: + raise StopAsyncIteration + + async def get_final_message(self) -> Any: + return self._final_message + + +@pytest.mark.asyncio +async def test_anthropic_backend_stream_uses_adaptive_thinking_for_opus_4_8() -> None: + # The streaming path must apply the same adaptive thinking format. + final_message = SimpleNamespace( + usage=SimpleNamespace(output_tokens=5), + stop_reason="end_turn", + ) + client = Mock() + client.messages.stream = Mock(return_value=_FakeAnthropicStream(final_message)) + + backend = AnthropicBackend(client) + chunks = [ + chunk + async for chunk in backend.stream( + model="claude-opus-4-8", + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_budget_tokens=16000, + ) + ] + + assert chunks[-1].is_done is True + stream_call = client.messages.stream.call_args + if stream_call is None: + raise AssertionError("Expected Anthropic stream call") + call = stream_call.kwargs + assert call["thinking"] == {"type": "adaptive"} + assert call["output_config"] == {"effort": "high"} + + +@pytest.mark.parametrize( + ("thinking_budget_tokens", "expected_effort"), + [ + (2048, "low"), + (8000, "medium"), + (16000, "high"), + (32000, "xhigh"), + ], +) +@pytest.mark.asyncio +async def test_anthropic_backend_maps_budget_to_effort( + thinking_budget_tokens: int, expected_effort: str +) -> None: + # With no explicit effort, the legacy budget is bucketed into an effort + # level so existing budget-based configs keep a comparable thinking depth. + client = _text_response_client() + backend = AnthropicBackend(client) + await backend.complete( + model="claude-opus-4-8", + messages=[{"role": "user", "content": "Hello"}], + max_tokens=64000, + thinking_budget_tokens=thinking_budget_tokens, + ) + + call = _create_call_kwargs(client) + assert call["thinking"] == {"type": "adaptive"} + assert call["output_config"] == {"effort": expected_effort} From ae0c840b77cef09a593196732d79fa55ce2039a9 Mon Sep 17 00:00:00 2001 From: Couiz <11710548+Couiz@users.noreply.github.com> Date: Sun, 31 May 2026 02:08:15 +0000 Subject: [PATCH 2/5] fix(llm): validate thinking inputs in the Anthropic backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address CodeRabbit review on PR #752: reject invalid thinking inputs instead of silently coercing them, using the existing src.exceptions.ValidationException. - `_adaptive_effort` now raises on an unrecognized `thinking_effort` (previously fell back to the budget-derived effort / API default). - `_adaptive_effort` and `_build_thinking_params` now raise on a negative `thinking_budget_tokens` (previously bucketed to "low" on adaptive models and forwarded unchanged to legacy models). A budget of 0 stays a valid "disable thinking" sentinel — config.py explicitly permits 0 for Anthropic ("0 to disable") and the original backend treated it as no-thinking. So the guard rejects only `< 0` (matching CodeRabbit's proposed diff, `must be >= 0`), not `<= 0`, which would regress the disable path. None and valid positive budgets are unchanged. Co-Authored-By: Claude Opus 4.8 --- src/llm/backends/anthropic.py | 23 +++++++--- tests/llm/test_backends/test_anthropic.py | 54 +++++++++++++++++++++++ 2 files changed, 72 insertions(+), 5 deletions(-) diff --git a/src/llm/backends/anthropic.py b/src/llm/backends/anthropic.py index 6a8b44203..5a5ce3759 100644 --- a/src/llm/backends/anthropic.py +++ b/src/llm/backends/anthropic.py @@ -9,6 +9,7 @@ from anthropic.types import TextBlock, ThinkingBlock, ToolUseBlock from pydantic import BaseModel, ValidationError +from src.exceptions import ValidationException from src.llm.backend import CompletionResult, StreamChunk, ToolCallResult from src.llm.structured_output import repair_response_model_json @@ -76,12 +77,19 @@ def _adaptive_effort( An explicit ``thinking_effort`` wins; otherwise the legacy ``thinking_budget_tokens`` is bucketed so existing budget-based configs keep a comparable thinking depth. Returns None to fall back to the API default - (``high``). + (``high``). A budget of 0 (or None) means "no effort hint"; a negative + budget or an unrecognized effort is a caller error and raises + ``ValidationException`` rather than being silently coerced. """ - if thinking_effort and thinking_effort != "none": + if thinking_budget_tokens is not None and thinking_budget_tokens < 0: + raise ValidationException("thinking_budget_tokens must be >= 0") + if thinking_effort is not None and thinking_effort != "none": normalized = _EFFORT_ALIASES.get(thinking_effort, thinking_effort) - if normalized in _ANTHROPIC_EFFORTS: - return normalized + if normalized not in _ANTHROPIC_EFFORTS: + raise ValidationException( + f"Unsupported thinking_effort: {thinking_effort!r}" + ) + return normalized if thinking_budget_tokens: return _budget_to_effort(thinking_budget_tokens) return None @@ -96,8 +104,13 @@ def _build_thinking_params( Opus 4.7+ reject the legacy ``{"type": "enabled", "budget_tokens": N}`` shape with HTTP 400 and require adaptive thinking; older models keep the legacy - shape unchanged. Returns an empty dict when thinking is not requested. + shape unchanged. A budget of 0 (or None) means "no thinking"; a negative + budget raises ``ValidationException`` rather than being forwarded. Returns + an empty dict when thinking is not requested. """ + if thinking_budget_tokens is not None and thinking_budget_tokens < 0: + raise ValidationException("thinking_budget_tokens must be >= 0") + if not _requires_adaptive_thinking(model): if thinking_budget_tokens: return { diff --git a/tests/llm/test_backends/test_anthropic.py b/tests/llm/test_backends/test_anthropic.py index d3cb1332e..34c7ea64b 100644 --- a/tests/llm/test_backends/test_anthropic.py +++ b/tests/llm/test_backends/test_anthropic.py @@ -6,6 +6,7 @@ from anthropic.types import TextBlock, ThinkingBlock, ToolUseBlock from pydantic import BaseModel +from src.exceptions import ValidationException from src.llm.backends.anthropic import AnthropicBackend @@ -332,3 +333,56 @@ async def test_anthropic_backend_maps_budget_to_effort( call = _create_call_kwargs(client) assert call["thinking"] == {"type": "adaptive"} assert call["output_config"] == {"effort": expected_effort} + + +@pytest.mark.asyncio +async def test_anthropic_backend_rejects_unknown_thinking_effort() -> None: + # An unrecognized effort is a caller error, not a silent fallback. + backend = AnthropicBackend(_text_response_client()) + with pytest.raises(ValidationException): + await backend.complete( + model="claude-opus-4-8", + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_budget_tokens=16000, + thinking_effort="turbo", + ) + + +@pytest.mark.parametrize("model", ["claude-opus-4-8", "claude-opus-4-6"]) +@pytest.mark.asyncio +async def test_anthropic_backend_rejects_negative_budget(model: str) -> None: + # Negative budgets must not be bucketed (adaptive) or forwarded (legacy). + backend = AnthropicBackend(_text_response_client()) + with pytest.raises(ValidationException): + await backend.complete( + model=model, + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_budget_tokens=-1, + ) + + +@pytest.mark.parametrize( + "model", + ["claude-opus-4-8", "claude-opus-4-6", "claude-haiku-4-5"], +) +@pytest.mark.parametrize("thinking_budget_tokens", [None, 0]) +@pytest.mark.asyncio +async def test_anthropic_backend_no_thinking_for_zero_or_none_budget( + model: str, thinking_budget_tokens: int | None +) -> None: + # 0 is a valid "disable thinking" sentinel (config permits it); neither 0 + # nor None raises, and no thinking params are sent. + client = _text_response_client() + backend = AnthropicBackend(client) + await backend.complete( + model=model, + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_budget_tokens=thinking_budget_tokens, + ) + + call = _create_call_kwargs(client) + assert "thinking" not in call + assert "output_config" not in call From 567e2c463a7b3626f2aad1e998b7a28b43daecf7 Mon Sep 17 00:00:00 2001 From: Couiz <11710548+Couiz@users.noreply.github.com> Date: Sun, 31 May 2026 03:58:27 +0000 Subject: [PATCH 3/5] fix(llm): fail fast on deterministic thinking-input errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deterministic HonchoExceptions (e.g. invalid thinking budget/effort raised by the Anthropic backend) were caught by the tenacity retry wrappers in api.py and tool_loop.py, retried retry_attempts times, re-wrapped as a RetryError, and surfaced as a generic 500 — losing the intended 422 status and burning a fallback hop. Add retry=retry_if_not_exception_type(HonchoException) to every retry wrapper so these errors fail fast and propagate with their own status_code. Also enable adaptive thinking when an explicit thinking_effort is set, regardless of budget. Adds regression tests covering the tool-less (api.py prompt + message-list) and tool-loop (Dialectic/Dreamer) paths. --- src/llm/api.py | 15 +- src/llm/backends/anthropic.py | 8 +- src/llm/tool_loop.py | 14 +- tests/llm/test_backends/test_anthropic.py | 116 +++++++++++- .../test_thinking_validation_not_retried.py | 168 ++++++++++++++++++ 5 files changed, 313 insertions(+), 8 deletions(-) create mode 100644 tests/llm/test_thinking_validation_not_retried.py diff --git a/src/llm/api.py b/src/llm/api.py index 9c9a628d4..71500a2db 100644 --- a/src/llm/api.py +++ b/src/llm/api.py @@ -17,10 +17,15 @@ from pydantic import BaseModel from sentry_sdk.ai.monitoring import ai_track -from tenacity import retry, stop_after_attempt, wait_exponential +from tenacity import ( + retry, + retry_if_not_exception_type, + stop_after_attempt, + wait_exponential, +) from src.config import ConfiguredModelSettings, ModelConfig -from src.exceptions import ValidationException +from src.exceptions import HonchoException, ValidationException from src.telemetry.logging import conditional_observe from src.telemetry.reasoning_traces import log_reasoning_trace @@ -290,6 +295,11 @@ def before_retry_callback(retry_state: Any) -> None: decorated = retry( stop=stop_after_attempt(retry_attempts), wait=wait_exponential(multiplier=1, min=4, max=10), + # HonchoExceptions are deterministic input/config errors (e.g. an + # invalid thinking budget/effort raised by a backend): fail fast so + # they propagate with their own status_code instead of being + # retried, wrapped in RetryError, and surfaced as a generic 500. + retry=retry_if_not_exception_type(HonchoException), before_sleep=before_retry_callback, )(decorated) @@ -403,6 +413,7 @@ async def _toolless_call() -> ( wrapped = retry( stop=stop_after_attempt(retry_attempts), wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_not_exception_type(HonchoException), before_sleep=before_retry_callback, )(wrapped) result: ( diff --git a/src/llm/backends/anthropic.py b/src/llm/backends/anthropic.py index 5a5ce3759..c22b27136 100644 --- a/src/llm/backends/anthropic.py +++ b/src/llm/backends/anthropic.py @@ -104,9 +104,11 @@ def _build_thinking_params( Opus 4.7+ reject the legacy ``{"type": "enabled", "budget_tokens": N}`` shape with HTTP 400 and require adaptive thinking; older models keep the legacy - shape unchanged. A budget of 0 (or None) means "no thinking"; a negative - budget raises ``ValidationException`` rather than being forwarded. Returns - an empty dict when thinking is not requested. + shape unchanged. A budget of 0 (or None) with no explicit effort means no + thinking; an explicit ``thinking_effort`` enables adaptive thinking + regardless of budget. A negative budget raises ``ValidationException`` + rather than being forwarded. Returns an empty dict when thinking is not + requested. """ if thinking_budget_tokens is not None and thinking_budget_tokens < 0: raise ValidationException("thinking_budget_tokens must be >= 0") diff --git a/src/llm/tool_loop.py b/src/llm/tool_loop.py index 734af8b3c..978b613df 100644 --- a/src/llm/tool_loop.py +++ b/src/llm/tool_loop.py @@ -18,10 +18,15 @@ from typing import Any, ParamSpec, TypeVar from pydantic import BaseModel -from tenacity import retry, stop_after_attempt, wait_exponential +from tenacity import ( + retry, + retry_if_not_exception_type, + stop_after_attempt, + wait_exponential, +) from src.config import ModelTransport -from src.exceptions import ValidationException +from src.exceptions import HonchoException, ValidationException from src.utils.types import ( get_last_tool_metadata, iteration_scope, @@ -259,6 +264,9 @@ async def _setup_stream() -> AsyncIterator[HonchoLLMCallStreamChunk]: wrapped = retry( stop=stop_after_attempt(retry_attempts), wait=wait_exponential(multiplier=1, min=4, max=10), + # Deterministic input/config errors (HonchoException) fail fast + # rather than being retried and re-wrapped as RetryError → 500. + retry=retry_if_not_exception_type(HonchoException), before_sleep=before_retry_callback, )(_setup_stream) stream = await wrapped() @@ -380,6 +388,7 @@ async def _call_with_messages( call_func = retry( stop=stop_after_attempt(retry_attempts), wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_not_exception_type(HonchoException), before_sleep=before_retry_callback, )(_call_with_messages) else: @@ -634,6 +643,7 @@ async def _final_call() -> HonchoLLMCallResponse[Any]: final_call_func = retry( stop=stop_after_attempt(retry_attempts), wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_not_exception_type(HonchoException), before_sleep=before_retry_callback, )(_final_call) else: diff --git a/tests/llm/test_backends/test_anthropic.py b/tests/llm/test_backends/test_anthropic.py index 34c7ea64b..0d63aeef3 100644 --- a/tests/llm/test_backends/test_anthropic.py +++ b/tests/llm/test_backends/test_anthropic.py @@ -125,7 +125,7 @@ async def test_anthropic_backend_skips_assistant_prefill_for_claude_4_models() - @pytest.mark.asyncio -async def test_anthropic_backend_ignores_thinking_effort() -> None: +async def test_legacy_model_ignores_thinking_effort() -> None: client = Mock() client.messages.create = AsyncMock( return_value=SimpleNamespace( @@ -386,3 +386,117 @@ async def test_anthropic_backend_no_thinking_for_zero_or_none_budget( call = _create_call_kwargs(client) assert "thinking" not in call assert "output_config" not in call + + +@pytest.mark.asyncio +async def test_anthropic_backend_adaptive_aliases_minimal_to_low() -> None: + # Honcho's ThinkingEffortLevel allows "minimal", which the Messages API + # does not accept; it is aliased onto the supported "low" effort. + client = _text_response_client() + backend = AnthropicBackend(client) + await backend.complete( + model="claude-opus-4-8", + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_effort="minimal", + ) + + call = _create_call_kwargs(client) + assert call["thinking"] == {"type": "adaptive"} + assert call["output_config"] == {"effort": "low"} + + +@pytest.mark.asyncio +async def test_anthropic_backend_effort_none_falls_through_to_budget_bucket() -> None: + # "none" means "no explicit effort hint", not "disable thinking": with a + # positive budget it falls through to the budget-derived effort bucket + # (16000 -> high). Thinking is disabled via a 0/None budget, not "none". + client = _text_response_client() + backend = AnthropicBackend(client) + await backend.complete( + model="claude-opus-4-8", + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_budget_tokens=16000, + thinking_effort="none", + ) + + call = _create_call_kwargs(client) + assert call["thinking"] == {"type": "adaptive"} + assert call["output_config"] == {"effort": "high"} + + +@pytest.mark.asyncio +async def test_anthropic_backend_explicit_effort_with_zero_budget_enables_adaptive() -> ( + None +): + # A 0 budget alone disables thinking, but an explicit effort still enables + # adaptive thinking on 4.7+ models — the effort is the authoritative hint. + client = _text_response_client() + backend = AnthropicBackend(client) + await backend.complete( + model="claude-opus-4-8", + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_budget_tokens=0, + thinking_effort="high", + ) + + call = _create_call_kwargs(client) + assert call["thinking"] == {"type": "adaptive"} + assert call["output_config"] == {"effort": "high"} + + +@pytest.mark.asyncio +async def test_anthropic_backend_stream_keeps_legacy_thinking_for_sonnet_4_6() -> None: + # The streaming path must keep the legacy budget shape for models that + # still accept it (no adaptive output_config). + final_message = SimpleNamespace( + usage=SimpleNamespace(output_tokens=5), + stop_reason="end_turn", + ) + client = Mock() + client.messages.stream = Mock(return_value=_FakeAnthropicStream(final_message)) + + backend = AnthropicBackend(client) + chunks = [ + chunk + async for chunk in backend.stream( + model="claude-sonnet-4-6", + messages=[{"role": "user", "content": "Hello"}], + max_tokens=32000, + thinking_budget_tokens=16000, + ) + ] + + assert chunks[-1].is_done is True + stream_call = client.messages.stream.call_args + if stream_call is None: + raise AssertionError("Expected Anthropic stream call") + call = stream_call.kwargs + assert call["thinking"] == {"type": "enabled", "budget_tokens": 16000} + assert "output_config" not in call + + +@pytest.mark.parametrize( + ("thinking_budget_tokens", "expected_effort"), + [(4095, "low"), (4096, "medium")], +) +@pytest.mark.asyncio +async def test_anthropic_backend_budget_low_medium_boundary( + thinking_budget_tokens: int, expected_effort: str +) -> None: + # The <4096 bucket boundary: 4095 -> low, 4096 -> medium. (16000 -> high + # and 32000 -> xhigh are pinned by test_anthropic_backend_maps_budget_to_effort.) + client = _text_response_client() + backend = AnthropicBackend(client) + await backend.complete( + model="claude-opus-4-8", + messages=[{"role": "user", "content": "Hello"}], + max_tokens=64000, + thinking_budget_tokens=thinking_budget_tokens, + ) + + call = _create_call_kwargs(client) + assert call["thinking"] == {"type": "adaptive"} + assert call["output_config"] == {"effort": expected_effort} diff --git a/tests/llm/test_thinking_validation_not_retried.py b/tests/llm/test_thinking_validation_not_retried.py new file mode 100644 index 000000000..6b3dcf68a --- /dev/null +++ b/tests/llm/test_thinking_validation_not_retried.py @@ -0,0 +1,168 @@ +"""Regression tests for B1: deterministic thinking-input errors must surface +as HTTP 422 and must not be retried. + +The Anthropic backend rejects invalid thinking inputs (a negative budget or an +unrecognized effort) by raising ``ValidationException`` (status_code 422) from +inside ``_build_thinking_params``. That raise happens inside the closures the +tenacity ``retry`` wrappers decorate. Without a retry predicate, a deterministic +``HonchoException`` would be retried ``retry_attempts`` times, re-wrapped as a +``tenacity.RetryError`` (which is not a ``HonchoException``), and surface as a +generic 500 — losing the intended 422 and wasting a fallback hop. + +The fix adds ``retry=retry_if_not_exception_type(HonchoException)`` to every +retry wrapper in ``src/llm/api.py`` and ``src/llm/tool_loop.py`` so these +deterministic errors fail fast and propagate with their own status code. +""" + +from __future__ import annotations + +from typing import Any, cast +from unittest.mock import AsyncMock, Mock, patch + +import pytest + +from src.config import ModelConfig +from src.exceptions import ValidationException +from src.llm import registry, tool_loop +from src.llm.api import honcho_llm_call +from src.llm.runtime import AttemptPlan +from src.llm.tool_loop import execute_tool_loop +from src.llm.types import ProviderClient + +_THINKING_ERROR = "thinking_budget_tokens must be >= 0" + + +@pytest.mark.asyncio +async def test_negative_thinking_budget_surfaces_as_422_not_retried() -> None: + """End-to-end through ``honcho_llm_call`` with the real Anthropic backend. + + A negative ``thinking_budget_tokens`` is config-reachable: ``ModelConfig`` + only rejects ``0 < budget < 1024`` for Anthropic, so a negative budget + passes config load and reaches the backend at call time. The backend + rejects it, and the retry wrapper must let that ``ValidationException`` + propagate as a 422 rather than retrying it into a ``RetryError`` (a 500). + """ + model_config = ModelConfig( + model="claude-opus-4-8", + transport="anthropic", + thinking_budget_tokens=-1, + ) + + client = Mock() + client.messages.create = AsyncMock() + + with ( + patch.dict(registry.CLIENTS, {"anthropic": client}), + pytest.raises(ValidationException) as exc_info, + ): + await honcho_llm_call( + model_config=model_config, + prompt="hi", + max_tokens=100, + enable_retry=True, + retry_attempts=3, + ) + + assert exc_info.value.status_code == 422 + # Validation fails before any network call, and the wrapper must not retry: + # the old behavior raised a RetryError after exhausting all attempts. + assert client.messages.create.await_count == 0 + + +@pytest.mark.parametrize( + "messages", + [None, [{"role": "user", "content": "hi"}]], +) +@pytest.mark.asyncio +async def test_toolless_validation_exception_not_retried( + messages: list[dict[str, Any]] | None, +) -> None: + """The tool-less retry wrappers (``api.py``) call the inner backend exactly + once for a deterministic ``ValidationException`` — not ``retry_attempts`` + times. ``messages=None`` exercises the prompt-only wrapper; a supplied + ``messages`` list exercises the message-list wrapper. + """ + model_config = ModelConfig(model="claude-opus-4-8", transport="anthropic") + + inner = AsyncMock(side_effect=ValidationException(_THINKING_ERROR)) + + with ( + patch.dict(registry.CLIENTS, {"anthropic": Mock()}), + patch("src.llm.api.honcho_llm_call_inner", inner), + pytest.raises(ValidationException) as exc_info, + ): + await honcho_llm_call( + model_config=model_config, + prompt="hi", + max_tokens=100, + messages=messages, + enable_retry=True, + retry_attempts=3, + ) + + assert exc_info.value.status_code == 422 + assert inner.await_count == 1 + + +def _make_plan() -> AttemptPlan: + return AttemptPlan( + provider="anthropic", + model="claude-opus-4-8", + client=cast(ProviderClient, object()), + thinking_budget_tokens=None, + reasoning_effort=None, + selected_config=ModelConfig(model="claude-opus-4-8", transport="anthropic"), + attempt=1, + retry_attempts=3, + is_fallback=False, + ) + + +def _unused_tool_executor(_name: str, _tool_input: dict[str, Any]) -> str: + # The inner call raises before any tool runs; this is never invoked. + return "" + + +def _noop_before_retry(_retry_state: Any) -> None: + return None + + +@pytest.mark.asyncio +async def test_tool_loop_validation_exception_not_retried() -> None: + """The per-iteration retry wrapper in ``execute_tool_loop`` (the Dialectic + and Dreamer path) must also fail fast on a deterministic + ``ValidationException`` — exactly one inner call, propagated as 422. + """ + inner = AsyncMock(side_effect=ValidationException(_THINKING_ERROR)) + tools: list[dict[str, Any]] = [ + {"name": "noop", "description": "no-op", "input_schema": {"type": "object"}} + ] + + with ( + patch.object(tool_loop, "honcho_llm_call_inner", inner), + pytest.raises(ValidationException) as exc_info, + ): + await execute_tool_loop( + prompt="hi", + max_tokens=100, + messages=[{"role": "user", "content": "hi"}], + tools=tools, + tool_choice="auto", + tool_executor=_unused_tool_executor, + max_tool_iterations=5, + response_model=None, + json_mode=False, + temperature=None, + stop_seqs=None, + verbosity=None, + enable_retry=True, + retry_attempts=3, + max_input_tokens=None, + get_attempt_plan=_make_plan, + before_retry_callback=_noop_before_retry, + stream_final=False, + telemetry=None, + ) + + assert exc_info.value.status_code == 422 + assert inner.await_count == 1 From 5277b63ad90213ce44b679b97a85334a61fbe948 Mon Sep 17 00:00:00 2001 From: Couiz <11710548+Couiz@users.noreply.github.com> Date: Sun, 31 May 2026 04:19:33 +0000 Subject: [PATCH 4/5] refactor(llm): simplify thinking retry-predicate plumbing Extract the repeated tenacity retry configuration (exponential backoff + fail-fast on HonchoException) into a single with_llm_retry() helper in runtime.py, the module that already owns retry-attempt tracking. Replaces five identical inline retry(...) blocks across api.py and tool_loop.py, centralizing the backoff policy, the fail-fast predicate, and its rationale in one place. Behavior is unchanged: the helper is a pure pass-through to the same retry(...) call and preserves each call site's exact callable type. Co-Authored-By: Claude Opus 4.8 --- src/llm/api.py | 35 ++++++++++++----------------------- src/llm/runtime.py | 35 ++++++++++++++++++++++++++++++++++- src/llm/tool_loop.py | 44 +++++++++++++++++--------------------------- 3 files changed, 63 insertions(+), 51 deletions(-) diff --git a/src/llm/api.py b/src/llm/api.py index 71500a2db..0cf338556 100644 --- a/src/llm/api.py +++ b/src/llm/api.py @@ -17,15 +17,9 @@ from pydantic import BaseModel from sentry_sdk.ai.monitoring import ai_track -from tenacity import ( - retry, - retry_if_not_exception_type, - stop_after_attempt, - wait_exponential, -) from src.config import ConfiguredModelSettings, ModelConfig -from src.exceptions import HonchoException, ValidationException +from src.exceptions import ValidationException from src.telemetry.logging import conditional_observe from src.telemetry.reasoning_traces import log_reasoning_trace @@ -37,6 +31,7 @@ plan_attempt, resolve_runtime_model_config, update_current_langfuse_observation, + with_llm_retry, ) from .tool_loop import execute_tool_loop from .types import ( @@ -292,16 +287,11 @@ def before_retry_callback(retry_state: Any) -> None: logger.info(f"Will retry with attempt {next_attempt}/{retry_attempts}") if enable_retry: - decorated = retry( - stop=stop_after_attempt(retry_attempts), - wait=wait_exponential(multiplier=1, min=4, max=10), - # HonchoExceptions are deterministic input/config errors (e.g. an - # invalid thinking budget/effort raised by a backend): fail fast so - # they propagate with their own status_code instead of being - # retried, wrapped in RetryError, and surfaced as a generic 500. - retry=retry_if_not_exception_type(HonchoException), - before_sleep=before_retry_callback, - )(decorated) + decorated = with_llm_retry( + decorated, + retry_attempts=retry_attempts, + before_retry_callback=before_retry_callback, + ) def _trace_thinking_budget() -> int | None: # Trace log should reflect what got applied, so fall back to the @@ -410,12 +400,11 @@ async def _toolless_call() -> ( if track_name: wrapped = ai_track(track_name)(wrapped) if enable_retry: - wrapped = retry( - stop=stop_after_attempt(retry_attempts), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_not_exception_type(HonchoException), - before_sleep=before_retry_callback, - )(wrapped) + wrapped = with_llm_retry( + wrapped, + retry_attempts=retry_attempts, + before_retry_callback=before_retry_callback, + ) result: ( HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk] ) = await wrapped() diff --git a/src/llm/runtime.py b/src/llm/runtime.py index ae551378c..44ba8e87d 100644 --- a/src/llm/runtime.py +++ b/src/llm/runtime.py @@ -12,9 +12,17 @@ from __future__ import annotations import logging +from collections.abc import Callable from contextvars import ContextVar from dataclasses import dataclass -from typing import Any +from typing import Any, TypeVar + +from tenacity import ( + retry, + retry_if_not_exception_type, + stop_after_attempt, + wait_exponential, +) from src.config import ( ConfiguredModelSettings, @@ -23,16 +31,41 @@ resolve_model_config, settings, ) +from src.exceptions import HonchoException from .registry import backend_for_provider, client_for_model_config from .types import ProviderClient, ReasoningEffortType +_WrappedFn = TypeVar("_WrappedFn", bound=Callable[..., Any]) + logger = logging.getLogger(__name__) # ContextVar tracking the current retry attempt for provider switching. current_attempt: ContextVar[int] = ContextVar("current_attempt", default=0) +def with_llm_retry( + func: _WrappedFn, + *, + retry_attempts: int, + before_retry_callback: Callable[[Any], None], +) -> _WrappedFn: + """Wrap an LLM-call closure with Honcho's standard retry policy. + + Transient failures retry with exponential backoff, but ``HonchoException`` + fails fast: these are deterministic input/config errors (e.g. an invalid + thinking budget/effort raised by a backend) and should propagate with their + own ``status_code`` instead of being retried, re-wrapped in tenacity's + ``RetryError``, and surfaced as a generic 500. + """ + return retry( + stop=stop_after_attempt(retry_attempts), + wait=wait_exponential(multiplier=1, min=4, max=10), + retry=retry_if_not_exception_type(HonchoException), + before_sleep=before_retry_callback, + )(func) + + def update_current_langfuse_observation( provider: ModelTransport, model: str, diff --git a/src/llm/tool_loop.py b/src/llm/tool_loop.py index 978b613df..69f2c4a95 100644 --- a/src/llm/tool_loop.py +++ b/src/llm/tool_loop.py @@ -18,15 +18,9 @@ from typing import Any, ParamSpec, TypeVar from pydantic import BaseModel -from tenacity import ( - retry, - retry_if_not_exception_type, - stop_after_attempt, - wait_exponential, -) from src.config import ModelTransport -from src.exceptions import HonchoException, ValidationException +from src.exceptions import ValidationException from src.utils.types import ( get_last_tool_metadata, iteration_scope, @@ -41,6 +35,7 @@ AttemptPlan, current_attempt, effective_temperature, + with_llm_retry, ) from .types import ( HonchoLLMCallResponse, @@ -261,14 +256,11 @@ async def _setup_stream() -> AsyncIterator[HonchoLLMCallStreamChunk]: ) if enable_retry: - wrapped = retry( - stop=stop_after_attempt(retry_attempts), - wait=wait_exponential(multiplier=1, min=4, max=10), - # Deterministic input/config errors (HonchoException) fail fast - # rather than being retried and re-wrapped as RetryError → 500. - retry=retry_if_not_exception_type(HonchoException), - before_sleep=before_retry_callback, - )(_setup_stream) + wrapped = with_llm_retry( + _setup_stream, + retry_attempts=retry_attempts, + before_retry_callback=before_retry_callback, + ) stream = await wrapped() else: stream = await _setup_stream() @@ -385,12 +377,11 @@ async def _call_with_messages( ) if enable_retry: - call_func = retry( - stop=stop_after_attempt(retry_attempts), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_not_exception_type(HonchoException), - before_sleep=before_retry_callback, - )(_call_with_messages) + call_func = with_llm_retry( + _call_with_messages, + retry_attempts=retry_attempts, + before_retry_callback=before_retry_callback, + ) else: call_func = _call_with_messages @@ -640,12 +631,11 @@ async def _final_call() -> HonchoLLMCallResponse[Any]: ) if enable_retry: - final_call_func = retry( - stop=stop_after_attempt(retry_attempts), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_not_exception_type(HonchoException), - before_sleep=before_retry_callback, - )(_final_call) + final_call_func = with_llm_retry( + _final_call, + retry_attempts=retry_attempts, + before_retry_callback=before_retry_callback, + ) else: final_call_func = _final_call From 0061bc0d9df8b5d6262c47c8574f6229a8c3d4bf Mon Sep 17 00:00:00 2001 From: Couiz <11710548+Couiz@users.noreply.github.com> Date: Sun, 31 May 2026 05:15:05 +0000 Subject: [PATCH 5/5] test(llm): fix live Anthropic thinking assertion for adaptive models The live thinking+tool-replay test hard-coded the legacy {"type": "enabled", "budget_tokens": N} shape, but Opus 4.7+ are adaptive-only and now correctly receive {"type": "adaptive"} with the budget bucketed into output_config.effort. Branch the expected shape on _requires_adaptive_thinking so the assertion stays correct across the whole claude_4_5_plus family (Sonnet 4.5/4.6 keep the legacy shape) and additionally assert the resolved effort for adaptive models. Verified live against claude-opus-4-8: both live Anthropic tests pass. Co-Authored-By: Claude Opus 4.8 --- tests/live_llm/test_live_anthropic.py | 42 ++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/tests/live_llm/test_live_anthropic.py b/tests/live_llm/test_live_anthropic.py index e10d2b103..b613ba254 100644 --- a/tests/live_llm/test_live_anthropic.py +++ b/tests/live_llm/test_live_anthropic.py @@ -3,6 +3,10 @@ import pytest from src.llm.backend import CompletionResult +from src.llm.backends.anthropic import ( + _budget_to_effort, # pyright: ignore[reportPrivateUsage] + _requires_adaptive_thinking, # pyright: ignore[reportPrivateUsage] +) from src.llm.history_adapters import AnthropicHistoryAdapter from src.llm.request_builder import execute_completion @@ -20,6 +24,20 @@ pytestmark = [pytest.mark.live_llm, pytest.mark.requires_anthropic] +def _expected_thinking_kwargs(model: str, budget_tokens: int) -> dict[str, object]: + """The ``thinking`` request param honcho should send for ``model``. + + Opus 4.7+ are adaptive-only: the backend sends ``{"type": "adaptive"}`` and + moves the depth hint to ``output_config.effort`` (a bucketed ``budget_tokens``). + Older models (e.g. Sonnet 4.5/4.6) keep the legacy budget shape. Mirroring + the backend's own branch here keeps this live assertion correct across the + whole ``claude_4_5_plus`` family instead of hard-coding one shape. + """ + if _requires_adaptive_thinking(model): + return {"type": "adaptive"} + return {"type": "enabled", "budget_tokens": budget_tokens} + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_spec", @@ -115,10 +133,14 @@ async def test_live_anthropic_thinking_and_tool_replay( tools=tools, ) - assert create_calls[0]["kwargs"]["thinking"] == { - "type": "enabled", - "budget_tokens": 1024, - } + assert ( + create_calls[0]["kwargs"]["thinking"] + == _expected_thinking_kwargs(model_spec.model, 1024) + ) + if _requires_adaptive_thinking(model_spec.model): + assert create_calls[0]["kwargs"]["output_config"] == { + "effort": _budget_to_effort(1024) + } assert first.tool_calls, "Anthropic should issue a tool call in the first turn" assert first.thinking_blocks, "Anthropic thinking blocks should be preserved" @@ -145,10 +167,14 @@ async def test_live_anthropic_thinking_and_tool_replay( tools=tools, ) - assert create_calls[1]["kwargs"]["thinking"] == { - "type": "enabled", - "budget_tokens": 1024, - } + assert ( + create_calls[1]["kwargs"]["thinking"] + == _expected_thinking_kwargs(model_spec.model, 1024) + ) + if _requires_adaptive_thinking(model_spec.model): + assert create_calls[1]["kwargs"]["output_config"] == { + "effort": _budget_to_effort(1024) + } assert isinstance(second.content, str) assert "13" in second.content assert "prime" in second.content.lower()