From 22b55e6b3cb93ce6d243c2e4b76649708a1ecebf Mon Sep 17 00:00:00 2001
From: Couiz <11710548+Couiz@users.noreply.github.com>
Date: Sun, 31 May 2026 01:49:49 +0000
Subject: [PATCH 1/5] fix(llm): use adaptive thinking for Opus 4.7+ Anthropic
 models

Claude Opus 4.7 and 4.8 removed the legacy extended-thinking format.
Sending `thinking: {"type": "enabled", "budget_tokens": N}` to these
models now fails with HTTP 400:

    "thinking.type.enabled" is not supported for this model.
    Use "thinking.type.adaptive" and "output_config.effort" to control
    thinking behavior.

The Anthropic backend only ever emitted the legacy shape, so every
dream/dialectic call on Opus 4.8 failed (reproduced on a self-hosted
v3.0.7 deployment: deriver dream.deduction, claude-opus-4-8,
thinking_budget_tokens=16000 -> 400 on all retries).

Detect models that require the new API by version (Opus >= 4.7, and any
future Opus) and emit `thinking: {"type": "adaptive"}` plus
`output_config: {"effort": ...}`, mapping the existing
`thinking_budget_tokens` to a sensible effort level (16000 -> high,
32000 -> xhigh) or honoring an explicit `thinking_effort`. Models that
still accept the legacy format (Opus 4.6, Sonnet 4.6, Sonnet 4.5,
Haiku, etc.) keep the `enabled` + `budget_tokens` shape unchanged.

Applied to both the complete and stream paths; the assistant-prefill
gate now keys off whether thinking is enabled rather than the raw
budget. Verified the adaptive shape against the Anthropic docs:
https://platform.claude.com/docs/en/build-with-claude/adaptive-thinking

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/llm/backends/anthropic.py             | 132 ++++++++++++++--
 tests/llm/test_backends/test_anthropic.py | 180 ++++++++++++++++++++++
 2 files changed, 298 insertions(+), 14 deletions(-)
diff --git a/src/llm/backends/anthropic.py b/src/llm/backends/anthropic.py
index 17138583c..6a8b44203 100644
--- a/src/llm/backends/anthropic.py
+++ b/src/llm/backends/anthropic.py
@@ -2,6 +2,7 @@
 
 import copy
 import json
+import re
 from collections.abc import AsyncIterator
 from typing import Any
 
@@ -11,6 +12,110 @@
 from src.llm.backend import CompletionResult, StreamChunk, ToolCallResult
 from src.llm.structured_output import repair_response_model_json
 
+# Effort levels accepted by ``output_config.effort`` (ordered low -> high).
+# Honcho's ThinkingEffortLevel additionally allows "none"/"minimal", which the
+# Messages API does not accept, so those are mapped onto supported values below.
+_ANTHROPIC_EFFORTS: frozenset[str] = frozenset(
+    {"low", "medium", "high", "xhigh", "max"}
+)
+_EFFORT_ALIASES: dict[str, str] = {"minimal": "low"}
+
+# claude-<tier>-<major>-<minor>[-<date/suffix>]; only the version prefix matters
+# for picking the thinking format.
+_MODEL_VERSION_RE = re.compile(r"^claude-(opus|sonnet|haiku)-(\d+)-(\d+)")
+
+# Per-tier (major, minor) at/above which Anthropic removed the legacy
+# ``thinking: {"type": "enabled", "budget_tokens": N}`` format and *requires*
+# adaptive thinking. Sending the legacy shape to one of these models returns
+# HTTP 400:
+#   "thinking.type.enabled" is not supported for this model. Use
+#   "thinking.type.adaptive" and "output_config.effort" to control thinking ...
+# Verified against the Anthropic docs (Opus 4.7 and Opus 4.8 are adaptive-only):
+# https://platform.claude.com/docs/en/build-with-claude/adaptive-thinking
+# Opus 4.6 / Sonnet 4.6 still accept the legacy format (deprecated but
+# functional), so they intentionally stay on the legacy path.
+_ADAPTIVE_THINKING_MIN_VERSION: dict[str, tuple[int, int]] = {"opus": (4, 7)}
+
+
+def _requires_adaptive_thinking(model: str) -> bool:
+    """Return True if ``model`` rejects legacy budget-based thinking (HTTP 400).
+
+    Such models require ``thinking: {"type": "adaptive"}`` with
+    ``output_config.effort`` instead of ``{"type": "enabled", "budget_tokens"}``.
+    """
+    match = _MODEL_VERSION_RE.match(model)
+    if match is None:
+        return False
+    minimum = _ADAPTIVE_THINKING_MIN_VERSION.get(match.group(1))
+    if minimum is None:
+        return False
+    return (int(match.group(2)), int(match.group(3))) >= minimum
+
+
+def _budget_to_effort(thinking_budget_tokens: int) -> str:
+    """Bucket a legacy thinking-token budget into an adaptive effort level.
+
+    There is no exact mapping from ``budget_tokens`` to ``effort``; these
+    buckets keep budget-configured models at a comparable thinking depth (e.g.
+    the deriver's 16000-token dream budget maps to ``high``, 32000 to ``xhigh``).
+    """
+    if thinking_budget_tokens < 4096:
+        return "low"
+    if thinking_budget_tokens < 16000:
+        return "medium"
+    if thinking_budget_tokens < 32000:
+        return "high"
+    return "xhigh"
+
+
+def _adaptive_effort(
+    thinking_effort: str | None, thinking_budget_tokens: int | None
+) -> str | None:
+    """Resolve ``output_config.effort`` for an adaptive-thinking request.
+
+    An explicit ``thinking_effort`` wins; otherwise the legacy
+    ``thinking_budget_tokens`` is bucketed so existing budget-based configs keep
+    a comparable thinking depth. Returns None to fall back to the API default
+    (``high``).
+    """
+    if thinking_effort and thinking_effort != "none":
+        normalized = _EFFORT_ALIASES.get(thinking_effort, thinking_effort)
+        if normalized in _ANTHROPIC_EFFORTS:
+            return normalized
+    if thinking_budget_tokens:
+        return _budget_to_effort(thinking_budget_tokens)
+    return None
+
+
+def _build_thinking_params(
+    model: str,
+    thinking_budget_tokens: int | None,
+    thinking_effort: str | None,
+) -> dict[str, Any]:
+    """Build the ``thinking`` (+ ``output_config``) request params for a model.
+
+    Opus 4.7+ reject the legacy ``{"type": "enabled", "budget_tokens": N}`` shape
+    with HTTP 400 and require adaptive thinking; older models keep the legacy
+    shape unchanged. Returns an empty dict when thinking is not requested.
+    """
+    if not _requires_adaptive_thinking(model):
+        if thinking_budget_tokens:
+            return {
+                "thinking": {
+                    "type": "enabled",
+                    "budget_tokens": thinking_budget_tokens,
+                }
+            }
+        return {}
+
+    effort = _adaptive_effort(thinking_effort, thinking_budget_tokens)
+    if not thinking_budget_tokens and effort is None:
+        return {}
+    params: dict[str, Any] = {"thinking": {"type": "adaptive"}}
+    if effort is not None:
+        params["output_config"] = {"effort": effort}
+    return params
+
 
 class AnthropicBackend:
     """Provider backend wrapping the native Anthropic SDK."""
@@ -34,7 +139,7 @@ async def complete(
         max_output_tokens: int | None = None,
         extra_params: dict[str, Any] | None = None,
     ) -> CompletionResult:
-        del max_output_tokens, thinking_effort
+        del max_output_tokens
 
         request_messages, system_messages = self._extract_system(messages)
         params: dict[str, Any] = {
@@ -43,6 +148,11 @@ async def complete(
             "messages": request_messages,
         }
 
+        thinking_params = _build_thinking_params(
+            model, thinking_budget_tokens, thinking_effort
+        )
+        params.update(thinking_params)
+
         if temperature is not None:
             params["temperature"] = temperature
         if stop:
@@ -60,11 +170,6 @@ async def complete(
             converted_tool_choice = self._convert_tool_choice(tool_choice)
             if converted_tool_choice is not None:
                 params["tool_choice"] = converted_tool_choice
-        if thinking_budget_tokens:
-            params["thinking"] = {
-                "type": "enabled",
-                "budget_tokens": thinking_budget_tokens,
-            }
         if extra_params:
             for key in ("top_p", "top_k"):
                 if key in extra_params:
@@ -72,7 +177,7 @@ async def complete(
 
         use_json_prefill = (
             bool(response_format or self._json_mode(extra_params))
-            and not thinking_budget_tokens
+            and "thinking" not in thinking_params
             and self._supports_assistant_prefill(model)
         )
         if use_json_prefill and params["messages"]:
@@ -119,7 +224,7 @@ async def stream(
         extra_params: dict[str, Any] | None = None,
     ) -> AsyncIterator[StreamChunk]:
         is_json_mode = self._json_mode(extra_params)
-        del max_output_tokens, thinking_effort
+        del max_output_tokens
 
         request_messages, system_messages = self._extract_system(messages)
         params: dict[str, Any] = {
@@ -127,6 +232,10 @@ async def stream(
             "max_tokens": max_tokens,
             "messages": request_messages,
         }
+        thinking_params = _build_thinking_params(
+            model, thinking_budget_tokens, thinking_effort
+        )
+        params.update(thinking_params)
         if temperature is not None:
             params["temperature"] = temperature
         if stop:
@@ -150,7 +259,7 @@ async def stream(
                     params[key] = extra_params[key]
         use_json_prefill = (
             bool(response_format or is_json_mode)
-            and not thinking_budget_tokens
+            and "thinking" not in thinking_params
             and self._supports_assistant_prefill(model)
         )
         if use_json_prefill and params["messages"]:
@@ -169,11 +278,6 @@ async def stream(
                 params["messages"],
                 f"\n\nRespond with valid JSON matching this schema:\n{schema_json}",
             )
-        if thinking_budget_tokens:
-            params["thinking"] = {
-                "type": "enabled",
-                "budget_tokens": thinking_budget_tokens,
-            }
 
         async with self._client.messages.stream(**params) as stream:
             async for chunk in stream:
diff --git a/tests/llm/test_backends/test_anthropic.py b/tests/llm/test_backends/test_anthropic.py
index 52de0fa2a..d3cb1332e 100644
--- a/tests/llm/test_backends/test_anthropic.py
+++ b/tests/llm/test_backends/test_anthropic.py
@@ -1,4 +1,5 @@
 from types import SimpleNamespace
+from typing import Any
 from unittest.mock import AsyncMock, Mock
 
 import pytest
@@ -152,3 +153,182 @@ async def test_anthropic_backend_ignores_thinking_effort() -> None:
     call = await_args.kwargs
     assert "thinking" not in call
     assert "reasoning_effort" not in call
+
+
+def _text_response_client() -> Mock:
+    """Mock Anthropic client returning a minimal text completion."""
+    client = Mock()
+    client.messages.create = AsyncMock(
+        return_value=SimpleNamespace(
+            content=[TextBlock(type="text", text="ok")],
+            usage=SimpleNamespace(
+                input_tokens=10,
+                output_tokens=5,
+                cache_creation_input_tokens=0,
+                cache_read_input_tokens=0,
+            ),
+            stop_reason="end_turn",
+        )
+    )
+    return client
+
+
+def _create_call_kwargs(client: Mock) -> dict[str, Any]:
+    await_args = client.messages.create.await_args
+    if await_args is None:
+        raise AssertionError("Expected Anthropic client call")
+    return await_args.kwargs
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "claude-opus-4-8",
+        "claude-opus-4-7",
+        "claude-opus-5-0",  # future Opus keeps using adaptive thinking
+        "claude-opus-4-8-20260120",  # date-suffixed variant
+    ],
+)
+@pytest.mark.asyncio
+async def test_anthropic_backend_uses_adaptive_thinking(model: str) -> None:
+    # These models reject the legacy {"type": "enabled", "budget_tokens": N}
+    # shape with HTTP 400; the backend must send adaptive thinking instead.
+    client = _text_response_client()
+    backend = AnthropicBackend(client)
+    await backend.complete(
+        model=model,
+        messages=[{"role": "user", "content": "Hello"}],
+        max_tokens=32000,
+        thinking_budget_tokens=16000,
+    )
+
+    call = _create_call_kwargs(client)
+    assert call["thinking"] == {"type": "adaptive"}
+    assert call["output_config"] == {"effort": "high"}
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "claude-opus-4-6",
+        "claude-opus-4-5",
+        "claude-sonnet-4-6",
+        "claude-sonnet-4-5",
+        "claude-haiku-4-5",
+    ],
+)
+@pytest.mark.asyncio
+async def test_anthropic_backend_keeps_legacy_thinking(model: str) -> None:
+    # Models that still accept the legacy budget format must be unchanged.
+    client = _text_response_client()
+    backend = AnthropicBackend(client)
+    await backend.complete(
+        model=model,
+        messages=[{"role": "user", "content": "Hello"}],
+        max_tokens=32000,
+        thinking_budget_tokens=16000,
+    )
+
+    call = _create_call_kwargs(client)
+    assert call["thinking"] == {"type": "enabled", "budget_tokens": 16000}
+    assert "output_config" not in call
+
+
+@pytest.mark.asyncio
+async def test_anthropic_backend_adaptive_thinking_respects_explicit_effort() -> None:
+    # An explicit thinking_effort overrides the budget-derived effort bucket.
+    client = _text_response_client()
+    backend = AnthropicBackend(client)
+    await backend.complete(
+        model="claude-opus-4-8",
+        messages=[{"role": "user", "content": "Hello"}],
+        max_tokens=32000,
+        thinking_budget_tokens=16000,
+        thinking_effort="low",
+    )
+
+    call = _create_call_kwargs(client)
+    assert call["thinking"] == {"type": "adaptive"}
+    assert call["output_config"] == {"effort": "low"}
+
+
+class _FakeAnthropicStream:
+    """Minimal async-context-manager stand-in for client.messages.stream()."""
+
+    def __init__(self, final_message: Any) -> None:
+        self._final_message: Any = final_message
+
+    async def __aenter__(self) -> "_FakeAnthropicStream":
+        return self
+
+    async def __aexit__(self, *exc_info: object) -> bool:
+        del exc_info
+        return False
+
+    def __aiter__(self) -> "_FakeAnthropicStream":
+        return self
+
+    async def __anext__(self) -> Any:
+        raise StopAsyncIteration
+
+    async def get_final_message(self) -> Any:
+        return self._final_message
+
+
+@pytest.mark.asyncio
+async def test_anthropic_backend_stream_uses_adaptive_thinking_for_opus_4_8() -> None:
+    # The streaming path must apply the same adaptive thinking format.
+    final_message = SimpleNamespace(
+        usage=SimpleNamespace(output_tokens=5),
+        stop_reason="end_turn",
+    )
+    client = Mock()
+    client.messages.stream = Mock(return_value=_FakeAnthropicStream(final_message))
+
+    backend = AnthropicBackend(client)
+    chunks = [
+        chunk
+        async for chunk in backend.stream(
+            model="claude-opus-4-8",
+            messages=[{"role": "user", "content": "Hello"}],
+            max_tokens=32000,
+            thinking_budget_tokens=16000,
+        )
+    ]
+
+    assert chunks[-1].is_done is True
+    stream_call = client.messages.stream.call_args
+    if stream_call is None:
+        raise AssertionError("Expected Anthropic stream call")
+    call = stream_call.kwargs
+    assert call["thinking"] == {"type": "adaptive"}
+    assert call["output_config"] == {"effort": "high"}
+
+
+@pytest.mark.parametrize(
+    ("thinking_budget_tokens", "expected_effort"),
+    [
+        (2048, "low"),
+        (8000, "medium"),
+        (16000, "high"),
+        (32000, "xhigh"),
+    ],
+)
+@pytest.mark.asyncio
+async def test_anthropic_backend_maps_budget_to_effort(
+    thinking_budget_tokens: int, expected_effort: str
+) -> None:
+    # With no explicit effort, the legacy budget is bucketed into an effort
+    # level so existing budget-based configs keep a comparable thinking depth.
+    client = _text_response_client()
+    backend = AnthropicBackend(client)
+    await backend.complete(
+        model="claude-opus-4-8",
+        messages=[{"role": "user", "content": "Hello"}],
+        max_tokens=64000,
+        thinking_budget_tokens=thinking_budget_tokens,
+    )
+
+    call = _create_call_kwargs(client)
+    assert call["thinking"] == {"type": "adaptive"}
+    assert call["output_config"] == {"effort": expected_effort}

From ae0c840b77cef09a593196732d79fa55ce2039a9 Mon Sep 17 00:00:00 2001
From: Couiz <11710548+Couiz@users.noreply.github.com>
Date: Sun, 31 May 2026 02:08:15 +0000
Subject: [PATCH 2/5] fix(llm): validate thinking inputs in the Anthropic
 backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address CodeRabbit review on PR #752: reject invalid thinking inputs
instead of silently coercing them, using the existing
src.exceptions.ValidationException.

- `_adaptive_effort` now raises on an unrecognized `thinking_effort`
  (previously fell back to the budget-derived effort / API default).
- `_adaptive_effort` and `_build_thinking_params` now raise on a
  negative `thinking_budget_tokens` (previously bucketed to "low" on
  adaptive models and forwarded unchanged to legacy models).

A budget of 0 stays a valid "disable thinking" sentinel — config.py
explicitly permits 0 for Anthropic ("0 to disable") and the original
backend treated it as no-thinking. So the guard rejects only `< 0`
(matching CodeRabbit's proposed diff, `must be >= 0`), not `<= 0`,
which would regress the disable path. None and valid positive budgets
are unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/llm/backends/anthropic.py             | 23 +++++++---
 tests/llm/test_backends/test_anthropic.py | 54 +++++++++++++++++++++++
 2 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/src/llm/backends/anthropic.py b/src/llm/backends/anthropic.py
index 6a8b44203..5a5ce3759 100644
--- a/src/llm/backends/anthropic.py
+++ b/src/llm/backends/anthropic.py
@@ -9,6 +9,7 @@
 from anthropic.types import TextBlock, ThinkingBlock, ToolUseBlock
 from pydantic import BaseModel, ValidationError
 
+from src.exceptions import ValidationException
 from src.llm.backend import CompletionResult, StreamChunk, ToolCallResult
 from src.llm.structured_output import repair_response_model_json
 
@@ -76,12 +77,19 @@ def _adaptive_effort(
     An explicit ``thinking_effort`` wins; otherwise the legacy
     ``thinking_budget_tokens`` is bucketed so existing budget-based configs keep
     a comparable thinking depth. Returns None to fall back to the API default
-    (``high``).
+    (``high``). A budget of 0 (or None) means "no effort hint"; a negative
+    budget or an unrecognized effort is a caller error and raises
+    ``ValidationException`` rather than being silently coerced.
     """
-    if thinking_effort and thinking_effort != "none":
+    if thinking_budget_tokens is not None and thinking_budget_tokens < 0:
+        raise ValidationException("thinking_budget_tokens must be >= 0")
+    if thinking_effort is not None and thinking_effort != "none":
         normalized = _EFFORT_ALIASES.get(thinking_effort, thinking_effort)
-        if normalized in _ANTHROPIC_EFFORTS:
-            return normalized
+        if normalized not in _ANTHROPIC_EFFORTS:
+            raise ValidationException(
+                f"Unsupported thinking_effort: {thinking_effort!r}"
+            )
+        return normalized
     if thinking_budget_tokens:
         return _budget_to_effort(thinking_budget_tokens)
     return None
@@ -96,8 +104,13 @@ def _build_thinking_params(
 
     Opus 4.7+ reject the legacy ``{"type": "enabled", "budget_tokens": N}`` shape
     with HTTP 400 and require adaptive thinking; older models keep the legacy
-    shape unchanged. Returns an empty dict when thinking is not requested.
+    shape unchanged. A budget of 0 (or None) means "no thinking"; a negative
+    budget raises ``ValidationException`` rather than being forwarded. Returns
+    an empty dict when thinking is not requested.
     """
+    if thinking_budget_tokens is not None and thinking_budget_tokens < 0:
+        raise ValidationException("thinking_budget_tokens must be >= 0")
+
     if not _requires_adaptive_thinking(model):
         if thinking_budget_tokens:
             return {
diff --git a/tests/llm/test_backends/test_anthropic.py b/tests/llm/test_backends/test_anthropic.py
index d3cb1332e..34c7ea64b 100644
--- a/tests/llm/test_backends/test_anthropic.py
+++ b/tests/llm/test_backends/test_anthropic.py
@@ -6,6 +6,7 @@
 from anthropic.types import TextBlock, ThinkingBlock, ToolUseBlock
 from pydantic import BaseModel
 
+from src.exceptions import ValidationException
 from src.llm.backends.anthropic import AnthropicBackend
 
 
@@ -332,3 +333,56 @@ async def test_anthropic_backend_maps_budget_to_effort(
     call = _create_call_kwargs(client)
     assert call["thinking"] == {"type": "adaptive"}
     assert call["output_config"] == {"effort": expected_effort}
+
+
+@pytest.mark.asyncio
+async def test_anthropic_backend_rejects_unknown_thinking_effort() -> None:
+    # An unrecognized effort is a caller error, not a silent fallback.
+    backend = AnthropicBackend(_text_response_client())
+    with pytest.raises(ValidationException):
+        await backend.complete(
+            model="claude-opus-4-8",
+            messages=[{"role": "user", "content": "Hello"}],
+            max_tokens=32000,
+            thinking_budget_tokens=16000,
+            thinking_effort="turbo",
+        )
+
+
+@pytest.mark.parametrize("model", ["claude-opus-4-8", "claude-opus-4-6"])
+@pytest.mark.asyncio
+async def test_anthropic_backend_rejects_negative_budget(model: str) -> None:
+    # Negative budgets must not be bucketed (adaptive) or forwarded (legacy).
+    backend = AnthropicBackend(_text_response_client())
+    with pytest.raises(ValidationException):
+        await backend.complete(
+            model=model,
+            messages=[{"role": "user", "content": "Hello"}],
+            max_tokens=32000,
+            thinking_budget_tokens=-1,
+        )
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["claude-opus-4-8", "claude-opus-4-6", "claude-haiku-4-5"],
+)
+@pytest.mark.parametrize("thinking_budget_tokens", [None, 0])
+@pytest.mark.asyncio
+async def test_anthropic_backend_no_thinking_for_zero_or_none_budget(
+    model: str, thinking_budget_tokens: int | None
+) -> None:
+    # 0 is a valid "disable thinking" sentinel (config permits it); neither 0
+    # nor None raises, and no thinking params are sent.
+    client = _text_response_client()
+    backend = AnthropicBackend(client)
+    await backend.complete(
+        model=model,
+        messages=[{"role": "user", "content": "Hello"}],
+        max_tokens=32000,
+        thinking_budget_tokens=thinking_budget_tokens,
+    )
+
+    call = _create_call_kwargs(client)
+    assert "thinking" not in call
+    assert "output_config" not in call

From 567e2c463a7b3626f2aad1e998b7a28b43daecf7 Mon Sep 17 00:00:00 2001
From: Couiz <11710548+Couiz@users.noreply.github.com>
Date: Sun, 31 May 2026 03:58:27 +0000
Subject: [PATCH 3/5] fix(llm): fail fast on deterministic thinking-input
 errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Deterministic HonchoExceptions (e.g. invalid thinking budget/effort raised
by the Anthropic backend) were caught by the tenacity retry wrappers in
api.py and tool_loop.py, retried retry_attempts times, re-wrapped as a
RetryError, and surfaced as a generic 500 — losing the intended 422 status
and burning a fallback hop.

Add retry=retry_if_not_exception_type(HonchoException) to every retry
wrapper so these errors fail fast and propagate with their own status_code.
Also enable adaptive thinking when an explicit thinking_effort is set,
regardless of budget.

Adds regression tests covering the tool-less (api.py prompt + message-list)
and tool-loop (Dialectic/Dreamer) paths.
---
 src/llm/api.py                                |  15 +-
 src/llm/backends/anthropic.py                 |   8 +-
 src/llm/tool_loop.py                          |  14 +-
 tests/llm/test_backends/test_anthropic.py     | 116 +++++++++++-
 .../test_thinking_validation_not_retried.py   | 168 ++++++++++++++++++
 5 files changed, 313 insertions(+), 8 deletions(-)
 create mode 100644 tests/llm/test_thinking_validation_not_retried.py

diff --git a/src/llm/api.py b/src/llm/api.py
index 9c9a628d4..71500a2db 100644
--- a/src/llm/api.py
+++ b/src/llm/api.py
@@ -17,10 +17,15 @@
 
 from pydantic import BaseModel
 from sentry_sdk.ai.monitoring import ai_track
-from tenacity import retry, stop_after_attempt, wait_exponential
+from tenacity import (
+    retry,
+    retry_if_not_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
 
 from src.config import ConfiguredModelSettings, ModelConfig
-from src.exceptions import ValidationException
+from src.exceptions import HonchoException, ValidationException
 from src.telemetry.logging import conditional_observe
 from src.telemetry.reasoning_traces import log_reasoning_trace
 
@@ -290,6 +295,11 @@ def before_retry_callback(retry_state: Any) -> None:
         decorated = retry(
             stop=stop_after_attempt(retry_attempts),
             wait=wait_exponential(multiplier=1, min=4, max=10),
+            # HonchoExceptions are deterministic input/config errors (e.g. an
+            # invalid thinking budget/effort raised by a backend): fail fast so
+            # they propagate with their own status_code instead of being
+            # retried, wrapped in RetryError, and surfaced as a generic 500.
+            retry=retry_if_not_exception_type(HonchoException),
             before_sleep=before_retry_callback,
         )(decorated)
 
@@ -403,6 +413,7 @@ async def _toolless_call() -> (
                 wrapped = retry(
                     stop=stop_after_attempt(retry_attempts),
                     wait=wait_exponential(multiplier=1, min=4, max=10),
+                    retry=retry_if_not_exception_type(HonchoException),
                     before_sleep=before_retry_callback,
                 )(wrapped)
             result: (
diff --git a/src/llm/backends/anthropic.py b/src/llm/backends/anthropic.py
index 5a5ce3759..c22b27136 100644
--- a/src/llm/backends/anthropic.py
+++ b/src/llm/backends/anthropic.py
@@ -104,9 +104,11 @@ def _build_thinking_params(
 
     Opus 4.7+ reject the legacy ``{"type": "enabled", "budget_tokens": N}`` shape
     with HTTP 400 and require adaptive thinking; older models keep the legacy
-    shape unchanged. A budget of 0 (or None) means "no thinking"; a negative
-    budget raises ``ValidationException`` rather than being forwarded. Returns
-    an empty dict when thinking is not requested.
+    shape unchanged. A budget of 0 (or None) with no explicit effort means no
+    thinking; an explicit ``thinking_effort`` enables adaptive thinking
+    regardless of budget. A negative budget raises ``ValidationException``
+    rather than being forwarded. Returns an empty dict when thinking is not
+    requested.
     """
     if thinking_budget_tokens is not None and thinking_budget_tokens < 0:
         raise ValidationException("thinking_budget_tokens must be >= 0")
diff --git a/src/llm/tool_loop.py b/src/llm/tool_loop.py
index 734af8b3c..978b613df 100644
--- a/src/llm/tool_loop.py
+++ b/src/llm/tool_loop.py
@@ -18,10 +18,15 @@
 from typing import Any, ParamSpec, TypeVar
 
 from pydantic import BaseModel
-from tenacity import retry, stop_after_attempt, wait_exponential
+from tenacity import (
+    retry,
+    retry_if_not_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
 
 from src.config import ModelTransport
-from src.exceptions import ValidationException
+from src.exceptions import HonchoException, ValidationException
 from src.utils.types import (
     get_last_tool_metadata,
     iteration_scope,
@@ -259,6 +264,9 @@ async def _setup_stream() -> AsyncIterator[HonchoLLMCallStreamChunk]:
         wrapped = retry(
             stop=stop_after_attempt(retry_attempts),
             wait=wait_exponential(multiplier=1, min=4, max=10),
+            # Deterministic input/config errors (HonchoException) fail fast
+            # rather than being retried and re-wrapped as RetryError → 500.
+            retry=retry_if_not_exception_type(HonchoException),
             before_sleep=before_retry_callback,
         )(_setup_stream)
         stream = await wrapped()
@@ -380,6 +388,7 @@ async def _call_with_messages(
             call_func = retry(
                 stop=stop_after_attempt(retry_attempts),
                 wait=wait_exponential(multiplier=1, min=4, max=10),
+                retry=retry_if_not_exception_type(HonchoException),
                 before_sleep=before_retry_callback,
             )(_call_with_messages)
         else:
@@ -634,6 +643,7 @@ async def _final_call() -> HonchoLLMCallResponse[Any]:
         final_call_func = retry(
             stop=stop_after_attempt(retry_attempts),
             wait=wait_exponential(multiplier=1, min=4, max=10),
+            retry=retry_if_not_exception_type(HonchoException),
             before_sleep=before_retry_callback,
         )(_final_call)
     else:
diff --git a/tests/llm/test_backends/test_anthropic.py b/tests/llm/test_backends/test_anthropic.py
index 34c7ea64b..0d63aeef3 100644
--- a/tests/llm/test_backends/test_anthropic.py
+++ b/tests/llm/test_backends/test_anthropic.py
@@ -125,7 +125,7 @@ async def test_anthropic_backend_skips_assistant_prefill_for_claude_4_models() -
 
 
 @pytest.mark.asyncio
-async def test_anthropic_backend_ignores_thinking_effort() -> None:
+async def test_legacy_model_ignores_thinking_effort() -> None:
     client = Mock()
     client.messages.create = AsyncMock(
         return_value=SimpleNamespace(
@@ -386,3 +386,117 @@ async def test_anthropic_backend_no_thinking_for_zero_or_none_budget(
     call = _create_call_kwargs(client)
     assert "thinking" not in call
     assert "output_config" not in call
+
+
+@pytest.mark.asyncio
+async def test_anthropic_backend_adaptive_aliases_minimal_to_low() -> None:
+    # Honcho's ThinkingEffortLevel allows "minimal", which the Messages API
+    # does not accept; it is aliased onto the supported "low" effort.
+    client = _text_response_client()
+    backend = AnthropicBackend(client)
+    await backend.complete(
+        model="claude-opus-4-8",
+        messages=[{"role": "user", "content": "Hello"}],
+        max_tokens=32000,
+        thinking_effort="minimal",
+    )
+
+    call = _create_call_kwargs(client)
+    assert call["thinking"] == {"type": "adaptive"}
+    assert call["output_config"] == {"effort": "low"}
+
+
+@pytest.mark.asyncio
+async def test_anthropic_backend_effort_none_falls_through_to_budget_bucket() -> None:
+    # "none" means "no explicit effort hint", not "disable thinking": with a
+    # positive budget it falls through to the budget-derived effort bucket
+    # (16000 -> high). Thinking is disabled via a 0/None budget, not "none".
+    client = _text_response_client()
+    backend = AnthropicBackend(client)
+    await backend.complete(
+        model="claude-opus-4-8",
+        messages=[{"role": "user", "content": "Hello"}],
+        max_tokens=32000,
+        thinking_budget_tokens=16000,
+        thinking_effort="none",
+    )
+
+    call = _create_call_kwargs(client)
+    assert call["thinking"] == {"type": "adaptive"}
+    assert call["output_config"] == {"effort": "high"}
+
+
+@pytest.mark.asyncio
+async def test_anthropic_backend_explicit_effort_with_zero_budget_enables_adaptive() -> (
+    None
+):
+    # A 0 budget alone disables thinking, but an explicit effort still enables
+    # adaptive thinking on 4.7+ models — the effort is the authoritative hint.
+    client = _text_response_client()
+    backend = AnthropicBackend(client)
+    await backend.complete(
+        model="claude-opus-4-8",
+        messages=[{"role": "user", "content": "Hello"}],
+        max_tokens=32000,
+        thinking_budget_tokens=0,
+        thinking_effort="high",
+    )
+
+    call = _create_call_kwargs(client)
+    assert call["thinking"] == {"type": "adaptive"}
+    assert call["output_config"] == {"effort": "high"}
+
+
+@pytest.mark.asyncio
+async def test_anthropic_backend_stream_keeps_legacy_thinking_for_sonnet_4_6() -> None:
+    # The streaming path must keep the legacy budget shape for models that
+    # still accept it (no adaptive output_config).
+    final_message = SimpleNamespace(
+        usage=SimpleNamespace(output_tokens=5),
+        stop_reason="end_turn",
+    )
+    client = Mock()
+    client.messages.stream = Mock(return_value=_FakeAnthropicStream(final_message))
+
+    backend = AnthropicBackend(client)
+    chunks = [
+        chunk
+        async for chunk in backend.stream(
+            model="claude-sonnet-4-6",
+            messages=[{"role": "user", "content": "Hello"}],
+            max_tokens=32000,
+            thinking_budget_tokens=16000,
+        )
+    ]
+
+    assert chunks[-1].is_done is True
+    stream_call = client.messages.stream.call_args
+    if stream_call is None:
+        raise AssertionError("Expected Anthropic stream call")
+    call = stream_call.kwargs
+    assert call["thinking"] == {"type": "enabled", "budget_tokens": 16000}
+    assert "output_config" not in call
+
+
+@pytest.mark.parametrize(
+    ("thinking_budget_tokens", "expected_effort"),
+    [(4095, "low"), (4096, "medium")],
+)
+@pytest.mark.asyncio
+async def test_anthropic_backend_budget_low_medium_boundary(
+    thinking_budget_tokens: int, expected_effort: str
+) -> None:
+    # The <4096 bucket boundary: 4095 -> low, 4096 -> medium. (16000 -> high
+    # and 32000 -> xhigh are pinned by test_anthropic_backend_maps_budget_to_effort.)
+    client = _text_response_client()
+    backend = AnthropicBackend(client)
+    await backend.complete(
+        model="claude-opus-4-8",
+        messages=[{"role": "user", "content": "Hello"}],
+        max_tokens=64000,
+        thinking_budget_tokens=thinking_budget_tokens,
+    )
+
+    call = _create_call_kwargs(client)
+    assert call["thinking"] == {"type": "adaptive"}
+    assert call["output_config"] == {"effort": expected_effort}
diff --git a/tests/llm/test_thinking_validation_not_retried.py b/tests/llm/test_thinking_validation_not_retried.py
new file mode 100644
index 000000000..6b3dcf68a
--- /dev/null
+++ b/tests/llm/test_thinking_validation_not_retried.py
@@ -0,0 +1,168 @@
+"""Regression tests for B1: deterministic thinking-input errors must surface
+as HTTP 422 and must not be retried.
+
+The Anthropic backend rejects invalid thinking inputs (a negative budget or an
+unrecognized effort) by raising ``ValidationException`` (status_code 422) from
+inside ``_build_thinking_params``. That raise happens inside the closures the
+tenacity ``retry`` wrappers decorate. Without a retry predicate, a deterministic
+``HonchoException`` would be retried ``retry_attempts`` times, re-wrapped as a
+``tenacity.RetryError`` (which is not a ``HonchoException``), and surface as a
+generic 500 — losing the intended 422 and wasting a fallback hop.
+
+The fix adds ``retry=retry_if_not_exception_type(HonchoException)`` to every
+retry wrapper in ``src/llm/api.py`` and ``src/llm/tool_loop.py`` so these
+deterministic errors fail fast and propagate with their own status code.
+"""
+
+from __future__ import annotations
+
+from typing import Any, cast
+from unittest.mock import AsyncMock, Mock, patch
+
+import pytest
+
+from src.config import ModelConfig
+from src.exceptions import ValidationException
+from src.llm import registry, tool_loop
+from src.llm.api import honcho_llm_call
+from src.llm.runtime import AttemptPlan
+from src.llm.tool_loop import execute_tool_loop
+from src.llm.types import ProviderClient
+
+_THINKING_ERROR = "thinking_budget_tokens must be >= 0"
+
+
+@pytest.mark.asyncio
+async def test_negative_thinking_budget_surfaces_as_422_not_retried() -> None:
+    """End-to-end through ``honcho_llm_call`` with the real Anthropic backend.
+
+    A negative ``thinking_budget_tokens`` is config-reachable: ``ModelConfig``
+    only rejects ``0 < budget < 1024`` for Anthropic, so a negative budget
+    passes config load and reaches the backend at call time. The backend
+    rejects it, and the retry wrapper must let that ``ValidationException``
+    propagate as a 422 rather than retrying it into a ``RetryError`` (a 500).
+    """
+    model_config = ModelConfig(
+        model="claude-opus-4-8",
+        transport="anthropic",
+        thinking_budget_tokens=-1,
+    )
+
+    client = Mock()
+    client.messages.create = AsyncMock()
+
+    with (
+        patch.dict(registry.CLIENTS, {"anthropic": client}),
+        pytest.raises(ValidationException) as exc_info,
+    ):
+        await honcho_llm_call(
+            model_config=model_config,
+            prompt="hi",
+            max_tokens=100,
+            enable_retry=True,
+            retry_attempts=3,
+        )
+
+    assert exc_info.value.status_code == 422
+    # Validation fails before any network call, and the wrapper must not retry:
+    # the old behavior raised a RetryError after exhausting all attempts.
+    assert client.messages.create.await_count == 0
+
+
+@pytest.mark.parametrize(
+    "messages",
+    [None, [{"role": "user", "content": "hi"}]],
+)
+@pytest.mark.asyncio
+async def test_toolless_validation_exception_not_retried(
+    messages: list[dict[str, Any]] | None,
+) -> None:
+    """The tool-less retry wrappers (``api.py``) call the inner backend exactly
+    once for a deterministic ``ValidationException`` — not ``retry_attempts``
+    times. ``messages=None`` exercises the prompt-only wrapper; a supplied
+    ``messages`` list exercises the message-list wrapper.
+    """
+    model_config = ModelConfig(model="claude-opus-4-8", transport="anthropic")
+
+    inner = AsyncMock(side_effect=ValidationException(_THINKING_ERROR))
+
+    with (
+        patch.dict(registry.CLIENTS, {"anthropic": Mock()}),
+        patch("src.llm.api.honcho_llm_call_inner", inner),
+        pytest.raises(ValidationException) as exc_info,
+    ):
+        await honcho_llm_call(
+            model_config=model_config,
+            prompt="hi",
+            max_tokens=100,
+            messages=messages,
+            enable_retry=True,
+            retry_attempts=3,
+        )
+
+    assert exc_info.value.status_code == 422
+    assert inner.await_count == 1
+
+
+def _make_plan() -> AttemptPlan:
+    return AttemptPlan(
+        provider="anthropic",
+        model="claude-opus-4-8",
+        client=cast(ProviderClient, object()),
+        thinking_budget_tokens=None,
+        reasoning_effort=None,
+        selected_config=ModelConfig(model="claude-opus-4-8", transport="anthropic"),
+        attempt=1,
+        retry_attempts=3,
+        is_fallback=False,
+    )
+
+
+def _unused_tool_executor(_name: str, _tool_input: dict[str, Any]) -> str:
+    # The inner call raises before any tool runs; this is never invoked.
+    return ""
+
+
+def _noop_before_retry(_retry_state: Any) -> None:
+    return None
+
+
+@pytest.mark.asyncio
+async def test_tool_loop_validation_exception_not_retried() -> None:
+    """The per-iteration retry wrapper in ``execute_tool_loop`` (the Dialectic
+    and Dreamer path) must also fail fast on a deterministic
+    ``ValidationException`` — exactly one inner call, propagated as 422.
+    """
+    inner = AsyncMock(side_effect=ValidationException(_THINKING_ERROR))
+    tools: list[dict[str, Any]] = [
+        {"name": "noop", "description": "no-op", "input_schema": {"type": "object"}}
+    ]
+
+    with (
+        patch.object(tool_loop, "honcho_llm_call_inner", inner),
+        pytest.raises(ValidationException) as exc_info,
+    ):
+        await execute_tool_loop(
+            prompt="hi",
+            max_tokens=100,
+            messages=[{"role": "user", "content": "hi"}],
+            tools=tools,
+            tool_choice="auto",
+            tool_executor=_unused_tool_executor,
+            max_tool_iterations=5,
+            response_model=None,
+            json_mode=False,
+            temperature=None,
+            stop_seqs=None,
+            verbosity=None,
+            enable_retry=True,
+            retry_attempts=3,
+            max_input_tokens=None,
+            get_attempt_plan=_make_plan,
+            before_retry_callback=_noop_before_retry,
+            stream_final=False,
+            telemetry=None,
+        )
+
+    assert exc_info.value.status_code == 422
+    assert inner.await_count == 1

From 5277b63ad90213ce44b679b97a85334a61fbe948 Mon Sep 17 00:00:00 2001
From: Couiz <11710548+Couiz@users.noreply.github.com>
Date: Sun, 31 May 2026 04:19:33 +0000
Subject: [PATCH 4/5] refactor(llm): simplify thinking retry-predicate plumbing

Extract the repeated tenacity retry configuration (exponential backoff +
fail-fast on HonchoException) into a single with_llm_retry() helper in
runtime.py, the module that already owns retry-attempt tracking. Replaces
five identical inline retry(...) blocks across api.py and tool_loop.py,
centralizing the backoff policy, the fail-fast predicate, and its rationale
in one place. Behavior is unchanged: the helper is a pure pass-through to the
same retry(...) call and preserves each call site's exact callable type.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/llm/api.py       | 35 ++++++++++++-----------------------
 src/llm/runtime.py   | 35 ++++++++++++++++++++++++++++++++++-
 src/llm/tool_loop.py | 44 +++++++++++++++++---------------------------
 3 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/src/llm/api.py b/src/llm/api.py
index 71500a2db..0cf338556 100644
--- a/src/llm/api.py
+++ b/src/llm/api.py
@@ -17,15 +17,9 @@
 
 from pydantic import BaseModel
 from sentry_sdk.ai.monitoring import ai_track
-from tenacity import (
-    retry,
-    retry_if_not_exception_type,
-    stop_after_attempt,
-    wait_exponential,
-)
 
 from src.config import ConfiguredModelSettings, ModelConfig
-from src.exceptions import HonchoException, ValidationException
+from src.exceptions import ValidationException
 from src.telemetry.logging import conditional_observe
 from src.telemetry.reasoning_traces import log_reasoning_trace
 
@@ -37,6 +31,7 @@
     plan_attempt,
     resolve_runtime_model_config,
     update_current_langfuse_observation,
+    with_llm_retry,
 )
 from .tool_loop import execute_tool_loop
 from .types import (
@@ -292,16 +287,11 @@ def before_retry_callback(retry_state: Any) -> None:
             logger.info(f"Will retry with attempt {next_attempt}/{retry_attempts}")
 
     if enable_retry:
-        decorated = retry(
-            stop=stop_after_attempt(retry_attempts),
-            wait=wait_exponential(multiplier=1, min=4, max=10),
-            # HonchoExceptions are deterministic input/config errors (e.g. an
-            # invalid thinking budget/effort raised by a backend): fail fast so
-            # they propagate with their own status_code instead of being
-            # retried, wrapped in RetryError, and surfaced as a generic 500.
-            retry=retry_if_not_exception_type(HonchoException),
-            before_sleep=before_retry_callback,
-        )(decorated)
+        decorated = with_llm_retry(
+            decorated,
+            retry_attempts=retry_attempts,
+            before_retry_callback=before_retry_callback,
+        )
 
     def _trace_thinking_budget() -> int | None:
         # Trace log should reflect what got applied, so fall back to the
@@ -410,12 +400,11 @@ async def _toolless_call() -> (
             if track_name:
                 wrapped = ai_track(track_name)(wrapped)
             if enable_retry:
-                wrapped = retry(
-                    stop=stop_after_attempt(retry_attempts),
-                    wait=wait_exponential(multiplier=1, min=4, max=10),
-                    retry=retry_if_not_exception_type(HonchoException),
-                    before_sleep=before_retry_callback,
-                )(wrapped)
+                wrapped = with_llm_retry(
+                    wrapped,
+                    retry_attempts=retry_attempts,
+                    before_retry_callback=before_retry_callback,
+                )
             result: (
                 HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
             ) = await wrapped()
diff --git a/src/llm/runtime.py b/src/llm/runtime.py
index ae551378c..44ba8e87d 100644
--- a/src/llm/runtime.py
+++ b/src/llm/runtime.py
@@ -12,9 +12,17 @@
 from __future__ import annotations
 
 import logging
+from collections.abc import Callable
 from contextvars import ContextVar
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, TypeVar
+
+from tenacity import (
+    retry,
+    retry_if_not_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
 
 from src.config import (
     ConfiguredModelSettings,
@@ -23,16 +31,41 @@
     resolve_model_config,
     settings,
 )
+from src.exceptions import HonchoException
 
 from .registry import backend_for_provider, client_for_model_config
 from .types import ProviderClient, ReasoningEffortType
 
+_WrappedFn = TypeVar("_WrappedFn", bound=Callable[..., Any])
+
 logger = logging.getLogger(__name__)
 
 # ContextVar tracking the current retry attempt for provider switching.
 current_attempt: ContextVar[int] = ContextVar("current_attempt", default=0)
 
 
+def with_llm_retry(
+    func: _WrappedFn,
+    *,
+    retry_attempts: int,
+    before_retry_callback: Callable[[Any], None],
+) -> _WrappedFn:
+    """Wrap an LLM-call closure with Honcho's standard retry policy.
+
+    Transient failures retry with exponential backoff, but ``HonchoException``
+    fails fast: these are deterministic input/config errors (e.g. an invalid
+    thinking budget/effort raised by a backend) and should propagate with their
+    own ``status_code`` instead of being retried, re-wrapped in tenacity's
+    ``RetryError``, and surfaced as a generic 500.
+    """
+    return retry(
+        stop=stop_after_attempt(retry_attempts),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_not_exception_type(HonchoException),
+        before_sleep=before_retry_callback,
+    )(func)
+
+
 def update_current_langfuse_observation(
     provider: ModelTransport,
     model: str,
diff --git a/src/llm/tool_loop.py b/src/llm/tool_loop.py
index 978b613df..69f2c4a95 100644
--- a/src/llm/tool_loop.py
+++ b/src/llm/tool_loop.py
@@ -18,15 +18,9 @@
 from typing import Any, ParamSpec, TypeVar
 
 from pydantic import BaseModel
-from tenacity import (
-    retry,
-    retry_if_not_exception_type,
-    stop_after_attempt,
-    wait_exponential,
-)
 
 from src.config import ModelTransport
-from src.exceptions import HonchoException, ValidationException
+from src.exceptions import ValidationException
 from src.utils.types import (
     get_last_tool_metadata,
     iteration_scope,
@@ -41,6 +35,7 @@
     AttemptPlan,
     current_attempt,
     effective_temperature,
+    with_llm_retry,
 )
 from .types import (
     HonchoLLMCallResponse,
@@ -261,14 +256,11 @@ async def _setup_stream() -> AsyncIterator[HonchoLLMCallStreamChunk]:
         )
 
     if enable_retry:
-        wrapped = retry(
-            stop=stop_after_attempt(retry_attempts),
-            wait=wait_exponential(multiplier=1, min=4, max=10),
-            # Deterministic input/config errors (HonchoException) fail fast
-            # rather than being retried and re-wrapped as RetryError → 500.
-            retry=retry_if_not_exception_type(HonchoException),
-            before_sleep=before_retry_callback,
-        )(_setup_stream)
+        wrapped = with_llm_retry(
+            _setup_stream,
+            retry_attempts=retry_attempts,
+            before_retry_callback=before_retry_callback,
+        )
         stream = await wrapped()
     else:
         stream = await _setup_stream()
@@ -385,12 +377,11 @@ async def _call_with_messages(
             )
 
         if enable_retry:
-            call_func = retry(
-                stop=stop_after_attempt(retry_attempts),
-                wait=wait_exponential(multiplier=1, min=4, max=10),
-                retry=retry_if_not_exception_type(HonchoException),
-                before_sleep=before_retry_callback,
-            )(_call_with_messages)
+            call_func = with_llm_retry(
+                _call_with_messages,
+                retry_attempts=retry_attempts,
+                before_retry_callback=before_retry_callback,
+            )
         else:
             call_func = _call_with_messages
 
@@ -640,12 +631,11 @@ async def _final_call() -> HonchoLLMCallResponse[Any]:
         )
 
     if enable_retry:
-        final_call_func = retry(
-            stop=stop_after_attempt(retry_attempts),
-            wait=wait_exponential(multiplier=1, min=4, max=10),
-            retry=retry_if_not_exception_type(HonchoException),
-            before_sleep=before_retry_callback,
-        )(_final_call)
+        final_call_func = with_llm_retry(
+            _final_call,
+            retry_attempts=retry_attempts,
+            before_retry_callback=before_retry_callback,
+        )
     else:
         final_call_func = _final_call
 

From 0061bc0d9df8b5d6262c47c8574f6229a8c3d4bf Mon Sep 17 00:00:00 2001
From: Couiz <11710548+Couiz@users.noreply.github.com>
Date: Sun, 31 May 2026 05:15:05 +0000
Subject: [PATCH 5/5] test(llm): fix live Anthropic thinking assertion for
 adaptive models

The live thinking+tool-replay test hard-coded the legacy
{"type": "enabled", "budget_tokens": N} shape, but Opus 4.7+ are
adaptive-only and now correctly receive {"type": "adaptive"} with the
budget bucketed into output_config.effort. Branch the expected shape on
_requires_adaptive_thinking so the assertion stays correct across the
whole claude_4_5_plus family (Sonnet 4.5/4.6 keep the legacy shape) and
additionally assert the resolved effort for adaptive models.

Verified live against claude-opus-4-8: both live Anthropic tests pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tests/live_llm/test_live_anthropic.py | 42 ++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/tests/live_llm/test_live_anthropic.py b/tests/live_llm/test_live_anthropic.py
index e10d2b103..b613ba254 100644
--- a/tests/live_llm/test_live_anthropic.py
+++ b/tests/live_llm/test_live_anthropic.py
@@ -3,6 +3,10 @@
 import pytest
 
 from src.llm.backend import CompletionResult
+from src.llm.backends.anthropic import (
+    _budget_to_effort,  # pyright: ignore[reportPrivateUsage]
+    _requires_adaptive_thinking,  # pyright: ignore[reportPrivateUsage]
+)
 from src.llm.history_adapters import AnthropicHistoryAdapter
 from src.llm.request_builder import execute_completion
 
@@ -20,6 +24,20 @@
 pytestmark = [pytest.mark.live_llm, pytest.mark.requires_anthropic]
 
 
+def _expected_thinking_kwargs(model: str, budget_tokens: int) -> dict[str, object]:
+    """The ``thinking`` request param honcho should send for ``model``.
+
+    Opus 4.7+ are adaptive-only: the backend sends ``{"type": "adaptive"}`` and
+    moves the depth hint to ``output_config.effort`` (a bucketed ``budget_tokens``).
+    Older models (e.g. Sonnet 4.5/4.6) keep the legacy budget shape. Mirroring
+    the backend's own branch here keeps this live assertion correct across the
+    whole ``claude_4_5_plus`` family instead of hard-coding one shape.
+    """
+    if _requires_adaptive_thinking(model):
+        return {"type": "adaptive"}
+    return {"type": "enabled", "budget_tokens": budget_tokens}
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_spec",
@@ -115,10 +133,14 @@ async def test_live_anthropic_thinking_and_tool_replay(
         tools=tools,
     )
 
-    assert create_calls[0]["kwargs"]["thinking"] == {
-        "type": "enabled",
-        "budget_tokens": 1024,
-    }
+    assert (
+        create_calls[0]["kwargs"]["thinking"]
+        == _expected_thinking_kwargs(model_spec.model, 1024)
+    )
+    if _requires_adaptive_thinking(model_spec.model):
+        assert create_calls[0]["kwargs"]["output_config"] == {
+            "effort": _budget_to_effort(1024)
+        }
     assert first.tool_calls, "Anthropic should issue a tool call in the first turn"
     assert first.thinking_blocks, "Anthropic thinking blocks should be preserved"
 
@@ -145,10 +167,14 @@ async def test_live_anthropic_thinking_and_tool_replay(
         tools=tools,
     )
 
-    assert create_calls[1]["kwargs"]["thinking"] == {
-        "type": "enabled",
-        "budget_tokens": 1024,
-    }
+    assert (
+        create_calls[1]["kwargs"]["thinking"]
+        == _expected_thinking_kwargs(model_spec.model, 1024)
+    )
+    if _requires_adaptive_thinking(model_spec.model):
+        assert create_calls[1]["kwargs"]["output_config"] == {
+            "effort": _budget_to_effort(1024)
+        }
     assert isinstance(second.content, str)
     assert "13" in second.content
     assert "prime" in second.content.lower()