Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions src/llm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

from pydantic import BaseModel
from sentry_sdk.ai.monitoring import ai_track
from tenacity import retry, stop_after_attempt, wait_exponential

from src.config import ConfiguredModelSettings, ModelConfig
from src.exceptions import ValidationException
Expand All @@ -32,6 +31,7 @@
plan_attempt,
resolve_runtime_model_config,
update_current_langfuse_observation,
with_llm_retry,
)
from .tool_loop import execute_tool_loop
from .types import (
Expand Down Expand Up @@ -287,11 +287,11 @@ def before_retry_callback(retry_state: Any) -> None:
logger.info(f"Will retry with attempt {next_attempt}/{retry_attempts}")

if enable_retry:
decorated = retry(
stop=stop_after_attempt(retry_attempts),
wait=wait_exponential(multiplier=1, min=4, max=10),
before_sleep=before_retry_callback,
)(decorated)
decorated = with_llm_retry(
decorated,
retry_attempts=retry_attempts,
before_retry_callback=before_retry_callback,
)

def _trace_thinking_budget() -> int | None:
# Trace log should reflect what got applied, so fall back to the
Expand Down Expand Up @@ -400,11 +400,11 @@ async def _toolless_call() -> (
if track_name:
wrapped = ai_track(track_name)(wrapped)
if enable_retry:
wrapped = retry(
stop=stop_after_attempt(retry_attempts),
wait=wait_exponential(multiplier=1, min=4, max=10),
before_sleep=before_retry_callback,
)(wrapped)
wrapped = with_llm_retry(
wrapped,
retry_attempts=retry_attempts,
before_retry_callback=before_retry_callback,
)
result: (
HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
) = await wrapped()
Expand Down
147 changes: 133 additions & 14 deletions src/llm/backends/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,135 @@

import copy
import json
import re
from collections.abc import AsyncIterator
from typing import Any

from anthropic.types import TextBlock, ThinkingBlock, ToolUseBlock
from pydantic import BaseModel, ValidationError

from src.exceptions import ValidationException
from src.llm.backend import CompletionResult, StreamChunk, ToolCallResult
from src.llm.structured_output import repair_response_model_json

# Effort levels accepted by ``output_config.effort`` (ordered low -> high).
# Honcho's ThinkingEffortLevel additionally allows "none"/"minimal", which the
# Messages API does not accept, so those are mapped onto supported values below.
_ANTHROPIC_EFFORTS: frozenset[str] = frozenset(
{"low", "medium", "high", "xhigh", "max"}
)
_EFFORT_ALIASES: dict[str, str] = {"minimal": "low"}

# claude-<tier>-<major>-<minor>[-<date/suffix>]; only the version prefix matters
# for picking the thinking format.
_MODEL_VERSION_RE = re.compile(r"^claude-(opus|sonnet|haiku)-(\d+)-(\d+)")

# Per-tier (major, minor) at/above which Anthropic removed the legacy
# ``thinking: {"type": "enabled", "budget_tokens": N}`` format and *requires*
# adaptive thinking. Sending the legacy shape to one of these models returns
# HTTP 400:
# "thinking.type.enabled" is not supported for this model. Use
# "thinking.type.adaptive" and "output_config.effort" to control thinking ...
# Verified against the Anthropic docs (Opus 4.7 and Opus 4.8 are adaptive-only):
# https://platform.claude.com/docs/en/build-with-claude/adaptive-thinking
# Opus 4.6 / Sonnet 4.6 still accept the legacy format (deprecated but
# functional), so they intentionally stay on the legacy path.
_ADAPTIVE_THINKING_MIN_VERSION: dict[str, tuple[int, int]] = {"opus": (4, 7)}


def _requires_adaptive_thinking(model: str) -> bool:
"""Return True if ``model`` rejects legacy budget-based thinking (HTTP 400).

Such models require ``thinking: {"type": "adaptive"}`` with
``output_config.effort`` instead of ``{"type": "enabled", "budget_tokens"}``.
"""
match = _MODEL_VERSION_RE.match(model)
if match is None:
return False
minimum = _ADAPTIVE_THINKING_MIN_VERSION.get(match.group(1))
if minimum is None:
return False
return (int(match.group(2)), int(match.group(3))) >= minimum


def _budget_to_effort(thinking_budget_tokens: int) -> str:
"""Bucket a legacy thinking-token budget into an adaptive effort level.

There is no exact mapping from ``budget_tokens`` to ``effort``; these
buckets keep budget-configured models at a comparable thinking depth (e.g.
the deriver's 16000-token dream budget maps to ``high``, 32000 to ``xhigh``).
"""
if thinking_budget_tokens < 4096:
return "low"
if thinking_budget_tokens < 16000:
return "medium"
if thinking_budget_tokens < 32000:
return "high"
return "xhigh"


def _adaptive_effort(
thinking_effort: str | None, thinking_budget_tokens: int | None
) -> str | None:
"""Resolve ``output_config.effort`` for an adaptive-thinking request.

An explicit ``thinking_effort`` wins; otherwise the legacy
``thinking_budget_tokens`` is bucketed so existing budget-based configs keep
a comparable thinking depth. Returns None to fall back to the API default
(``high``). A budget of 0 (or None) means "no effort hint"; a negative
budget or an unrecognized effort is a caller error and raises
``ValidationException`` rather than being silently coerced.
"""
if thinking_budget_tokens is not None and thinking_budget_tokens < 0:
raise ValidationException("thinking_budget_tokens must be >= 0")
if thinking_effort is not None and thinking_effort != "none":
normalized = _EFFORT_ALIASES.get(thinking_effort, thinking_effort)
if normalized not in _ANTHROPIC_EFFORTS:
raise ValidationException(
f"Unsupported thinking_effort: {thinking_effort!r}"
)
return normalized
if thinking_budget_tokens:
return _budget_to_effort(thinking_budget_tokens)
return None


def _build_thinking_params(
model: str,
thinking_budget_tokens: int | None,
thinking_effort: str | None,
) -> dict[str, Any]:
"""Build the ``thinking`` (+ ``output_config``) request params for a model.

Opus 4.7+ reject the legacy ``{"type": "enabled", "budget_tokens": N}`` shape
with HTTP 400 and require adaptive thinking; older models keep the legacy
shape unchanged. A budget of 0 (or None) with no explicit effort means no
thinking; an explicit ``thinking_effort`` enables adaptive thinking
regardless of budget. A negative budget raises ``ValidationException``
rather than being forwarded. Returns an empty dict when thinking is not
requested.
"""
if thinking_budget_tokens is not None and thinking_budget_tokens < 0:
raise ValidationException("thinking_budget_tokens must be >= 0")

if not _requires_adaptive_thinking(model):
if thinking_budget_tokens:
return {
"thinking": {
"type": "enabled",
"budget_tokens": thinking_budget_tokens,
}
}
return {}

effort = _adaptive_effort(thinking_effort, thinking_budget_tokens)
if not thinking_budget_tokens and effort is None:
return {}
params: dict[str, Any] = {"thinking": {"type": "adaptive"}}
if effort is not None:
params["output_config"] = {"effort": effort}
return params
Comment thread
coderabbitai[bot] marked this conversation as resolved.


class AnthropicBackend:
"""Provider backend wrapping the native Anthropic SDK."""
Expand All @@ -34,7 +154,7 @@ async def complete(
max_output_tokens: int | None = None,
extra_params: dict[str, Any] | None = None,
) -> CompletionResult:
del max_output_tokens, thinking_effort
del max_output_tokens

request_messages, system_messages = self._extract_system(messages)
params: dict[str, Any] = {
Expand All @@ -43,6 +163,11 @@ async def complete(
"messages": request_messages,
}

thinking_params = _build_thinking_params(
model, thinking_budget_tokens, thinking_effort
)
params.update(thinking_params)

if temperature is not None:
params["temperature"] = temperature
if stop:
Expand All @@ -60,19 +185,14 @@ async def complete(
converted_tool_choice = self._convert_tool_choice(tool_choice)
if converted_tool_choice is not None:
params["tool_choice"] = converted_tool_choice
if thinking_budget_tokens:
params["thinking"] = {
"type": "enabled",
"budget_tokens": thinking_budget_tokens,
}
if extra_params:
for key in ("top_p", "top_k"):
if key in extra_params:
params[key] = extra_params[key]

use_json_prefill = (
bool(response_format or self._json_mode(extra_params))
and not thinking_budget_tokens
and "thinking" not in thinking_params
and self._supports_assistant_prefill(model)
)
if use_json_prefill and params["messages"]:
Expand Down Expand Up @@ -119,14 +239,18 @@ async def stream(
extra_params: dict[str, Any] | None = None,
) -> AsyncIterator[StreamChunk]:
is_json_mode = self._json_mode(extra_params)
del max_output_tokens, thinking_effort
del max_output_tokens

request_messages, system_messages = self._extract_system(messages)
params: dict[str, Any] = {
"model": model,
"max_tokens": max_tokens,
"messages": request_messages,
}
thinking_params = _build_thinking_params(
model, thinking_budget_tokens, thinking_effort
)
params.update(thinking_params)
if temperature is not None:
params["temperature"] = temperature
if stop:
Expand All @@ -150,7 +274,7 @@ async def stream(
params[key] = extra_params[key]
use_json_prefill = (
bool(response_format or is_json_mode)
and not thinking_budget_tokens
and "thinking" not in thinking_params
and self._supports_assistant_prefill(model)
)
if use_json_prefill and params["messages"]:
Expand All @@ -169,11 +293,6 @@ async def stream(
params["messages"],
f"\n\nRespond with valid JSON matching this schema:\n{schema_json}",
)
if thinking_budget_tokens:
params["thinking"] = {
"type": "enabled",
"budget_tokens": thinking_budget_tokens,
}

async with self._client.messages.stream(**params) as stream:
async for chunk in stream:
Expand Down
35 changes: 34 additions & 1 deletion src/llm/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,17 @@
from __future__ import annotations

import logging
from collections.abc import Callable
from contextvars import ContextVar
from dataclasses import dataclass
from typing import Any
from typing import Any, TypeVar

from tenacity import (
retry,
retry_if_not_exception_type,
stop_after_attempt,
wait_exponential,
)

from src.config import (
ConfiguredModelSettings,
Expand All @@ -23,16 +31,41 @@
resolve_model_config,
settings,
)
from src.exceptions import HonchoException

from .registry import backend_for_provider, client_for_model_config
from .types import ProviderClient, ReasoningEffortType

_WrappedFn = TypeVar("_WrappedFn", bound=Callable[..., Any])

logger = logging.getLogger(__name__)

# ContextVar tracking the current retry attempt for provider switching.
current_attempt: ContextVar[int] = ContextVar("current_attempt", default=0)


def with_llm_retry(
func: _WrappedFn,
*,
retry_attempts: int,
before_retry_callback: Callable[[Any], None],
) -> _WrappedFn:
"""Wrap an LLM-call closure with Honcho's standard retry policy.

Transient failures retry with exponential backoff, but ``HonchoException``
fails fast: these are deterministic input/config errors (e.g. an invalid
thinking budget/effort raised by a backend) and should propagate with their
own ``status_code`` instead of being retried, re-wrapped in tenacity's
``RetryError``, and surfaced as a generic 500.
"""
return retry(
stop=stop_after_attempt(retry_attempts),
wait=wait_exponential(multiplier=1, min=4, max=10),
retry=retry_if_not_exception_type(HonchoException),
before_sleep=before_retry_callback,
)(func)


def update_current_langfuse_observation(
provider: ModelTransport,
model: str,
Expand Down
32 changes: 16 additions & 16 deletions src/llm/tool_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from typing import Any, ParamSpec, TypeVar

from pydantic import BaseModel
from tenacity import retry, stop_after_attempt, wait_exponential

from src.config import ModelTransport
from src.exceptions import ValidationException
Expand All @@ -36,6 +35,7 @@
AttemptPlan,
current_attempt,
effective_temperature,
with_llm_retry,
)
from .types import (
HonchoLLMCallResponse,
Expand Down Expand Up @@ -256,11 +256,11 @@ async def _setup_stream() -> AsyncIterator[HonchoLLMCallStreamChunk]:
)

if enable_retry:
wrapped = retry(
stop=stop_after_attempt(retry_attempts),
wait=wait_exponential(multiplier=1, min=4, max=10),
before_sleep=before_retry_callback,
)(_setup_stream)
wrapped = with_llm_retry(
_setup_stream,
retry_attempts=retry_attempts,
before_retry_callback=before_retry_callback,
)
stream = await wrapped()
else:
stream = await _setup_stream()
Expand Down Expand Up @@ -377,11 +377,11 @@ async def _call_with_messages(
)

if enable_retry:
call_func = retry(
stop=stop_after_attempt(retry_attempts),
wait=wait_exponential(multiplier=1, min=4, max=10),
before_sleep=before_retry_callback,
)(_call_with_messages)
call_func = with_llm_retry(
_call_with_messages,
retry_attempts=retry_attempts,
before_retry_callback=before_retry_callback,
)
else:
call_func = _call_with_messages

Expand Down Expand Up @@ -631,11 +631,11 @@ async def _final_call() -> HonchoLLMCallResponse[Any]:
)

if enable_retry:
final_call_func = retry(
stop=stop_after_attempt(retry_attempts),
wait=wait_exponential(multiplier=1, min=4, max=10),
before_sleep=before_retry_callback,
)(_final_call)
final_call_func = with_llm_retry(
_final_call,
retry_attempts=retry_attempts,
before_retry_callback=before_retry_callback,
)
else:
final_call_func = _final_call

Expand Down
Loading