Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 81 additions & 40 deletions hud/agents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,59 +487,100 @@ async def _run_context(
self.console.debug(f"Messages: {messages}")

step_count = 0
answer_submitted = False

while max_steps == -1 or step_count < max_steps:
step_count += 1
if max_steps == -1:
self.console.debug(f"Step {step_count} (unlimited)")
else:
self.console.debug(f"Step {step_count}/{max_steps}")

# If we've reached max_steps without submitting an answer, force the
# next model call to use the `answer` tool only. Provider-specific
# agents (Claude/OpenAI/Gemini) implement this by honoring the
# `_force_answer_only` attribute via their native tool-choice APIs.
if max_steps != -1 and step_count >= max_steps and not answer_submitted:
try:
tools = self.get_available_tools()
except RuntimeError:
tools = self._available_tools or []

if tools and any(getattr(t, "name", None) == "answer" for t in tools):
# Set a flag checked by provider agents to restrict tools
setattr(self, "_force_answer_only", True)
self.console.warning_log(
f"Reached max_steps ({max_steps}) without submitting answer. "
"Prompting model to submit answer now."
)
# Also inject a clear instruction so the model knows what to do
messages.extend(
await self.format_message(
(
"You have reached the maximum number of steps allowed. "
"You MUST use the `answer` tool NOW to submit your final "
"answer based on all the information you have gathered so far. "
"Do not use any other tools. Submit your answer immediately."
)
)
)

try:
# 1. Get model response
response = await self.get_response(messages)

self.console.debug(f"Agent:\n{response}")

# Check if we should stop
if response.done or not response.tool_calls:
# Use auto_respond to decide whether to stop
decision: Literal["STOP", "CONTINUE"] = "STOP"
if self.auto_respond and response.content:
try:
from hud.agents.misc import ResponseAgent

response_agent = ResponseAgent()
decision = await response_agent.determine_response(response.content)
except Exception as e:
self.console.warning_log(f"Auto-respond failed: {e}")
if decision == "STOP":
self.console.debug("Stopping execution")
final_response = response
break
else:
self.console.debug("Continuing execution")
messages.extend(await self.format_message(decision))
continue

# 2. Execute tools
tool_calls = response.tool_calls
tool_results = await self.call_tools(tool_calls)

# 3. Format tool results and add to messages
tool_messages = await self.format_tool_results(tool_calls, tool_results)
messages.extend(tool_messages)

# Compact step completion display
step_info = f"\n[bold]Step {step_count}"
if max_steps != -1:
step_info += f"/{max_steps}"
step_info += "[/bold]"

# Show tool calls and results in compact format
for call, result in zip(tool_calls, tool_results, strict=False):
step_info += f"\n{call}\n{result}"

self.console.info_log(step_info)
# Track whether an answer was submitted in this step
if response.tool_calls:
for tool_call in response.tool_calls:
if getattr(tool_call, "name", "") == "answer":
answer_submitted = True
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Failed answer calls disable final-step forcing

Medium Severity

answer_submitted is set as soon as an answer tool call is emitted, before call_tools confirms success. If that answer call fails, later max-step logic sees answer_submitted=True and skips forced-answer behavior, allowing the run to finish without a valid final submission.

Additional Locations (1)

Fix in Cursor Fix in Web

# Clear forced-answer flag once we've seen an answer
if hasattr(self, "_force_answer_only"):
setattr(self, "_force_answer_only", False)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Forced-answer flag leaks across runs

Medium Severity

_run_context sets _force_answer_only but only clears it when an answer tool call is observed. If a run exits without that call, the flag stays set on the agent instance. Later run() calls on the same instance are unintentionally forced into answer-only mode from the first step.

Additional Locations (2)

Fix in Cursor Fix in Web

break

# Check if we should stop
if response.done or not response.tool_calls:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stop condition skipped without tool calls

High Severity

_run_context now gates stop handling behind if response.tool_calls, so responses with no tool calls never hit the response.done or not response.tool_calls exit path. The loop can continue until max_steps exhaustion (or forever with max_steps=-1), and Trace.content may end up None instead of the model’s final response.

Fix in Cursor Fix in Web

# Use auto_respond to decide whether to stop
decision: Literal["STOP", "CONTINUE"] = "STOP"
if self.auto_respond and response.content:
try:
from hud.agents.misc import ResponseAgent

response_agent = ResponseAgent()
decision = await response_agent.determine_response(response.content)
except Exception as e:
self.console.warning_log(f"Auto-respond failed: {e}")
if decision == "STOP":
self.console.debug("Stopping execution")
final_response = response
break
else:
self.console.debug("Continuing execution")
messages.extend(await self.format_message(decision))
continue

# 2. Execute tools
tool_calls = response.tool_calls
tool_results = await self.call_tools(tool_calls)

# 3. Format tool results and add to messages
tool_messages = await self.format_tool_results(tool_calls, tool_results)
messages.extend(tool_messages)

# Compact step completion display
step_info = f"\n[bold]Step {step_count}"
if max_steps != -1:
step_info += f"/{max_steps}"
step_info += "[/bold]"

# Show tool calls and results in compact format
for call, result in zip(tool_calls, tool_results, strict=False):
step_info += f"\n{call}\n{result}"

self.console.info_log(step_info)

except Exception as e:
self.console.error_log(f"Step failed: {e}")
Expand Down
31 changes: 27 additions & 4 deletions hud/agents/claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,29 @@ async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
"""Get response from Claude including any tool calls."""
messages_cached = self._add_prompt_caching(messages)

# Support forced-answer behavior from MCPAgent by restricting tools and
# tool_choice when _force_answer_only is set. This mirrors sandbox's
# per-message forceToolOptions behavior.
force_answer_only = getattr(self, "_force_answer_only", False)

tools_for_call: list[BetaToolUnionParam] = self.claude_tools
tool_choice: dict[str, Any] = {
"type": "auto",
"disable_parallel_tool_use": True,
}

if force_answer_only:
answer_tools = [
t for t in self.claude_tools if getattr(t, "name", None) == "answer"
]
if answer_tools:
tools_for_call = answer_tools
tool_choice = {
"type": "tool",
"name": "answer",
"disable_parallel_tool_use": True,
}

# betas to use - collected during tool conversion based on native specs
# Only pass betas when non-empty; an empty list can produce an empty
# anthropic-beta header which the API rejects.
Expand All @@ -212,8 +235,8 @@ async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
system=self.system_prompt if self.system_prompt is not None else Omit(),
max_tokens=self.max_tokens,
messages=messages_cached,
tools=self.claude_tools,
tool_choice={"type": "auto", "disable_parallel_tool_use": True},
tools=tools_for_call,
tool_choice=tool_choice,
betas=betas,
)
messages.append(BetaMessageParam(role="assistant", content=response.content))
Expand All @@ -228,8 +251,8 @@ async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
system=self.system_prompt if self.system_prompt is not None else Omit(),
max_tokens=self.max_tokens,
messages=messages_cached,
tools=self.claude_tools,
tool_choice={"type": "auto", "disable_parallel_tool_use": True},
tools=tools_for_call,
tool_choice=tool_choice,
betas=betas,
) as stream:
# allow backend to accumulate message content
Expand Down
37 changes: 29 additions & 8 deletions hud/agents/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,35 @@ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[genai_ty
async def get_response(self, messages: list[genai_types.Content]) -> AgentResponse:
"""Get response from Gemini including any tool calls."""
# Build generate content config
generate_config = genai_types.GenerateContentConfig(
temperature=self.temperature,
top_p=self.top_p,
top_k=self.top_k,
max_output_tokens=self.max_output_tokens,
tools=self.gemini_tools,
system_instruction=self.system_prompt,
)
force_answer_only = getattr(self, "_force_answer_only", False)

tool_config: genai_types.ToolConfig | None = None
if force_answer_only:
# Restrict Gemini's function calling to the `answer` function only,
# matching sandbox's per-message forceToolOptions behavior.
try:
tool_config = genai_types.ToolConfig(
function_calling_config=genai_types.FunctionCallingConfig(
mode="ANY",
allowed_function_names=["answer"],
)
)
except Exception as exc: # pragma: no cover - older SDKs
logger.debug(f"Could not construct FunctionCallingConfig: {exc}")
tool_config = None

generate_kwargs: dict[str, Any] = {
"temperature": self.temperature,
"top_p": self.top_p,
"top_k": self.top_k,
"max_output_tokens": self.max_output_tokens,
"tools": self.gemini_tools,
"system_instruction": self.system_prompt,
}
if tool_config is not None:
generate_kwargs["tool_config"] = tool_config

generate_config = genai_types.GenerateContentConfig(**generate_kwargs)

# Use async API to avoid blocking the event loop
response = await self.gemini_client.aio.models.generate_content(
Expand Down
9 changes: 8 additions & 1 deletion hud/agents/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,14 @@ async def get_response(self, messages: ResponseInputParam) -> AgentResponse:
instructions=self.system_prompt,
max_output_tokens=self.max_output_tokens,
temperature=self.temperature,
tool_choice=self.tool_choice if self.tool_choice is not None else Omit(),
# Per-step forced-answer behavior: when the base agent sets
# `_force_answer_only`, override tool_choice so the model MUST call
# the `answer` function on this request.
tool_choice=(
ToolChoice(type="function", function={"name": "answer"})
if getattr(self, "_force_answer_only", False)
else (self.tool_choice if self.tool_choice is not None else Omit())
),
parallel_tool_calls=self.parallel_tool_calls,
reasoning=self.reasoning if self.reasoning is not None else Omit(),
tools=self._openai_tools if self._openai_tools else Omit(),
Expand Down