Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions benchmark/llm_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,20 @@
logger = logging.getLogger(__name__)


def get_text_content(content) -> str:
"""Extract plain text from LLM response content.

Handles both plain strings and lists returned by models with thinking/reasoning
blocks (e.g. Claude extended thinking on Bedrock returns a list of content blocks).
"""
if isinstance(content, list):
for block in content:
if isinstance(block, dict) and block.get("type") == "text":
return block.get("text", "")
return ""
return content or ""


class LLMClient:
"""
Unified LLM client supporting multiple providers.
Expand Down Expand Up @@ -52,15 +66,18 @@ def _initialize_llm(self):
max_tokens=self.max_tokens,
)
elif self.provider == "aws_bedrock":
from langchain_aws import ChatBedrock
from langchain_aws import ChatBedrockConverse

thinking_params = {}
if self.reasoning:
thinking_params = {"thinking": self.reasoning}

self.llm = ChatBedrock(
model_id=self.model,
self.llm = ChatBedrockConverse(
model=self.model,
region_name=self.region or "us-west-2",
model_kwargs={
"temperature": self.temperature,
"max_tokens": self.max_tokens,
},
temperature=self.temperature,
max_tokens=self.max_tokens,
additional_model_request_fields=thinking_params
)
elif self.provider == "openai":
from langchain_openai import ChatOpenAI
Expand Down
4 changes: 2 additions & 2 deletions benchmark/verifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from langchain_core.messages import HumanMessage, SystemMessage

from benchmark.llm_client import LLMClient
from benchmark.llm_client import LLMClient, get_text_content
from benchmark.mcp_client import MCPClient
from benchmark.models import VerifierConfig

Expand Down Expand Up @@ -452,7 +452,7 @@ async def _compare_with_llm(

try:
response = await self.llm_client.llm.ainvoke(messages)
response_text = response.content
response_text = get_text_content(response.content)

# Parse JSON response
# Try to extract JSON from markdown code blocks
Expand Down
15 changes: 8 additions & 7 deletions orchestrators/decomposing_planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage

from benchmark.llm_client import LLMClient
from benchmark.llm_client import LLMClient, get_text_content
from .base import AgentOrchestrator

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -312,6 +312,7 @@ async def generate_plan_and_subtasks(
prompt = self.construct_prompt(system_prompt, user_prompt, tools)
logger.info("🧠 Meta Agent: Generating plan and decomposing into subtasks...")

response = None
for attempt in range(max_retries):
try:
response = await self.planner_llm.ainvoke([HumanMessage(content=prompt)])
Expand All @@ -323,7 +324,7 @@ async def generate_plan_and_subtasks(
f"{usage['input_tokens'] + usage['output_tokens']} tokens"
)

content = extract_json_from_llm_response(response.content)
content = extract_json_from_llm_response(get_text_content(response.content))
logger.debug(f"Extracted JSON content (attempt {attempt + 1}):\n{content[:500]}")

parsed = json.loads(content)
Expand Down Expand Up @@ -398,7 +399,7 @@ async def generate_plan_and_subtasks(
await asyncio.sleep(1)
else:
logger.error(
f"Response content: {response.content if 'response' in locals() else 'N/A'}"
f"Response content: {get_text_content(response.content) if response is not None else 'N/A'}"
)
raise ValueError(
f"Failed to parse structured plan after {max_retries} attempts: {e}"
Expand Down Expand Up @@ -573,14 +574,14 @@ async def execute(self, max_iterations: int = 25) -> SubTaskResult:

conversation_flow.append({
"type": "ai_message",
"content": response.content,
"content": get_text_content(response.content),
"tool_calls": [
{"name": tc["name"], "args": tc["args"]}
for tc in (response.tool_calls or [])
],
})

logger.info(f" LLM: {response.content[:200]}...")
logger.info(f" LLM: {get_text_content(response.content)[:200]}...")

messages.append(response)

Expand Down Expand Up @@ -642,7 +643,7 @@ async def execute(self, max_iterations: int = 25) -> SubTaskResult:
usage=total_usage,
)

final_response = messages[-1].content if messages else ""
final_response = get_text_content(messages[-1].content) if messages else ""
summary = final_response or (
f"Executed {len([c for c in conversation_flow if c['type'] == 'tool_result'])} tool calls"
)
Expand Down Expand Up @@ -746,7 +747,7 @@ async def _extract_memory_updates(
response = await self.llm_client.llm.ainvoke(extraction_messages)

extraction_usage = extract_usage_from_response(response)
content = extract_json_from_llm_response(response.content)
content = extract_json_from_llm_response(get_text_content(response.content))
memory_updates = json.loads(content)

if not isinstance(memory_updates, dict):
Expand Down
10 changes: 5 additions & 5 deletions orchestrators/planner_react.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage

from benchmark.llm_client import LLMClient
from benchmark.llm_client import LLMClient, get_text_content
from .base import AgentOrchestrator

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -101,7 +101,7 @@ async def generate_plan(

logger.info("🧠 Meta Agent: Generating execution plan...")
response = await self.planner_llm.ainvoke([HumanMessage(content=prompt)])
plan = response.content
plan = get_text_content(response.content)

logger.info(f"✅ Plan generated ({len(plan)} characters)")
logger.debug(f"Generated plan:\n{plan}")
Expand Down Expand Up @@ -230,7 +230,7 @@ async def execute(self) -> Dict[str, Any]:
conversation_flow.append(
{
"type": "ai_message",
"content": response.content,
"content": get_text_content(response.content),
"usage_metadata": usage_metadata,
"response_metadata": response_metadata,
"tool_calls": [
Expand All @@ -240,7 +240,7 @@ async def execute(self) -> Dict[str, Any]:
}
)

logger.info(f"LLM Response: {response.content}")
logger.info(f"LLM Response: {get_text_content(response.content)}")

messages.append(response)

Expand Down Expand Up @@ -289,7 +289,7 @@ async def execute(self) -> Dict[str, Any]:
)

return {
"final_response": messages[-1].content if messages else "",
"final_response": get_text_content(messages[-1].content) if messages else "",
"conversation_flow": conversation_flow,
"tools_used": tools_used,
"tool_results": tool_results,
Expand Down
8 changes: 4 additions & 4 deletions orchestrators/react.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Any, Dict, List, TYPE_CHECKING

from benchmark.mcp_client import MCPClient
from benchmark.llm_client import LLMClient
from benchmark.llm_client import LLMClient, get_text_content
from benchmark.models import BenchmarkConfig
from langchain_core.messages import SystemMessage, HumanMessage, ToolMessage

Expand Down Expand Up @@ -58,7 +58,7 @@ async def execute(self) -> Dict[str, Any]:
conversation_flow.append(
{
"type": "ai_message",
"content": response.content,
"content": get_text_content(response.content),
"usage_metadata": usage_metadata,
"response_metadata": response_metadata,
"tool_calls": [
Expand All @@ -68,7 +68,7 @@ async def execute(self) -> Dict[str, Any]:
}
)

logger.info(f"LLM Response: {response.content}")
logger.info(f"LLM Response: {get_text_content(response.content)}")

# Terminate if the LLM decided no further tool calls are needed
if not response.tool_calls or len(response.tool_calls) == 0:
Expand Down Expand Up @@ -117,7 +117,7 @@ async def execute(self) -> Dict[str, Any]:
)

return {
"final_response": messages[-1].content if messages else "",
"final_response": get_text_content(messages[-1].content) if messages else "",
"conversation_flow": conversation_flow,
"tools_used": tools_used,
"tool_results": tool_results,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dependencies = [
anthropic = [
"langchain-anthropic>=0.1.0",
"anthropic>=0.40.0",
"langchain-aws>=0.1.0",
"langchain-aws>=1.4.1",
"boto3>=1.28.0",
]
openai = [
Expand Down