ServiceNow · varunpandey23 · May 8, 2026
diff --git a/benchmark/llm_client.py b/benchmark/llm_client.py
@@ -4,6 +4,20 @@
 logger = logging.getLogger(__name__)
 
 
+def get_text_content(content) -> str:
+    """Extract plain text from LLM response content.
+
+    Handles both plain strings and lists returned by models with thinking/reasoning
+    blocks (e.g. Claude extended thinking on Bedrock returns a list of content blocks).
+    """
+    if isinstance(content, list):
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "text":
+                return block.get("text", "")
+        return ""
+    return content or ""
+
+
 class LLMClient:
     """
     Unified LLM client supporting multiple providers.
@@ -52,15 +66,18 @@ def _initialize_llm(self):
                     max_tokens=self.max_tokens,
                 )
             elif self.provider == "aws_bedrock":
-                from langchain_aws import ChatBedrock
+                from langchain_aws import ChatBedrockConverse
+
+                thinking_params = {}
+                if self.reasoning:
+                    thinking_params = {"thinking": self.reasoning}
 
-                self.llm = ChatBedrock(
-                    model_id=self.model,
+                self.llm = ChatBedrockConverse(
+                    model=self.model,
                     region_name=self.region or "us-west-2",
-                    model_kwargs={
-                        "temperature": self.temperature,
-                        "max_tokens": self.max_tokens,
-                    },
+                    temperature=self.temperature,
+                    max_tokens=self.max_tokens,
+                    additional_model_request_fields=thinking_params
                 )
             elif self.provider == "openai":
                 from langchain_openai import ChatOpenAI

diff --git a/benchmark/verifier.py b/benchmark/verifier.py
@@ -5,7 +5,7 @@
 
 from langchain_core.messages import HumanMessage, SystemMessage
 
-from benchmark.llm_client import LLMClient
+from benchmark.llm_client import LLMClient, get_text_content
 from benchmark.mcp_client import MCPClient
 from benchmark.models import VerifierConfig
 
@@ -452,7 +452,7 @@ async def _compare_with_llm(
 
         try:
             response = await self.llm_client.llm.ainvoke(messages)
-            response_text = response.content
+            response_text = get_text_content(response.content)
 
             # Parse JSON response
             # Try to extract JSON from markdown code blocks

diff --git a/orchestrators/decomposing_planner.py b/orchestrators/decomposing_planner.py
@@ -17,7 +17,7 @@
 
 from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage
 
-from benchmark.llm_client import LLMClient
+from benchmark.llm_client import LLMClient, get_text_content
 from .base import AgentOrchestrator
 
 logger = logging.getLogger(__name__)
@@ -312,6 +312,7 @@ async def generate_plan_and_subtasks(
         prompt = self.construct_prompt(system_prompt, user_prompt, tools)
         logger.info("🧠 Meta Agent: Generating plan and decomposing into subtasks...")
 
+        response = None
         for attempt in range(max_retries):
             try:
                 response = await self.planner_llm.ainvoke([HumanMessage(content=prompt)])
@@ -323,7 +324,7 @@ async def generate_plan_and_subtasks(
                     f"{usage['input_tokens'] + usage['output_tokens']} tokens"
                 )
 
-                content = extract_json_from_llm_response(response.content)
+                content = extract_json_from_llm_response(get_text_content(response.content))
                 logger.debug(f"Extracted JSON content (attempt {attempt + 1}):\n{content[:500]}")
 
                 parsed = json.loads(content)
@@ -398,7 +399,7 @@ async def generate_plan_and_subtasks(
                     await asyncio.sleep(1)
                 else:
                     logger.error(
-                        f"Response content: {response.content if 'response' in locals() else 'N/A'}"
+                        f"Response content: {get_text_content(response.content) if response is not None else 'N/A'}"
                     )
                     raise ValueError(
                         f"Failed to parse structured plan after {max_retries} attempts: {e}"
@@ -573,14 +574,14 @@ async def execute(self, max_iterations: int = 25) -> SubTaskResult:
 
                 conversation_flow.append({
                     "type": "ai_message",
-                    "content": response.content,
+                    "content": get_text_content(response.content),
                     "tool_calls": [
                         {"name": tc["name"], "args": tc["args"]}
                         for tc in (response.tool_calls or [])
                     ],
                 })
 
-                logger.info(f"  LLM: {response.content[:200]}...")
+                logger.info(f"  LLM: {get_text_content(response.content)[:200]}...")
 
                 messages.append(response)
 
@@ -642,7 +643,7 @@ async def execute(self, max_iterations: int = 25) -> SubTaskResult:
                     usage=total_usage,
                 )
 
-        final_response = messages[-1].content if messages else ""
+        final_response = get_text_content(messages[-1].content) if messages else ""
         summary = final_response or (
             f"Executed {len([c for c in conversation_flow if c['type'] == 'tool_result'])} tool calls"
         )
@@ -746,7 +747,7 @@ async def _extract_memory_updates(
             response = await self.llm_client.llm.ainvoke(extraction_messages)
 
             extraction_usage = extract_usage_from_response(response)
-            content = extract_json_from_llm_response(response.content)
+            content = extract_json_from_llm_response(get_text_content(response.content))
             memory_updates = json.loads(content)
 
             if not isinstance(memory_updates, dict):

diff --git a/orchestrators/planner_react.py b/orchestrators/planner_react.py
@@ -11,7 +11,7 @@
 
 from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage
 
-from benchmark.llm_client import LLMClient
+from benchmark.llm_client import LLMClient, get_text_content
 from .base import AgentOrchestrator
 
 logger = logging.getLogger(__name__)
@@ -101,7 +101,7 @@ async def generate_plan(
 
         logger.info("🧠 Meta Agent: Generating execution plan...")
         response = await self.planner_llm.ainvoke([HumanMessage(content=prompt)])
-        plan = response.content
+        plan = get_text_content(response.content)
 
         logger.info(f"✅ Plan generated ({len(plan)} characters)")
         logger.debug(f"Generated plan:\n{plan}")
@@ -230,7 +230,7 @@ async def execute(self) -> Dict[str, Any]:
             conversation_flow.append(
                 {
                     "type": "ai_message",
-                    "content": response.content,
+                    "content": get_text_content(response.content),
                     "usage_metadata": usage_metadata,
                     "response_metadata": response_metadata,
                     "tool_calls": [
@@ -240,7 +240,7 @@ async def execute(self) -> Dict[str, Any]:
                 }
             )
 
-            logger.info(f"LLM Response: {response.content}")
+            logger.info(f"LLM Response: {get_text_content(response.content)}")
 
             messages.append(response)
 
@@ -289,7 +289,7 @@ async def execute(self) -> Dict[str, Any]:
                 )
 
         return {
-            "final_response": messages[-1].content if messages else "",
+            "final_response": get_text_content(messages[-1].content) if messages else "",
             "conversation_flow": conversation_flow,
             "tools_used": tools_used,
             "tool_results": tool_results,

diff --git a/orchestrators/react.py b/orchestrators/react.py
@@ -5,7 +5,7 @@
 from typing import Any, Dict, List, TYPE_CHECKING
 
 from benchmark.mcp_client import MCPClient
-from benchmark.llm_client import LLMClient
+from benchmark.llm_client import LLMClient, get_text_content
 from benchmark.models import BenchmarkConfig
 from langchain_core.messages import SystemMessage, HumanMessage, ToolMessage
 
@@ -58,7 +58,7 @@ async def execute(self) -> Dict[str, Any]:
             conversation_flow.append(
                 {
                     "type": "ai_message",
-                    "content": response.content,
+                    "content": get_text_content(response.content),
                     "usage_metadata": usage_metadata,
                     "response_metadata": response_metadata,
                     "tool_calls": [
@@ -68,7 +68,7 @@ async def execute(self) -> Dict[str, Any]:
                 }
             )
 
-            logger.info(f"LLM Response: {response.content}")
+            logger.info(f"LLM Response: {get_text_content(response.content)}")
 
             # Terminate if the LLM decided no further tool calls are needed
             if not response.tool_calls or len(response.tool_calls) == 0:
@@ -117,7 +117,7 @@ async def execute(self) -> Dict[str, Any]:
                 )
 
         return {
-            "final_response": messages[-1].content if messages else "",
+            "final_response": get_text_content(messages[-1].content) if messages else "",
             "conversation_flow": conversation_flow,
             "tools_used": tools_used,
             "tool_results": tool_results,

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
 anthropic = [
     "langchain-anthropic>=0.1.0",
     "anthropic>=0.40.0",
-    "langchain-aws>=0.1.0",   
+    "langchain-aws>=1.4.1",
     "boto3>=1.28.0",
 ]
 openai = [