mozilla · evgenyrp · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/bugbug/tools/build_repair/agent.py b/bugbug/tools/build_repair/agent.py
@@ -11,6 +11,13 @@
 
 from claude_agent_sdk import ClaudeAgentOptions, ResultMessage, query
 from pydantic import BaseModel, Field
+from tenacity import (
+    retry,
+    retry_if_exception,
+    retry_if_exception_message,
+    stop_after_attempt,
+    wait_exponential_jitter,
+)
 
 from bugbug.tools.base import GenerativeModelTool
 from bugbug.tools.build_repair.config import (
@@ -20,11 +27,14 @@
     FIREFOX_MCP_URL,
     FIX_MODEL,
     SANDBOX_CONFIG,
+    VERIFY_ALLOWED_TOOLS,
+    VERIFY_MODEL,
 )
 from bugbug.tools.build_repair.prompts import (
     ANALYSIS_TEMPLATE,
     EVAL_PROMPT,
     FIX_TEMPLATE,
+    VERIFY_TEMPLATE,
 )
 
 logger = getLogger(__name__)
@@ -44,7 +54,16 @@ class BuildFailure(BaseModel):
     )
 
 
-class AgentResponse(BaseModel):
+class UsageStats(BaseModel):
+    cost_usd: float = Field(default=0.0)
+    num_turns: int = Field(default=0)
+    input_tokens: int = Field(default=0)
+    output_tokens: int = Field(default=0)
+    cache_read_input_tokens: int = Field(default=0)
+    cache_creation_input_tokens: int = Field(default=0)
+
+
+class AgentResponse(UsageStats):
     """Output from a build repair run, including analysis, diff, cost, and build results."""
 
     summary: str = Field(default="")
@@ -67,6 +86,28 @@ class AgentResponse(BaseModel):
     stage2_transcript: list[dict] = Field(default_factory=list)
 
 
+class GroundTruth(BaseModel):
+    gh_fix_commits: list[str] = Field(
+        description="Git commit hashes of the ground truth fix."
+    )
+
+
+class Judgment(BaseModel):
+    analysis_correct: bool
+    analysis_quality: float
+    analysis_explanation: str
+    fix_matches_ground_truth: bool
+    fix_quality: float
+    fix_explanation: str
+    fix_acceptance_probability: float
+    fix_acceptance_explanation: str
+
+
+class VerifyResponse(UsageStats):
+    judgment: Judgment | None = Field(default=None)
+    verification_transcript: list[dict] = Field(default_factory=list)
+
+
 class BuildRepairTool(GenerativeModelTool):
     """Two-stage build repair agent using Claude Agent SDK.
 
@@ -82,12 +123,14 @@ def __init__(
         eval_mode: bool = False,
         analysis_model: str = ANALYSIS_MODEL,
         fix_model: str = FIX_MODEL,
+        verify_model: str = VERIFY_MODEL,
     ) -> None:
         self.eval_mode = eval_mode
         self.target_software = target_software
         self.analysis_only = analysis_only
         self.analysis_model = analysis_model
         self.fix_model = fix_model
+        self.verify_model = verify_model
 
     @classmethod
     def create(cls, **kwargs):
@@ -128,28 +171,49 @@ async def _run_stage(
         result_data: dict = {}
         usage: dict = {}
 
-        if on_message:
-            on_message(
-                stage_name,
-                {
-                    "type": "stage_start",
-                    "prompt": prompt,
-                    "model": model,
-                },
-            )
-        try:
+        @retry(
+            retry=(
+                retry_if_exception_message(match="Control request timeout")
+                | retry_if_exception_message(match="overloaded")
+                | retry_if_exception_message(match="529")
+                | retry_if_exception_message(match="exit code")
+                | retry_if_exception(
+                    lambda e: isinstance(e, (TimeoutError, ConnectionError, OSError))
+                )
+            ),
+            stop=stop_after_attempt(5),
+            wait=wait_exponential_jitter(initial=2, max=60, jitter=5),
+            before_sleep=lambda rs: logger.warning(
+                f"Bug {bug_id}: {stage_name} transient error "
+                f"(attempt {rs.attempt_number}/5), retrying: {rs.outcome.exception()}"
+            ),
+            reraise=True,
+        )
+        async def _query():
+            nonlocal cost, turns, usage, result_data
             async for message in query(prompt=prompt, options=options):
                 serialized = self._serialize_message(message)
                 transcript.append(serialized)
-                logger.info(f"Bug {bug_id}: {stage_name} [{serialized['type']}]")
-                logger.debug(f"Bug {bug_id}: {stage_name} detail: {serialized}")
+                logger.debug(f"Bug {bug_id}: {stage_name} [{serialized['type']}]")
                 if on_message:
                     on_message(stage_name, serialized)
                 if isinstance(message, ResultMessage):
                     cost += message.total_cost_usd or 0
                     turns += message.num_turns or 0
                     usage = getattr(message, "usage", {}) or {}
                     result_data = serialized
+
+        if on_message:
+            on_message(
+                stage_name,
+                {
+                    "type": "stage_start",
+                    "prompt": prompt,
+                    "model": model,
+                },
+            )
+        try:
+            await _query()
         finally:
             if on_message:
                 on_message(
@@ -233,6 +297,7 @@ async def run(
         analysis_prompt = ANALYSIS_TEMPLATE.format(
             bug_id=failure.bug_id,
             target_software=self.target_software,
+            worktree_path=worktree_path,
             eval=EVAL_PROMPT if self.eval_mode else "",
         )
         try:
@@ -305,7 +370,10 @@ async def run(
             mcp_servers=mcp_servers,
         )
         fix_prompt = FIX_TEMPLATE.format(
-            bug_id=failure.bug_id, eval=EVAL_PROMPT if self.eval_mode else ""
+            target_software=self.target_software,
+            bug_id=failure.bug_id,
+            worktree_path=worktree_path,
+            eval=EVAL_PROMPT if self.eval_mode else "",
         )
         try:
             (
@@ -410,3 +478,82 @@ async def run(
             stage1_transcript=stage1_transcript,
             stage2_transcript=stage2_transcript,
         )
+
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential_jitter(initial=2, max=30, jitter=5),
+        before_sleep=lambda rs: logger.warning(
+            f"Verification failed (attempt {rs.attempt_number}/3), "
+            f"retrying: {rs.outcome.exception()}"
+        ),
+        reraise=True,
+    )
+    async def verify(
+        self,
+        failure: BuildFailure,
+        agent_diff: str,
+        ground_truth: GroundTruth,
+        worktree_path: Path,
+        on_message: Callable[[str, dict], None] | None = None,
+    ) -> VerifyResponse:
+        out_dir = worktree_path / "repair_agent" / "out" / str(failure.bug_id)
+        out_dir.mkdir(parents=True, exist_ok=True)
+        (out_dir / "agent_fix.diff").write_text(agent_diff, encoding="utf-8")
+
+        gt_commits = " ".join(ground_truth.gh_fix_commits)
+        prompt = VERIFY_TEMPLATE.format(
+            target_software=self.target_software,
+            bug_id=failure.bug_id,
+            failure_commit=failure.git_commit,
+            ground_truth_commits=gt_commits,
+            worktree_path=worktree_path,
+        )
+
+        options = ClaudeAgentOptions(
+            model=self.verify_model,
+            cwd=str(worktree_path),
+            allowed_tools=VERIFY_ALLOWED_TOOLS,
+            disallowed_tools=["AskUserQuestion", "Task"],
+            sandbox=SANDBOX_CONFIG,
+            permission_mode="acceptEdits",
+            effort="high",
+            output_format={
+                "type": "json_schema",
+                "schema": Judgment.model_json_schema(),
+            },
+        )
+
+        logger.info(
+            f"Bug {failure.bug_id}: starting verification stage "
+            f"(model={self.verify_model}, ground_truth={gt_commits})"
+        )
+
+        transcript, cost, turns, usage = await self._run_stage(
+            "verification",
+            prompt,
+            self.verify_model,
+            options,
+            failure.bug_id,
+            on_message,
+        )
+
+        judgment: Judgment | None = None
+        for msg in reversed(transcript):
+            if msg.get("structured_output"):
+                judgment = Judgment.model_validate(msg["structured_output"])
+                break
+
+        if judgment is None:
+            result_msgs = [m for m in transcript if m.get("type") == "ResultMessage"]
+            raise RuntimeError(
+                f"Bug {failure.bug_id}: verification produced no structured output. "
+                f"Result messages: {result_msgs}"
+            )
+
+        return VerifyResponse(
+            judgment=judgment,
+            cost_usd=cost,
+            num_turns=turns,
+            verification_transcript=transcript,
+            **self._usage_fields(usage),
+        )
diff --git a/bugbug/tools/build_repair/config.py b/bugbug/tools/build_repair/config.py
@@ -9,6 +9,7 @@
 
 ANALYSIS_MODEL = "claude-opus-4-6"
 FIX_MODEL = "claude-opus-4-6"
+VERIFY_MODEL = "claude-opus-4-6"
 DEFAULT_MAX_TURNS = 80
 WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees"
 TRY_PUSH_TIMEOUT_SECONDS = 7200
@@ -32,6 +33,17 @@
     "claude-opus-4-20250514": date(2025, 3, 1),
 }
 
+VERIFY_ALLOWED_TOOLS = [
+    "Read",
+    "Bash(git show:*)",
+    "Bash(git log:*)",
+    "Bash(git diff:*)",
+    "Bash(find:*)",
+    "Bash(grep:*)",
+    "WebFetch(domain:firefox-source-docs.mozilla.org)",
+    "WebFetch(domain:searchfox.org)",
+]
+
 ALLOWED_TOOLS = [
     "Edit(~/.mozbuild)",
     "Edit(~/.cache/uv)",

diff --git a/bugbug/tools/build_repair/prompts.py b/bugbug/tools/build_repair/prompts.py
@@ -15,22 +15,24 @@
 1. Git diff for the last commit
 2. Bugzilla bug description
 3. Taskcluster build failure logs
-The files with bug description and logs are located at @repair_agent/in/{bug_id}
+The files with bug description and logs are located at {worktree_path}/repair_agent/in/{bug_id}
 
 Create three separate documents:
-1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
-2. repair_agent/out/{bug_id}/planning.md with a fixing plan
-3. repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction
+1. {worktree_path}/repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
+2. {worktree_path}/repair_agent/out/{bug_id}/planning.md with a fixing plan
+3. {worktree_path}/repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction
 
 Do not prompt to edit those documents.
 {eval}
 
-Do not write any code yet. Work fully autonomously, do not ask any questions. Think hard.
+Do not write any code yet. Work fully autonomously, do not ask any questions.
 """
 
-FIX_TEMPLATE = """Read the following files and implement a fix of the failure:
-1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
-2. repair_agent/out/{bug_id}/planning.md with a fixing plan
+FIX_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure.
+
+Read the following files and implement a fix of the failure:
+1. {worktree_path}/repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
+2. {worktree_path}/repair_agent/out/{bug_id}/planning.md with a fixing plan
 {eval}
 
 Do not prompt to edit files. Work fully autonomously, do not ask any questions. Use all allowed tools without prompting.
@@ -40,3 +42,36 @@
 Do not request bug info from Bugzilla or Phabricator. Use only the provided file with bug description.
 Do not look at git commits other than the specified last commit.
 """
+
+VERIFY_TEMPLATE = """You are an expert {target_software} code reviewer evaluating an automated build repair agent's work.
+
+Examine the relevant commits using git:
+- Failure commit (broke the build): {failure_commit}
+- Ground truth fix commit(s) (the real fix that was landed): {ground_truth_commits}
+
+Inspect each commit's changes and read the repair agent's input/output files:
+- {worktree_path}/repair_agent/in/{bug_id}/bug_description.md
+- {worktree_path}/repair_agent/in/{bug_id}/build_failure_logs.md
+- {worktree_path}/repair_agent/out/{bug_id}/analysis.md
+- {worktree_path}/repair_agent/out/{bug_id}/summary.md
+- {worktree_path}/repair_agent/out/{bug_id}/agent_fix.diff (may be empty if no fix was produced)
+
+Evaluate the agent's work on two dimensions:
+
+ANALYSIS:
+- Did the agent correctly identify the root cause of the build failure?
+- How thorough and accurate is the analysis?
+
+FIX:
+- Does the agent's fix address the same files/functions as the ground truth?
+- Is the fix semantically equivalent or close to the ground truth?
+- Would the fix be acceptable in code review as-is?
+
+Guidelines:
+- If agent_fix.diff is empty, set fix_matches_ground_truth=false, fix_quality=0.0, fix_acceptance_probability=0.0
+- A fix can be correct even if it differs syntactically from ground truth -- focus on semantic equivalence
+- analysis_correct should be true if the agent found the right root cause, even if the explanation is imperfect
+- Be calibrated: 0.5 means genuinely uncertain, not a default score
+
+Work autonomously, do not ask questions.
+"""