Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 161 additions & 14 deletions bugbug/tools/build_repair/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@

from claude_agent_sdk import ClaudeAgentOptions, ResultMessage, query
from pydantic import BaseModel, Field
from tenacity import (
retry,
retry_if_exception,
retry_if_exception_message,
stop_after_attempt,
wait_exponential_jitter,
)

from bugbug.tools.base import GenerativeModelTool
from bugbug.tools.build_repair.config import (
Expand All @@ -20,11 +27,14 @@
FIREFOX_MCP_URL,
FIX_MODEL,
SANDBOX_CONFIG,
VERIFY_ALLOWED_TOOLS,
VERIFY_MODEL,
)
from bugbug.tools.build_repair.prompts import (
ANALYSIS_TEMPLATE,
EVAL_PROMPT,
FIX_TEMPLATE,
VERIFY_TEMPLATE,
)

logger = getLogger(__name__)
Expand All @@ -44,7 +54,16 @@ class BuildFailure(BaseModel):
)


class AgentResponse(BaseModel):
class UsageStats(BaseModel):
cost_usd: float = Field(default=0.0)
num_turns: int = Field(default=0)
input_tokens: int = Field(default=0)
output_tokens: int = Field(default=0)
cache_read_input_tokens: int = Field(default=0)
cache_creation_input_tokens: int = Field(default=0)


class AgentResponse(UsageStats):
"""Output from a build repair run, including analysis, diff, cost, and build results."""

summary: str = Field(default="")
Expand All @@ -67,6 +86,28 @@ class AgentResponse(BaseModel):
stage2_transcript: list[dict] = Field(default_factory=list)


class GroundTruth(BaseModel):
gh_fix_commits: list[str] = Field(
description="Git commit hashes of the ground truth fix."
)


class Judgment(BaseModel):
analysis_correct: bool
analysis_quality: float
analysis_explanation: str
fix_matches_ground_truth: bool
fix_quality: float
fix_explanation: str
fix_acceptance_probability: float
fix_acceptance_explanation: str


class VerifyResponse(UsageStats):
judgment: Judgment | None = Field(default=None)
verification_transcript: list[dict] = Field(default_factory=list)


class BuildRepairTool(GenerativeModelTool):
"""Two-stage build repair agent using Claude Agent SDK.

Expand All @@ -82,12 +123,14 @@ def __init__(
eval_mode: bool = False,
analysis_model: str = ANALYSIS_MODEL,
fix_model: str = FIX_MODEL,
verify_model: str = VERIFY_MODEL,
) -> None:
self.eval_mode = eval_mode
self.target_software = target_software
self.analysis_only = analysis_only
self.analysis_model = analysis_model
self.fix_model = fix_model
self.verify_model = verify_model

@classmethod
def create(cls, **kwargs):
Expand Down Expand Up @@ -128,28 +171,49 @@ async def _run_stage(
result_data: dict = {}
usage: dict = {}

if on_message:
on_message(
stage_name,
{
"type": "stage_start",
"prompt": prompt,
"model": model,
},
)
try:
@retry(
retry=(
retry_if_exception_message(match="Control request timeout")
| retry_if_exception_message(match="overloaded")
| retry_if_exception_message(match="529")
| retry_if_exception_message(match="exit code")
| retry_if_exception(
lambda e: isinstance(e, (TimeoutError, ConnectionError, OSError))
)
),
stop=stop_after_attempt(5),
wait=wait_exponential_jitter(initial=2, max=60, jitter=5),
before_sleep=lambda rs: logger.warning(
f"Bug {bug_id}: {stage_name} transient error "
f"(attempt {rs.attempt_number}/5), retrying: {rs.outcome.exception()}"
),
reraise=True,
)
async def _query():
nonlocal cost, turns, usage, result_data
async for message in query(prompt=prompt, options=options):
serialized = self._serialize_message(message)
transcript.append(serialized)
logger.info(f"Bug {bug_id}: {stage_name} [{serialized['type']}]")
logger.debug(f"Bug {bug_id}: {stage_name} detail: {serialized}")
logger.debug(f"Bug {bug_id}: {stage_name} [{serialized['type']}]")
if on_message:
on_message(stage_name, serialized)
if isinstance(message, ResultMessage):
cost += message.total_cost_usd or 0
turns += message.num_turns or 0
usage = getattr(message, "usage", {}) or {}
result_data = serialized

if on_message:
on_message(
stage_name,
{
"type": "stage_start",
"prompt": prompt,
"model": model,
},
)
try:
await _query()
finally:
if on_message:
on_message(
Expand Down Expand Up @@ -233,6 +297,7 @@ async def run(
analysis_prompt = ANALYSIS_TEMPLATE.format(
bug_id=failure.bug_id,
target_software=self.target_software,
worktree_path=worktree_path,
eval=EVAL_PROMPT if self.eval_mode else "",
)
try:
Expand Down Expand Up @@ -305,7 +370,10 @@ async def run(
mcp_servers=mcp_servers,
)
fix_prompt = FIX_TEMPLATE.format(
bug_id=failure.bug_id, eval=EVAL_PROMPT if self.eval_mode else ""
target_software=self.target_software,
bug_id=failure.bug_id,
worktree_path=worktree_path,
eval=EVAL_PROMPT if self.eval_mode else "",
)
try:
(
Expand Down Expand Up @@ -410,3 +478,82 @@ async def run(
stage1_transcript=stage1_transcript,
stage2_transcript=stage2_transcript,
)

@retry(
stop=stop_after_attempt(3),
wait=wait_exponential_jitter(initial=2, max=30, jitter=5),
before_sleep=lambda rs: logger.warning(
f"Verification failed (attempt {rs.attempt_number}/3), "
f"retrying: {rs.outcome.exception()}"
),
reraise=True,
)
async def verify(
self,
failure: BuildFailure,
agent_diff: str,
ground_truth: GroundTruth,
worktree_path: Path,
on_message: Callable[[str, dict], None] | None = None,
) -> VerifyResponse:
out_dir = worktree_path / "repair_agent" / "out" / str(failure.bug_id)
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir / "agent_fix.diff").write_text(agent_diff, encoding="utf-8")

gt_commits = " ".join(ground_truth.gh_fix_commits)
prompt = VERIFY_TEMPLATE.format(
target_software=self.target_software,
bug_id=failure.bug_id,
failure_commit=failure.git_commit,
ground_truth_commits=gt_commits,
worktree_path=worktree_path,
)

options = ClaudeAgentOptions(
model=self.verify_model,
cwd=str(worktree_path),
allowed_tools=VERIFY_ALLOWED_TOOLS,
disallowed_tools=["AskUserQuestion", "Task"],
sandbox=SANDBOX_CONFIG,
permission_mode="acceptEdits",
effort="high",
output_format={
"type": "json_schema",
"schema": Judgment.model_json_schema(),
},
)

logger.info(
f"Bug {failure.bug_id}: starting verification stage "
f"(model={self.verify_model}, ground_truth={gt_commits})"
)

transcript, cost, turns, usage = await self._run_stage(
"verification",
prompt,
self.verify_model,
options,
failure.bug_id,
on_message,
)

judgment: Judgment | None = None
for msg in reversed(transcript):
if msg.get("structured_output"):
judgment = Judgment.model_validate(msg["structured_output"])
break

if judgment is None:
result_msgs = [m for m in transcript if m.get("type") == "ResultMessage"]
raise RuntimeError(
f"Bug {failure.bug_id}: verification produced no structured output. "
f"Result messages: {result_msgs}"
)

return VerifyResponse(
judgment=judgment,
cost_usd=cost,
num_turns=turns,
verification_transcript=transcript,
**self._usage_fields(usage),
)
12 changes: 12 additions & 0 deletions bugbug/tools/build_repair/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

ANALYSIS_MODEL = "claude-opus-4-6"
FIX_MODEL = "claude-opus-4-6"
VERIFY_MODEL = "claude-opus-4-6"
DEFAULT_MAX_TURNS = 80
WORKTREE_BASE_DIR = "/tmp/build_repair_worktrees"
TRY_PUSH_TIMEOUT_SECONDS = 7200
Expand All @@ -32,6 +33,17 @@
"claude-opus-4-20250514": date(2025, 3, 1),
}

VERIFY_ALLOWED_TOOLS = [
"Read",
"Bash(git show:*)",
"Bash(git log:*)",
"Bash(git diff:*)",
"Bash(find:*)",
"Bash(grep:*)",
"WebFetch(domain:firefox-source-docs.mozilla.org)",
"WebFetch(domain:searchfox.org)",
]

ALLOWED_TOOLS = [
"Edit(~/.mozbuild)",
"Edit(~/.cache/uv)",
Expand Down
51 changes: 43 additions & 8 deletions bugbug/tools/build_repair/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,24 @@
1. Git diff for the last commit
2. Bugzilla bug description
3. Taskcluster build failure logs
The files with bug description and logs are located at @repair_agent/in/{bug_id}
The files with bug description and logs are located at {worktree_path}/repair_agent/in/{bug_id}

Create three separate documents:
1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
2. repair_agent/out/{bug_id}/planning.md with a fixing plan
3. repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction
1. {worktree_path}/repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
2. {worktree_path}/repair_agent/out/{bug_id}/planning.md with a fixing plan
3. {worktree_path}/repair_agent/out/{bug_id}/summary.md with a brief one paragraph summary of analysis and planning that can point a developer in the right direction

Do not prompt to edit those documents.
{eval}

Do not write any code yet. Work fully autonomously, do not ask any questions. Think hard.
Do not write any code yet. Work fully autonomously, do not ask any questions.
"""

FIX_TEMPLATE = """Read the following files and implement a fix of the failure:
1. repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
2. repair_agent/out/{bug_id}/planning.md with a fixing plan
FIX_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing and fixing a build failure.

Read the following files and implement a fix of the failure:
1. {worktree_path}/repair_agent/out/{bug_id}/analysis.md with your detailed analysis on what caused the issues
2. {worktree_path}/repair_agent/out/{bug_id}/planning.md with a fixing plan
{eval}

Do not prompt to edit files. Work fully autonomously, do not ask any questions. Use all allowed tools without prompting.
Expand All @@ -40,3 +42,36 @@
Do not request bug info from Bugzilla or Phabricator. Use only the provided file with bug description.
Do not look at git commits other than the specified last commit.
"""

VERIFY_TEMPLATE = """You are an expert {target_software} code reviewer evaluating an automated build repair agent's work.

Examine the relevant commits using git:
- Failure commit (broke the build): {failure_commit}
- Ground truth fix commit(s) (the real fix that was landed): {ground_truth_commits}

Inspect each commit's changes and read the repair agent's input/output files:
- {worktree_path}/repair_agent/in/{bug_id}/bug_description.md
- {worktree_path}/repair_agent/in/{bug_id}/build_failure_logs.md
- {worktree_path}/repair_agent/out/{bug_id}/analysis.md
- {worktree_path}/repair_agent/out/{bug_id}/summary.md
- {worktree_path}/repair_agent/out/{bug_id}/agent_fix.diff (may be empty if no fix was produced)

Evaluate the agent's work on two dimensions:

ANALYSIS:
- Did the agent correctly identify the root cause of the build failure?
- How thorough and accurate is the analysis?

FIX:
- Does the agent's fix address the same files/functions as the ground truth?
- Is the fix semantically equivalent or close to the ground truth?
- Would the fix be acceptable in code review as-is?

Guidelines:
- If agent_fix.diff is empty, set fix_matches_ground_truth=false, fix_quality=0.0, fix_acceptance_probability=0.0
- A fix can be correct even if it differs syntactically from ground truth -- focus on semantic equivalence
- analysis_correct should be true if the agent found the right root cause, even if the explanation is imperfect
- Be calibrated: 0.5 means genuinely uncertain, not a default score

Work autonomously, do not ask questions.
"""
Loading