Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 45 additions & 3 deletions evalbench/evaluator/agentevaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
import logging
import os
import shutil
import tempfile
import threading


from dataset.evalgeminicliinput import EvalGeminiCliRequest
from generators.models import get_generator
from generators.models.agent_cli import AgentCliGenerator
Expand Down Expand Up @@ -119,12 +121,12 @@ def process_scenario(
last_result = None

resolved_work_dir = scenario.get("resolved_work_dir")
if resolved_work_dir:
os.makedirs(resolved_work_dir, exist_ok=True)
execution_cwd, temp_sandbox_dir = self._setup_sandbox(resolved_work_dir)

# Copy declared env_files to fake_home
fake_home = getattr(self.generator, "fake_home", None)
if fake_home:
self._register_trusted_folders(fake_home, execution_cwd, resolved_work_dir)
session_dir = os.path.dirname(fake_home)
env_files = scenario.get("env_files", [])
for env_file in env_files:
Expand Down Expand Up @@ -152,7 +154,7 @@ def process_scenario(
env=env,
resume=(turn > 0),
session_id=session_id,
cwd=resolved_work_dir,
cwd=execution_cwd,
)
try:
result = self.generator.safe_generate(cli_cmd)
Expand Down Expand Up @@ -205,6 +207,8 @@ def process_scenario(
else:
break

self._cleanup_sandbox(resolved_work_dir, temp_sandbox_dir)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This cleanup call isn't reached on exception. The turn-loop body has its own try/except for safe_generate / generate, but any exception that bypasses those (e.g. an uncaught error in extract_tools / extract_skills, simulated_user.get_next_response, KeyboardInterrupt, or a bug somewhere else in the loop body) skips this line and leaves the tempdir on disk. On a long-running eval server (or a flaky scenario that fails repeatedly), this leaks evalbench-sandbox-* directories under /tmp indefinitely.

Wrap the post-setup block in try/finally:

resolved_work_dir = scenario.get("resolved_work_dir")
execution_cwd, temp_sandbox_dir = self._setup_sandbox(resolved_work_dir)
try:
    # fake_home prep + turn loop + _finalize_scenario all live here
    ...
finally:
    self._cleanup_sandbox(resolved_work_dir, temp_sandbox_dir)

Worth a unit test alongside the change: patch self.generator.safe_generate to raise, call process_scenario, then assert no evalbench-sandbox-* directories remain under tempfile.gettempdir(). That locks the contract in.


if last_result:
self._finalize_scenario(
scenario,
Expand Down Expand Up @@ -267,3 +271,41 @@ def _finalize_scenario(
score_work.run()

eval_result.agent_results.append(eval_output_data)

def _setup_sandbox(self, resolved_work_dir: str | None) -> tuple[str | None, str | None]:
"""Creates a temporary isolated directory and copies resolved_work_dir into it."""
if not resolved_work_dir:
return None, None
os.makedirs(resolved_work_dir, exist_ok=True)
temp_sandbox_dir = tempfile.mkdtemp(prefix="evalbench-sandbox-")
shutil.copytree(resolved_work_dir, temp_sandbox_dir, dirs_exist_ok=True)
return temp_sandbox_dir, temp_sandbox_dir

def _register_trusted_folders(self, fake_home: str, execution_cwd: str | None, resolved_work_dir: str | None) -> None:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to only work for gemini, we should support others and restrict this one to gemini.

"""Writes trusted folder configuration mappings to settings json to prevent CLI trust hangs."""
if not execution_cwd:
return
trusted_folders_path = os.path.join(fake_home, ".gemini", "trustedFolders.json")
os.makedirs(os.path.dirname(trusted_folders_path), exist_ok=True)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worth looking at mktemp ?


trusted_folders = {}
if os.path.exists(trusted_folders_path):
try:
with open(trusted_folders_path, "r") as f:
trusted_folders = json.load(f)
except json.JSONDecodeError:
pass

trusted_folders[execution_cwd] = "TRUST_FOLDER"
if resolved_work_dir:
trusted_folders[resolved_work_dir] = "TRUST_FOLDER"

with open(trusted_folders_path, "w") as f:
json.dump(trusted_folders, f, indent=2)
logging.info("Registered sandbox folder trust in fake_home settings.")

def _cleanup_sandbox(self, resolved_work_dir: str | None, temp_sandbox_dir: str | None) -> None:
"""Copies sandboxed modifications back to the original directory and deletes the temp folder."""
if resolved_work_dir and temp_sandbox_dir:
shutil.copytree(temp_sandbox_dir, resolved_work_dir, dirs_exist_ok=True)
shutil.rmtree(temp_sandbox_dir)
85 changes: 85 additions & 0 deletions evalbench/test/agentevaluator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,5 +89,90 @@ def test_process_scenario_copies_env_files(self, mock_get_generator):
self.assertEqual(content, "print('mock sleep')")


class TestAgentEvaluatorSandbox(unittest.TestCase):
"""Tests for sandbox setup, trusted folder registration, and cleanup."""

def setUp(self):
self.test_dir = tempfile.mkdtemp()
self.workspace_dir = os.path.join(self.test_dir, "workspace")
os.makedirs(self.workspace_dir)
with open(os.path.join(self.workspace_dir, "test.txt"), "w") as f:
f.write("original content")

self.fake_home = os.path.join(self.test_dir, "fake_home")
os.makedirs(self.fake_home)

config = {
"model_config": "dummy.yaml",
"runners": {"agent_runners": 1}
}
with patch("evaluator.agentevaluator.get_generator") as mock_get_generator:
mock_generator = MagicMock(spec=AgentCliGenerator)
mock_generator.fake_home = self.fake_home
mock_generator.version = "1.0.0"
mock_get_generator.return_value = mock_generator
self.evaluator = AgentEvaluator(config)

def tearDown(self):
shutil.rmtree(self.test_dir)

def test_setup_sandbox_creates_temp_dir_and_copies_contents(self):
execution_cwd, temp_sandbox_dir = self.evaluator._setup_sandbox(self.workspace_dir)
self.assertIsNotNone(temp_sandbox_dir)
self.assertTrue(os.path.exists(temp_sandbox_dir))
self.assertTrue(temp_sandbox_dir.startswith(tempfile.gettempdir()))

# Check content copy
copied_file = os.path.join(temp_sandbox_dir, "test.txt")
self.assertTrue(os.path.exists(copied_file))
with open(copied_file, "r") as f:
self.assertEqual(f.read(), "original content")

# Cleanup sandbox directory created inside test
shutil.rmtree(temp_sandbox_dir)

def test_register_trusted_folders_creates_valid_json(self):
sandbox_dir = os.path.join(self.test_dir, "sandbox")
os.makedirs(sandbox_dir)

self.evaluator._register_trusted_folders(self.fake_home, sandbox_dir, self.workspace_dir)

trusted_folders_path = os.path.join(self.fake_home, ".gemini", "trustedFolders.json")
self.assertTrue(os.path.exists(trusted_folders_path))

import json
with open(trusted_folders_path, "r") as f:
data = json.load(f)

self.assertIn(sandbox_dir, data)
self.assertEqual(data[sandbox_dir], "TRUST_FOLDER")
self.assertIn(self.workspace_dir, data)
self.assertEqual(data[self.workspace_dir], "TRUST_FOLDER")

def test_cleanup_sandbox_copies_back_and_removes_temp_dir(self):
# Setup sandbox manually
temp_sandbox_dir = tempfile.mkdtemp()
shutil.copytree(self.workspace_dir, temp_sandbox_dir, dirs_exist_ok=True)

# Modify file inside sandbox
with open(os.path.join(temp_sandbox_dir, "test.txt"), "w") as f:
f.write("modified content")
# Add new file inside sandbox
with open(os.path.join(temp_sandbox_dir, "new.txt"), "w") as f:
f.write("new file")

# Run cleanup
self.evaluator._cleanup_sandbox(self.workspace_dir, temp_sandbox_dir)

# Verify temp sandbox is removed
self.assertFalse(os.path.exists(temp_sandbox_dir))

# Verify changes were copied back to original workspace
with open(os.path.join(self.workspace_dir, "test.txt"), "r") as f:
self.assertEqual(f.read(), "modified content")
with open(os.path.join(self.workspace_dir, "new.txt"), "r") as f:
self.assertEqual(f.read(), "new file")


if __name__ == "__main__":
unittest.main()
Loading