diff --git a/evalbench/evaluator/agentevaluator.py b/evalbench/evaluator/agentevaluator.py index f464a83b..60ed194b 100644 --- a/evalbench/evaluator/agentevaluator.py +++ b/evalbench/evaluator/agentevaluator.py @@ -4,8 +4,10 @@ import logging import os import shutil +import tempfile import threading + from dataset.evalgeminicliinput import EvalGeminiCliRequest from generators.models import get_generator from generators.models.agent_cli import AgentCliGenerator @@ -119,12 +121,12 @@ def process_scenario( last_result = None resolved_work_dir = scenario.get("resolved_work_dir") - if resolved_work_dir: - os.makedirs(resolved_work_dir, exist_ok=True) + execution_cwd, temp_sandbox_dir = self._setup_sandbox(resolved_work_dir) # Copy declared env_files to fake_home fake_home = getattr(self.generator, "fake_home", None) if fake_home: + self._register_trusted_folders(fake_home, execution_cwd, resolved_work_dir) session_dir = os.path.dirname(fake_home) env_files = scenario.get("env_files", []) for env_file in env_files: @@ -152,7 +154,7 @@ def process_scenario( env=env, resume=(turn > 0), session_id=session_id, - cwd=resolved_work_dir, + cwd=execution_cwd, ) try: result = self.generator.safe_generate(cli_cmd) @@ -205,6 +207,8 @@ def process_scenario( else: break + self._cleanup_sandbox(resolved_work_dir, temp_sandbox_dir) + if last_result: self._finalize_scenario( scenario, @@ -267,3 +271,41 @@ def _finalize_scenario( score_work.run() eval_result.agent_results.append(eval_output_data) + + def _setup_sandbox(self, resolved_work_dir: str | None) -> tuple[str | None, str | None]: + """Creates a temporary isolated directory and copies resolved_work_dir into it.""" + if not resolved_work_dir: + return None, None + os.makedirs(resolved_work_dir, exist_ok=True) + temp_sandbox_dir = tempfile.mkdtemp(prefix="evalbench-sandbox-") + shutil.copytree(resolved_work_dir, temp_sandbox_dir, dirs_exist_ok=True) + return temp_sandbox_dir, temp_sandbox_dir + + def _register_trusted_folders(self, fake_home: str, execution_cwd: str | None, resolved_work_dir: str | None) -> None: + """Writes trusted folder configuration mappings to settings json to prevent CLI trust hangs.""" + if not execution_cwd: + return + trusted_folders_path = os.path.join(fake_home, ".gemini", "trustedFolders.json") + os.makedirs(os.path.dirname(trusted_folders_path), exist_ok=True) + + trusted_folders = {} + if os.path.exists(trusted_folders_path): + try: + with open(trusted_folders_path, "r") as f: + trusted_folders = json.load(f) + except json.JSONDecodeError: + pass + + trusted_folders[execution_cwd] = "TRUST_FOLDER" + if resolved_work_dir: + trusted_folders[resolved_work_dir] = "TRUST_FOLDER" + + with open(trusted_folders_path, "w") as f: + json.dump(trusted_folders, f, indent=2) + logging.info("Registered sandbox folder trust in fake_home settings.") + + def _cleanup_sandbox(self, resolved_work_dir: str | None, temp_sandbox_dir: str | None) -> None: + """Copies sandboxed modifications back to the original directory and deletes the temp folder.""" + if resolved_work_dir and temp_sandbox_dir: + shutil.copytree(temp_sandbox_dir, resolved_work_dir, dirs_exist_ok=True) + shutil.rmtree(temp_sandbox_dir) diff --git a/evalbench/test/agentevaluator_test.py b/evalbench/test/agentevaluator_test.py index de3176dc..72b5074b 100644 --- a/evalbench/test/agentevaluator_test.py +++ b/evalbench/test/agentevaluator_test.py @@ -89,5 +89,90 @@ def test_process_scenario_copies_env_files(self, mock_get_generator): self.assertEqual(content, "print('mock sleep')") +class TestAgentEvaluatorSandbox(unittest.TestCase): + """Tests for sandbox setup, trusted folder registration, and cleanup.""" + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + self.workspace_dir = os.path.join(self.test_dir, "workspace") + os.makedirs(self.workspace_dir) + with open(os.path.join(self.workspace_dir, "test.txt"), "w") as f: + f.write("original content") + + self.fake_home = os.path.join(self.test_dir, "fake_home") + os.makedirs(self.fake_home) + + config = { + "model_config": "dummy.yaml", + "runners": {"agent_runners": 1} + } + with patch("evaluator.agentevaluator.get_generator") as mock_get_generator: + mock_generator = MagicMock(spec=AgentCliGenerator) + mock_generator.fake_home = self.fake_home + mock_generator.version = "1.0.0" + mock_get_generator.return_value = mock_generator + self.evaluator = AgentEvaluator(config) + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_setup_sandbox_creates_temp_dir_and_copies_contents(self): + execution_cwd, temp_sandbox_dir = self.evaluator._setup_sandbox(self.workspace_dir) + self.assertIsNotNone(temp_sandbox_dir) + self.assertTrue(os.path.exists(temp_sandbox_dir)) + self.assertTrue(temp_sandbox_dir.startswith(tempfile.gettempdir())) + + # Check content copy + copied_file = os.path.join(temp_sandbox_dir, "test.txt") + self.assertTrue(os.path.exists(copied_file)) + with open(copied_file, "r") as f: + self.assertEqual(f.read(), "original content") + + # Cleanup sandbox directory created inside test + shutil.rmtree(temp_sandbox_dir) + + def test_register_trusted_folders_creates_valid_json(self): + sandbox_dir = os.path.join(self.test_dir, "sandbox") + os.makedirs(sandbox_dir) + + self.evaluator._register_trusted_folders(self.fake_home, sandbox_dir, self.workspace_dir) + + trusted_folders_path = os.path.join(self.fake_home, ".gemini", "trustedFolders.json") + self.assertTrue(os.path.exists(trusted_folders_path)) + + import json + with open(trusted_folders_path, "r") as f: + data = json.load(f) + + self.assertIn(sandbox_dir, data) + self.assertEqual(data[sandbox_dir], "TRUST_FOLDER") + self.assertIn(self.workspace_dir, data) + self.assertEqual(data[self.workspace_dir], "TRUST_FOLDER") + + def test_cleanup_sandbox_copies_back_and_removes_temp_dir(self): + # Setup sandbox manually + temp_sandbox_dir = tempfile.mkdtemp() + shutil.copytree(self.workspace_dir, temp_sandbox_dir, dirs_exist_ok=True) + + # Modify file inside sandbox + with open(os.path.join(temp_sandbox_dir, "test.txt"), "w") as f: + f.write("modified content") + # Add new file inside sandbox + with open(os.path.join(temp_sandbox_dir, "new.txt"), "w") as f: + f.write("new file") + + # Run cleanup + self.evaluator._cleanup_sandbox(self.workspace_dir, temp_sandbox_dir) + + # Verify temp sandbox is removed + self.assertFalse(os.path.exists(temp_sandbox_dir)) + + # Verify changes were copied back to original workspace + with open(os.path.join(self.workspace_dir, "test.txt"), "r") as f: + self.assertEqual(f.read(), "modified content") + with open(os.path.join(self.workspace_dir, "new.txt"), "r") as f: + self.assertEqual(f.read(), "new file") + + if __name__ == "__main__": unittest.main()