From 228cdf6fc9000d8c27548581e01a8e6f1fc57e3b Mon Sep 17 00:00:00 2001 From: g-lynnzee <139825992+g-lynnzee@users.noreply.github.com> Date: Sun, 21 Jun 2026 21:12:53 -0700 Subject: [PATCH 1/7] Feat: Add ability to filter to specific json scenario/examples Usage example: ./evalbench/evalbench.py \ --experiment_config=core-cujs/run_gemini_cli.yaml \ --scenarios=autoctx-expert-eval-confirm --- docs/configs/run-config.md | 4 +- evalbench/dataset/dataset.py | 51 ++++++++-- evalbench/evalbench.py | 25 +++++ evalbench/test/dataset_filter_test.py | 134 ++++++++++++++++++++++++++ 4 files changed, 206 insertions(+), 8 deletions(-) create mode 100644 evalbench/test/dataset_filter_test.py diff --git a/docs/configs/run-config.md b/docs/configs/run-config.md index 84a2060f..6ec14141 100644 --- a/docs/configs/run-config.md +++ b/docs/configs/run-config.md @@ -14,7 +14,9 @@ This section defines the primary resources used during evaluation, including the | `databases` | Optional | Specifies the databases (e.g., `db_blog`, `california_schools`, etc.). This filters the dataset to the provided list of databases and ignores all other evals. If not provided, all databases found in the dataset_config json file will be tried. | | `query_types` | Optional | Specifies the query_types (`dql`, `dml`, `dd`). This filters the dataset to the list of evals that are of the query_types provided. If not provided, all eval types (dql, dml and ddl) found in the dataset_config json file will be tried. | | `dataset_format` | Conditional (if needed) | Defines the dataset format, with `evalbench-standard-format` as the default. For BIRD datasets, it must be set to `bird-standard-format`.| -| `num_trials` | Optional | Number of trials to run for each prompt. +| `num_trials` | Optional | Number of trials to run for each prompt. | +| `scenarios` | Optional | A list of specific scenario IDs to run (only applies to scenario-based agentic datasets like `gemini-cli-format` or `cortado-format`). Defaults to empty (runs all scenarios). | +| `scenario_pattern` | Optional | A glob pattern of scenario IDs to run (only applies to scenario-based agentic datasets). Defaults to None (runs all scenarios). | --- ## 2. Prompt and Generation Modules diff --git a/evalbench/dataset/dataset.py b/evalbench/dataset/dataset.py index 2f399400..6551a543 100644 --- a/evalbench/dataset/dataset.py +++ b/evalbench/dataset/dataset.py @@ -3,6 +3,7 @@ from typing import Any, Optional import json import logging +import fnmatch from collections.abc import Sequence from dataset.evalinput import EvalInputRequest from dataset.evalinteractinput import EvalInteractInputRequest @@ -136,7 +137,37 @@ def load_dea_json(json_file_path): return all_items -def load_cortado_json(json_file_path): +def _filter_scenarios(scenarios: list[dict], config: dict) -> list[dict]: + """Filters a list of scenarios based on explicit IDs or glob pattern in config.""" + scenarios_to_run = config.get("scenarios", []) + scenario_pattern = config.get("scenario_pattern", None) + + # If no filters are specified, return the original list (run all) + if not scenarios_to_run and not scenario_pattern: + return scenarios + + filtered_scenarios = [] + for scenario in scenarios: + scenario_id = scenario.get("id") + if not scenario_id: + # Drop scenarios without IDs per user feedback + continue + + # Match explicit list of IDs + if scenarios_to_run and scenario_id not in scenarios_to_run: + continue + + # Match glob pattern + if scenario_pattern: + if not fnmatch.fnmatch(scenario_id, scenario_pattern): + continue + + filtered_scenarios.append(scenario) + + return filtered_scenarios + + +def load_cortado_json(json_file_path, config): all_items: dict[str, list[EvalCortadoRequest]] = { "cortado-format": [], } @@ -144,7 +175,8 @@ def load_cortado_json(json_file_path): data = json.load(json_file) scenarios = data.get("scenarios", []) - for scenario in scenarios: + filtered_scenarios = _filter_scenarios(scenarios, config) + for scenario in filtered_scenarios: eval_input = EvalCortadoRequest( raw_dict=scenario ) @@ -153,7 +185,7 @@ def load_cortado_json(json_file_path): return all_items -def load_gemini_cli_json(json_file_path): +def load_gemini_cli_json(json_file_path, config): all_items: dict[str, list[EvalGeminiCliRequest]] = { "gemini-cli-format": [], } @@ -162,10 +194,13 @@ def load_gemini_cli_json(json_file_path): json_item = _expand_env_placeholders(json_item, json_file_path) item = json.loads(json_item) - # Resolve work_dir for scenarios + # Filter scenarios scenarios = item.get("scenarios", []) + item["scenarios"] = _filter_scenarios(scenarios, config) + + # Resolve work_dir for scenarios dataset_dir = os.path.dirname(json_file_path) - for scenario in scenarios: + for scenario in item["scenarios"]: if "work_dir" in scenario: work_dir = scenario["work_dir"] # Resolve relative to dataset file @@ -197,9 +232,9 @@ def load_dataset_from_json(json_file_path, config): if dataset_format == "bird-interact-format": all_items = load_bird_interact_dataset(json_file_path, config) elif dataset_format in ("gemini-cli-format", "agent-format"): - all_items = load_gemini_cli_json(json_file_path) + all_items = load_gemini_cli_json(json_file_path, config) elif dataset_format == "cortado-format": - all_items = load_cortado_json(json_file_path) + all_items = load_cortado_json(json_file_path, config) elif dataset_format == "dea-format": all_items = load_dea_json(json_file_path) else: @@ -218,6 +253,7 @@ def load_dataset_from_json(json_file_path, config): elif dataset_format == "cortado-format": if "orchestrator" not in config: config["orchestrator"] = "cortado" + input_items = all_items elif dataset_format == "dea-format": if "orchestrator" not in config: config["orchestrator"] = "dea" @@ -236,6 +272,7 @@ def load_dataset_from_json(json_file_path, config): return input_items + def load_dataset_from_bird_format(dataset: Sequence[dict], config): input_items: dict[str, list[EvalInputRequest]] = { "dql": [], "dml": [], "ddl": []} diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py index c80910f3..611c768a 100644 --- a/evalbench/evalbench.py +++ b/evalbench/evalbench.py @@ -29,6 +29,16 @@ logging.getLogger().setLevel(logging.INFO) +_SCENARIOS = flags.DEFINE_list( + "scenarios", + [], + "List of scenario IDs to run. Defaults to empty (runs all scenarios).", +) +_SCENARIO_PATTERN = flags.DEFINE_string( + "scenario_pattern", + None, + "Glob pattern of scenario IDs to run. Defaults to None (runs all scenarios).", +) _SUITE_CONFIG = flags.DEFINE_string( "suite_config", None, @@ -57,6 +67,21 @@ def eval(experiment_config: str): logging.error(f"No Eval Config Found for '{display_config}'.") return + # 1. Merge Environment Variables (overrides YAML) + env_scenarios = os.environ.get("EVAL_SCENARIOS") + if env_scenarios: + parsed_config["scenarios"] = [s.strip() for s in env_scenarios.split(",") if s.strip()] + + env_pattern = os.environ.get("EVAL_SCENARIO_PATTERN") + if env_pattern: + parsed_config["scenario_pattern"] = env_pattern + + # 2. Merge CLI Flags (overrides Environment Variables and YAML) + if _SCENARIOS.value: + parsed_config["scenarios"] = _SCENARIOS.value + if _SCENARIO_PATTERN.value: + parsed_config["scenario_pattern"] = _SCENARIO_PATTERN.value + set_session_configs(session, parsed_config) # Load the configs config, db_configs, model_config, setup_config = load_session_configs(session) diff --git a/evalbench/test/dataset_filter_test.py b/evalbench/test/dataset_filter_test.py new file mode 100644 index 00000000..9b153801 --- /dev/null +++ b/evalbench/test/dataset_filter_test.py @@ -0,0 +1,134 @@ +import json +import os +import shutil +import sys +import tempfile +import unittest + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from dataset.dataset import load_dataset_from_json + + +class TestDatasetFilter(unittest.TestCase): + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + # 1. Create a dummy gemini-cli dataset + self.gemini_cli_data = { + "id": "gemini-cli-test", + "scenarios": [ + {"id": "csql-create-01", "starting_prompt": "create database"}, + {"id": "csql-delete-01", "starting_prompt": "delete database"}, + {"id": "spanner-list-01", "starting_prompt": "list instances"}, + {"id": "spanner-create-01", "starting_prompt": "create spanner"} + ] + } + self.gemini_cli_path = os.path.join(self.test_dir, "gemini_cli_dataset.json") + with open(self.gemini_cli_path, "w") as f: + json.dump(self.gemini_cli_data, f) + + # 2. Create a dummy cortado dataset + self.cortado_data = { + "scenarios": [ + {"id": "cortado-csql-01", "starting_prompt": "csql"}, + {"id": "cortado-spanner-01", "starting_prompt": "spanner"}, + {"id": "cortado-bigquery-01", "starting_prompt": "bq"} + ] + } + self.cortado_path = os.path.join(self.test_dir, "cortado_dataset.json") + with open(self.cortado_path, "w") as f: + json.dump(self.cortado_data, f) + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_gemini_cli_no_filter(self): + config = {"dataset_format": "gemini-cli-format"} + result = load_dataset_from_json(self.gemini_cli_path, config) + self.assertIn("gemini-cli-format", result) + self.assertEqual(len(result["gemini-cli-format"]), 1) + + payload = json.loads(result["gemini-cli-format"][0].payload) + scenarios = payload["scenarios"] + self.assertEqual(len(scenarios), 4) + self.assertEqual(scenarios[0]["id"], "csql-create-01") + + def test_gemini_cli_filter_by_id_list(self): + config = { + "dataset_format": "gemini-cli-format", + "scenarios": ["csql-create-01", "spanner-list-01"] + } + result = load_dataset_from_json(self.gemini_cli_path, config) + payload = json.loads(result["gemini-cli-format"][0].payload) + scenarios = payload["scenarios"] + self.assertEqual(len(scenarios), 2) + self.assertEqual(scenarios[0]["id"], "csql-create-01") + self.assertEqual(scenarios[1]["id"], "spanner-list-01") + + def test_gemini_cli_filter_by_pattern(self): + config = { + "dataset_format": "gemini-cli-format", + "scenario_pattern": "csql-*" + } + result = load_dataset_from_json(self.gemini_cli_path, config) + payload = json.loads(result["gemini-cli-format"][0].payload) + scenarios = payload["scenarios"] + self.assertEqual(len(scenarios), 2) + self.assertEqual(scenarios[0]["id"], "csql-create-01") + self.assertEqual(scenarios[1]["id"], "csql-delete-01") + + def test_gemini_cli_filter_combined(self): + # Combined should do intersection (or check both conditions) + config = { + "dataset_format": "gemini-cli-format", + "scenarios": ["csql-create-01", "spanner-list-01"], + "scenario_pattern": "*list*" + } + result = load_dataset_from_json(self.gemini_cli_path, config) + payload = json.loads(result["gemini-cli-format"][0].payload) + scenarios = payload["scenarios"] + self.assertEqual(len(scenarios), 1) + self.assertEqual(scenarios[0]["id"], "spanner-list-01") + + def test_gemini_cli_filter_no_match(self): + config = { + "dataset_format": "gemini-cli-format", + "scenarios": ["non-existent-id"] + } + result = load_dataset_from_json(self.gemini_cli_path, config) + payload = json.loads(result["gemini-cli-format"][0].payload) + scenarios = payload["scenarios"] + self.assertEqual(len(scenarios), 0) + + def test_cortado_no_filter(self): + config = {"dataset_format": "cortado-format"} + result = load_dataset_from_json(self.cortado_path, config) + self.assertIn("cortado-format", result) + self.assertEqual(len(result["cortado-format"]), 3) + self.assertEqual(result["cortado-format"][0].id, "cortado-csql-01") + + def test_cortado_filter_by_id_list(self): + config = { + "dataset_format": "cortado-format", + "scenarios": ["cortado-csql-01", "cortado-bigquery-01"] + } + result = load_dataset_from_json(self.cortado_path, config) + items = result["cortado-format"] + self.assertEqual(len(items), 2) + self.assertEqual(items[0].id, "cortado-csql-01") + self.assertEqual(items[1].id, "cortado-bigquery-01") + + def test_cortado_filter_by_pattern(self): + config = { + "dataset_format": "cortado-format", + "scenario_pattern": "*-spanner-*" + } + result = load_dataset_from_json(self.cortado_path, config) + items = result["cortado-format"] + self.assertEqual(len(items), 1) + self.assertEqual(items[0].id, "cortado-spanner-01") + + +if __name__ == "__main__": + unittest.main() From dc1a38dd6ee535a9edb9439813f1716822c2b4f3 Mon Sep 17 00:00:00 2001 From: g-lynnzee <139825992+g-lynnzee@users.noreply.github.com> Date: Mon, 22 Jun 2026 10:56:23 -0700 Subject: [PATCH 2/7] Update filtering logig --- evalbench/dataset/dataset.py | 24 ++++++++-- evalbench/evalbench.py | 9 ++-- evalbench/test/dataset_filter_test.py | 69 +++++++++++++++++++++++++-- 3 files changed, 90 insertions(+), 12 deletions(-) diff --git a/evalbench/dataset/dataset.py b/evalbench/dataset/dataset.py index 6551a543..12cf6de9 100644 --- a/evalbench/dataset/dataset.py +++ b/evalbench/dataset/dataset.py @@ -121,7 +121,7 @@ def load_bird_interact_dataset(json_file_path, config): return input_items -def load_dea_json(json_file_path): +def load_dea_json(json_file_path, config): all_items: dict[str, list[EvalDeaRequest]] = { "dea-format": [], } @@ -129,6 +129,15 @@ def load_dea_json(json_file_path): content = json_file.read() data = json.loads(content) + # Filter scenarios + scenarios = data.get("scenarios", []) + filtered_scenarios = _filter_scenarios(scenarios, config) + if scenarios and not filtered_scenarios: + return all_items + + if "scenarios" in data: + data["scenarios"] = filtered_scenarios + eval_input = EvalDeaRequest( raw_dict=data ) @@ -140,6 +149,9 @@ def load_dea_json(json_file_path): def _filter_scenarios(scenarios: list[dict], config: dict) -> list[dict]: """Filters a list of scenarios based on explicit IDs or glob pattern in config.""" scenarios_to_run = config.get("scenarios", []) + if isinstance(scenarios_to_run, str): + scenarios_to_run = [s.strip() for s in scenarios_to_run.split(",") if s.strip()] + scenario_pattern = config.get("scenario_pattern", None) # If no filters are specified, return the original list (run all) @@ -150,7 +162,6 @@ def _filter_scenarios(scenarios: list[dict], config: dict) -> list[dict]: for scenario in scenarios: scenario_id = scenario.get("id") if not scenario_id: - # Drop scenarios without IDs per user feedback continue # Match explicit list of IDs @@ -196,7 +207,11 @@ def load_gemini_cli_json(json_file_path, config): # Filter scenarios scenarios = item.get("scenarios", []) - item["scenarios"] = _filter_scenarios(scenarios, config) + filtered_scenarios = _filter_scenarios(scenarios, config) + if scenarios and not filtered_scenarios: + return all_items + + item["scenarios"] = filtered_scenarios # Resolve work_dir for scenarios dataset_dir = os.path.dirname(json_file_path) @@ -236,7 +251,7 @@ def load_dataset_from_json(json_file_path, config): elif dataset_format == "cortado-format": all_items = load_cortado_json(json_file_path, config) elif dataset_format == "dea-format": - all_items = load_dea_json(json_file_path) + all_items = load_dea_json(json_file_path, config) else: all_items = load_json(json_file_path) @@ -272,7 +287,6 @@ def load_dataset_from_json(json_file_path, config): return input_items - def load_dataset_from_bird_format(dataset: Sequence[dict], config): input_items: dict[str, list[EvalInputRequest]] = { "dql": [], "dml": [], "ddl": []} diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py index 611c768a..604fde52 100644 --- a/evalbench/evalbench.py +++ b/evalbench/evalbench.py @@ -77,10 +77,11 @@ def eval(experiment_config: str): parsed_config["scenario_pattern"] = env_pattern # 2. Merge CLI Flags (overrides Environment Variables and YAML) - if _SCENARIOS.value: - parsed_config["scenarios"] = _SCENARIOS.value - if _SCENARIO_PATTERN.value: - parsed_config["scenario_pattern"] = _SCENARIO_PATTERN.value + if flags.FLAGS.is_parsed(): + if _SCENARIOS.value: + parsed_config["scenarios"] = _SCENARIOS.value + if _SCENARIO_PATTERN.value: + parsed_config["scenario_pattern"] = _SCENARIO_PATTERN.value set_session_configs(session, parsed_config) # Load the configs diff --git a/evalbench/test/dataset_filter_test.py b/evalbench/test/dataset_filter_test.py index 9b153801..d46fb365 100644 --- a/evalbench/test/dataset_filter_test.py +++ b/evalbench/test/dataset_filter_test.py @@ -40,6 +40,18 @@ def setUp(self): with open(self.cortado_path, "w") as f: json.dump(self.cortado_data, f) + # 3. Create a dummy dea dataset + self.dea_data = { + "scenarios": [ + {"id": "dea-csql-01", "starting_prompt": "csql"}, + {"id": "dea-spanner-01", "starting_prompt": "spanner"}, + {"id": "dea-bigquery-01", "starting_prompt": "bq"} + ] + } + self.dea_path = os.path.join(self.test_dir, "dea_dataset.json") + with open(self.dea_path, "w") as f: + json.dump(self.dea_data, f) + def tearDown(self): shutil.rmtree(self.test_dir) @@ -97,9 +109,8 @@ def test_gemini_cli_filter_no_match(self): "scenarios": ["non-existent-id"] } result = load_dataset_from_json(self.gemini_cli_path, config) - payload = json.loads(result["gemini-cli-format"][0].payload) - scenarios = payload["scenarios"] - self.assertEqual(len(scenarios), 0) + self.assertIn("gemini-cli-format", result) + self.assertEqual(len(result["gemini-cli-format"]), 0) def test_cortado_no_filter(self): config = {"dataset_format": "cortado-format"} @@ -129,6 +140,58 @@ def test_cortado_filter_by_pattern(self): self.assertEqual(len(items), 1) self.assertEqual(items[0].id, "cortado-spanner-01") + def test_scenarios_normalization_string(self): + # scenarios provided as a comma-separated string + config = { + "dataset_format": "gemini-cli-format", + "scenarios": "csql-create-01, spanner-list-01" + } + result = load_dataset_from_json(self.gemini_cli_path, config) + payload = json.loads(result["gemini-cli-format"][0].payload) + scenarios = payload["scenarios"] + self.assertEqual(len(scenarios), 2) + self.assertEqual(scenarios[0]["id"], "csql-create-01") + self.assertEqual(scenarios[1]["id"], "spanner-list-01") + + def test_dea_no_filter(self): + config = {"dataset_format": "dea-format"} + result = load_dataset_from_json(self.dea_path, config) + self.assertIn("dea-format", result) + self.assertEqual(len(result["dea-format"]), 1) + scenarios = result["dea-format"][0].scenario["scenarios"] + self.assertEqual(len(scenarios), 3) + self.assertEqual(scenarios[0]["id"], "dea-csql-01") + + def test_dea_filter_by_id_list(self): + config = { + "dataset_format": "dea-format", + "scenarios": ["dea-csql-01", "dea-bigquery-01"] + } + result = load_dataset_from_json(self.dea_path, config) + scenarios = result["dea-format"][0].scenario["scenarios"] + self.assertEqual(len(scenarios), 2) + self.assertEqual(scenarios[0]["id"], "dea-csql-01") + self.assertEqual(scenarios[1]["id"], "dea-bigquery-01") + + def test_dea_filter_by_pattern(self): + config = { + "dataset_format": "dea-format", + "scenario_pattern": "*-spanner-*" + } + result = load_dataset_from_json(self.dea_path, config) + scenarios = result["dea-format"][0].scenario["scenarios"] + self.assertEqual(len(scenarios), 1) + self.assertEqual(scenarios[0]["id"], "dea-spanner-01") + + def test_dea_filter_no_match_returns_empty(self): + config = { + "dataset_format": "dea-format", + "scenarios": ["non-existent-id"] + } + result = load_dataset_from_json(self.dea_path, config) + self.assertIn("dea-format", result) + self.assertEqual(len(result["dea-format"]), 0) + if __name__ == "__main__": unittest.main() From d2b0b230c1e6b01b3bfa1a3335104b59459918e7 Mon Sep 17 00:00:00 2001 From: g-lynnzee <139825992+g-lynnzee@users.noreply.github.com> Date: Thu, 25 Jun 2026 20:50:30 -0700 Subject: [PATCH 3/7] feat: Improve isolation of CUJs by copying and using only the working_dir I ran into this for https://github.com/GoogleCloudPlatform/db-context-enrichment/pull/168 where the agent will read working_dirs for other CUJs. The test case relies on the agent trying and failing to find a dataset file, and acting a certain way. However, sometimes the agent will read other working_dirs and act based on datasets files there. --- evalbench/evaluator/agentevaluator.py | 50 +++++++++++++++- evalbench/test/agentevaluator_test.py | 86 +++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 3 deletions(-) diff --git a/evalbench/evaluator/agentevaluator.py b/evalbench/evaluator/agentevaluator.py index f464a83b..6666f93c 100644 --- a/evalbench/evaluator/agentevaluator.py +++ b/evalbench/evaluator/agentevaluator.py @@ -4,8 +4,10 @@ import logging import os import shutil +import tempfile import threading + from dataset.evalgeminicliinput import EvalGeminiCliRequest from generators.models import get_generator from generators.models.agent_cli import AgentCliGenerator @@ -119,12 +121,13 @@ def process_scenario( last_result = None resolved_work_dir = scenario.get("resolved_work_dir") - if resolved_work_dir: - os.makedirs(resolved_work_dir, exist_ok=True) + execution_cwd, temp_sandbox_dir = self._setup_sandbox(resolved_work_dir) + # Copy declared env_files to fake_home fake_home = getattr(self.generator, "fake_home", None) if fake_home: + self._register_trusted_folders(fake_home, execution_cwd, resolved_work_dir) session_dir = os.path.dirname(fake_home) env_files = scenario.get("env_files", []) for env_file in env_files: @@ -152,7 +155,7 @@ def process_scenario( env=env, resume=(turn > 0), session_id=session_id, - cwd=resolved_work_dir, + cwd=execution_cwd, ) try: result = self.generator.safe_generate(cli_cmd) @@ -205,6 +208,8 @@ def process_scenario( else: break + self._cleanup_sandbox(resolved_work_dir, temp_sandbox_dir) + if last_result: self._finalize_scenario( scenario, @@ -267,3 +272,42 @@ def _finalize_scenario( score_work.run() eval_result.agent_results.append(eval_output_data) + + def _setup_sandbox(self, resolved_work_dir: str | None) -> tuple[str | None, str | None]: + """Creates a temporary isolated directory and copies resolved_work_dir into it.""" + if not resolved_work_dir: + return None, None + os.makedirs(resolved_work_dir, exist_ok=True) + temp_sandbox_dir = tempfile.mkdtemp(prefix="evalbench-sandbox-") + shutil.copytree(resolved_work_dir, temp_sandbox_dir, dirs_exist_ok=True) + return temp_sandbox_dir, temp_sandbox_dir + + def _register_trusted_folders(self, fake_home: str, execution_cwd: str | None, resolved_work_dir: str | None) -> None: + """Writes trusted folder configuration mappings to settings json to prevent CLI trust hangs.""" + if not execution_cwd: + return + trusted_folders_path = os.path.join(fake_home, ".gemini", "trustedFolders.json") + os.makedirs(os.path.dirname(trusted_folders_path), exist_ok=True) + + trusted_folders = {} + if os.path.exists(trusted_folders_path): + try: + with open(trusted_folders_path, "r") as f: + trusted_folders = json.load(f) + except json.JSONDecodeError: + pass + + trusted_folders[execution_cwd] = "TRUST_FOLDER" + if resolved_work_dir: + trusted_folders[resolved_work_dir] = "TRUST_FOLDER" + + with open(trusted_folders_path, "w") as f: + json.dump(trusted_folders, f, indent=2) + logging.info("Registered sandbox folder trust in fake_home settings.") + + def _cleanup_sandbox(self, resolved_work_dir: str | None, temp_sandbox_dir: str | None) -> None: + """Copies sandboxed modifications back to the original directory and deletes the temp folder.""" + if resolved_work_dir and temp_sandbox_dir: + shutil.copytree(temp_sandbox_dir, resolved_work_dir, dirs_exist_ok=True) + shutil.rmtree(temp_sandbox_dir) + diff --git a/evalbench/test/agentevaluator_test.py b/evalbench/test/agentevaluator_test.py index de3176dc..6e1b56ac 100644 --- a/evalbench/test/agentevaluator_test.py +++ b/evalbench/test/agentevaluator_test.py @@ -89,5 +89,91 @@ def test_process_scenario_copies_env_files(self, mock_get_generator): self.assertEqual(content, "print('mock sleep')") +class TestAgentEvaluatorSandbox(unittest.TestCase): + """Tests for sandbox setup, trusted folder registration, and cleanup.""" + + def setUp(self): + self.test_dir = tempfile.mkdtemp() + self.workspace_dir = os.path.join(self.test_dir, "workspace") + os.makedirs(self.workspace_dir) + with open(os.path.join(self.workspace_dir, "test.txt"), "w") as f: + f.write("original content") + + self.fake_home = os.path.join(self.test_dir, "fake_home") + os.makedirs(self.fake_home) + + config = { + "model_config": "dummy.yaml", + "runners": {"agent_runners": 1} + } + with patch("evaluator.agentevaluator.get_generator") as mock_get_generator: + mock_generator = MagicMock(spec=AgentCliGenerator) + mock_generator.fake_home = self.fake_home + mock_generator.version = "1.0.0" + mock_get_generator.return_value = mock_generator + self.evaluator = AgentEvaluator(config) + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_setup_sandbox_creates_temp_dir_and_copies_contents(self): + execution_cwd, temp_sandbox_dir = self.evaluator._setup_sandbox(self.workspace_dir) + self.assertIsNotNone(temp_sandbox_dir) + self.assertTrue(os.path.exists(temp_sandbox_dir)) + self.assertTrue(temp_sandbox_dir.startswith(tempfile.gettempdir())) + + # Check content copy + copied_file = os.path.join(temp_sandbox_dir, "test.txt") + self.assertTrue(os.path.exists(copied_file)) + with open(copied_file, "r") as f: + self.assertEqual(f.read(), "original content") + + # Cleanup sandbox directory created inside test + shutil.rmtree(temp_sandbox_dir) + + def test_register_trusted_folders_creates_valid_json(self): + sandbox_dir = os.path.join(self.test_dir, "sandbox") + os.makedirs(sandbox_dir) + + self.evaluator._register_trusted_folders(self.fake_home, sandbox_dir, self.workspace_dir) + + trusted_folders_path = os.path.join(self.fake_home, ".gemini", "trustedFolders.json") + self.assertTrue(os.path.exists(trusted_folders_path)) + + import json + with open(trusted_folders_path, "r") as f: + data = json.load(f) + + self.assertIn(sandbox_dir, data) + self.assertEqual(data[sandbox_dir], "TRUST_FOLDER") + self.assertIn(self.workspace_dir, data) + self.assertEqual(data[self.workspace_dir], "TRUST_FOLDER") + + def test_cleanup_sandbox_copies_back_and_removes_temp_dir(self): + # Setup sandbox manually + temp_sandbox_dir = tempfile.mkdtemp() + shutil.copytree(self.workspace_dir, temp_sandbox_dir, dirs_exist_ok=True) + + # Modify file inside sandbox + with open(os.path.join(temp_sandbox_dir, "test.txt"), "w") as f: + f.write("modified content") + # Add new file inside sandbox + with open(os.path.join(temp_sandbox_dir, "new.txt"), "w") as f: + f.write("new file") + + # Run cleanup + self.evaluator._cleanup_sandbox(self.workspace_dir, temp_sandbox_dir) + + # Verify temp sandbox is removed + self.assertFalse(os.path.exists(temp_sandbox_dir)) + + # Verify changes were copied back to original workspace + with open(os.path.join(self.workspace_dir, "test.txt"), "r") as f: + self.assertEqual(f.read(), "modified content") + with open(os.path.join(self.workspace_dir, "new.txt"), "r") as f: + self.assertEqual(f.read(), "new file") + + if __name__ == "__main__": unittest.main() + From d58531507841d7e1af9aa414952693efdd7a3745 Mon Sep 17 00:00:00 2001 From: g-lynnzee <139825992+g-lynnzee@users.noreply.github.com> Date: Thu, 25 Jun 2026 20:53:12 -0700 Subject: [PATCH 4/7] Revert "Update filtering logig" This reverts commit dc1a38dd6ee535a9edb9439813f1716822c2b4f3. --- evalbench/dataset/dataset.py | 24 ++-------- evalbench/evalbench.py | 9 ++-- evalbench/test/dataset_filter_test.py | 69 ++------------------------- 3 files changed, 12 insertions(+), 90 deletions(-) diff --git a/evalbench/dataset/dataset.py b/evalbench/dataset/dataset.py index 12cf6de9..6551a543 100644 --- a/evalbench/dataset/dataset.py +++ b/evalbench/dataset/dataset.py @@ -121,7 +121,7 @@ def load_bird_interact_dataset(json_file_path, config): return input_items -def load_dea_json(json_file_path, config): +def load_dea_json(json_file_path): all_items: dict[str, list[EvalDeaRequest]] = { "dea-format": [], } @@ -129,15 +129,6 @@ def load_dea_json(json_file_path, config): content = json_file.read() data = json.loads(content) - # Filter scenarios - scenarios = data.get("scenarios", []) - filtered_scenarios = _filter_scenarios(scenarios, config) - if scenarios and not filtered_scenarios: - return all_items - - if "scenarios" in data: - data["scenarios"] = filtered_scenarios - eval_input = EvalDeaRequest( raw_dict=data ) @@ -149,9 +140,6 @@ def load_dea_json(json_file_path, config): def _filter_scenarios(scenarios: list[dict], config: dict) -> list[dict]: """Filters a list of scenarios based on explicit IDs or glob pattern in config.""" scenarios_to_run = config.get("scenarios", []) - if isinstance(scenarios_to_run, str): - scenarios_to_run = [s.strip() for s in scenarios_to_run.split(",") if s.strip()] - scenario_pattern = config.get("scenario_pattern", None) # If no filters are specified, return the original list (run all) @@ -162,6 +150,7 @@ def _filter_scenarios(scenarios: list[dict], config: dict) -> list[dict]: for scenario in scenarios: scenario_id = scenario.get("id") if not scenario_id: + # Drop scenarios without IDs per user feedback continue # Match explicit list of IDs @@ -207,11 +196,7 @@ def load_gemini_cli_json(json_file_path, config): # Filter scenarios scenarios = item.get("scenarios", []) - filtered_scenarios = _filter_scenarios(scenarios, config) - if scenarios and not filtered_scenarios: - return all_items - - item["scenarios"] = filtered_scenarios + item["scenarios"] = _filter_scenarios(scenarios, config) # Resolve work_dir for scenarios dataset_dir = os.path.dirname(json_file_path) @@ -251,7 +236,7 @@ def load_dataset_from_json(json_file_path, config): elif dataset_format == "cortado-format": all_items = load_cortado_json(json_file_path, config) elif dataset_format == "dea-format": - all_items = load_dea_json(json_file_path, config) + all_items = load_dea_json(json_file_path) else: all_items = load_json(json_file_path) @@ -287,6 +272,7 @@ def load_dataset_from_json(json_file_path, config): return input_items + def load_dataset_from_bird_format(dataset: Sequence[dict], config): input_items: dict[str, list[EvalInputRequest]] = { "dql": [], "dml": [], "ddl": []} diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py index 604fde52..611c768a 100644 --- a/evalbench/evalbench.py +++ b/evalbench/evalbench.py @@ -77,11 +77,10 @@ def eval(experiment_config: str): parsed_config["scenario_pattern"] = env_pattern # 2. Merge CLI Flags (overrides Environment Variables and YAML) - if flags.FLAGS.is_parsed(): - if _SCENARIOS.value: - parsed_config["scenarios"] = _SCENARIOS.value - if _SCENARIO_PATTERN.value: - parsed_config["scenario_pattern"] = _SCENARIO_PATTERN.value + if _SCENARIOS.value: + parsed_config["scenarios"] = _SCENARIOS.value + if _SCENARIO_PATTERN.value: + parsed_config["scenario_pattern"] = _SCENARIO_PATTERN.value set_session_configs(session, parsed_config) # Load the configs diff --git a/evalbench/test/dataset_filter_test.py b/evalbench/test/dataset_filter_test.py index d46fb365..9b153801 100644 --- a/evalbench/test/dataset_filter_test.py +++ b/evalbench/test/dataset_filter_test.py @@ -40,18 +40,6 @@ def setUp(self): with open(self.cortado_path, "w") as f: json.dump(self.cortado_data, f) - # 3. Create a dummy dea dataset - self.dea_data = { - "scenarios": [ - {"id": "dea-csql-01", "starting_prompt": "csql"}, - {"id": "dea-spanner-01", "starting_prompt": "spanner"}, - {"id": "dea-bigquery-01", "starting_prompt": "bq"} - ] - } - self.dea_path = os.path.join(self.test_dir, "dea_dataset.json") - with open(self.dea_path, "w") as f: - json.dump(self.dea_data, f) - def tearDown(self): shutil.rmtree(self.test_dir) @@ -109,8 +97,9 @@ def test_gemini_cli_filter_no_match(self): "scenarios": ["non-existent-id"] } result = load_dataset_from_json(self.gemini_cli_path, config) - self.assertIn("gemini-cli-format", result) - self.assertEqual(len(result["gemini-cli-format"]), 0) + payload = json.loads(result["gemini-cli-format"][0].payload) + scenarios = payload["scenarios"] + self.assertEqual(len(scenarios), 0) def test_cortado_no_filter(self): config = {"dataset_format": "cortado-format"} @@ -140,58 +129,6 @@ def test_cortado_filter_by_pattern(self): self.assertEqual(len(items), 1) self.assertEqual(items[0].id, "cortado-spanner-01") - def test_scenarios_normalization_string(self): - # scenarios provided as a comma-separated string - config = { - "dataset_format": "gemini-cli-format", - "scenarios": "csql-create-01, spanner-list-01" - } - result = load_dataset_from_json(self.gemini_cli_path, config) - payload = json.loads(result["gemini-cli-format"][0].payload) - scenarios = payload["scenarios"] - self.assertEqual(len(scenarios), 2) - self.assertEqual(scenarios[0]["id"], "csql-create-01") - self.assertEqual(scenarios[1]["id"], "spanner-list-01") - - def test_dea_no_filter(self): - config = {"dataset_format": "dea-format"} - result = load_dataset_from_json(self.dea_path, config) - self.assertIn("dea-format", result) - self.assertEqual(len(result["dea-format"]), 1) - scenarios = result["dea-format"][0].scenario["scenarios"] - self.assertEqual(len(scenarios), 3) - self.assertEqual(scenarios[0]["id"], "dea-csql-01") - - def test_dea_filter_by_id_list(self): - config = { - "dataset_format": "dea-format", - "scenarios": ["dea-csql-01", "dea-bigquery-01"] - } - result = load_dataset_from_json(self.dea_path, config) - scenarios = result["dea-format"][0].scenario["scenarios"] - self.assertEqual(len(scenarios), 2) - self.assertEqual(scenarios[0]["id"], "dea-csql-01") - self.assertEqual(scenarios[1]["id"], "dea-bigquery-01") - - def test_dea_filter_by_pattern(self): - config = { - "dataset_format": "dea-format", - "scenario_pattern": "*-spanner-*" - } - result = load_dataset_from_json(self.dea_path, config) - scenarios = result["dea-format"][0].scenario["scenarios"] - self.assertEqual(len(scenarios), 1) - self.assertEqual(scenarios[0]["id"], "dea-spanner-01") - - def test_dea_filter_no_match_returns_empty(self): - config = { - "dataset_format": "dea-format", - "scenarios": ["non-existent-id"] - } - result = load_dataset_from_json(self.dea_path, config) - self.assertIn("dea-format", result) - self.assertEqual(len(result["dea-format"]), 0) - if __name__ == "__main__": unittest.main() From 5aaabffdca0545d2922e5a1da0c98ad1c838c4ef Mon Sep 17 00:00:00 2001 From: g-lynnzee <139825992+g-lynnzee@users.noreply.github.com> Date: Thu, 25 Jun 2026 20:53:36 -0700 Subject: [PATCH 5/7] Revert "Feat: Add ability to filter to specific json scenario/examples" This reverts commit 228cdf6fc9000d8c27548581e01a8e6f1fc57e3b. --- docs/configs/run-config.md | 4 +- evalbench/dataset/dataset.py | 51 ++-------- evalbench/evalbench.py | 25 ----- evalbench/test/dataset_filter_test.py | 134 -------------------------- 4 files changed, 8 insertions(+), 206 deletions(-) delete mode 100644 evalbench/test/dataset_filter_test.py diff --git a/docs/configs/run-config.md b/docs/configs/run-config.md index 6ec14141..84a2060f 100644 --- a/docs/configs/run-config.md +++ b/docs/configs/run-config.md @@ -14,9 +14,7 @@ This section defines the primary resources used during evaluation, including the | `databases` | Optional | Specifies the databases (e.g., `db_blog`, `california_schools`, etc.). This filters the dataset to the provided list of databases and ignores all other evals. If not provided, all databases found in the dataset_config json file will be tried. | | `query_types` | Optional | Specifies the query_types (`dql`, `dml`, `dd`). This filters the dataset to the list of evals that are of the query_types provided. If not provided, all eval types (dql, dml and ddl) found in the dataset_config json file will be tried. | | `dataset_format` | Conditional (if needed) | Defines the dataset format, with `evalbench-standard-format` as the default. For BIRD datasets, it must be set to `bird-standard-format`.| -| `num_trials` | Optional | Number of trials to run for each prompt. | -| `scenarios` | Optional | A list of specific scenario IDs to run (only applies to scenario-based agentic datasets like `gemini-cli-format` or `cortado-format`). Defaults to empty (runs all scenarios). | -| `scenario_pattern` | Optional | A glob pattern of scenario IDs to run (only applies to scenario-based agentic datasets). Defaults to None (runs all scenarios). | +| `num_trials` | Optional | Number of trials to run for each prompt. --- ## 2. Prompt and Generation Modules diff --git a/evalbench/dataset/dataset.py b/evalbench/dataset/dataset.py index 6551a543..2f399400 100644 --- a/evalbench/dataset/dataset.py +++ b/evalbench/dataset/dataset.py @@ -3,7 +3,6 @@ from typing import Any, Optional import json import logging -import fnmatch from collections.abc import Sequence from dataset.evalinput import EvalInputRequest from dataset.evalinteractinput import EvalInteractInputRequest @@ -137,37 +136,7 @@ def load_dea_json(json_file_path): return all_items -def _filter_scenarios(scenarios: list[dict], config: dict) -> list[dict]: - """Filters a list of scenarios based on explicit IDs or glob pattern in config.""" - scenarios_to_run = config.get("scenarios", []) - scenario_pattern = config.get("scenario_pattern", None) - - # If no filters are specified, return the original list (run all) - if not scenarios_to_run and not scenario_pattern: - return scenarios - - filtered_scenarios = [] - for scenario in scenarios: - scenario_id = scenario.get("id") - if not scenario_id: - # Drop scenarios without IDs per user feedback - continue - - # Match explicit list of IDs - if scenarios_to_run and scenario_id not in scenarios_to_run: - continue - - # Match glob pattern - if scenario_pattern: - if not fnmatch.fnmatch(scenario_id, scenario_pattern): - continue - - filtered_scenarios.append(scenario) - - return filtered_scenarios - - -def load_cortado_json(json_file_path, config): +def load_cortado_json(json_file_path): all_items: dict[str, list[EvalCortadoRequest]] = { "cortado-format": [], } @@ -175,8 +144,7 @@ def load_cortado_json(json_file_path, config): data = json.load(json_file) scenarios = data.get("scenarios", []) - filtered_scenarios = _filter_scenarios(scenarios, config) - for scenario in filtered_scenarios: + for scenario in scenarios: eval_input = EvalCortadoRequest( raw_dict=scenario ) @@ -185,7 +153,7 @@ def load_cortado_json(json_file_path, config): return all_items -def load_gemini_cli_json(json_file_path, config): +def load_gemini_cli_json(json_file_path): all_items: dict[str, list[EvalGeminiCliRequest]] = { "gemini-cli-format": [], } @@ -194,13 +162,10 @@ def load_gemini_cli_json(json_file_path, config): json_item = _expand_env_placeholders(json_item, json_file_path) item = json.loads(json_item) - # Filter scenarios - scenarios = item.get("scenarios", []) - item["scenarios"] = _filter_scenarios(scenarios, config) - # Resolve work_dir for scenarios + scenarios = item.get("scenarios", []) dataset_dir = os.path.dirname(json_file_path) - for scenario in item["scenarios"]: + for scenario in scenarios: if "work_dir" in scenario: work_dir = scenario["work_dir"] # Resolve relative to dataset file @@ -232,9 +197,9 @@ def load_dataset_from_json(json_file_path, config): if dataset_format == "bird-interact-format": all_items = load_bird_interact_dataset(json_file_path, config) elif dataset_format in ("gemini-cli-format", "agent-format"): - all_items = load_gemini_cli_json(json_file_path, config) + all_items = load_gemini_cli_json(json_file_path) elif dataset_format == "cortado-format": - all_items = load_cortado_json(json_file_path, config) + all_items = load_cortado_json(json_file_path) elif dataset_format == "dea-format": all_items = load_dea_json(json_file_path) else: @@ -253,7 +218,6 @@ def load_dataset_from_json(json_file_path, config): elif dataset_format == "cortado-format": if "orchestrator" not in config: config["orchestrator"] = "cortado" - input_items = all_items elif dataset_format == "dea-format": if "orchestrator" not in config: config["orchestrator"] = "dea" @@ -272,7 +236,6 @@ def load_dataset_from_json(json_file_path, config): return input_items - def load_dataset_from_bird_format(dataset: Sequence[dict], config): input_items: dict[str, list[EvalInputRequest]] = { "dql": [], "dml": [], "ddl": []} diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py index 611c768a..c80910f3 100644 --- a/evalbench/evalbench.py +++ b/evalbench/evalbench.py @@ -29,16 +29,6 @@ logging.getLogger().setLevel(logging.INFO) -_SCENARIOS = flags.DEFINE_list( - "scenarios", - [], - "List of scenario IDs to run. Defaults to empty (runs all scenarios).", -) -_SCENARIO_PATTERN = flags.DEFINE_string( - "scenario_pattern", - None, - "Glob pattern of scenario IDs to run. Defaults to None (runs all scenarios).", -) _SUITE_CONFIG = flags.DEFINE_string( "suite_config", None, @@ -67,21 +57,6 @@ def eval(experiment_config: str): logging.error(f"No Eval Config Found for '{display_config}'.") return - # 1. Merge Environment Variables (overrides YAML) - env_scenarios = os.environ.get("EVAL_SCENARIOS") - if env_scenarios: - parsed_config["scenarios"] = [s.strip() for s in env_scenarios.split(",") if s.strip()] - - env_pattern = os.environ.get("EVAL_SCENARIO_PATTERN") - if env_pattern: - parsed_config["scenario_pattern"] = env_pattern - - # 2. Merge CLI Flags (overrides Environment Variables and YAML) - if _SCENARIOS.value: - parsed_config["scenarios"] = _SCENARIOS.value - if _SCENARIO_PATTERN.value: - parsed_config["scenario_pattern"] = _SCENARIO_PATTERN.value - set_session_configs(session, parsed_config) # Load the configs config, db_configs, model_config, setup_config = load_session_configs(session) diff --git a/evalbench/test/dataset_filter_test.py b/evalbench/test/dataset_filter_test.py deleted file mode 100644 index 9b153801..00000000 --- a/evalbench/test/dataset_filter_test.py +++ /dev/null @@ -1,134 +0,0 @@ -import json -import os -import shutil -import sys -import tempfile -import unittest - -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from dataset.dataset import load_dataset_from_json - - -class TestDatasetFilter(unittest.TestCase): - def setUp(self): - self.test_dir = tempfile.mkdtemp() - - # 1. Create a dummy gemini-cli dataset - self.gemini_cli_data = { - "id": "gemini-cli-test", - "scenarios": [ - {"id": "csql-create-01", "starting_prompt": "create database"}, - {"id": "csql-delete-01", "starting_prompt": "delete database"}, - {"id": "spanner-list-01", "starting_prompt": "list instances"}, - {"id": "spanner-create-01", "starting_prompt": "create spanner"} - ] - } - self.gemini_cli_path = os.path.join(self.test_dir, "gemini_cli_dataset.json") - with open(self.gemini_cli_path, "w") as f: - json.dump(self.gemini_cli_data, f) - - # 2. Create a dummy cortado dataset - self.cortado_data = { - "scenarios": [ - {"id": "cortado-csql-01", "starting_prompt": "csql"}, - {"id": "cortado-spanner-01", "starting_prompt": "spanner"}, - {"id": "cortado-bigquery-01", "starting_prompt": "bq"} - ] - } - self.cortado_path = os.path.join(self.test_dir, "cortado_dataset.json") - with open(self.cortado_path, "w") as f: - json.dump(self.cortado_data, f) - - def tearDown(self): - shutil.rmtree(self.test_dir) - - def test_gemini_cli_no_filter(self): - config = {"dataset_format": "gemini-cli-format"} - result = load_dataset_from_json(self.gemini_cli_path, config) - self.assertIn("gemini-cli-format", result) - self.assertEqual(len(result["gemini-cli-format"]), 1) - - payload = json.loads(result["gemini-cli-format"][0].payload) - scenarios = payload["scenarios"] - self.assertEqual(len(scenarios), 4) - self.assertEqual(scenarios[0]["id"], "csql-create-01") - - def test_gemini_cli_filter_by_id_list(self): - config = { - "dataset_format": "gemini-cli-format", - "scenarios": ["csql-create-01", "spanner-list-01"] - } - result = load_dataset_from_json(self.gemini_cli_path, config) - payload = json.loads(result["gemini-cli-format"][0].payload) - scenarios = payload["scenarios"] - self.assertEqual(len(scenarios), 2) - self.assertEqual(scenarios[0]["id"], "csql-create-01") - self.assertEqual(scenarios[1]["id"], "spanner-list-01") - - def test_gemini_cli_filter_by_pattern(self): - config = { - "dataset_format": "gemini-cli-format", - "scenario_pattern": "csql-*" - } - result = load_dataset_from_json(self.gemini_cli_path, config) - payload = json.loads(result["gemini-cli-format"][0].payload) - scenarios = payload["scenarios"] - self.assertEqual(len(scenarios), 2) - self.assertEqual(scenarios[0]["id"], "csql-create-01") - self.assertEqual(scenarios[1]["id"], "csql-delete-01") - - def test_gemini_cli_filter_combined(self): - # Combined should do intersection (or check both conditions) - config = { - "dataset_format": "gemini-cli-format", - "scenarios": ["csql-create-01", "spanner-list-01"], - "scenario_pattern": "*list*" - } - result = load_dataset_from_json(self.gemini_cli_path, config) - payload = json.loads(result["gemini-cli-format"][0].payload) - scenarios = payload["scenarios"] - self.assertEqual(len(scenarios), 1) - self.assertEqual(scenarios[0]["id"], "spanner-list-01") - - def test_gemini_cli_filter_no_match(self): - config = { - "dataset_format": "gemini-cli-format", - "scenarios": ["non-existent-id"] - } - result = load_dataset_from_json(self.gemini_cli_path, config) - payload = json.loads(result["gemini-cli-format"][0].payload) - scenarios = payload["scenarios"] - self.assertEqual(len(scenarios), 0) - - def test_cortado_no_filter(self): - config = {"dataset_format": "cortado-format"} - result = load_dataset_from_json(self.cortado_path, config) - self.assertIn("cortado-format", result) - self.assertEqual(len(result["cortado-format"]), 3) - self.assertEqual(result["cortado-format"][0].id, "cortado-csql-01") - - def test_cortado_filter_by_id_list(self): - config = { - "dataset_format": "cortado-format", - "scenarios": ["cortado-csql-01", "cortado-bigquery-01"] - } - result = load_dataset_from_json(self.cortado_path, config) - items = result["cortado-format"] - self.assertEqual(len(items), 2) - self.assertEqual(items[0].id, "cortado-csql-01") - self.assertEqual(items[1].id, "cortado-bigquery-01") - - def test_cortado_filter_by_pattern(self): - config = { - "dataset_format": "cortado-format", - "scenario_pattern": "*-spanner-*" - } - result = load_dataset_from_json(self.cortado_path, config) - items = result["cortado-format"] - self.assertEqual(len(items), 1) - self.assertEqual(items[0].id, "cortado-spanner-01") - - -if __name__ == "__main__": - unittest.main() From 54845755798b709873ebdc9e815461a20876755b Mon Sep 17 00:00:00 2001 From: g-lynnzee <139825992+g-lynnzee@users.noreply.github.com> Date: Thu, 25 Jun 2026 21:03:11 -0700 Subject: [PATCH 6/7] Remove newlines --- evalbench/evaluator/agentevaluator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/evalbench/evaluator/agentevaluator.py b/evalbench/evaluator/agentevaluator.py index 6666f93c..f03f275c 100644 --- a/evalbench/evaluator/agentevaluator.py +++ b/evalbench/evaluator/agentevaluator.py @@ -309,5 +309,4 @@ def _cleanup_sandbox(self, resolved_work_dir: str | None, temp_sandbox_dir: str """Copies sandboxed modifications back to the original directory and deletes the temp folder.""" if resolved_work_dir and temp_sandbox_dir: shutil.copytree(temp_sandbox_dir, resolved_work_dir, dirs_exist_ok=True) - shutil.rmtree(temp_sandbox_dir) - + shutil.rmtree(temp_sandbox_dir) \ No newline at end of file From 57edee5d42ceb3580f170a6cf25c53017832b78e Mon Sep 17 00:00:00 2001 From: g-lynnzee <139825992+g-lynnzee@users.noreply.github.com> Date: Thu, 25 Jun 2026 21:18:37 -0700 Subject: [PATCH 7/7] style guide fixes --- evalbench/evaluator/agentevaluator.py | 9 ++++----- evalbench/test/agentevaluator_test.py | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/evalbench/evaluator/agentevaluator.py b/evalbench/evaluator/agentevaluator.py index f03f275c..60ed194b 100644 --- a/evalbench/evaluator/agentevaluator.py +++ b/evalbench/evaluator/agentevaluator.py @@ -123,7 +123,6 @@ def process_scenario( resolved_work_dir = scenario.get("resolved_work_dir") execution_cwd, temp_sandbox_dir = self._setup_sandbox(resolved_work_dir) - # Copy declared env_files to fake_home fake_home = getattr(self.generator, "fake_home", None) if fake_home: @@ -288,7 +287,7 @@ def _register_trusted_folders(self, fake_home: str, execution_cwd: str | None, r return trusted_folders_path = os.path.join(fake_home, ".gemini", "trustedFolders.json") os.makedirs(os.path.dirname(trusted_folders_path), exist_ok=True) - + trusted_folders = {} if os.path.exists(trusted_folders_path): try: @@ -296,11 +295,11 @@ def _register_trusted_folders(self, fake_home: str, execution_cwd: str | None, r trusted_folders = json.load(f) except json.JSONDecodeError: pass - + trusted_folders[execution_cwd] = "TRUST_FOLDER" if resolved_work_dir: trusted_folders[resolved_work_dir] = "TRUST_FOLDER" - + with open(trusted_folders_path, "w") as f: json.dump(trusted_folders, f, indent=2) logging.info("Registered sandbox folder trust in fake_home settings.") @@ -309,4 +308,4 @@ def _cleanup_sandbox(self, resolved_work_dir: str | None, temp_sandbox_dir: str """Copies sandboxed modifications back to the original directory and deletes the temp folder.""" if resolved_work_dir and temp_sandbox_dir: shutil.copytree(temp_sandbox_dir, resolved_work_dir, dirs_exist_ok=True) - shutil.rmtree(temp_sandbox_dir) \ No newline at end of file + shutil.rmtree(temp_sandbox_dir) diff --git a/evalbench/test/agentevaluator_test.py b/evalbench/test/agentevaluator_test.py index 6e1b56ac..72b5074b 100644 --- a/evalbench/test/agentevaluator_test.py +++ b/evalbench/test/agentevaluator_test.py @@ -176,4 +176,3 @@ def test_cleanup_sandbox_copies_back_and_removes_temp_dir(self): if __name__ == "__main__": unittest.main() -