Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,11 @@ viewer/version.txt
.jetskicli/project.json

# Local testing datasets/configs (do not commit)
datasets/bqml/Bullseye_PCA.json
datasets/bqml/Bullseye_PCA_config.yaml
datasets/bqml/
datasets/model_configs/gemini_cli_test_model.yaml
datasets/model_configs/gemini_2.5_pro_test_model.yaml
datasets/model_configs/gemini_3_flash_test_model.yaml
datasets/model_configs/codex_cli_dak_model.yaml



85 changes: 77 additions & 8 deletions evalbench/generators/models/codex_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ def __init__(self, querygenerator_config):

self.setup_config = querygenerator_config.get("setup", {})
self.config_path = os.path.join(self.codex_config_dir, "config.toml")
self.inline_mcp_servers = {}
self.enabled_plugins = {}
self._setup()

@staticmethod
Expand Down Expand Up @@ -243,9 +245,10 @@ def _setup(self):
either be a repo with a `skills/` child or a single skill folder.
"""
mcp_servers_config = self.setup_config.get("mcp_servers", {})
extra_config = dict(self._DEFAULT_TOP_LEVEL_CONFIG)
extra_config.update(self.setup_config.get("config", {}))
self._write_config_toml(mcp_servers_config, extra_config)
if isinstance(mcp_servers_config, list):
self._install_mcp_servers_from_repo(mcp_servers_config)
elif isinstance(mcp_servers_config, dict):
self.inline_mcp_servers.update(mcp_servers_config)

skills_config = self.setup_config.get("skills", [])
if skills_config:
Expand All @@ -255,6 +258,8 @@ def _setup(self):
if skills_dir_path:
self._setup_skills_from_dir(skills_dir_path)

self._write_config_toml()

def _setup_skills(self, skills: list):
"""Installs Codex skills from repo or local path configs."""
setup_env = os.environ.copy()
Expand All @@ -279,6 +284,12 @@ def _setup_skills(self, skills: list):
repo_dir = self._clone_extension_repo(url, plugins_dir, setup_env)
if repo_dir:
self._register_codex_plugin(repo_dir, skill_config)
plugin_name = skill_config.get("plugin_name") or skill_config.get("plugin") or self._read_codex_plugin_name(repo_dir)
if not plugin_name:
plugin_name = os.path.basename(os.path.abspath(repo_dir))
marketplace_name = skill_config.get("marketplace_name", "evalbench-local-marketplace")
plugin_id = f"{plugin_name}@{marketplace_name}"
self.enabled_plugins.setdefault(plugin_id, {}).update(skill_config.get("config", {}) or {})
self._install_skills_from_source(repo_dir, skill_config)
elif action in ("copy", "link", "install") and path:
# Materialize the skill instead of symlinking so Codex sees a
Expand Down Expand Up @@ -341,7 +352,7 @@ def _clone_extension_repo(

def _register_codex_plugin(self, repo_dir: str, skill_config: dict):
"""Registers a local Codex plugin marketplace entry for cloned repos."""
plugin_name = skill_config.get("plugin_name") or self._read_codex_plugin_name(repo_dir)
plugin_name = skill_config.get("plugin_name") or skill_config.get("plugin") or self._read_codex_plugin_name(repo_dir)
if not plugin_name:
plugin_name = os.path.basename(os.path.abspath(repo_dir))

Expand All @@ -350,7 +361,7 @@ def _register_codex_plugin(self, repo_dir: str, skill_config: dict):
display_name = skill_config.get(
"marketplace_display_name", "EvalBench Local Skills")

plugins_dir = os.path.join(self.codex_config_dir, "plugins")
plugins_dir = os.path.join(self.fake_home, ".agents", "plugins")
os.makedirs(plugins_dir, exist_ok=True)
marketplace_path = os.path.join(plugins_dir, "marketplace.json")

Expand All @@ -370,11 +381,16 @@ def _register_codex_plugin(self, repo_dir: str, skill_config: dict):
logging.warning(
f"Failed to read Codex marketplace at {marketplace_path}: {e}")

# Compute path relative to fake_home so that it works as a local marketplace source
rel_path = os.path.relpath(os.path.abspath(repo_dir), self.fake_home)
if not (rel_path.startswith("./") or rel_path.startswith("../") or rel_path.startswith("/")):
rel_path = f"./{rel_path}"

entry = {
"name": plugin_name,
"source": {
"source": "local",
"path": os.path.abspath(repo_dir),
"path": rel_path,
},
"policy": {
"installation": "AVAILABLE",
Expand All @@ -395,6 +411,17 @@ def _register_codex_plugin(self, repo_dir: str, skill_config: dict):
logging.info(
f"Registered Codex plugin '{plugin_name}' in {marketplace_path}")

# Install the plugin via Codex CLI so it changes status from 'not installed' to 'installed, enabled'
cmd = ["npm", "exec", "--yes", self.codex_cli_version, "--", "plugin", "add", f"{plugin_name}@{marketplace_name}"]
try:
result = subprocess.run(cmd, env=self.env, check=False, capture_output=True, text=True)
if result.returncode == 0:
logging.info(f"Successfully installed Codex plugin '{plugin_name}@{marketplace_name}'")
else:
logging.error(f"Failed to install Codex plugin '{plugin_name}@{marketplace_name}': {result.stderr.strip()}")
except Exception as e:
logging.error(f"Error executing plugin installation command: {e}")

@staticmethod
def _read_codex_plugin_name(repo_dir: str) -> str:
plugin_json_path = os.path.join(repo_dir, ".codex-plugin", "plugin.json")
Expand Down Expand Up @@ -463,7 +490,39 @@ def _find_skill_dirs(source_dir: str) -> list[str]:
if os.path.exists(os.path.join(source_dir, entry, "SKILL.md"))
]

def _write_config_toml(self, mcp_servers_config: dict, extra_config: dict):
def _install_mcp_servers_from_repo(self, mcp_servers: list):
"""Clones plugin repositories that bundle MCP servers and enables them."""
setup_env = os.environ.copy()
setup_env.update(self.env)

plugins_dir = os.path.join(self.codex_config_dir, "plugins")
os.makedirs(plugins_dir, exist_ok=True)

for mcp_config in mcp_servers:
if not isinstance(mcp_config, dict):
logging.warning(f"Unsupported MCP server config: {mcp_config}")
continue
action = mcp_config.get("action")
url = mcp_config.get("url")
if action != "install_from_repo" or not url:
logging.warning(
f"Unsupported MCP server config: {mcp_config}. When "
"'mcp_servers' is a list, each entry must use "
"'action: install_from_repo' with 'url'.")
continue
repo_dir = self._clone_extension_repo(url, plugins_dir, setup_env)
if repo_dir:
plugin_name = mcp_config.get("plugin") or self._read_codex_plugin_name(repo_dir)
if not plugin_name:
plugin_name = os.path.basename(os.path.abspath(repo_dir))
self._register_codex_plugin(repo_dir, mcp_config)
self._install_skills_from_source(repo_dir, mcp_config)

marketplace_name = mcp_config.get("marketplace_name", "evalbench-local-marketplace")
plugin_id = f"{plugin_name}@{marketplace_name}"
self.enabled_plugins.setdefault(plugin_id, {}).update(mcp_config.get("config", {}) or {})

def _write_config_toml(self):
"""Writes Codex CLI's `config.toml` with MCP server declarations.

Accepts the same Gemini-style MCP shape the rest of evalbench uses
Expand All @@ -481,18 +540,28 @@ def _write_config_toml(self, mcp_servers_config: dict, extra_config: dict):
"""
lines: list[str] = []

extra_config = dict(self._DEFAULT_TOP_LEVEL_CONFIG)
extra_config.update(self.setup_config.get("config", {}))

for key, value in extra_config.items():
lines.append(f"{key} = {self._toml_value(value)}")
if extra_config:
lines.append("")

for server_name, config in mcp_servers_config.items():
for server_name, config in self.inline_mcp_servers.items():
translated = self._translate_mcp_config(server_name, dict(config))
lines.append(f"[mcp_servers.{self._toml_key(server_name)}]")
for key, value in translated.items():
lines.append(f"{key} = {self._toml_value(value)}")
lines.append("")

for plugin_id, options in self.enabled_plugins.items():
lines.append(f'[plugins."{plugin_id}"]')
lines.append("enabled = true")
if options:
lines.append(f"options = {self._toml_value(options)}")
lines.append("")

with open(self.config_path, "w") as f:
f.write("\n".join(lines).rstrip() + "\n")

Expand Down
Loading