diff --git a/packages/browseros-agent/apps/eval/configs/browseros-clado-001760-webbench-30-visible.json b/packages/browseros-agent/apps/eval/configs/browseros-clado-001760-webbench-30-visible.json new file mode 100644 index 000000000..12d0790f8 --- /dev/null +++ b/packages/browseros-agent/apps/eval/configs/browseros-clado-001760-webbench-30-visible.json @@ -0,0 +1,30 @@ +{ + "agent": { + "type": "clado-action", + "provider": "clado-action", + "model": "qwen3-vl-30b-a3b-instruct", + "apiKey": "", + "baseUrl": "https://clado-ai--clado-browseros-action-001760-actionmodel-generate.modal.run", + "temperature": 0.7 + }, + "dataset": "../data/local/webbench-2of4-30-visible.jsonl", + "output_dir": "../results", + "num_workers": 5, + "restart_server_per_task": true, + "browseros": { + "server_url": "http://127.0.0.1:9110", + "base_cdp_port": 9010, + "base_server_port": 9110, + "base_extension_port": 9310, + "load_extensions": true, + "headless": true + }, + "captcha": { + "api_key_env": "NOPECHA_API_KEY" + }, + "graders": ["performance_grader"], + "grader_api_key_env": "OPENROUTER_API_KEY", + "grader_base_url": "https://openrouter.ai/api/v1", + "grader_model": "openai/gpt-4.1", + "timeout_ms": 1800000 +} diff --git a/packages/browseros-agent/apps/eval/data/local/webbench-2of4-30-visible.jsonl b/packages/browseros-agent/apps/eval/data/local/webbench-2of4-30-visible.jsonl new file mode 100644 index 000000000..d1d059d4d --- /dev/null +++ b/packages/browseros-agent/apps/eval/data/local/webbench-2of4-30-visible.jsonl @@ -0,0 +1,30 @@ +{"query_id":"wb-2634","dataset":"webbench","query":"Use the search bar to find an article about the \"500 Greatest Albums\" list; then list the title and URL of the article. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.rollingstone.com","metadata":{"original_task_id":"wb-2634","website":"rollingstone.com","category":"READ","additional":{"webbench_id":2634,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Use the search bar to find an article about the \"500 Greatest Albums\" list; then list the title and URL of the article.","action_only_visible_task":true}} +{"query_id":"wb-2097","dataset":"webbench","query":"Run a SPARQL query that retrieves the population of all countries in Europe. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.wikidata.org/wiki/Wikidata:Main_Page","metadata":{"original_task_id":"wb-2097","website":"wikidata.org","category":"READ","additional":{"webbench_id":2097,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"PASS","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Run a SPARQL query that retrieves the population of all countries in Europe.","action_only_visible_task":true}} +{"query_id":"wb-2675","dataset":"webbench","query":"Navigate to the “Events & News” section and extract the headline and summary of the latest press release regarding UN peacekeeping operations. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.un.org/en","metadata":{"original_task_id":"wb-2675","website":"un.org","category":"READ","additional":{"webbench_id":2675,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"PASS","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Navigate to the “Events & News” section and extract the headline and summary of the latest press release regarding UN peacekeeping operations.","action_only_visible_task":true}} +{"query_id":"wb-24","dataset":"webbench","query":"Search for beachfront properties in Miami with nightly rates under $300, and list the top three property names along with their prices. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.airbnb.com","metadata":{"original_task_id":"wb-24","website":"airbnb.com","category":"READ","additional":{"webbench_id":24,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Search for beachfront properties in Miami with nightly rates under $300, and list the top three property names along with their prices.","action_only_visible_task":true}} +{"query_id":"wb-2124","dataset":"webbench","query":"Go to the events or webinars section and list the upcoming academic events, including dates and topics. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.wiley.com/en-us","metadata":{"original_task_id":"wb-2124","website":"wiley.com","category":"READ","additional":{"webbench_id":2124,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Go to the events or webinars section and list the upcoming academic events, including dates and topics.","action_only_visible_task":true}} +{"query_id":"wb-2287","dataset":"webbench","query":"Locate a Medium article from the publication \"The Startup\" and list its URL, title, and a brief summary of its content. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://medium.com/explore-topics","metadata":{"original_task_id":"wb-2287","website":"medium.com","category":"READ","additional":{"webbench_id":2287,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"PASS","skyvern_bb":"FAIL","openai_cua":"FAIL"}},"original_query":"Locate a Medium article from the publication \"The Startup\" and list its URL, title, and a brief summary of its content.","action_only_visible_task":true}} +{"query_id":"wb-1183","dataset":"webbench","query":"Check the trading summary for the NYSE and record its current value. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.nasdaq.com","metadata":{"original_task_id":"wb-1183","website":"nasdaq.com","category":"READ","additional":{"webbench_id":1183,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Check the trading summary for the NYSE and record its current value.","action_only_visible_task":true}} +{"query_id":"wb-2329","dataset":"webbench","query":"Use the advanced search filters to find listings near Times Square with a doorman and list the first three property names along with their prices. Verify they have a doorman or concierge. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://streeteasy.com","metadata":{"original_task_id":"wb-2329","website":"streeteasy.com","category":"READ","additional":{"webbench_id":2329,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Use the advanced search filters to find listings near Times Square with a doorman and list the first three property names along with their prices. Verify they have a doorman or concierge.","action_only_visible_task":true}} +{"query_id":"wb-2391","dataset":"webbench","query":"Use Bing Maps to find walking directions to Central Park in New York and copy the step-by-step route details provided. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.bing.com","metadata":{"original_task_id":"wb-2391","website":"bing.com","category":"READ","additional":{"webbench_id":2391,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Use Bing Maps to find walking directions to Central Park in New York and copy the step-by-step route details provided.","action_only_visible_task":true}} +{"query_id":"wb-2673","dataset":"webbench","query":"Use the search feature to find nearby restaurants offering vegan options and output the names and ratings of the top 5 results. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.ubereats.com","metadata":{"original_task_id":"wb-2673","website":"ubereats.com","category":"READ","additional":{"webbench_id":2673,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"PASS","skyvern_bb":"FAIL","openai_cua":"FAIL"}},"original_query":"Use the search feature to find nearby restaurants offering vegan options and output the names and ratings of the top 5 results.","action_only_visible_task":true}} +{"query_id":"wb-219","dataset":"webbench","query":"Locate a section or article on \"upcoming releases\" and list the titles and release dates of the movies or comics mentioned. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.cbr.com","metadata":{"original_task_id":"wb-219","website":"cbr.com","category":"READ","additional":{"webbench_id":219,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"FAIL","skyvern_bb":"PASS","openai_cua":"PASS"}},"original_query":"Locate a section or article on \"upcoming releases\" and list the titles and release dates of the movies or comics mentioned.","action_only_visible_task":true}} +{"query_id":"wb-814","dataset":"webbench","query":"Use the HubPages search function to find articles on travel and summarize the main idea of the first result. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://discover.hubpages.com/","metadata":{"original_task_id":"wb-814","website":"discover.hubpages.com","category":"READ","additional":{"webbench_id":814,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"PASS","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Use the HubPages search function to find articles on travel and summarize the main idea of the first result.","action_only_visible_task":true}} +{"query_id":"wb-1152","dataset":"webbench","query":"Search for articles mentioning \"Brexit\" and list the titles of the first five results. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://metro.co.uk","metadata":{"original_task_id":"wb-1152","website":"metro.co.uk","category":"READ","additional":{"webbench_id":1152,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Search for articles mentioning \"Brexit\" and list the titles of the first five results.","action_only_visible_task":true}} +{"query_id":"wb-1325","dataset":"webbench","query":"Access the digital magazine archive and identify the issue that covers hardware benchmarks; provide its publication month. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.pcgamer.com","metadata":{"original_task_id":"wb-1325","website":"pcgamer.com","category":"READ","additional":{"webbench_id":1325,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Access the digital magazine archive and identify the issue that covers hardware benchmarks; provide its publication month.","action_only_visible_task":true}} +{"query_id":"wb-1862","dataset":"webbench","query":"Access the Opinion section, search for commentary on “climate change,” and list the titles of the three most recent pieces. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.today.com","metadata":{"original_task_id":"wb-1862","website":"today.com","category":"READ","additional":{"webbench_id":1862,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Access the Opinion section, search for commentary on “climate change,” and list the titles of the three most recent pieces.","action_only_visible_task":true}} +{"query_id":"wb-2576","dataset":"webbench","query":"Use MDPI’s journal search filters to identify journals that offer an ultra-rapid publication process, then list the names and scopes of the first five journals displayed. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.mdpi.com","metadata":{"original_task_id":"wb-2576","website":"mdpi.com","category":"READ","additional":{"webbench_id":2576,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Use MDPI’s journal search filters to identify journals that offer an ultra-rapid publication process, then list the names and scopes of the first five journals displayed.","action_only_visible_task":true}} +{"query_id":"wb-102","dataset":"webbench","query":"Search for the app \"Spotify\" on APKPure and list the latest version number along with its release date as shown on the version history section. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://apkpure.com","metadata":{"original_task_id":"wb-102","website":"apkpure.com","category":"READ","additional":{"webbench_id":102,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"PASS","skyvern_bb":"PASS","openai_cua":"FAIL"}},"original_query":"Search for the app \"Spotify\" on APKPure and list the latest version number along with its release date as shown on the version history section.","action_only_visible_task":true}} +{"query_id":"wb-984","dataset":"webbench","query":"Use the historical trends tool to list the top three most amended clauses over the past year by frequency. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.lawinsider.com","metadata":{"original_task_id":"wb-984","website":"lawinsider.com","category":"READ","additional":{"webbench_id":984,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"FAIL","skyvern_bb":"PASS","openai_cua":"PASS"}},"original_query":"Use the historical trends tool to list the top three most amended clauses over the past year by frequency.","action_only_visible_task":true}} +{"query_id":"wb-390","dataset":"webbench","query":"Visit the Podcast section, select the latest episode, and provide its title along with a brief description. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.dw.com","metadata":{"original_task_id":"wb-390","website":"dw.com","category":"READ","additional":{"webbench_id":390,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"PASS","openai_cua":"FAIL"}},"original_query":"Visit the Podcast section, select the latest episode, and provide its title along with a brief description.","action_only_visible_task":true}} +{"query_id":"wb-2071","dataset":"webbench","query":"Filter the furniture category by \"sectional sofas\" and extract the dimensions, available colors, and material details of the first product listed. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.wayfair.com","metadata":{"original_task_id":"wb-2071","website":"wayfair.com","category":"READ","additional":{"webbench_id":2071,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"PASS","skyvern_bb":"PASS","openai_cua":"FAIL"}},"original_query":"Filter the furniture category by \"sectional sofas\" and extract the dimensions, available colors, and material details of the first product listed.","action_only_visible_task":true}} +{"query_id":"wb-2302","dataset":"webbench","query":"Use the search function to find tracks containing the keyword “ambient” and output a list of the top 10 track titles along with their durations. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://soundcloud.com","metadata":{"original_task_id":"wb-2302","website":"soundcloud.com","category":"READ","additional":{"webbench_id":2302,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"FAIL","skyvern_bb":"PASS","openai_cua":"PASS"}},"original_query":"Use the search function to find tracks containing the keyword “ambient” and output a list of the top 10 track titles along with their durations.","action_only_visible_task":true}} +{"query_id":"wb-1895","dataset":"webbench","query":"Filter hotel search results in Paris by selecting properties that offer free breakfast; then extract and list the names and average review scores of the first 5 hotels. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.trivago.com","metadata":{"original_task_id":"wb-1895","website":"trivago.com","category":"READ","additional":{"webbench_id":1895,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Filter hotel search results in Paris by selecting properties that offer free breakfast; then extract and list the names and average review scores of the first 5 hotels.","action_only_visible_task":true}} +{"query_id":"wb-605","dataset":"webbench","query":"Use the website’s filtering tools to display \"Retro Gaming\" articles and extract the titles of the top three most recent posts. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://gamerant.com","metadata":{"original_task_id":"wb-605","website":"gamerant.com","category":"READ","additional":{"webbench_id":605,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Use the website’s filtering tools to display \"Retro Gaming\" articles and extract the titles of the top three most recent posts.","action_only_visible_task":true}} +{"query_id":"wb-2159","dataset":"webbench","query":"In a forum discussion on regional variations in Spanish, summarize the key differences mentioned by the community members. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.wordreference.com","metadata":{"original_task_id":"wb-2159","website":"wordreference.com","category":"READ","additional":{"webbench_id":2159,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"PASS","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"In a forum discussion on regional variations in Spanish, summarize the key differences mentioned by the community members.","action_only_visible_task":true}} +{"query_id":"wb-1158","dataset":"webbench","query":"Access the Michigan Department of Health page via Michigan.gov and list the steps provided for scheduling a vaccination appointment. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.michigan.gov/som","metadata":{"original_task_id":"wb-1158","website":"michigan.gov","category":"READ","additional":{"webbench_id":1158,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"FAIL","skyvern_bb":"PASS","openai_cua":"PASS"}},"original_query":"Access the Michigan Department of Health page via Michigan.gov and list the steps provided for scheduling a vaccination appointment.","action_only_visible_task":true}} +{"query_id":"wb-2054","dataset":"webbench","query":"Search for COVID-19 vaccinations on Walgreens and list the available appointment options, eligibility criteria, and any cost details provided. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.walgreens.com","metadata":{"original_task_id":"wb-2054","website":"walgreens.com","category":"READ","additional":{"webbench_id":2054,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"PASS","openai_cua":"FAIL"}},"original_query":"Search for COVID-19 vaccinations on Walgreens and list the available appointment options, eligibility criteria, and any cost details provided.","action_only_visible_task":true}} +{"query_id":"wb-635","dataset":"webbench","query":"What email should I contact if I'm interesting in working for Genius as a journalist? Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://genius.com","metadata":{"original_task_id":"wb-635","website":"genius.com","category":"READ","additional":{"webbench_id":635,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"What email should I contact if I'm interesting in working for Genius as a journalist?","action_only_visible_task":true}} +{"query_id":"wb-1145","dataset":"webbench","query":"Browse the TV shows category and list the titles, metascores, and number of critic reviews for shows scoring below 60 with at least 10 critic reviews. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.metacritic.com","metadata":{"original_task_id":"wb-1145","website":"metacritic.com","category":"READ","additional":{"webbench_id":1145,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"PASS","skyvern_bb":"PASS","openai_cua":"FAIL"}},"original_query":"Browse the TV shows category and list the titles, metascores, and number of critic reviews for shows scoring below 60 with at least 10 critic reviews.","action_only_visible_task":true}} +{"query_id":"wb-1161","dataset":"webbench","query":"Access the MLB.TV subscription page and extract the available pricing options and plan durations offered. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://www.mlb.com","metadata":{"original_task_id":"wb-1161","website":"mlb.com","category":"READ","additional":{"webbench_id":1161,"difficulty":"easy","pass_count_4":2,"agent_results":{"anthropic_cua":"FAIL","skyvern_2":"PASS","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Access the MLB.TV subscription page and extract the available pricing options and plan durations offered.","action_only_visible_task":true}} +{"query_id":"wb-2289","dataset":"webbench","query":"Search for the latest article about holiday recipes on Parade.com and summarize the key steps or ingredients mentioned in the recipe. Complete the task by navigating until the requested information is clearly visible on the page. Do not prepare a written response; stop when the relevant evidence is visible.","start_url":"https://parade.com","metadata":{"original_task_id":"wb-2289","website":"parade.com","category":"READ","additional":{"webbench_id":2289,"difficulty":"hard","pass_count_4":2,"agent_results":{"anthropic_cua":"PASS","skyvern_2":"FAIL","skyvern_bb":"FAIL","openai_cua":"PASS"}},"original_query":"Search for the latest article about holiday recipes on Parade.com and summarize the key steps or ingredients mentioned in the recipe.","action_only_visible_task":true}} diff --git a/packages/browseros-agent/apps/eval/src/agents/clado-action/index.ts b/packages/browseros-agent/apps/eval/src/agents/clado-action/index.ts new file mode 100644 index 000000000..2beb95f5a --- /dev/null +++ b/packages/browseros-agent/apps/eval/src/agents/clado-action/index.ts @@ -0,0 +1,199 @@ +/** + * Direct Clado Action evaluator. + * + * Runs the visual action model directly against the full task instruction, + * without an LLM orchestrator in front of it. + */ + +import { Browser } from '@browseros/server/browser' +import { CdpBackend } from '@browseros/server/browser/backends/cdp' +import { CaptchaWaiter } from '../../capture/captcha-waiter' +import { DEFAULT_TIMEOUT_MS } from '../../constants' +import type { CladoActionConfig, EvalConfig, TaskMetadata } from '../../types' +import type { UIMessageStreamEvent } from '../../types/message' +import { resolveEnvValue } from '../../utils/resolve-env' +import { withEvalTimeout } from '../../utils/with-eval-timeout' +import { CladoActionExecutor } from '../orchestrator-executor/clado-action-executor' +import type { ExecutorCallbacks } from '../orchestrator-executor/executor' +import type { AgentContext, AgentEvaluator, AgentResult } from '../types' + +function extractCdpPort(config: EvalConfig): number { + const serverUrl = config.browseros.server_url + const match = serverUrl.match(/:(\d+)$/) + if (!match) return config.browseros.base_cdp_port + const serverPort = Number.parseInt(match[1], 10) + const workerOffset = serverPort - config.browseros.base_server_port + return config.browseros.base_cdp_port + workerOffset +} + +export class CladoActionEvaluator implements AgentEvaluator { + constructor(private ctx: AgentContext) {} + + async execute(): Promise { + const { config, task, capture } = this.ctx + const startTime = Date.now() + const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS + + await capture.messageLogger.logUser(task.query) + + if (config.agent.type !== 'clado-action') { + throw new Error('CladoActionEvaluator requires clado-action config') + } + + const agentConfig = config.agent as CladoActionConfig + const cdpPort = extractCdpPort(config) + const cdp = new CdpBackend({ port: cdpPort }) + await cdp.connect() + const browser = new Browser(cdp) + capture.screenshot.setBrowser(browser) + + const captchaWaiter = config.captcha + ? new CaptchaWaiter({ + waitTimeoutMs: config.captcha.wait_timeout_ms, + pollIntervalMs: config.captcha.poll_interval_ms, + }) + : null + + const callbacks: ExecutorCallbacks = { + onToolCallStart: ({ input }) => { + const args = input as Record | undefined + if (args && typeof args.page === 'number') { + capture.setActivePageId(args.page) + } + }, + onToolCallFinish: async () => { + try { + if (captchaWaiter) { + await captchaWaiter.waitIfCaptchaPresent( + browser, + capture.getActivePageId(), + ) + } + const screenshotNum = await capture.screenshot.capture( + capture.getActivePageId(), + ) + capture.emitEvent(task.query_id, { + type: 'screenshot-captured', + screenshot: screenshotNum, + }) + } catch { + // Screenshot failures are non-fatal. + } + }, + onStepFinish: async ({ toolCalls, toolResults, text }) => { + if (toolCalls) { + for (const tc of toolCalls) { + const inputEvent: UIMessageStreamEvent = { + type: 'tool-input-available', + toolCallId: tc.toolCallId, + toolName: tc.toolName, + input: tc.input, + } + await capture.messageLogger.logStreamEvent(inputEvent) + capture.emitEvent(task.query_id, inputEvent) + } + } + if (toolResults) { + for (const tr of toolResults) { + const outputEvent: UIMessageStreamEvent = { + type: 'tool-output-available', + toolCallId: tr.toolCallId, + output: tr.output, + } + await capture.messageLogger.logStreamEvent(outputEvent) + capture.emitEvent(task.query_id, outputEvent) + } + } + if (text) { + const textId = crypto.randomUUID() + const startEvent: UIMessageStreamEvent = { + type: 'text-start', + id: textId, + } + const deltaEvent: UIMessageStreamEvent = { + type: 'text-delta', + id: textId, + delta: text, + } + const endEvent: UIMessageStreamEvent = { + type: 'text-end', + id: textId, + } + await capture.messageLogger.logStreamEvent(startEvent) + await capture.messageLogger.logStreamEvent(deltaEvent) + await capture.messageLogger.logStreamEvent(endEvent) + capture.emitEvent(task.query_id, deltaEvent) + } + }, + } + + const executor = new CladoActionExecutor( + { + provider: agentConfig.provider, + model: agentConfig.model, + apiKey: resolveEnvValue(agentConfig.apiKey) ?? '', + baseUrl: agentConfig.baseUrl, + temperature: agentConfig.temperature, + }, + config.browseros.server_url, + undefined, + undefined, + this.ctx.initialPageId, + ) + executor.setCallbacks(callbacks) + + try { + let finalAnswer: string | null = null + let totalSteps = 0 + + const { terminationReason, result } = await withEvalTimeout( + timeoutMs, + capture, + async (signal) => { + const execution = await executor.execute(task.query, signal) + finalAnswer = execution.observation + totalSteps = execution.actionsPerformed + + if (execution.status !== 'done' && execution.status !== 'timeout') { + capture.addError('agent_execution', execution.observation) + } + + return execution + }, + ) + + const endTime = Date.now() + const metadata: TaskMetadata = { + query_id: task.query_id, + dataset: task.dataset, + query: task.query, + started_at: new Date(startTime).toISOString(), + completed_at: new Date(endTime).toISOString(), + total_duration_ms: endTime - startTime, + total_steps: result?.actionsPerformed ?? totalSteps, + termination_reason: terminationReason, + final_answer: finalAnswer, + errors: capture.getErrors(), + warnings: capture.getWarnings(), + device_pixel_ratio: capture.screenshot.getDevicePixelRatio(), + agent_config: { + type: 'clado-action', + model: agentConfig.model, + temperature: agentConfig.temperature, + }, + grader_results: {}, + } + + await capture.trajectorySaver.saveMetadata(metadata) + + return { + metadata, + messages: capture.getMessages(), + finalAnswer, + } + } finally { + await executor.close().catch(() => {}) + await cdp.disconnect().catch(() => {}) + } + } +} diff --git a/packages/browseros-agent/apps/eval/src/agents/index.ts b/packages/browseros-agent/apps/eval/src/agents/index.ts index 1ace1f82a..c863baa2c 100644 --- a/packages/browseros-agent/apps/eval/src/agents/index.ts +++ b/packages/browseros-agent/apps/eval/src/agents/index.ts @@ -1,3 +1,4 @@ +import { CladoActionEvaluator } from './clado-action' import { GeminiComputerUseEvaluator } from './gemini-computer-use' import { OrchestratorExecutorEvaluator } from './orchestrator-executor' import { registerAgent } from './registry' @@ -10,6 +11,7 @@ registerAgent( 'orchestrator-executor', (ctx) => new OrchestratorExecutorEvaluator(ctx), ) +registerAgent('clado-action', (ctx) => new CladoActionEvaluator(ctx)) registerAgent( 'gemini-computer-use', (ctx) => new GeminiComputerUseEvaluator(ctx), diff --git a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/clado-action-executor.ts b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/clado-action-executor.ts index 3108f8f8c..2408c8fa2 100644 --- a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/clado-action-executor.ts +++ b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/clado-action-executor.ts @@ -371,6 +371,9 @@ export class CladoActionExecutor { instruction, image_base64: imageBase64, history: this.formatHistory(actionHistory), + ...(typeof this.config.temperature === 'number' + ? { temperature: this.config.temperature } + : {}), }), signal: requestController.signal, }) diff --git a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/executor.ts b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/executor.ts index 4103ca8e3..87c80e888 100644 --- a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/executor.ts +++ b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/executor.ts @@ -64,7 +64,7 @@ export class Executor { private cladoExecutor: CladoActionExecutor | null = null private stepsUsed = 0 private currentUrl = '' - private configTemplate: ResolvedAgentConfig + private configTemplate: ResolvedAgentConfig & { temperature?: number } private isCladoAction: boolean private browser: Browser | null private serverUrl: string @@ -74,7 +74,7 @@ export class Executor { private callbacks: ExecutorCallbacks constructor( - configTemplate: ResolvedAgentConfig, + configTemplate: ResolvedAgentConfig & { temperature?: number }, browser: Browser | null, serverUrl: string, options?: { @@ -107,6 +107,7 @@ export class Executor { model: this.configTemplate.model, apiKey: this.configTemplate.apiKey ?? '', baseUrl: this.configTemplate.baseUrl, + temperature: this.configTemplate.temperature, }, this.serverUrl, this.windowId, diff --git a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts index 6ebaefbda..17cc8d3ea 100644 --- a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts +++ b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/index.ts @@ -41,7 +41,7 @@ function extractCdpPort(config: EvalConfig): number { interface ResolvedConfigs { orchestratorConfig: ResolvedAgentConfig & { maxTurns?: number } - executorConfig: ResolvedAgentConfig + executorConfig: ResolvedAgentConfig & { temperature?: number } isCladoAction: boolean } @@ -87,7 +87,7 @@ async function resolveAgentConfig( const isCladoAction = config.executor.provider === 'clado-action' - let executorConfig: ResolvedAgentConfig + let executorConfig: ResolvedAgentConfig & { temperature?: number } if (isCladoAction) { executorConfig = { conversationId: crypto.randomUUID(), @@ -95,6 +95,7 @@ async function resolveAgentConfig( model: executorModel, apiKey: resolveEnvValue(config.executor.apiKey), baseUrl: config.executor.baseUrl, + temperature: config.executor.temperature, workingDir: `/tmp/browseros-eval-executor-${crypto.randomUUID()}`, } } else { diff --git a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/types.ts b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/types.ts index 3a758e227..cf8d6301a 100644 --- a/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/types.ts +++ b/packages/browseros-agent/apps/eval/src/agents/orchestrator-executor/types.ts @@ -15,6 +15,7 @@ export interface ExecutorConfig { model: string apiKey: string baseUrl?: string + temperature?: number } export const ORCHESTRATOR_DEFAULTS = { diff --git a/packages/browseros-agent/apps/eval/src/constants.ts b/packages/browseros-agent/apps/eval/src/constants.ts index 1db4d1d1b..abc61e675 100644 --- a/packages/browseros-agent/apps/eval/src/constants.ts +++ b/packages/browseros-agent/apps/eval/src/constants.ts @@ -4,5 +4,5 @@ export const DEFAULT_TIMEOUT_MS = 30 * 60 * 1000 // 30 minutes export const SCREENSHOT_TIMEOUT_MS = 65_000 // 65s — ensures we get extension's error (60s) -export const MAX_ACTIONS_PER_DELEGATION = 15 +export const MAX_ACTIONS_PER_DELEGATION = 50 export const CLADO_REQUEST_TIMEOUT_MS = 120_000 diff --git a/packages/browseros-agent/apps/eval/src/graders/performance/axes.ts b/packages/browseros-agent/apps/eval/src/graders/performance/axes.ts index ea32042dc..53c3fec48 100644 --- a/packages/browseros-agent/apps/eval/src/graders/performance/axes.ts +++ b/packages/browseros-agent/apps/eval/src/graders/performance/axes.ts @@ -39,6 +39,10 @@ export const DEFAULT_AXES: AxisDefinition[] = [ }, ] +export interface BuildUserPromptOptions { + stateOnlyMode?: boolean +} + export const PERFORMANCE_SYSTEM_PROMPT = `You are a performance evaluator for a browser automation agent. You will score how well the agent executed a web task across multiple axes. ## Data Files @@ -102,6 +106,8 @@ When the agent's final answer contains specific data (prices, names, dates, coun - Task asks "extract the email address" → grep for the email pattern This is the most reliable way to verify whether the agent actually found the data it claims, since screenshots may be blurry, truncated, or missing the relevant section. +For action-only agents, the final answer may be absent or may contain executor status text instead of a user-facing response. In that case, judge task_completion from the final browser state, screenshots, action sequence, and DOM evidence. Do not penalize the agent solely for lacking a textual final answer when the task contract asks it to leave the requested evidence visible on screen. + ## How to View Screenshots You have {screenshot_count} screenshots. View 3-5 strategically: @@ -169,6 +175,7 @@ export function buildUserPrompt( metrics: PreComputedMetrics, axes: AxisDefinition[], expectedAnswer?: string | null, + options: BuildUserPromptOptions = {}, ): string { const axesBlock = axes .map((a) => `- **${a.name}** (weight: ${a.weight}): ${a.description}`) @@ -180,11 +187,16 @@ export function buildUserPrompt( ? `\n## Expected Answer (Ground Truth)\n${expectedAnswer}\n\nWhen scoring task_completion, compare the agent's final answer against this ground truth. Consider semantic equivalence, partial correctness, and completeness. Award partial credit where the agent got some but not all parts right.` : '' + const stateOnlyBlock = options.stateOnlyMode + ? `\n## Action-Only Browser-State Evaluation\nThis run used an action-only Clado executor. The model can click, type, scroll, wait, and emit end(), but it has no supported channel for a separate natural-language final answer. Treat the browser state as the output. Score task_completion by verifying whether the requested information or destination is visible or otherwise evidenced in screenshots/messages.jsonl. Ignore executor status text such as max-budget or end-observation as a substantive answer. Still penalize repeated loops, unnecessary actions, CAPTCHA blockage, unsupported claims in reasoning, and failure to reach visible evidence.\n` + : '' + return `## Task ${taskQuery} +${stateOnlyBlock} ## Agent's Final Answer -${finalAnswer || '[No answer provided]'} +${options.stateOnlyMode ? '[Action-only run: judge browser state, not this field]' : finalAnswer || '[No answer provided]'} ${expectedAnswerBlock} ## Pre-Computed Metrics ${metricsBlock} diff --git a/packages/browseros-agent/apps/eval/src/graders/performance/performance-grader.ts b/packages/browseros-agent/apps/eval/src/graders/performance/performance-grader.ts index 127bafd7c..c6eca18b1 100644 --- a/packages/browseros-agent/apps/eval/src/graders/performance/performance-grader.ts +++ b/packages/browseros-agent/apps/eval/src/graders/performance/performance-grader.ts @@ -47,6 +47,7 @@ export class PerformanceGrader implements Grader { try { // Read termination reason from metadata.json let terminationReason = 'unknown' + let stateOnlyMode = false try { const metadataRaw = await readFile( join(input.outputDir, 'metadata.json'), @@ -54,6 +55,7 @@ export class PerformanceGrader implements Grader { ) const metadata = JSON.parse(metadataRaw) terminationReason = metadata.termination_reason || 'unknown' + stateOnlyMode = metadata.agent_config?.type === 'clado-action' } catch { // metadata.json may not exist } @@ -75,6 +77,7 @@ export class PerformanceGrader implements Grader { metrics, this.axes, input.expectedAnswer, + { stateOnlyMode }, ) const response = await this.runAgent( diff --git a/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts b/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts index 7566b5897..794f38f3c 100644 --- a/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts +++ b/packages/browseros-agent/apps/eval/src/runner/browseros-app-manager.ts @@ -14,6 +14,7 @@ */ import { + cpSync, existsSync, mkdtempSync, readFileSync, @@ -28,7 +29,7 @@ import { sleep } from '../utils/sleep' const MAX_RESTART_ATTEMPTS = 3 const CDP_WAIT_TIMEOUT_MS = 30_000 -const SERVER_HEALTH_TIMEOUT_MS = 30_000 +const SERVER_HEALTH_TIMEOUT_MS = 120_000 const MONOREPO_ROOT = join( dirname(fileURLToPath(import.meta.url)), @@ -39,10 +40,14 @@ const BROWSEROS_BINARY = process.env.BROWSEROS_BINARY || '/Applications/BrowserOS.app/Contents/MacOS/BrowserOS' -const CAPTCHA_EXT_DIR = join( - dirname(fileURLToPath(import.meta.url)), - '../../extensions/nopecha', -) +const CAPTCHA_EXT_DIR = + process.env.NOPECHA_EXTENSION_DIR || + join(dirname(fileURLToPath(import.meta.url)), '../../extensions/nopecha') +let patchedCaptchaExtDir: string | null = null + +function getCaptchaExtDir(): string { + return patchedCaptchaExtDir || CAPTCHA_EXT_DIR +} export class BrowserOSAppManager { private ports: EvalPorts @@ -127,12 +132,20 @@ export class BrowserOSAppManager { ` [W${this.workerIndex}] Ports: CDP=${cdp} Server=${server} Extension=${extension}${this.headless ? ' (headless)' : ''}`, ) console.log(` [W${this.workerIndex}] Profile: ${this.tempDir}`) + if (!existsSync(BROWSEROS_BINARY)) { + throw new Error(`BrowserOS binary not found: ${BROWSEROS_BINARY}`) + } + console.log( + ` [W${this.workerIndex}] BrowserOS binary: ${BROWSEROS_BINARY}`, + ) // --- Chrome Launch (matches start.ts startManualBrowser) --- const chromeArgs = [ '--no-first-run', '--no-default-browser-check', '--use-mock-keychain', + '--show-component-extension-options', + '--no-sandbox', '--disable-browseros-server', '--disable-browseros-extensions', ...(this.headless ? ['--headless=new'] : []), @@ -144,11 +157,20 @@ export class BrowserOSAppManager { ] const extensions: string[] = [] - if (this.loadExtensions && existsSync(CAPTCHA_EXT_DIR)) { - extensions.push(CAPTCHA_EXT_DIR) + if (this.loadExtensions) { + const captchaExtDir = getCaptchaExtDir() + const manifestPath = join(captchaExtDir, 'manifest.json') + if (!existsSync(manifestPath)) { + throw new Error(`NopeCHA extension not found: ${captchaExtDir}`) + } + extensions.push(captchaExtDir) } if (extensions.length > 0) { + chromeArgs.push(`--disable-extensions-except=${extensions.join(',')}`) chromeArgs.push(`--load-extension=${extensions.join(',')}`) + console.log( + ` [W${this.workerIndex}] Loading extensions: ${extensions.join(',')}`, + ) } chromeArgs.push('about:blank') @@ -159,7 +181,7 @@ export class BrowserOSAppManager { stderr: 'ignore', }) console.log( - ` [W${this.workerIndex}] Chrome started (PID: ${this.chromeProc.pid})`, + ` [W${this.workerIndex}] BrowserOS started (PID: ${this.chromeProc.pid})`, ) // --- Wait for CDP --- @@ -307,16 +329,32 @@ export class BrowserOSAppManager { * Call once before launching any workers — the extension directory is shared. */ static patchNopechaApiKey(apiKey: string): void { - const manifestPath = join(CAPTCHA_EXT_DIR, 'manifest.json') - if (!existsSync(manifestPath)) { - console.log( - '[BROWSEROS] NopeCHA extension not found, skipping API key patch', - ) - return + const trimmedApiKey = apiKey.trim() + if (!trimmedApiKey) { + throw new Error('NopeCHA API key is empty') } + const sourceManifestPath = join(CAPTCHA_EXT_DIR, 'manifest.json') + if (!existsSync(sourceManifestPath)) { + throw new Error(`NopeCHA extension not found: ${CAPTCHA_EXT_DIR}`) + } + if (patchedCaptchaExtDir?.startsWith('/tmp/browseros-nopecha-')) { + rmSync(patchedCaptchaExtDir, { recursive: true, force: true }) + } + patchedCaptchaExtDir = mkdtempSync('/tmp/browseros-nopecha-') + cpSync(CAPTCHA_EXT_DIR, patchedCaptchaExtDir, { recursive: true }) + const manifestPath = join(patchedCaptchaExtDir, 'manifest.json') const manifest = JSON.parse(readFileSync(manifestPath, 'utf-8')) - manifest.nopecha = { ...manifest.nopecha, key: apiKey } + manifest.nopecha = { + ...manifest.nopecha, + key: trimmedApiKey, + keys: [trimmedApiKey], + enabled: true, + recaptcha_auto_solve: true, + hcaptcha_auto_solve: true, + funcaptcha_auto_solve: true, + turnstile_auto_solve: true, + } writeFileSync(manifestPath, JSON.stringify(manifest, null, 2)) - console.log('[BROWSEROS] NopeCHA API key patched') + console.log(`[BROWSEROS] NopeCHA API key patched: ${manifestPath}`) } } diff --git a/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts b/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts index 4766b1bc4..c3b3831b4 100644 --- a/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts +++ b/packages/browseros-agent/apps/eval/src/runner/parallel-executor.ts @@ -89,13 +89,20 @@ export class ParallelExecutor { const loadExtensions = this.config.config.browseros.load_extensions ?? false - // Patch NopeCHA API key before launching any workers + // Patch NopeCHA API key before launching any workers when extensions are enabled. const captchaConfig = this.config.config.captcha - if (captchaConfig) { + if (loadExtensions && captchaConfig) { const apiKey = process.env[captchaConfig.api_key_env] - if (apiKey) { - BrowserOSAppManager.patchNopechaApiKey(apiKey) + if (!apiKey) { + throw new Error( + `${captchaConfig.api_key_env} is required when BrowserOS extensions are enabled`, + ) } + BrowserOSAppManager.patchNopechaApiKey(apiKey) + } else if (loadExtensions) { + console.warn( + '[BROWSEROS] Extensions enabled, but no captcha config was provided; NopeCHA key was not patched', + ) } this.queue = new TaskQueue(tasks) diff --git a/packages/browseros-agent/apps/eval/src/types/config.ts b/packages/browseros-agent/apps/eval/src/types/config.ts index bf181535b..ed0d86a74 100644 --- a/packages/browseros-agent/apps/eval/src/types/config.ts +++ b/packages/browseros-agent/apps/eval/src/types/config.ts @@ -16,9 +16,19 @@ export const OrchestratorExecutorConfigSchema = z.object({ }), executor: LLMConfigSchema.extend({ provider: z.union([LLMProviderSchema, z.literal('clado-action')]), + temperature: z.number().min(0).max(2).optional(), }), }) +export const CladoActionConfigSchema = z.object({ + type: z.literal('clado-action'), + provider: z.literal('clado-action'), + model: z.string().min(1), + apiKey: z.string().optional().default(''), + baseUrl: z.string().url(), + temperature: z.number().min(0).max(2).optional(), +}) + export const GeminiComputerUseConfigSchema = z.object({ type: z.literal('gemini-computer-use'), apiKey: z @@ -48,6 +58,7 @@ export const YutoriNavigatorConfigSchema = z.object({ export const AgentConfigSchema = z.discriminatedUnion('type', [ SingleAgentConfigSchema, OrchestratorExecutorConfigSchema, + CladoActionConfigSchema, GeminiComputerUseConfigSchema, YutoriNavigatorConfigSchema, ]) @@ -84,6 +95,7 @@ export type SingleAgentConfig = z.infer export type OrchestratorExecutorConfig = z.infer< typeof OrchestratorExecutorConfigSchema > +export type CladoActionConfig = z.infer export type GeminiComputerUseConfig = z.infer< typeof GeminiComputerUseConfigSchema > diff --git a/packages/browseros-agent/apps/eval/src/types/index.ts b/packages/browseros-agent/apps/eval/src/types/index.ts index e55f73a71..594f66988 100644 --- a/packages/browseros-agent/apps/eval/src/types/index.ts +++ b/packages/browseros-agent/apps/eval/src/types/index.ts @@ -2,6 +2,8 @@ export { type AgentConfig, AgentConfigSchema, + type CladoActionConfig, + CladoActionConfigSchema, type EvalConfig, EvalConfigSchema, type GeminiComputerUseConfig, diff --git a/packages/browseros-agent/apps/eval/src/types/result.ts b/packages/browseros-agent/apps/eval/src/types/result.ts index 29fa3f5c9..5d656fdc5 100644 --- a/packages/browseros-agent/apps/eval/src/types/result.ts +++ b/packages/browseros-agent/apps/eval/src/types/result.ts @@ -16,6 +16,7 @@ const AgentConfigMetaSchema = z type: z.enum([ 'single', 'orchestrator-executor', + 'clado-action', 'gemini-computer-use', 'yutori-navigator', ]), diff --git a/packages/browseros-agent/apps/eval/tests/e2e/captcha-e2e.ts b/packages/browseros-agent/apps/eval/tests/e2e/captcha-e2e.ts index 071375d84..93514eb74 100644 --- a/packages/browseros-agent/apps/eval/tests/e2e/captcha-e2e.ts +++ b/packages/browseros-agent/apps/eval/tests/e2e/captcha-e2e.ts @@ -46,7 +46,7 @@ const EVAL_CONFIG = { base_cdp_port: 9010, base_server_port: 9110, base_extension_port: 9310, - load_extensions: false, + load_extensions: true, headless: false, }, captcha: { api_key_env: 'NOPECHA_API_KEY' },