Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"agent": {
"type": "clado-action",
"provider": "clado-action",
"model": "qwen3-vl-30b-a3b-instruct",
"apiKey": "",
"baseUrl": "https://clado-ai--clado-browseros-action-001760-actionmodel-generate.modal.run",
"temperature": 0.7
},
"dataset": "../data/local/webbench-2of4-30-visible.jsonl",
"output_dir": "../results",
"num_workers": 5,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": true,
"headless": true
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1800000
}

Large diffs are not rendered by default.

199 changes: 199 additions & 0 deletions packages/browseros-agent/apps/eval/src/agents/clado-action/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
/**
* Direct Clado Action evaluator.
*
* Runs the visual action model directly against the full task instruction,
* without an LLM orchestrator in front of it.
*/

import { Browser } from '@browseros/server/browser'
import { CdpBackend } from '@browseros/server/browser/backends/cdp'
import { CaptchaWaiter } from '../../capture/captcha-waiter'
import { DEFAULT_TIMEOUT_MS } from '../../constants'
import type { CladoActionConfig, EvalConfig, TaskMetadata } from '../../types'
import type { UIMessageStreamEvent } from '../../types/message'
import { resolveEnvValue } from '../../utils/resolve-env'
import { withEvalTimeout } from '../../utils/with-eval-timeout'
import { CladoActionExecutor } from '../orchestrator-executor/clado-action-executor'
import type { ExecutorCallbacks } from '../orchestrator-executor/executor'
import type { AgentContext, AgentEvaluator, AgentResult } from '../types'

function extractCdpPort(config: EvalConfig): number {
const serverUrl = config.browseros.server_url
const match = serverUrl.match(/:(\d+)$/)
if (!match) return config.browseros.base_cdp_port
const serverPort = Number.parseInt(match[1], 10)
const workerOffset = serverPort - config.browseros.base_server_port
return config.browseros.base_cdp_port + workerOffset
}

export class CladoActionEvaluator implements AgentEvaluator {
constructor(private ctx: AgentContext) {}

async execute(): Promise<AgentResult> {
const { config, task, capture } = this.ctx
const startTime = Date.now()
const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS

await capture.messageLogger.logUser(task.query)

if (config.agent.type !== 'clado-action') {
throw new Error('CladoActionEvaluator requires clado-action config')
}

const agentConfig = config.agent as CladoActionConfig
const cdpPort = extractCdpPort(config)
const cdp = new CdpBackend({ port: cdpPort })
await cdp.connect()
const browser = new Browser(cdp)
capture.screenshot.setBrowser(browser)

const captchaWaiter = config.captcha
? new CaptchaWaiter({
waitTimeoutMs: config.captcha.wait_timeout_ms,
pollIntervalMs: config.captcha.poll_interval_ms,
})
: null

const callbacks: ExecutorCallbacks = {
onToolCallStart: ({ input }) => {
const args = input as Record<string, unknown> | undefined
if (args && typeof args.page === 'number') {
capture.setActivePageId(args.page)
}
},
onToolCallFinish: async () => {
try {
if (captchaWaiter) {
await captchaWaiter.waitIfCaptchaPresent(
browser,
capture.getActivePageId(),
)
}
const screenshotNum = await capture.screenshot.capture(
capture.getActivePageId(),
)
capture.emitEvent(task.query_id, {
type: 'screenshot-captured',
screenshot: screenshotNum,
})
} catch {
// Screenshot failures are non-fatal.
}
},
onStepFinish: async ({ toolCalls, toolResults, text }) => {
if (toolCalls) {
for (const tc of toolCalls) {
const inputEvent: UIMessageStreamEvent = {
type: 'tool-input-available',
toolCallId: tc.toolCallId,
toolName: tc.toolName,
input: tc.input,
}
await capture.messageLogger.logStreamEvent(inputEvent)
capture.emitEvent(task.query_id, inputEvent)
}
}
if (toolResults) {
for (const tr of toolResults) {
const outputEvent: UIMessageStreamEvent = {
type: 'tool-output-available',
toolCallId: tr.toolCallId,
output: tr.output,
}
await capture.messageLogger.logStreamEvent(outputEvent)
capture.emitEvent(task.query_id, outputEvent)
}
}
if (text) {
const textId = crypto.randomUUID()
const startEvent: UIMessageStreamEvent = {
type: 'text-start',
id: textId,
}
const deltaEvent: UIMessageStreamEvent = {
type: 'text-delta',
id: textId,
delta: text,
}
const endEvent: UIMessageStreamEvent = {
type: 'text-end',
id: textId,
}
await capture.messageLogger.logStreamEvent(startEvent)
await capture.messageLogger.logStreamEvent(deltaEvent)
await capture.messageLogger.logStreamEvent(endEvent)
capture.emitEvent(task.query_id, deltaEvent)
}
},
}

const executor = new CladoActionExecutor(
{
provider: agentConfig.provider,
model: agentConfig.model,
apiKey: resolveEnvValue(agentConfig.apiKey) ?? '',
baseUrl: agentConfig.baseUrl,
temperature: agentConfig.temperature,
},
config.browseros.server_url,
undefined,
undefined,
this.ctx.initialPageId,
)
executor.setCallbacks(callbacks)

try {
let finalAnswer: string | null = null
let totalSteps = 0

const { terminationReason, result } = await withEvalTimeout(
timeoutMs,
capture,
async (signal) => {
const execution = await executor.execute(task.query, signal)
finalAnswer = execution.observation
totalSteps = execution.actionsPerformed

if (execution.status !== 'done' && execution.status !== 'timeout') {
capture.addError('agent_execution', execution.observation)
}

return execution
},
)

const endTime = Date.now()
const metadata: TaskMetadata = {
query_id: task.query_id,
dataset: task.dataset,
query: task.query,
started_at: new Date(startTime).toISOString(),
completed_at: new Date(endTime).toISOString(),
total_duration_ms: endTime - startTime,
total_steps: result?.actionsPerformed ?? totalSteps,
termination_reason: terminationReason,
final_answer: finalAnswer,
errors: capture.getErrors(),
warnings: capture.getWarnings(),
device_pixel_ratio: capture.screenshot.getDevicePixelRatio(),
agent_config: {
type: 'clado-action',
model: agentConfig.model,
temperature: agentConfig.temperature,
},
grader_results: {},
}

await capture.trajectorySaver.saveMetadata(metadata)

return {
metadata,
messages: capture.getMessages(),
finalAnswer,
}
} finally {
await executor.close().catch(() => {})
await cdp.disconnect().catch(() => {})
}
}
}
2 changes: 2 additions & 0 deletions packages/browseros-agent/apps/eval/src/agents/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { CladoActionEvaluator } from './clado-action'
import { GeminiComputerUseEvaluator } from './gemini-computer-use'
import { OrchestratorExecutorEvaluator } from './orchestrator-executor'
import { registerAgent } from './registry'
Expand All @@ -10,6 +11,7 @@ registerAgent(
'orchestrator-executor',
(ctx) => new OrchestratorExecutorEvaluator(ctx),
)
registerAgent('clado-action', (ctx) => new CladoActionEvaluator(ctx))
registerAgent(
'gemini-computer-use',
(ctx) => new GeminiComputerUseEvaluator(ctx),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,9 @@ export class CladoActionExecutor {
instruction,
image_base64: imageBase64,
history: this.formatHistory(actionHistory),
...(typeof this.config.temperature === 'number'
? { temperature: this.config.temperature }
: {}),
}),
signal: requestController.signal,
})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ export class Executor {
private cladoExecutor: CladoActionExecutor | null = null
private stepsUsed = 0
private currentUrl = ''
private configTemplate: ResolvedAgentConfig
private configTemplate: ResolvedAgentConfig & { temperature?: number }
private isCladoAction: boolean
private browser: Browser | null
private serverUrl: string
Expand All @@ -74,7 +74,7 @@ export class Executor {
private callbacks: ExecutorCallbacks

constructor(
configTemplate: ResolvedAgentConfig,
configTemplate: ResolvedAgentConfig & { temperature?: number },
browser: Browser | null,
serverUrl: string,
options?: {
Expand Down Expand Up @@ -107,6 +107,7 @@ export class Executor {
model: this.configTemplate.model,
apiKey: this.configTemplate.apiKey ?? '',
baseUrl: this.configTemplate.baseUrl,
temperature: this.configTemplate.temperature,
},
this.serverUrl,
this.windowId,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ function extractCdpPort(config: EvalConfig): number {

interface ResolvedConfigs {
orchestratorConfig: ResolvedAgentConfig & { maxTurns?: number }
executorConfig: ResolvedAgentConfig
executorConfig: ResolvedAgentConfig & { temperature?: number }
isCladoAction: boolean
}

Expand Down Expand Up @@ -87,14 +87,15 @@ async function resolveAgentConfig(

const isCladoAction = config.executor.provider === 'clado-action'

let executorConfig: ResolvedAgentConfig
let executorConfig: ResolvedAgentConfig & { temperature?: number }
if (isCladoAction) {
executorConfig = {
conversationId: crypto.randomUUID(),
provider: config.executor.provider as ResolvedAgentConfig['provider'],
model: executorModel,
apiKey: resolveEnvValue(config.executor.apiKey),
baseUrl: config.executor.baseUrl,
temperature: config.executor.temperature,
workingDir: `/tmp/browseros-eval-executor-${crypto.randomUUID()}`,
}
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ export interface ExecutorConfig {
model: string
apiKey: string
baseUrl?: string
temperature?: number
}

export const ORCHESTRATOR_DEFAULTS = {
Expand Down
2 changes: 1 addition & 1 deletion packages/browseros-agent/apps/eval/src/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@

export const DEFAULT_TIMEOUT_MS = 30 * 60 * 1000 // 30 minutes
export const SCREENSHOT_TIMEOUT_MS = 65_000 // 65s — ensures we get extension's error (60s)
export const MAX_ACTIONS_PER_DELEGATION = 15
export const MAX_ACTIONS_PER_DELEGATION = 50
export const CLADO_REQUEST_TIMEOUT_MS = 120_000
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ export const DEFAULT_AXES: AxisDefinition[] = [
},
]

export interface BuildUserPromptOptions {
stateOnlyMode?: boolean
}

export const PERFORMANCE_SYSTEM_PROMPT = `You are a performance evaluator for a browser automation agent. You will score how well the agent executed a web task across multiple axes.

## Data Files
Expand Down Expand Up @@ -102,6 +106,8 @@ When the agent's final answer contains specific data (prices, names, dates, coun
- Task asks "extract the email address" → grep for the email pattern
This is the most reliable way to verify whether the agent actually found the data it claims, since screenshots may be blurry, truncated, or missing the relevant section.

For action-only agents, the final answer may be absent or may contain executor status text instead of a user-facing response. In that case, judge task_completion from the final browser state, screenshots, action sequence, and DOM evidence. Do not penalize the agent solely for lacking a textual final answer when the task contract asks it to leave the requested evidence visible on screen.

## How to View Screenshots

You have {screenshot_count} screenshots. View 3-5 strategically:
Expand Down Expand Up @@ -169,6 +175,7 @@ export function buildUserPrompt(
metrics: PreComputedMetrics,
axes: AxisDefinition[],
expectedAnswer?: string | null,
options: BuildUserPromptOptions = {},
): string {
const axesBlock = axes
.map((a) => `- **${a.name}** (weight: ${a.weight}): ${a.description}`)
Expand All @@ -180,11 +187,16 @@ export function buildUserPrompt(
? `\n## Expected Answer (Ground Truth)\n${expectedAnswer}\n\nWhen scoring task_completion, compare the agent's final answer against this ground truth. Consider semantic equivalence, partial correctness, and completeness. Award partial credit where the agent got some but not all parts right.`
: ''

const stateOnlyBlock = options.stateOnlyMode
? `\n## Action-Only Browser-State Evaluation\nThis run used an action-only Clado executor. The model can click, type, scroll, wait, and emit end(), but it has no supported channel for a separate natural-language final answer. Treat the browser state as the output. Score task_completion by verifying whether the requested information or destination is visible or otherwise evidenced in screenshots/messages.jsonl. Ignore executor status text such as max-budget or end-observation as a substantive answer. Still penalize repeated loops, unnecessary actions, CAPTCHA blockage, unsupported claims in reasoning, and failure to reach visible evidence.\n`
: ''

return `## Task
${taskQuery}
${stateOnlyBlock}

## Agent's Final Answer
${finalAnswer || '[No answer provided]'}
${options.stateOnlyMode ? '[Action-only run: judge browser state, not this field]' : finalAnswer || '[No answer provided]'}
${expectedAnswerBlock}
## Pre-Computed Metrics
${metricsBlock}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,15 @@ export class PerformanceGrader implements Grader {
try {
// Read termination reason from metadata.json
let terminationReason = 'unknown'
let stateOnlyMode = false
try {
const metadataRaw = await readFile(
join(input.outputDir, 'metadata.json'),
'utf-8',
)
const metadata = JSON.parse(metadataRaw)
terminationReason = metadata.termination_reason || 'unknown'
stateOnlyMode = metadata.agent_config?.type === 'clado-action'
} catch {
// metadata.json may not exist
}
Expand All @@ -75,6 +77,7 @@ export class PerformanceGrader implements Grader {
metrics,
this.axes,
input.expectedAnswer,
{ stateOnlyMode },
)

const response = await this.runAgent(
Expand Down
Loading
Loading