From 8945f5cf6f767bb3b6d8288bb5344b6b90af51fc Mon Sep 17 00:00:00 2001 From: ozymandiashh <234437643+ozymandiashh@users.noreply.github.com> Date: Tue, 19 May 2026 02:17:01 +0300 Subject: [PATCH] Add MCP skill reliability optimizer --- CHANGELOG.md | 4 + src/optimize.ts | 249 +++++++++++++++++++++++++++++++++++++++++ tests/optimize.test.ts | 151 +++++++++++++++++++++++++ 3 files changed, 404 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b1f1dc..2442871 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ ## Unreleased ### Added (CLI) +- **MCP and skill reliability report.** `codeburn optimize` now detects MCP + servers and skills whose edit turns are disproportionately retry-heavy, + using turn-level MCP/Skill call evidence and a shared-turn token estimate so + one retry-heavy turn is not double-counted across multiple capabilities. - **Agent and subagent tracking coverage.** Gemini sessions now emit one provider call per assistant message with token usage instead of one aggregate call per session, preserving per-message tools, bash commands, timestamps, diff --git a/src/optimize.ts b/src/optimize.ts index c672bac..45cbcc5 100644 --- a/src/optimize.ts +++ b/src/optimize.ts @@ -99,6 +99,15 @@ const WORTH_IT_LOW_MAX_CANDIDATES = 2 const WORTH_IT_LOW_MAX_TOTAL_COST_USD = 10 const WORTH_IT_HIGH_MIN_CANDIDATES = 10 const WORTH_IT_HIGH_TOTAL_COST_USD = 50 +const CAPABILITY_RELIABILITY_MIN_EDIT_TURNS = 5 +const CAPABILITY_RELIABILITY_MIN_RETRY_TURNS = 3 +const CAPABILITY_RELIABILITY_MIN_RETRY_RATE = 0.50 +const CAPABILITY_RELIABILITY_RECOVERY_FRACTION = 0.50 +const CAPABILITY_RELIABILITY_PREVIEW = 5 +const CAPABILITY_RELIABILITY_LOW_MAX_CANDIDATES = 1 +const CAPABILITY_RELIABILITY_LOW_MAX_TOKENS = 50_000 +const CAPABILITY_RELIABILITY_HIGH_MIN_CANDIDATES = 5 +const CAPABILITY_RELIABILITY_HIGH_IMPACT_TOKENS = 200_000 // ============================================================================ // Scoring constants @@ -895,6 +904,245 @@ export function detectMcpToolCoverage( } } +type CapabilityKind = 'mcp' | 'skill' + +type CapabilityRef = { + kind: CapabilityKind + name: string +} + +type CapabilityReliabilityAccumulator = CapabilityRef & { + editTurns: number + retryTurns: number + oneShotTurns: number + retries: number + tokensTouched: number + projects: Set + retryTurnSavings: Map +} + +export type CapabilityReliabilityCandidate = { + kind: CapabilityKind + name: string + editTurns: number + retryTurns: number + oneShotTurns: number + retries: number + retryRate: number + tokensTouched: number + tokensSaved: number + projects: string[] +} + +function capabilityKey(ref: CapabilityRef): string { + return `${ref.kind}:${ref.name}` +} + +function formatCapabilityKind(kind: CapabilityKind): string { + return kind === 'mcp' ? 'MCP server' : 'skill' +} + +function mcpServerFromToolName(fqn: string): string | null { + const parts = fqn.split('__') + if (parts.length < 3 || parts[0] !== 'mcp') return null + return parts[1] || null +} + +function collectReliabilityCapabilities(turn: ProjectSummary['sessions'][number]['turns'][number]): Map { + const capabilities = new Map() + + for (const call of turn.assistantCalls) { + for (const fqn of call.mcpTools) { + const server = mcpServerFromToolName(fqn) + if (!server) continue + const ref: CapabilityRef = { kind: 'mcp', name: server } + capabilities.set(capabilityKey(ref), ref) + } + for (const rawSkill of call.skills ?? []) { + const skill = rawSkill.trim() + if (!skill) continue + const ref: CapabilityRef = { kind: 'skill', name: skill } + capabilities.set(capabilityKey(ref), ref) + } + } + + return capabilities +} + +function turnEffectiveTokenTotal(turn: ProjectSummary['sessions'][number]['turns'][number]): number { + return Math.round(turn.assistantCalls.reduce((sum, call) => + sum + + call.usage.inputTokens + + call.usage.outputTokens + + call.usage.cacheCreationInputTokens * CACHE_WRITE_MULTIPLIER + + call.usage.cacheReadInputTokens * CACHE_READ_DISCOUNT, + 0)) +} + +function reliabilityTurnKey( + project: ProjectSummary, + session: ProjectSummary['sessions'][number], + turn: ProjectSummary['sessions'][number]['turns'][number], + turnIndex: number, +): string { + return `${project.projectPath || project.project}:${session.sessionId}:${turn.timestamp}:${turnIndex}` +} + +function getReliabilityAccumulator( + stats: Map, + ref: CapabilityRef, +): CapabilityReliabilityAccumulator { + const key = capabilityKey(ref) + let acc = stats.get(key) + if (!acc) { + acc = { + ...ref, + editTurns: 0, + retryTurns: 0, + oneShotTurns: 0, + retries: 0, + tokensTouched: 0, + projects: new Set(), + retryTurnSavings: new Map(), + } + stats.set(key, acc) + } + return acc +} + +function findCapabilityReliabilityCandidates(projects: ProjectSummary[]): CapabilityReliabilityCandidate[] { + const stats = new Map() + + for (const project of projects) { + for (const session of project.sessions) { + for (let turnIndex = 0; turnIndex < session.turns.length; turnIndex++) { + const turn = session.turns[turnIndex]! + if (!turn.hasEdits) continue + + const capabilities = collectReliabilityCapabilities(turn) + if (capabilities.size === 0) continue + + const turnTokens = turnEffectiveTokenTotal(turn) + const turnKey = reliabilityTurnKey(project, session, turn, turnIndex) + const recoverableTokens = turn.retries > 0 + ? Math.round(turnTokens * CAPABILITY_RELIABILITY_RECOVERY_FRACTION) + : 0 + + for (const ref of capabilities.values()) { + const acc = getReliabilityAccumulator(stats, ref) + acc.editTurns++ + acc.tokensTouched += turnTokens + acc.projects.add(project.project) + if (turn.retries > 0) { + acc.retryTurns++ + acc.retries += turn.retries + acc.retryTurnSavings.set(turnKey, recoverableTokens) + } else { + acc.oneShotTurns++ + } + } + } + } + } + + const candidates: CapabilityReliabilityCandidate[] = [] + for (const acc of stats.values()) { + if (acc.editTurns < CAPABILITY_RELIABILITY_MIN_EDIT_TURNS) continue + if (acc.retryTurns < CAPABILITY_RELIABILITY_MIN_RETRY_TURNS) continue + const retryRate = acc.retryTurns / acc.editTurns + if (retryRate < CAPABILITY_RELIABILITY_MIN_RETRY_RATE) continue + + candidates.push({ + kind: acc.kind, + name: acc.name, + editTurns: acc.editTurns, + retryTurns: acc.retryTurns, + oneShotTurns: acc.oneShotTurns, + retries: acc.retries, + retryRate, + tokensTouched: acc.tokensTouched, + tokensSaved: Array.from(acc.retryTurnSavings.values()).reduce((sum, tokens) => sum + tokens, 0), + projects: Array.from(acc.projects).sort(), + }) + } + + candidates.sort((a, b) => + b.retryRate - a.retryRate + || b.retries - a.retries + || b.tokensSaved - a.tokensSaved + || a.kind.localeCompare(b.kind) + || a.name.localeCompare(b.name) + ) + return candidates +} + +export function detectCapabilityReliability(projects: ProjectSummary[]): WasteFinding | null { + const candidates = findCapabilityReliabilityCandidates(projects) + if (candidates.length === 0) return null + + const candidateKeys = new Set(candidates.map(c => capabilityKey(c))) + const uniqueRetryTurnSavings = new Map() + for (const project of projects) { + for (const session of project.sessions) { + for (let turnIndex = 0; turnIndex < session.turns.length; turnIndex++) { + const turn = session.turns[turnIndex]! + if (!turn.hasEdits || turn.retries <= 0) continue + const capabilities = collectReliabilityCapabilities(turn) + if (capabilities.size === 0) continue + + const hasFlaggedCapability = Array.from(capabilities.keys()).some(key => candidateKeys.has(key)) + if (!hasFlaggedCapability) continue + + const key = reliabilityTurnKey(project, session, turn, turnIndex) + const tokens = Math.round(turnEffectiveTokenTotal(turn) * CAPABILITY_RELIABILITY_RECOVERY_FRACTION) + uniqueRetryTurnSavings.set(key, Math.max(uniqueRetryTurnSavings.get(key) ?? 0, tokens)) + } + } + } + + const tokensSaved = Array.from(uniqueRetryTurnSavings.values()).reduce((sum, tokens) => sum + tokens, 0) + const preview = candidates.slice(0, CAPABILITY_RELIABILITY_PREVIEW) + const list = preview.map(c => { + const percent = Math.round(c.retryRate * 100) + const projects = c.projects.length > 1 ? ` across ${c.projects.length} projects` : ` in ${c.projects[0] ?? 'one project'}` + return `${formatCapabilityKind(c.kind)} ${c.name}: ${c.retryTurns}/${c.editTurns} edit turns retried (${percent}%), ${c.retries} retries${projects}` + }).join('; ') + const extra = candidates.length > preview.length ? `; +${candidates.length - preview.length} more` : '' + + const names = preview + .map(c => `${formatCapabilityKind(c.kind)} ${c.name}`) + .join(', ') + + let impact: Impact + if (candidates.length >= CAPABILITY_RELIABILITY_HIGH_MIN_CANDIDATES || tokensSaved >= CAPABILITY_RELIABILITY_HIGH_IMPACT_TOKENS) { + impact = 'high' + } else if (candidates.length <= CAPABILITY_RELIABILITY_LOW_MAX_CANDIDATES && tokensSaved < CAPABILITY_RELIABILITY_LOW_MAX_TOKENS) { + impact = 'low' + } else { + impact = 'medium' + } + + const kindSet = new Set(candidates.map(c => c.kind)) + const noun = kindSet.size === 1 + ? (kindSet.has('mcp') ? 'MCP server' : 'skill') + : 'MCP/skill capability' + const pluralNoun = noun === 'MCP/skill capability' ? 'MCP/skill capabilities' : `${noun}s` + const verb = candidates.length === 1 ? 'correlates' : 'correlate' + + return { + title: `${candidates.length} ${candidates.length === 1 ? noun : pluralNoun} ${verb} with retry-heavy edits`, + explanation: `Edit turns using these capabilities are retry-heavy: ${list}${extra}. This is a correlation report, not proof of causation; compare the retry-heavy turns with one-shot turns before changing MCP scope or skill instructions.`, + impact, + tokensSaved, + fix: { + type: 'paste', + destination: 'prompt', + label: 'Ask Claude to audit the retry-heavy capability before changing config:', + text: `Investigate these retry-correlated capabilities: ${names}. Compare edit turns with retries against one-shot edit turns, identify whether the MCP server or skill actually caused rework, then propose a scoped MCP config or skill-instruction change with session evidence. Do not remove a capability solely because it appears in this report.`, + }, + } +} + export function detectUnusedMcp( calls: ToolCall[], projects: ProjectSummary[], @@ -1800,6 +2048,7 @@ export async function scanAndDetect( () => detectDuplicateReads(toolCalls, dateRange), () => detectUnusedMcp(toolCalls, projects, projectCwds, mcpCoverage), () => detectMcpToolCoverage(projects, mcpCoverage), + () => detectCapabilityReliability(projects), () => detectLowWorthSessions(projects), () => detectContextBloat(projects, lowWorthSessionIds), () => detectSessionOutliers(projects, outlierExclusions), diff --git a/tests/optimize.test.ts b/tests/optimize.test.ts index 52643f9..8af8639 100644 --- a/tests/optimize.test.ts +++ b/tests/optimize.test.ts @@ -7,6 +7,7 @@ import { detectCacheBloat, detectBloatedClaudeMd, detectContextBloat, + detectCapabilityReliability, detectLowWorthSessions, detectSessionOutliers, computeHealth, @@ -775,6 +776,156 @@ describe('detectLowWorthSessions', () => { }) }) +type ReliabilityCall = LowWorthTurn['assistantCalls'][number] + +function reliabilityCall(overrides: Partial = {}): ReliabilityCall { + return { + provider: 'claude', + model: 'claude-sonnet-4-5', + usage: { + inputTokens: 1000, + outputTokens: 0, + cacheCreationInputTokens: 0, + cacheReadInputTokens: 0, + cachedInputTokens: 0, + reasoningTokens: 0, + webSearchRequests: 0, + }, + costUSD: 0.01, + tools: ['Edit'], + mcpTools: [], + skills: [], + hasAgentSpawn: false, + hasPlanMode: false, + speed: 'standard', + timestamp: '2026-05-01T10:00:00Z', + bashCommands: [], + deduplicationKey: 'call', + ...overrides, + } +} + +function reliabilityTurn( + i: number, + overrides: Partial & { call?: Partial } = {}, +): LowWorthTurn { + const { call: callOverrides, ...turnOverrides } = overrides + return lowWorthTurn({ + userMessage: `turn ${i}`, + assistantCalls: [reliabilityCall({ + timestamp: `2026-05-01T10:${String(i).padStart(2, '0')}:00Z`, + deduplicationKey: `call-${i}`, + ...callOverrides, + })], + timestamp: `2026-05-01T10:${String(i).padStart(2, '0')}:00Z`, + sessionId: 's1', + hasEdits: true, + retries: 0, + ...turnOverrides, + }) +} + +function projectWithReliabilityTurns(turns: LowWorthTurn[], project = 'app'): ProjectSummary { + return projectWithLowWorthSessions([ + lowWorthSession(1, 0, { + turns, + totalInputTokens: turns.length * 1000, + totalOutputTokens: 0, + totalCacheReadTokens: 0, + totalCacheWriteTokens: 0, + apiCalls: turns.length, + }, project), + ], project) +} + +describe('detectCapabilityReliability', () => { + it('flags retry-heavy skills from actual Skill call metadata', () => { + const turns = Array.from({ length: 5 }, (_, i) => reliabilityTurn(i, { + retries: i < 3 ? 1 : 0, + call: { tools: ['Edit', 'Skill'], skills: ['reviewer'] }, + })) + + const finding = detectCapabilityReliability([projectWithReliabilityTurns(turns)]) + + expect(finding).not.toBeNull() + expect(finding!.title).toContain('skill') + expect(finding!.explanation).toContain('skill reviewer') + expect(finding!.explanation).toContain('3/5 edit turns retried (60%)') + expect(finding!.explanation).toContain('correlation report') + expect(finding!.tokensSaved).toBe(1500) + expect(finding!.fix.type).toBe('paste') + if (finding!.fix.type === 'paste') expect(finding!.fix.destination).toBe('prompt') + }) + + it('flags retry-heavy MCP servers from invoked MCP tools', () => { + const turns = Array.from({ length: 5 }, (_, i) => reliabilityTurn(i, { + retries: i < 3 ? 1 : 0, + call: { + tools: ['Edit', 'mcp__ci__run'], + mcpTools: ['mcp__ci__run'], + }, + })) + + const finding = detectCapabilityReliability([projectWithReliabilityTurns(turns)]) + + expect(finding).not.toBeNull() + expect(finding!.title).toContain('MCP server') + expect(finding!.explanation).toContain('MCP server ci') + expect(finding!.explanation).toContain('3 retries') + }) + + it('does not flag healthy capabilities with mostly one-shot edit turns', () => { + const turns = Array.from({ length: 5 }, (_, i) => reliabilityTurn(i, { + retries: i === 0 ? 1 : 0, + call: { tools: ['Edit', 'Skill'], skills: ['healthy'] }, + })) + + expect(detectCapabilityReliability([projectWithReliabilityTurns(turns)])).toBeNull() + }) + + it('does not treat subCategory alone as skill evidence', () => { + const turns = Array.from({ length: 5 }, (_, i) => reliabilityTurn(i, { + retries: 1, + subCategory: 'legacy-skill-label', + call: { tools: ['Edit'], skills: [] }, + })) + + expect(detectCapabilityReliability([projectWithReliabilityTurns(turns)])).toBeNull() + }) + + it('does not double-count the same retry-heavy turn across MCP and skill candidates', () => { + const turns = Array.from({ length: 5 }, (_, i) => reliabilityTurn(i, { + retries: i < 3 ? 1 : 0, + call: { + tools: ['Edit', 'Skill', 'mcp__ci__run'], + mcpTools: ['mcp__ci__run'], + skills: ['reviewer'], + }, + })) + + const finding = detectCapabilityReliability([projectWithReliabilityTurns(turns)]) + + expect(finding).not.toBeNull() + expect(finding!.title).toContain('2 MCP/skill capabilities') + expect(finding!.explanation).toContain('MCP server ci') + expect(finding!.explanation).toContain('skill reviewer') + // Three retry-heavy turns at 1K effective tokens each, counted once at + // the 50% recoverable ceiling even though two flagged capabilities share + // every turn. + expect(finding!.tokensSaved).toBe(1500) + }) + + it('ignores read-only retry turns for capability reliability', () => { + const turns = Array.from({ length: 5 }, (_, i) => reliabilityTurn(i, { + hasEdits: false, + retries: 1, + call: { tools: ['Read', 'Skill'], skills: ['reader'] }, + })) + + expect(detectCapabilityReliability([projectWithReliabilityTurns(turns)])).toBeNull() + }) +}) + describe('detectSessionOutliers', () => { it('returns null when there are too few sessions for a project baseline', () => { expect(detectSessionOutliers([projectWithSessions([0.5, 4])])).toBeNull()