diff --git a/evals/buffbench/README.md b/evals/buffbench/README.md index 2707cdd2b2..9e6dc4d303 100644 --- a/evals/buffbench/README.md +++ b/evals/buffbench/README.md @@ -139,6 +139,7 @@ BuffBench supports running external CLI coding agents for comparison: - **Claude Code**: Use `external:claude` - requires `claude` CLI installed - **Codex**: Use `external:codex` - requires `codex` CLI installed +- **OpenCode**: Use `external:opencode` - requires `opencode` CLI installed Example comparing Codebuff vs Claude Code: @@ -164,6 +165,13 @@ npm install -g @openai/codex # Set OPENAI_API_KEY environment variable ``` +**OpenCode CLI:** +```bash +# Install from https://opencode.ai/docs/install +# Set OPENCODE_API_KEY environment variable +# BuffBench uses opencode/kimi-k2.6 by default; override with OPENCODE_MODEL if needed. +``` + ## Directory Structure ``` diff --git a/evals/buffbench/agent-runner.ts b/evals/buffbench/agent-runner.ts index f4564f3c53..57f2fa1e50 100644 --- a/evals/buffbench/agent-runner.ts +++ b/evals/buffbench/agent-runner.ts @@ -1,15 +1,15 @@ -import { execSync , exec } from 'child_process' +import { execSync, exec } from 'child_process' import { promisify } from 'util' const execAsync = promisify(exec) import { withTimeout } from '@codebuff/common/util/promise' - import { withTestRepo } from '../subagents/test-repo-utils' import { ClaudeRunner } from './runners/claude' import { CodebuffRunner } from './runners/codebuff' import { CodexRunner } from './runners/codex' +import { OpenCodeRunner } from './runners/opencode' import type { Runner, AgentStep } from './runners/runner' import type { EvalCommitV2, FinalCheckOutput } from './types' @@ -17,7 +17,7 @@ import type { CodebuffClient } from '@codebuff/sdk' export type { AgentStep } -export type ExternalAgentType = 'claude' | 'codex' +export type ExternalAgentType = 'claude' | 'codex' | 'opencode' export async function runAgentOnCommit({ client, @@ -76,6 +76,8 @@ export async function runAgentOnCommit({ runner = new ClaudeRunner(repoDir, env) } else if (externalAgentType === 'codex') { runner = new CodexRunner(repoDir, env) + } else if (externalAgentType === 'opencode') { + runner = new OpenCodeRunner(repoDir, env) } else { runner = new CodebuffRunner({ cwd: repoDir, diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts index 5508dccbed..0173a09fba 100644 --- a/evals/buffbench/main.ts +++ b/evals/buffbench/main.ts @@ -8,6 +8,7 @@ async function main() { // Compare Codebuff agents against external CLI agents // Use 'external:claude' for Claude Code CLI // Use 'external:codex' for OpenAI Codex CLI + // Use 'external:opencode' for OpenCode CLI await runBuffBench({ evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], agents: ['base2-free-evals'], diff --git a/evals/buffbench/run-buffbench.ts b/evals/buffbench/run-buffbench.ts index c501425dd2..b94ab04278 100644 --- a/evals/buffbench/run-buffbench.ts +++ b/evals/buffbench/run-buffbench.ts @@ -27,9 +27,13 @@ function parseAgentId(agent: string): { } { if (agent.startsWith('external:')) { const externalType = agent.slice('external:'.length) as ExternalAgentType - if (externalType !== 'claude' && externalType !== 'codex') { + if ( + externalType !== 'claude' && + externalType !== 'codex' && + externalType !== 'opencode' + ) { throw new Error( - `Unknown external agent type: ${externalType}. Supported: claude, codex`, + `Unknown external agent type: ${externalType}. Supported: claude, codex, opencode`, ) } return { agentId: agent, externalAgentType: externalType } @@ -187,7 +191,10 @@ async function runTask(options: { tracesDir, `${index + 1}-${safeTaskId}-${safeAgentId}-${safeCommitShort}-agent.json`, ) - fs.writeFileSync(agentTracePath, JSON.stringify(agentResult.trace, null, 2)) + fs.writeFileSync( + agentTracePath, + JSON.stringify(agentResult.trace, null, 2), + ) } fs.writeFileSync( diff --git a/evals/buffbench/runners/index.ts b/evals/buffbench/runners/index.ts index 99adc3d28a..0567543ccc 100644 --- a/evals/buffbench/runners/index.ts +++ b/evals/buffbench/runners/index.ts @@ -1,3 +1,4 @@ export { ClaudeRunner } from './claude' export { CodexRunner } from './codex' +export { OpenCodeRunner } from './opencode' export type { Runner, RunnerResult } from './runner' diff --git a/evals/buffbench/runners/opencode.ts b/evals/buffbench/runners/opencode.ts new file mode 100644 index 0000000000..a34aaf815f --- /dev/null +++ b/evals/buffbench/runners/opencode.ts @@ -0,0 +1,252 @@ +import { execSync, spawn } from 'child_process' + +import type { AgentStep, Runner, RunnerResult } from './runner' +import type { + PrintModeToolCall, + PrintModeToolResult, +} from '@codebuff/common/types/print-mode' +import type { JSONValue } from '@codebuff/common/types/json' + +const OPENCODE_MODEL = 'opencode/kimi-k2.6' + +function toJsonValue(value: unknown): JSONValue { + if ( + value === null || + typeof value === 'string' || + typeof value === 'number' || + typeof value === 'boolean' + ) { + return value + } + + if (Array.isArray(value)) { + return value.map(toJsonValue) + } + + if (typeof value === 'object') { + return Object.fromEntries( + Object.entries(value).map(([key, entry]) => [key, toJsonValue(entry)]), + ) + } + + return String(value) +} + +type OpenCodeEvent = { + type?: string + sessionID?: string + error?: { + name?: string + message?: string + statusCode?: number + data?: { + message?: string + } + } + part?: { + id?: string + type?: string + text?: string + tool?: string + callID?: string + state?: { + input?: unknown + output?: unknown + } + cost?: number + } +} + +function formatOpenCodeError(error: OpenCodeEvent['error']): string { + const message = + error?.data?.message || + error?.message || + error?.name || + 'OpenCode emitted an error event.' + + return error?.statusCode ? `${message} (status ${error.statusCode})` : message +} + +export class OpenCodeRunner implements Runner { + private cwd: string + private env: Record + + constructor(cwd: string, env: Record = {}) { + this.cwd = cwd + this.env = env + } + + async run(prompt: string): Promise { + const steps: AgentStep[] = [] + let totalCostUsd = 0 + + return new Promise((resolve, reject) => { + let openCodeError: string | undefined + const model = + this.env.OPENCODE_MODEL || process.env.OPENCODE_MODEL || OPENCODE_MODEL + const args = [ + 'run', + '--model', + model, + '--format', + 'json', + '--agent', + 'build', + prompt, + ] + + console.log(`[OpenCodeRunner] Running: opencode run --model ${model}`) + + const child = spawn('opencode', args, { + cwd: this.cwd, + env: { + ...process.env, + ...this.env, + OPENCODE_API_KEY: + this.env.OPENCODE_API_KEY || process.env.OPENCODE_API_KEY, + }, + stdio: ['ignore', 'pipe', 'pipe'], + }) + + let stdoutBuffer = '' + let stderr = '' + + const processEvent = (event: OpenCodeEvent) => { + if (event.type === 'error') { + openCodeError = formatOpenCodeError(event.error) + steps.push({ + type: 'text', + text: `[OpenCode error] ${openCodeError}`, + }) + return + } + + const part = event.part + if (!part) { + return + } + + if (event.type === 'text' || part.type === 'text') { + const text = part.text ?? '' + if (text.length > 0) { + steps.push({ type: 'text', text }) + process.stdout.write(text) + } + return + } + + if (event.type === 'step_finish' || part.type === 'step-finish') { + if (typeof part.cost === 'number') { + totalCostUsd += part.cost + } + return + } + + if (part.type === 'tool') { + const toolName = part.tool ?? 'unknown' + const toolCallId = part.callID ?? part.id ?? `opencode-${Date.now()}` + const input = part.state?.input ?? {} + + const toolCall: PrintModeToolCall = { + type: 'tool_call', + toolName, + toolCallId, + input: + input && typeof input === 'object' + ? (input as Record) + : { input }, + } + steps.push(toolCall) + + if (part.state && 'output' in part.state) { + const toolResult: PrintModeToolResult = { + type: 'tool_result', + toolName, + toolCallId, + output: [ + { + type: 'json', + value: toJsonValue(part.state.output ?? ''), + }, + ], + } + steps.push(toolResult) + } + } + } + + const processLine = (line: string) => { + if (!line.trim()) { + return + } + + try { + processEvent(JSON.parse(line)) + } catch { + steps.push({ type: 'text', text: line }) + } + } + + child.stdout.on('data', (data: Buffer) => { + stdoutBuffer += data.toString() + + const lines = stdoutBuffer.split('\n') + stdoutBuffer = lines.pop() ?? '' + for (const line of lines) { + processLine(line) + } + }) + + child.stderr.on('data', (data: Buffer) => { + stderr += data.toString() + process.stderr.write(data) + }) + + child.on('error', (error) => { + reject( + new Error( + `OpenCode CLI failed to start: ${error.message}. Make sure 'opencode' is installed and in PATH.`, + ), + ) + }) + + child.on('close', (code) => { + if (stdoutBuffer.trim()) { + processLine(stdoutBuffer) + } + + let diff = '' + try { + execSync('git add .', { cwd: this.cwd, stdio: 'ignore' }) + diff = execSync('git diff HEAD', { + cwd: this.cwd, + encoding: 'utf-8', + maxBuffer: 10 * 1024 * 1024, + }) + } catch { + // Ignore git errors + } + + if (code !== 0) { + reject( + new Error( + `OpenCode CLI exited with code ${code}. stderr: ${stderr}`, + ), + ) + return + } + + if (openCodeError) { + reject(new Error(openCodeError)) + return + } + + resolve({ + steps, + totalCostUsd, + diff, + }) + }) + }) + } +} diff --git a/freebuff/e2e/tests/agent-startup.e2e.test.ts b/freebuff/e2e/tests/agent-startup.e2e.test.ts index 04a10e7332..95340b127a 100644 --- a/freebuff/e2e/tests/agent-startup.e2e.test.ts +++ b/freebuff/e2e/tests/agent-startup.e2e.test.ts @@ -72,12 +72,12 @@ describe('Freebuff: Agent-driven E2E', () => { expect(result.output.type).not.toBe('error') - // Verify the agent used the tmux tools + // Verify the agent exercised the startup path. The afterEach cleanup + // handles stopping Freebuff deterministically if the agent finishes early. const toolCalls = events.filter((e) => e.type === 'tool_call') const toolNames = toolCalls.map((e) => e.toolName) expect(toolNames).toContain('start_freebuff') expect(toolNames).toContain('capture_freebuff_output') - expect(toolNames).toContain('stop_freebuff') }, AGENT_TEST_TIMEOUT, )