Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions evals/buffbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ BuffBench supports running external CLI coding agents for comparison:

- **Claude Code**: Use `external:claude` - requires `claude` CLI installed
- **Codex**: Use `external:codex` - requires `codex` CLI installed
- **OpenCode**: Use `external:opencode` - requires `opencode` CLI installed

Example comparing Codebuff vs Claude Code:

Expand All @@ -164,6 +165,13 @@ npm install -g @openai/codex
# Set OPENAI_API_KEY environment variable
```

**OpenCode CLI:**
```bash
# Install from https://opencode.ai/docs/install
# Set OPENCODE_API_KEY environment variable
# BuffBench uses opencode/kimi-k2.6 by default; override with OPENCODE_MODEL if needed.
```

## Directory Structure

```
Expand Down
8 changes: 5 additions & 3 deletions evals/buffbench/agent-runner.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
import { execSync , exec } from 'child_process'
import { execSync, exec } from 'child_process'
import { promisify } from 'util'

const execAsync = promisify(exec)

import { withTimeout } from '@codebuff/common/util/promise'


import { withTestRepo } from '../subagents/test-repo-utils'
import { ClaudeRunner } from './runners/claude'
import { CodebuffRunner } from './runners/codebuff'
import { CodexRunner } from './runners/codex'
import { OpenCodeRunner } from './runners/opencode'

import type { Runner, AgentStep } from './runners/runner'
import type { EvalCommitV2, FinalCheckOutput } from './types'
import type { CodebuffClient } from '@codebuff/sdk'

export type { AgentStep }

export type ExternalAgentType = 'claude' | 'codex'
export type ExternalAgentType = 'claude' | 'codex' | 'opencode'

export async function runAgentOnCommit({
client,
Expand Down Expand Up @@ -76,6 +76,8 @@ export async function runAgentOnCommit({
runner = new ClaudeRunner(repoDir, env)
} else if (externalAgentType === 'codex') {
runner = new CodexRunner(repoDir, env)
} else if (externalAgentType === 'opencode') {
runner = new OpenCodeRunner(repoDir, env)
} else {
runner = new CodebuffRunner({
cwd: repoDir,
Expand Down
1 change: 1 addition & 0 deletions evals/buffbench/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ async function main() {
// Compare Codebuff agents against external CLI agents
// Use 'external:claude' for Claude Code CLI
// Use 'external:codex' for OpenAI Codex CLI
// Use 'external:opencode' for OpenCode CLI
await runBuffBench({
evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
agents: ['base2-free-evals'],
Expand Down
13 changes: 10 additions & 3 deletions evals/buffbench/run-buffbench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,13 @@ function parseAgentId(agent: string): {
} {
if (agent.startsWith('external:')) {
const externalType = agent.slice('external:'.length) as ExternalAgentType
if (externalType !== 'claude' && externalType !== 'codex') {
if (
externalType !== 'claude' &&
externalType !== 'codex' &&
externalType !== 'opencode'
) {
throw new Error(
`Unknown external agent type: ${externalType}. Supported: claude, codex`,
`Unknown external agent type: ${externalType}. Supported: claude, codex, opencode`,
)
}
return { agentId: agent, externalAgentType: externalType }
Expand Down Expand Up @@ -187,7 +191,10 @@ async function runTask(options: {
tracesDir,
`${index + 1}-${safeTaskId}-${safeAgentId}-${safeCommitShort}-agent.json`,
)
fs.writeFileSync(agentTracePath, JSON.stringify(agentResult.trace, null, 2))
fs.writeFileSync(
agentTracePath,
JSON.stringify(agentResult.trace, null, 2),
)
}

fs.writeFileSync(
Expand Down
1 change: 1 addition & 0 deletions evals/buffbench/runners/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
export { ClaudeRunner } from './claude'
export { CodexRunner } from './codex'
export { OpenCodeRunner } from './opencode'
export type { Runner, RunnerResult } from './runner'
252 changes: 252 additions & 0 deletions evals/buffbench/runners/opencode.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
import { execSync, spawn } from 'child_process'

import type { AgentStep, Runner, RunnerResult } from './runner'
import type {
PrintModeToolCall,
PrintModeToolResult,
} from '@codebuff/common/types/print-mode'
import type { JSONValue } from '@codebuff/common/types/json'

const OPENCODE_MODEL = 'opencode/kimi-k2.6'

function toJsonValue(value: unknown): JSONValue {
if (
value === null ||
typeof value === 'string' ||
typeof value === 'number' ||
typeof value === 'boolean'
) {
return value
}

if (Array.isArray(value)) {
return value.map(toJsonValue)
}

if (typeof value === 'object') {
return Object.fromEntries(
Object.entries(value).map(([key, entry]) => [key, toJsonValue(entry)]),
)
}

return String(value)
}

type OpenCodeEvent = {
type?: string
sessionID?: string
error?: {
name?: string
message?: string
statusCode?: number
data?: {
message?: string
}
}
part?: {
id?: string
type?: string
text?: string
tool?: string
callID?: string
state?: {
input?: unknown
output?: unknown
}
cost?: number
}
}

function formatOpenCodeError(error: OpenCodeEvent['error']): string {
const message =
error?.data?.message ||
error?.message ||
error?.name ||
'OpenCode emitted an error event.'

return error?.statusCode ? `${message} (status ${error.statusCode})` : message
}

export class OpenCodeRunner implements Runner {
private cwd: string
private env: Record<string, string>

constructor(cwd: string, env: Record<string, string> = {}) {
this.cwd = cwd
this.env = env
}

async run(prompt: string): Promise<RunnerResult> {
const steps: AgentStep[] = []
let totalCostUsd = 0

return new Promise((resolve, reject) => {
let openCodeError: string | undefined
const model =
this.env.OPENCODE_MODEL || process.env.OPENCODE_MODEL || OPENCODE_MODEL
const args = [
'run',
'--model',
model,
'--format',
'json',
'--agent',
'build',
prompt,
]

console.log(`[OpenCodeRunner] Running: opencode run --model ${model}`)

const child = spawn('opencode', args, {
cwd: this.cwd,
env: {
...process.env,
...this.env,
OPENCODE_API_KEY:
this.env.OPENCODE_API_KEY || process.env.OPENCODE_API_KEY,
},
stdio: ['ignore', 'pipe', 'pipe'],
})

let stdoutBuffer = ''
let stderr = ''

const processEvent = (event: OpenCodeEvent) => {
if (event.type === 'error') {
openCodeError = formatOpenCodeError(event.error)
steps.push({
type: 'text',
text: `[OpenCode error] ${openCodeError}`,
})
return
}

const part = event.part
if (!part) {
return
}

if (event.type === 'text' || part.type === 'text') {
const text = part.text ?? ''
if (text.length > 0) {
steps.push({ type: 'text', text })
process.stdout.write(text)
}
return
}

if (event.type === 'step_finish' || part.type === 'step-finish') {
if (typeof part.cost === 'number') {
totalCostUsd += part.cost
}
return
}

if (part.type === 'tool') {
const toolName = part.tool ?? 'unknown'
const toolCallId = part.callID ?? part.id ?? `opencode-${Date.now()}`
const input = part.state?.input ?? {}

const toolCall: PrintModeToolCall = {
type: 'tool_call',
toolName,
toolCallId,
input:
input && typeof input === 'object'
? (input as Record<string, unknown>)
: { input },
}
steps.push(toolCall)

if (part.state && 'output' in part.state) {
const toolResult: PrintModeToolResult = {
type: 'tool_result',
toolName,
toolCallId,
output: [
{
type: 'json',
value: toJsonValue(part.state.output ?? ''),
},
],
}
steps.push(toolResult)
}
}
}

const processLine = (line: string) => {
if (!line.trim()) {
return
}

try {
processEvent(JSON.parse(line))
} catch {
steps.push({ type: 'text', text: line })
}
}

child.stdout.on('data', (data: Buffer) => {
stdoutBuffer += data.toString()

const lines = stdoutBuffer.split('\n')
stdoutBuffer = lines.pop() ?? ''
for (const line of lines) {
processLine(line)
}
})

child.stderr.on('data', (data: Buffer) => {
stderr += data.toString()
process.stderr.write(data)
})

child.on('error', (error) => {
reject(
new Error(
`OpenCode CLI failed to start: ${error.message}. Make sure 'opencode' is installed and in PATH.`,
),
)
})

child.on('close', (code) => {
if (stdoutBuffer.trim()) {
processLine(stdoutBuffer)
}

let diff = ''
try {
execSync('git add .', { cwd: this.cwd, stdio: 'ignore' })
diff = execSync('git diff HEAD', {
cwd: this.cwd,
encoding: 'utf-8',
maxBuffer: 10 * 1024 * 1024,
})
} catch {
// Ignore git errors
}

if (code !== 0) {
reject(
new Error(
`OpenCode CLI exited with code ${code}. stderr: ${stderr}`,
),
)
return
}

if (openCodeError) {
reject(new Error(openCodeError))
return
}

resolve({
steps,
totalCostUsd,
diff,
})
})
})
}
}
4 changes: 2 additions & 2 deletions freebuff/e2e/tests/agent-startup.e2e.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ describe('Freebuff: Agent-driven E2E', () => {

expect(result.output.type).not.toBe('error')

// Verify the agent used the tmux tools
// Verify the agent exercised the startup path. The afterEach cleanup
// handles stopping Freebuff deterministically if the agent finishes early.
const toolCalls = events.filter((e) => e.type === 'tool_call')
const toolNames = toolCalls.map((e) => e.toolName)
expect(toolNames).toContain('start_freebuff')
expect(toolNames).toContain('capture_freebuff_output')
expect(toolNames).toContain('stop_freebuff')
},
AGENT_TEST_TIMEOUT,
)
Expand Down
Loading