Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion apps/cli/src/commands/results/eval-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,15 @@
* All handlers accept a `cwd` (project root) to resolve paths against.
* The module spawns `bun apps/cli/src/cli.ts eval run ...` and tracks
* process state in memory.
*
* Stdout/stderr are also persisted to `<outputDir>/console.log` so that
* RunDetail can show the full captured log after the in-memory buffers are
* pruned. The static log file is served by the run-log routes registered in
* `serve.ts` via `getActiveRunStatus`/`getActiveRunTarget` cross-referencing.
*/

import { type ChildProcess, execFileSync, spawn } from 'node:child_process';
import { existsSync } from 'node:fs';
import { type WriteStream, createWriteStream, existsSync, mkdirSync } from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { listTargetNames, readTargetDefinitions } from '@agentv/core';
Expand Down Expand Up @@ -80,6 +85,21 @@ export function getActiveRunTarget(indexJsonlPath: string): string | undefined {
return undefined;
}

/**
* Look up the in-memory status for a Studio-launched run by its index.jsonl path.
* Returns 'starting' | 'running' | 'finished' | 'failed' if the run is tracked,
* else undefined. Used by handleRuns to render a spinner for active runs in the
* RunList instead of a misleading red ✗ derived from a 0 pass-rate.
*/
export function getActiveRunStatus(indexJsonlPath: string): StudioRun['status'] | undefined {
for (const run of activeRuns.values()) {
if (run.outputDir && path.join(run.outputDir, 'index.jsonl') === indexJsonlPath) {
return run.status;
}
}
return undefined;
}

// ── Discover targets file from project root ──────────────────────────────

async function discoverTargetsInProject(cwd: string): Promise<readonly string[]> {
Expand Down Expand Up @@ -259,6 +279,33 @@ function isCommandAvailable(cmd: string): boolean {
}
}

/**
* Open a writable stream to `<outputDir>/console.log` for persisting the
* spawned eval process's combined stdout/stderr. Returns `undefined` when the
* directory cannot be created or the file cannot be opened — callers fall back
* to the in-memory buffer in that case.
*
* The log file is the source of truth shown by the RunDetail "Console Log"
* section after the run completes. The in-memory `stdout`/`stderr` buffers on
* `StudioRun` remain capped for live status polling.
*
* Stream `error` events (e.g. the output dir was removed underneath us by a
* test teardown) are swallowed so they don't surface as unhandled errors and
* fail unrelated tests.
*/
function openConsoleLogStream(outputDir: string): WriteStream | undefined {
try {
mkdirSync(outputDir, { recursive: true });
const stream = createWriteStream(path.join(outputDir, 'console.log'), { flags: 'w' });
stream.on('error', () => {
/* best-effort log capture; ignore filesystem errors */
});
return stream;
} catch {
return undefined;
}
}

// ── Route registration ───────────────────────────────────────────────────

// biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route
Expand Down Expand Up @@ -366,7 +413,10 @@ export function registerEvalRoutes(
run.process = child;
run.status = 'running';

const logStream = openConsoleLogStream(outputDir);

child.stdout?.on('data', (chunk: Buffer) => {
logStream?.write(chunk);
run.stdout += chunk.toString();
// Cap buffer at 100KB
if (run.stdout.length > 100_000) {
Expand All @@ -375,6 +425,7 @@ export function registerEvalRoutes(
});

child.stderr?.on('data', (chunk: Buffer) => {
logStream?.write(chunk);
run.stderr += chunk.toString();
if (run.stderr.length > 100_000) {
run.stderr = run.stderr.slice(-80_000);
Expand All @@ -386,6 +437,7 @@ export function registerEvalRoutes(
run.status = code === 0 ? 'finished' : 'failed';
run.finishedAt = new Date().toISOString();
run.process = undefined;
logStream?.end();
pruneFinishedRuns();
});

Expand All @@ -394,6 +446,8 @@ export function registerEvalRoutes(
run.stderr += `\nProcess error: ${err.message}`;
run.finishedAt = new Date().toISOString();
run.process = undefined;
logStream?.write(`\nProcess error: ${err.message}\n`);
logStream?.end();
});

return c.json(
Expand Down Expand Up @@ -574,11 +628,15 @@ export function registerEvalRoutes(
run.process = child;
run.status = 'running';

const logStream = openConsoleLogStream(outputDir);

child.stdout?.on('data', (chunk: Buffer) => {
logStream?.write(chunk);
run.stdout += chunk.toString();
if (run.stdout.length > 100_000) run.stdout = run.stdout.slice(-80_000);
});
child.stderr?.on('data', (chunk: Buffer) => {
logStream?.write(chunk);
run.stderr += chunk.toString();
if (run.stderr.length > 100_000) run.stderr = run.stderr.slice(-80_000);
});
Expand All @@ -587,13 +645,16 @@ export function registerEvalRoutes(
run.status = code === 0 ? 'finished' : 'failed';
run.finishedAt = new Date().toISOString();
run.process = undefined;
logStream?.end();
pruneFinishedRuns();
});
child.on('error', (err) => {
run.status = 'failed';
run.stderr += `\nProcess error: ${err.message}`;
run.finishedAt = new Date().toISOString();
run.process = undefined;
logStream?.write(`\nProcess error: ${err.message}\n`);
logStream?.end();
});

return c.json({ id: runId, status: run.status, command }, 202);
Expand Down
29 changes: 28 additions & 1 deletion apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* - GET / — Studio SPA (React app)
* - GET /api/runs — list available run workspaces with metadata
* - GET /api/runs/:filename — load results from a specific run workspace
* - GET /api/runs/:filename/log — stream the captured console.log for a run
* - GET /api/feedback — read feedback reviews
* - POST /api/feedback — write feedback reviews
* - GET /api/benchmarks — list registered benchmarks
Expand Down Expand Up @@ -55,7 +56,7 @@ import { resolveRunManifestPath } from '../eval/result-layout.js';
import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js';
import { findRepoRoot } from '../eval/shared.js';
import { listResultFiles } from '../inspect/utils.js';
import { getActiveRunTarget, registerEvalRoutes } from './eval-runner.js';
import { getActiveRunStatus, getActiveRunTarget, registerEvalRoutes } from './eval-runner.js';
import {
loadLightweightResults,
loadManifestResults,
Expand Down Expand Up @@ -299,6 +300,10 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
} catch {
// ignore enrichment errors
}
// Surface live status for Studio-launched runs that are still starting
// or running so the RunList can render a spinner instead of the
// pass/fail dot derived from a 0% pass rate.
const liveStatus = getActiveRunStatus(m.path);
const tagsEntry = readRunTags(m.path);
return {
filename: m.filename,
Expand All @@ -313,11 +318,31 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
...(target && { target }),
...(experiment && { experiment }),
...(tagsEntry && { tags: tagsEntry.tags }),
...(liveStatus && { status: liveStatus }),
};
}),
});
}

async function handleRunLog(c: C, { searchDir }: DataContext) {
const filename = c.req.param('filename') ?? '';
const meta = await findRunById(searchDir, filename);
if (!meta) return c.json({ error: 'Run not found' }, 404);
if (meta.source === 'remote') {
return c.json({ error: 'Console log is not available for remote runs' }, 404);
}
const logPath = path.join(path.dirname(meta.path), 'console.log');
if (!existsSync(logPath)) {
return c.json({ error: 'Console log not found for this run' }, 404);
}
try {
const content = readFileSync(logPath, 'utf8');
return c.text(content);
} catch {
return c.json({ error: 'Failed to read console log' }, 500);
}
}

async function handleRunDetail(c: C, { searchDir }: DataContext) {
const filename = c.req.param('filename') ?? '';
const meta = await findRunById(searchDir, filename);
Expand Down Expand Up @@ -1169,6 +1194,7 @@ export function createApp(
return handleRunTagsDelete(c, defaultCtx);
});
app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx));
app.get('/api/runs/:filename/log', (c) => handleRunLog(c, defaultCtx));
app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx));
app.get('/api/runs/:filename/categories', (c) => handleRunCategories(c, defaultCtx));
app.get('/api/runs/:filename/categories/:category/suites', (c) =>
Expand Down Expand Up @@ -1293,6 +1319,7 @@ export function createApp(
return withBenchmark(c, handleRunTagsDelete);
});
app.get('/api/benchmarks/:benchmarkId/runs/:filename', (c) => withBenchmark(c, handleRunDetail));
app.get('/api/benchmarks/:benchmarkId/runs/:filename/log', (c) => withBenchmark(c, handleRunLog));
app.get('/api/benchmarks/:benchmarkId/runs/:filename/suites', (c) =>
withBenchmark(c, handleRunSuites),
);
Expand Down
53 changes: 52 additions & 1 deletion apps/studio/src/components/RunDetail.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,20 @@
* Groups results by category, then by suite within each category.
* Category Breakdown is shown as a clean table with coloured pass-rate pills.
* The All Evals table shows ERR badge instead of 0% for execution errors.
*
* Also renders a collapsible "Console Log" section sourced from the run's
* captured `console.log` file (served by `/api/runs/:id/log`). Hidden when no
* log is available — e.g. for remote runs or local runs that completed before
* the console-log capture feature shipped.
*/

import { useState } from 'react';

import { Link } from '@tanstack/react-router';

import type { EvalResult } from '~/lib/types';

import { isPassing, useStudioConfig } from '~/lib/api';
import { isPassing, useRunLog, useStudioConfig } from '~/lib/api';

import { PassRatePill } from './PassRatePill';
import { StatsCards } from './StatsCards';
Expand Down Expand Up @@ -234,6 +241,50 @@ export function RunDetail({ results, runId, benchmarkId }: RunDetailProps) {
</table>
</div>
</div>

<ConsoleLogSection runId={runId} benchmarkId={benchmarkId} />
</div>
);
}

function ConsoleLogSection({ runId, benchmarkId }: { runId: string; benchmarkId?: string }) {
const [open, setOpen] = useState(false);
const { data: log, isLoading, error } = useRunLog(runId, benchmarkId);

// Hide the section entirely when no log was captured (remote runs, or
// local runs from before this feature shipped). The 404 path resolves
// to `null` in fetchText, distinct from `undefined` (loading).
if (!isLoading && !error && log == null) return null;

return (
<div>
<button
type="button"
onClick={() => setOpen((v) => !v)}
className="flex w-full items-center justify-between rounded-lg border border-gray-800 bg-gray-900/50 px-4 py-2 text-left text-sm font-medium text-gray-300 transition-colors hover:bg-gray-900"
aria-expanded={open}
>
<span className="flex items-center gap-2">
<span aria-hidden="true">{open ? '▾' : '▸'}</span>
Console Log
</span>
<span className="text-xs text-gray-500">
{isLoading ? 'Loading…' : error ? 'Failed to load' : log ? `${log.length} chars` : ''}
</span>
</button>
{open && (
<div className="mt-2 overflow-hidden rounded-lg border border-gray-800 bg-black">
{error ? (
<div className="p-4 text-sm text-red-400">
Failed to load console log: {(error as Error).message}
</div>
) : (
<pre className="max-h-[480px] overflow-auto whitespace-pre-wrap break-words p-4 font-mono text-xs leading-relaxed text-gray-200">
{log ?? ''}
</pre>
)}
</div>
)}
</div>
);
}
27 changes: 21 additions & 6 deletions apps/studio/src/components/RunList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
* Displays all available runs with a pass/fail status dot, human-readable name,
* source badge, date, test count, and coloured pass-rate pill.
* Clicking a row navigates to the run detail view.
*
* In-progress runs (status `starting` / `running`, surfaced by the backend
* via the RunMeta `status` field while a Studio-launched run is still
* tracked in-memory) render a pulsing cyan dot instead of the pass/fail
* dot — otherwise a 0% pass rate during the warm-up window would show as
* a misleading red ✗.
*/

import type React from 'react';
Expand Down Expand Up @@ -82,18 +88,27 @@ export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) {
{runs.map((run) => {
const ts = formatDate(run.timestamp);
const passing = run.pass_rate >= passThreshold;
const isActive = run.status === 'starting' || run.status === 'running';
const label = formatRunLabel(run);
const passedCount = Math.round(run.pass_rate * run.test_count);
const failedCount = run.test_count - passedCount;
return (
<tr key={run.filename} className="transition-colors hover:bg-gray-900/30">
{/* Status dot */}
{/* Status dot — spinner for active runs, otherwise pass/fail */}
<td className="px-4 py-3 text-center">
<span
className={`text-base font-bold ${passing ? 'text-emerald-400' : 'text-red-400'}`}
>
{passing ? '✓' : '✗'}
</span>
{isActive ? (
<span
className="inline-block h-2 w-2 animate-pulse rounded-full bg-cyan-400"
title={run.status === 'starting' ? 'Starting…' : 'Running…'}
aria-label={run.status === 'starting' ? 'Starting' : 'Running'}
/>
) : (
<span
className={`text-base font-bold ${passing ? 'text-emerald-400' : 'text-red-400'}`}
>
{passing ? '✓' : '✗'}
</span>
)}
</td>

{/* Run name */}
Expand Down
32 changes: 32 additions & 0 deletions apps/studio/src/lib/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,21 @@ async function fetchJson<T>(url: string): Promise<T> {
return res.json() as Promise<T>;
}

/**
* Fetch a text/plain endpoint. Treats 404 as `null` so callers can model
* "log not yet captured" without throwing — used by the RunDetail console log
* viewer for runs that finished before this feature shipped (no console.log
* on disk) and for remote runs.
*/
async function fetchText(url: string): Promise<string | null> {
const res = await fetch(url);
if (res.status === 404) return null;
if (!res.ok) {
throw new Error(`API error: ${res.status} ${res.statusText}`);
}
return res.text();
}

// ── Query option factories ──────────────────────────────────────────────

export const runListOptions = queryOptions({
Expand All @@ -58,6 +73,23 @@ export function runDetailOptions(filename: string) {
});
}

export function runLogOptions(filename: string, benchmarkId?: string) {
const url = benchmarkId
? `${benchmarkApiBase(benchmarkId)}/runs/${encodeURIComponent(filename)}/log`
: `/api/runs/${encodeURIComponent(filename)}/log`;
return queryOptions({
queryKey: ['runs', filename, 'log', benchmarkId ?? ''],
queryFn: () => fetchText(url),
enabled: !!filename,
// Re-fetch while a run is still capturing output so the viewer streams in.
refetchInterval: 3_000,
});
}

export function useRunLog(filename: string, benchmarkId?: string) {
return useQuery(runLogOptions(filename, benchmarkId));
}

export function runSuitesOptions(runId: string) {
return queryOptions({
queryKey: ['runs', runId, 'suites'],
Expand Down
7 changes: 7 additions & 0 deletions apps/studio/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ export interface RunMeta {
benchmark_name?: string;
/** Optional user-assigned tags from the run's sidecar tags.json. */
tags?: string[];
/**
* Live execution status. Only present for Studio-launched runs that are
* still being tracked in-memory — used to render a spinner in RunList
* instead of the pass/fail dot when pass_rate is 0 simply because no
* results have been written yet.
*/
status?: 'starting' | 'running' | 'finished' | 'failed';
}

export interface RunListResponse {
Expand Down
Loading