EntityProcess · christso · May 14, 2026 · May 14, 2026
diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts
@@ -12,10 +12,15 @@
  * All handlers accept a `cwd` (project root) to resolve paths against.
  * The module spawns `bun apps/cli/src/cli.ts eval run ...` and tracks
  * process state in memory.
+ *
+ * Stdout/stderr are also persisted to `<outputDir>/console.log` so that
+ * RunDetail can show the full captured log after the in-memory buffers are
+ * pruned. The static log file is served by the run-log routes registered in
+ * `serve.ts` via `getActiveRunStatus`/`getActiveRunTarget` cross-referencing.
  */
 
 import { type ChildProcess, execFileSync, spawn } from 'node:child_process';
-import { existsSync } from 'node:fs';
+import { type WriteStream, createWriteStream, existsSync, mkdirSync } from 'node:fs';
 import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import { listTargetNames, readTargetDefinitions } from '@agentv/core';
@@ -80,6 +85,21 @@ export function getActiveRunTarget(indexJsonlPath: string): string | undefined {
   return undefined;
 }
 
+/**
+ * Look up the in-memory status for a Studio-launched run by its index.jsonl path.
+ * Returns 'starting' | 'running' | 'finished' | 'failed' if the run is tracked,
+ * else undefined. Used by handleRuns to render a spinner for active runs in the
+ * RunList instead of a misleading red ✗ derived from a 0 pass-rate.
+ */
+export function getActiveRunStatus(indexJsonlPath: string): StudioRun['status'] | undefined {
+  for (const run of activeRuns.values()) {
+    if (run.outputDir && path.join(run.outputDir, 'index.jsonl') === indexJsonlPath) {
+      return run.status;
+    }
+  }
+  return undefined;
+}
+
 // ── Discover targets file from project root ──────────────────────────────
 
 async function discoverTargetsInProject(cwd: string): Promise<readonly string[]> {
@@ -259,6 +279,33 @@ function isCommandAvailable(cmd: string): boolean {
   }
 }
 
+/**
+ * Open a writable stream to `<outputDir>/console.log` for persisting the
+ * spawned eval process's combined stdout/stderr. Returns `undefined` when the
+ * directory cannot be created or the file cannot be opened — callers fall back
+ * to the in-memory buffer in that case.
+ *
+ * The log file is the source of truth shown by the RunDetail "Console Log"
+ * section after the run completes. The in-memory `stdout`/`stderr` buffers on
+ * `StudioRun` remain capped for live status polling.
+ *
+ * Stream `error` events (e.g. the output dir was removed underneath us by a
+ * test teardown) are swallowed so they don't surface as unhandled errors and
+ * fail unrelated tests.
+ */
+function openConsoleLogStream(outputDir: string): WriteStream | undefined {
+  try {
+    mkdirSync(outputDir, { recursive: true });
+    const stream = createWriteStream(path.join(outputDir, 'console.log'), { flags: 'w' });
+    stream.on('error', () => {
+      /* best-effort log capture; ignore filesystem errors */
+    });
+    return stream;
+  } catch {
+    return undefined;
+  }
+}
+
 // ── Route registration ───────────────────────────────────────────────────
 
 // biome-ignore lint/suspicious/noExplicitAny: Hono Context generic varies by route
@@ -366,7 +413,10 @@ export function registerEvalRoutes(
       run.process = child;
       run.status = 'running';
 
+      const logStream = openConsoleLogStream(outputDir);
+
       child.stdout?.on('data', (chunk: Buffer) => {
+        logStream?.write(chunk);
         run.stdout += chunk.toString();
         // Cap buffer at 100KB
         if (run.stdout.length > 100_000) {
@@ -375,6 +425,7 @@ export function registerEvalRoutes(
       });
 
       child.stderr?.on('data', (chunk: Buffer) => {
+        logStream?.write(chunk);
         run.stderr += chunk.toString();
         if (run.stderr.length > 100_000) {
           run.stderr = run.stderr.slice(-80_000);
@@ -386,6 +437,7 @@ export function registerEvalRoutes(
         run.status = code === 0 ? 'finished' : 'failed';
         run.finishedAt = new Date().toISOString();
         run.process = undefined;
+        logStream?.end();
         pruneFinishedRuns();
       });
 
@@ -394,6 +446,8 @@ export function registerEvalRoutes(
         run.stderr += `\nProcess error: ${err.message}`;
         run.finishedAt = new Date().toISOString();
         run.process = undefined;
+        logStream?.write(`\nProcess error: ${err.message}\n`);
+        logStream?.end();
       });
 
       return c.json(
@@ -574,11 +628,15 @@ export function registerEvalRoutes(
       run.process = child;
       run.status = 'running';
 
+      const logStream = openConsoleLogStream(outputDir);
+
       child.stdout?.on('data', (chunk: Buffer) => {
+        logStream?.write(chunk);
         run.stdout += chunk.toString();
         if (run.stdout.length > 100_000) run.stdout = run.stdout.slice(-80_000);
       });
       child.stderr?.on('data', (chunk: Buffer) => {
+        logStream?.write(chunk);
         run.stderr += chunk.toString();
         if (run.stderr.length > 100_000) run.stderr = run.stderr.slice(-80_000);
       });
@@ -587,13 +645,16 @@ export function registerEvalRoutes(
         run.status = code === 0 ? 'finished' : 'failed';
         run.finishedAt = new Date().toISOString();
         run.process = undefined;
+        logStream?.end();
         pruneFinishedRuns();
       });
       child.on('error', (err) => {
         run.status = 'failed';
         run.stderr += `\nProcess error: ${err.message}`;
         run.finishedAt = new Date().toISOString();
         run.process = undefined;
+        logStream?.write(`\nProcess error: ${err.message}\n`);
+        logStream?.end();
       });
 
       return c.json({ id: runId, status: run.status, command }, 202);

diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
@@ -9,6 +9,7 @@
  *   - GET /           — Studio SPA (React app)
  *   - GET /api/runs   — list available run workspaces with metadata
  *   - GET /api/runs/:filename — load results from a specific run workspace
+ *   - GET /api/runs/:filename/log — stream the captured console.log for a run
  *   - GET /api/feedback  — read feedback reviews
  *   - POST /api/feedback — write feedback reviews
  *   - GET /api/benchmarks  — list registered benchmarks
@@ -55,7 +56,7 @@ import { resolveRunManifestPath } from '../eval/result-layout.js';
 import { loadRunCache, resolveRunCacheFile } from '../eval/run-cache.js';
 import { findRepoRoot } from '../eval/shared.js';
 import { listResultFiles } from '../inspect/utils.js';
-import { getActiveRunTarget, registerEvalRoutes } from './eval-runner.js';
+import { getActiveRunStatus, getActiveRunTarget, registerEvalRoutes } from './eval-runner.js';
 import {
   loadLightweightResults,
   loadManifestResults,
@@ -299,6 +300,10 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
       } catch {
         // ignore enrichment errors
       }
+      // Surface live status for Studio-launched runs that are still starting
+      // or running so the RunList can render a spinner instead of the
+      // pass/fail dot derived from a 0% pass rate.
+      const liveStatus = getActiveRunStatus(m.path);
       const tagsEntry = readRunTags(m.path);
       return {
         filename: m.filename,
@@ -313,11 +318,31 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
         ...(target && { target }),
         ...(experiment && { experiment }),
         ...(tagsEntry && { tags: tagsEntry.tags }),
+        ...(liveStatus && { status: liveStatus }),
       };
     }),
   });
 }
 
+async function handleRunLog(c: C, { searchDir }: DataContext) {
+  const filename = c.req.param('filename') ?? '';
+  const meta = await findRunById(searchDir, filename);
+  if (!meta) return c.json({ error: 'Run not found' }, 404);
+  if (meta.source === 'remote') {
+    return c.json({ error: 'Console log is not available for remote runs' }, 404);
+  }
+  const logPath = path.join(path.dirname(meta.path), 'console.log');
+  if (!existsSync(logPath)) {
+    return c.json({ error: 'Console log not found for this run' }, 404);
+  }
+  try {
+    const content = readFileSync(logPath, 'utf8');
+    return c.text(content);
+  } catch {
+    return c.json({ error: 'Failed to read console log' }, 500);
+  }
+}
+
 async function handleRunDetail(c: C, { searchDir }: DataContext) {
   const filename = c.req.param('filename') ?? '';
   const meta = await findRunById(searchDir, filename);
@@ -1169,6 +1194,7 @@ export function createApp(
     return handleRunTagsDelete(c, defaultCtx);
   });
   app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx));
+  app.get('/api/runs/:filename/log', (c) => handleRunLog(c, defaultCtx));
   app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx));
   app.get('/api/runs/:filename/categories', (c) => handleRunCategories(c, defaultCtx));
   app.get('/api/runs/:filename/categories/:category/suites', (c) =>
@@ -1293,6 +1319,7 @@ export function createApp(
     return withBenchmark(c, handleRunTagsDelete);
   });
   app.get('/api/benchmarks/:benchmarkId/runs/:filename', (c) => withBenchmark(c, handleRunDetail));
+  app.get('/api/benchmarks/:benchmarkId/runs/:filename/log', (c) => withBenchmark(c, handleRunLog));
   app.get('/api/benchmarks/:benchmarkId/runs/:filename/suites', (c) =>
     withBenchmark(c, handleRunSuites),
   );

diff --git a/apps/studio/src/components/RunDetail.tsx b/apps/studio/src/components/RunDetail.tsx
@@ -4,13 +4,20 @@
  * Groups results by category, then by suite within each category.
  * Category Breakdown is shown as a clean table with coloured pass-rate pills.
  * The All Evals table shows ERR badge instead of 0% for execution errors.
+ *
+ * Also renders a collapsible "Console Log" section sourced from the run's
+ * captured `console.log` file (served by `/api/runs/:id/log`). Hidden when no
+ * log is available — e.g. for remote runs or local runs that completed before
+ * the console-log capture feature shipped.
  */
 
+import { useState } from 'react';
+
 import { Link } from '@tanstack/react-router';
 
 import type { EvalResult } from '~/lib/types';
 
-import { isPassing, useStudioConfig } from '~/lib/api';
+import { isPassing, useRunLog, useStudioConfig } from '~/lib/api';
 
 import { PassRatePill } from './PassRatePill';
 import { StatsCards } from './StatsCards';
@@ -234,6 +241,50 @@ export function RunDetail({ results, runId, benchmarkId }: RunDetailProps) {
           </table>
         </div>
       </div>
+
+      <ConsoleLogSection runId={runId} benchmarkId={benchmarkId} />
+    </div>
+  );
+}
+
+function ConsoleLogSection({ runId, benchmarkId }: { runId: string; benchmarkId?: string }) {
+  const [open, setOpen] = useState(false);
+  const { data: log, isLoading, error } = useRunLog(runId, benchmarkId);
+
+  // Hide the section entirely when no log was captured (remote runs, or
+  // local runs from before this feature shipped). The 404 path resolves
+  // to `null` in fetchText, distinct from `undefined` (loading).
+  if (!isLoading && !error && log == null) return null;
+
+  return (
+    <div>
+      <button
+        type="button"
+        onClick={() => setOpen((v) => !v)}
+        className="flex w-full items-center justify-between rounded-lg border border-gray-800 bg-gray-900/50 px-4 py-2 text-left text-sm font-medium text-gray-300 transition-colors hover:bg-gray-900"
+        aria-expanded={open}
+      >
+        <span className="flex items-center gap-2">
+          <span aria-hidden="true">{open ? '▾' : '▸'}</span>
+          Console Log
+        </span>
+        <span className="text-xs text-gray-500">
+          {isLoading ? 'Loading…' : error ? 'Failed to load' : log ? `${log.length} chars` : ''}
+        </span>
+      </button>
+      {open && (
+        <div className="mt-2 overflow-hidden rounded-lg border border-gray-800 bg-black">
+          {error ? (
+            <div className="p-4 text-sm text-red-400">
+              Failed to load console log: {(error as Error).message}
+            </div>
+          ) : (
+            <pre className="max-h-[480px] overflow-auto whitespace-pre-wrap break-words p-4 font-mono text-xs leading-relaxed text-gray-200">
+              {log ?? ''}
+            </pre>
+          )}
+        </div>
+      )}
     </div>
   );
 }
diff --git a/apps/studio/src/components/RunList.tsx b/apps/studio/src/components/RunList.tsx
@@ -4,6 +4,12 @@
  * Displays all available runs with a pass/fail status dot, human-readable name,
  * source badge, date, test count, and coloured pass-rate pill.
  * Clicking a row navigates to the run detail view.
+ *
+ * In-progress runs (status `starting` / `running`, surfaced by the backend
+ * via the RunMeta `status` field while a Studio-launched run is still
+ * tracked in-memory) render a pulsing cyan dot instead of the pass/fail
+ * dot — otherwise a 0% pass rate during the warm-up window would show as
+ * a misleading red ✗.
  */
 
 import type React from 'react';
@@ -82,18 +88,27 @@ export function RunList({ runs, benchmarkId, emptyMessage }: RunListProps) {
           {runs.map((run) => {
             const ts = formatDate(run.timestamp);
             const passing = run.pass_rate >= passThreshold;
+            const isActive = run.status === 'starting' || run.status === 'running';
             const label = formatRunLabel(run);
             const passedCount = Math.round(run.pass_rate * run.test_count);
             const failedCount = run.test_count - passedCount;
             return (
               <tr key={run.filename} className="transition-colors hover:bg-gray-900/30">
-                {/* Status dot */}
+                {/* Status dot — spinner for active runs, otherwise pass/fail */}
                 <td className="px-4 py-3 text-center">
-                  <span
-                    className={`text-base font-bold ${passing ? 'text-emerald-400' : 'text-red-400'}`}
-                  >
-                    {passing ? '✓' : '✗'}
-                  </span>
+                  {isActive ? (
+                    <span
+                      className="inline-block h-2 w-2 animate-pulse rounded-full bg-cyan-400"
+                      title={run.status === 'starting' ? 'Starting…' : 'Running…'}
+                      aria-label={run.status === 'starting' ? 'Starting' : 'Running'}
+                    />
+                  ) : (
+                    <span
+                      className={`text-base font-bold ${passing ? 'text-emerald-400' : 'text-red-400'}`}
+                    >
+                      {passing ? '✓' : '✗'}
+                    </span>
+                  )}
                 </td>
 
                 {/* Run name */}

diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts
@@ -42,6 +42,21 @@ async function fetchJson<T>(url: string): Promise<T> {
   return res.json() as Promise<T>;
 }
 
+/**
+ * Fetch a text/plain endpoint. Treats 404 as `null` so callers can model
+ * "log not yet captured" without throwing — used by the RunDetail console log
+ * viewer for runs that finished before this feature shipped (no console.log
+ * on disk) and for remote runs.
+ */
+async function fetchText(url: string): Promise<string | null> {
+  const res = await fetch(url);
+  if (res.status === 404) return null;
+  if (!res.ok) {
+    throw new Error(`API error: ${res.status} ${res.statusText}`);
+  }
+  return res.text();
+}
+
 // ── Query option factories ──────────────────────────────────────────────
 
 export const runListOptions = queryOptions({
@@ -58,6 +73,23 @@ export function runDetailOptions(filename: string) {
   });
 }
 
+export function runLogOptions(filename: string, benchmarkId?: string) {
+  const url = benchmarkId
+    ? `${benchmarkApiBase(benchmarkId)}/runs/${encodeURIComponent(filename)}/log`
+    : `/api/runs/${encodeURIComponent(filename)}/log`;
+  return queryOptions({
+    queryKey: ['runs', filename, 'log', benchmarkId ?? ''],
+    queryFn: () => fetchText(url),
+    enabled: !!filename,
+    // Re-fetch while a run is still capturing output so the viewer streams in.
+    refetchInterval: 3_000,
+  });
+}
+
+export function useRunLog(filename: string, benchmarkId?: string) {
+  return useQuery(runLogOptions(filename, benchmarkId));
+}
+
 export function runSuitesOptions(runId: string) {
   return queryOptions({
     queryKey: ['runs', runId, 'suites'],

diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts
@@ -21,6 +21,13 @@ export interface RunMeta {
   benchmark_name?: string;
   /** Optional user-assigned tags from the run's sidecar tags.json. */
   tags?: string[];
+  /**
+   * Live execution status. Only present for Studio-launched runs that are
+   * still being tracked in-memory — used to render a spinner in RunList
+   * instead of the pass/fail dot when pass_rate is 0 simply because no
+   * results have been written yet.
+   */
+  status?: 'starting' | 'running' | 'finished' | 'failed';
 }
 
 export interface RunListResponse {