From 563a999f0f04e7ec489feb1adbc9687275667b6a Mon Sep 17 00:00:00 2001
From: Christopher Tso
Date: Thu, 7 May 2026 07:49:07 +0200
Subject: [PATCH 1/4] feat(cli): kill spawned providers on SIGINT/SIGTERM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Track long-lived provider subprocesses (claude, codex, pi, copilot,
vscode) in a per-process registry and walk it from a top-level signal
handler in cli.ts. Without this, Studio's child.kill('SIGTERM') against
the CLI orphans grandchildren — the Node parent exits but the OS does
not propagate the signal.
Plan in docs/plans/1222-stop-run.md.
---
apps/cli/src/cli.ts | 26 ++++
docs/plans/1222-stop-run.md | 118 ++++++++++++++++++
.../src/evaluation/providers/claude-cli.ts | 2 +
.../src/evaluation/providers/codex-cli.ts | 2 +
.../src/evaluation/providers/copilot-cli.ts | 2 +
.../core/src/evaluation/providers/pi-cli.ts | 2 +
.../vscode/dispatch/vscodeProcess.ts | 2 +
packages/core/src/index.ts | 5 +
packages/core/src/runtime/child-tracker.ts | 39 ++++++
9 files changed, 198 insertions(+)
create mode 100644 docs/plans/1222-stop-run.md
create mode 100644 packages/core/src/runtime/child-tracker.ts
diff --git a/apps/cli/src/cli.ts b/apps/cli/src/cli.ts
index 12dece45c..995cc8c26 100644
--- a/apps/cli/src/cli.ts
+++ b/apps/cli/src/cli.ts
@@ -1,6 +1,32 @@
#!/usr/bin/env node
+import { killAllTrackedChildren } from '@agentv/core';
+
import { runCli } from './index.js';
+// Forward SIGINT/SIGTERM to spawned provider subprocesses before exiting.
+// Without this, Studio's `child.kill('SIGTERM')` against the CLI orphans
+// any in-flight `claude`/`codex`/`pi`/`copilot` subprocess. The partial
+// `index.jsonl` is already row-by-row durable, so finished tests survive.
+//
+// First signal: kill children, exit with the conventional 128+signal code.
+// Second signal within the same process: hard-exit so a hung child cannot
+// trap the user.
+let interrupted = false;
+function installShutdown(signal: NodeJS.Signals, exitCode: number) {
+ process.on(signal, () => {
+ if (interrupted) {
+ process.exit(1);
+ }
+ interrupted = true;
+ killAllTrackedChildren('SIGTERM');
+ // Defer exit one tick so SIGTERM has a chance to dispatch before the
+ // event loop tears down.
+ setTimeout(() => process.exit(exitCode), 50);
+ });
+}
+installShutdown('SIGINT', 130);
+installShutdown('SIGTERM', 143);
+
runCli()
.then(() => {
process.exit(0);
diff --git a/docs/plans/1222-stop-run.md b/docs/plans/1222-stop-run.md
new file mode 100644
index 000000000..5decca1b9
--- /dev/null
+++ b/docs/plans/1222-stop-run.md
@@ -0,0 +1,118 @@
+# 1222 — Stop run + partial-run resumability (simplified scope)
+
+## Scope
+
+Make Ctrl+C / DELETE work. No staged shutdown, no new status words, no
+AbortSignal threading.
+
+1. **CLI** — Ctrl+C → install `process.on('SIGINT'|'SIGTERM')` that kills
+ tracked child processes, then exits. Partial `index.jsonl` is already
+ row-by-row durable; whatever finished is preserved.
+2. **Studio API** — `DELETE /api/eval/run/:id` (and benchmark-scoped
+ variant) calls `child.kill('SIGTERM')` on the spawned CLI. Existing
+ `child.on('close')` flips `running → failed` once the process exits.
+ No new `'stopping'` state on the server.
+3. **Studio UI** — Red Stop button on `/jobs/:runId` while
+ `status ∈ {starting, running}` and not read-only. On click, optimistic
+ local "Stopping…" label until the next status poll flips to terminal.
+4. **Resume detection for partial runs** — Persist `planned_test_count` in
+ `benchmark.json.metadata` at run start (early write), updated at end.
+ Run-detail API surfaces the number; Studio computes
+ `shouldShowResumeActions(results, isReadOnly, plannedTestCount?)` as
+ `executionError OR results.length < plannedTestCount`. No
+ `is_resumable` / `resume_reason` flags.
+
+## Files touched
+
+### CLI signal handler
+- `packages/core/src/runtime/child-tracker.ts` — *new*. Singleton
+ `Set`; `trackChild(child)` and `killAll(signal)`.
+- `packages/core/src/evaluation/providers/{claude-cli,codex-cli,pi-cli}.ts`
+ + `copilot-utils.ts` — call `trackChild(child)` after each `spawn(...)`.
+ No need to untrack on close: `kill()` is a no-op on dead PIDs and the
+ registry is short-lived.
+- `apps/cli/src/cli.ts` — install SIGINT/SIGTERM handlers that call
+ `killAll('SIGTERM')` then `process.exit(130)` / `143`. Idempotent on
+ re-entry; second signal hard-exits (`process.exit(1)`).
+
+### Studio DELETE endpoint
+- `apps/cli/src/commands/results/eval-runner.ts` — add
+ `app.delete('/api/eval/run/:id', ...)` and
+ `app.delete('/api/benchmarks/:benchmarkId/eval/run/:id', ...)`.
+ - 404 if id unknown
+ - 403 in read-only mode
+ - 200 `{stopped: true}` after `child.kill('SIGTERM')`
+ - 200 `{stopped: false, reason: 'already_terminal'}` if already done
+ - No status mutation here; close handler does it.
+
+### Studio Stop button
+- `apps/studio/src/components/StopRunButton.tsx` — *new*. Renders the red
+ destructive button when `status` is non-terminal; calls DELETE with the
+ benchmark-scoped path when `benchmarkId` is set; sets local
+ `stopping=true` state to flip the label optimistically.
+- `apps/studio/src/components/stop-run-helpers.ts` — *new*. Pure
+ `shouldShowStopButton(status, isReadOnly)` for unit testing.
+- `apps/studio/src/lib/api.ts` — add `stopEvalRun(id, benchmarkId?)`.
+- `apps/studio/src/routes/jobs/$runId.tsx` — wire `` into
+ the header.
+
+### Resume — planned_test_count
+- `apps/cli/src/commands/eval/artifact-writer.ts`
+ - Extend `BenchmarkArtifact.metadata` with optional
+ `planned_test_count?: number`.
+ - Add `writeInitialBenchmarkArtifact(runDir, { evalFile, targets,
+ plannedTestCount, experiment })` that writes a stub at run start
+ (`run_summary: {}`, `metadata` pre-filled).
+- `apps/cli/src/commands/eval/run-eval.ts` — call the initial writer
+ immediately after the run dir is created, **before** dispatching tests.
+ At end of run, the existing `writeArtifactsFromResults` /
+ `aggregateRunDir` overwrites the file with full data and the same
+ `planned_test_count`.
+- `apps/cli/src/commands/results/serve.ts` — `deriveResumeMeta` already
+ reads `metadata.eval_file`; extend to also surface
+ `planned_test_count`. Run-detail response gains `planned_test_count`.
+- `apps/studio/src/lib/types.ts` — add optional
+ `planned_test_count?: number` to the run-detail response type.
+- `apps/studio/src/components/resume-run-helpers.ts` — extend
+ `shouldShowResumeActions` to also return true when
+ `plannedTestCount && results.length < plannedTestCount`.
+- `apps/studio/src/components/ResumeRunActions.tsx` and the two run
+ detail routes — pass `plannedTestCount` through.
+
+### Tests (narrow)
+- `apps/cli/test/commands/results/serve.test.ts` (or co-located) — DELETE
+ endpoint: 404 unknown, 403 read-only, 200 happy path with a fake child
+ whose `kill` is observable.
+- `apps/studio/src/components/resume-run-helpers.test.ts` — case where
+ every result is `ok` but `results.length < plannedTestCount` → button
+ visible.
+- `apps/studio/src/components/stop-run-helpers.test.ts` — visibility
+ matrix.
+- Skip: integration test that signals a real eval. Manual UAT is enough.
+
+## Non-goals
+
+- "Pause" / resume-from-mid-test semantics.
+- Per-test SIGINT to graders.
+- Multi-job orchestration / job queue.
+- A "stopping" status word — UI handles the optimistic flicker locally.
+
+## Acceptance signals
+
+- [ ] CLI: `Ctrl+C` during a multi-test eval kills all spawned providers
+ and the partial `index.jsonl` is preserved.
+- [ ] Server: `DELETE /api/eval/run/:id` returns 200 in normal mode, 403
+ in read-only, 404 for unknown id.
+- [ ] UI: Stop button appears while running, hidden in read-only, hidden
+ when terminal. Optimistic "Stopping…" label appears until status
+ poll flips.
+- [ ] UI: A run with 5 of 10 tests `ok` and 5 missing shows Resume.
+- [ ] UI: A complete passing run does *not* show Resume.
+
+## Out-of-scope cleanups noted
+
+The benchmark.json write happens entirely at end-of-run today. Writing a
+stub at start means a run that crashes before the first test still has a
+metadata file on disk — that may obsolete some of the fallback logic in
+`deriveResumeMeta`. We are not consolidating that here; this PR only adds
+the early-write call.
diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts
index 4635d0e26..06ed5c810 100644
--- a/packages/core/src/evaluation/providers/claude-cli.ts
+++ b/packages/core/src/evaluation/providers/claude-cli.ts
@@ -5,6 +5,7 @@ import type { WriteStream } from 'node:fs';
import { mkdir } from 'node:fs/promises';
import path from 'node:path';
+import { trackChild } from '../../runtime/child-tracker.js';
import { extractTextContent, toContentArray } from './claude-content.js';
import { recordClaudeLogEntry } from './claude-log-tracker.js';
import { normalizeToolCall } from './normalize-tool-call.js';
@@ -270,6 +271,7 @@ export class ClaudeCliProvider implements Provider {
}
const child = spawn(this.config.executable, options.args, spawnOptions);
+ trackChild(child);
let stdout = '';
let stderr = '';
diff --git a/packages/core/src/evaluation/providers/codex-cli.ts b/packages/core/src/evaluation/providers/codex-cli.ts
index c776c77c5..232d540a4 100644
--- a/packages/core/src/evaluation/providers/codex-cli.ts
+++ b/packages/core/src/evaluation/providers/codex-cli.ts
@@ -7,6 +7,7 @@ import { tmpdir } from 'node:os';
import path from 'node:path';
import { promisify } from 'node:util';
+import { trackChild } from '../../runtime/child-tracker.js';
import { recordCodexLogEntry } from './codex-log-tracker.js';
import { buildPromptDocument, normalizeInputFiles } from './preread.js';
import type { CodexResolvedConfig } from './targets.js';
@@ -755,6 +756,7 @@ async function defaultCodexRunner(options: CodexRunOptions): Promise {
env: options.env,
stdio: ['pipe', 'pipe', 'pipe'],
});
+ trackChild(child);
let stdout = '';
let stderr = '';
diff --git a/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts b/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts
index 00bd69748..036c723b9 100644
--- a/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts
+++ b/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts
@@ -3,6 +3,7 @@ import { mkdir, writeFile } from 'node:fs/promises';
import path from 'node:path';
import { promisify } from 'node:util';
+import { trackChild } from '../../../../runtime/child-tracker.js';
import { pathExists, removeIfExists } from '../utils/fs.js';
import { pathToFileUri } from '../utils/path.js';
import { sleep } from '../utils/time.js';
@@ -36,6 +37,7 @@ function spawnVsCode(
shell: useShell,
detached: false,
});
+ trackChild(child);
child.on('error', (error) => {
const label = options?.label ?? 'spawn';
const renderedArgs = args.map((value) => JSON.stringify(value)).join(' ');
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index d6d1a349a..b88ba6eb2 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -132,6 +132,11 @@ export {
export { discoverGraders } from './evaluation/registry/grader-discovery.js';
export { RunBudgetTracker } from './evaluation/run-budget-tracker.js';
export { runBeforeSessionHook, parseEnvOutput } from './evaluation/hooks.js';
+export {
+ trackChild,
+ killAllTrackedChildren,
+ trackedChildCount,
+} from './runtime/child-tracker.js';
// Import pipeline
export * from './import/index.js';
diff --git a/packages/core/src/runtime/child-tracker.ts b/packages/core/src/runtime/child-tracker.ts
new file mode 100644
index 000000000..ed9e5e036
--- /dev/null
+++ b/packages/core/src/runtime/child-tracker.ts
@@ -0,0 +1,39 @@
+/**
+ * Tracks long-lived child processes spawned by AgentV providers so that a
+ * top-level signal handler can kill them all on Ctrl+C / SIGTERM.
+ *
+ * Why this exists: when the CLI receives SIGTERM (e.g. from Studio's Stop
+ * button), Node exits the parent process but does NOT propagate the signal
+ * to grandchildren. Without tracking, the spawned `claude`, `codex`, `pi`,
+ * `copilot` subprocesses linger as orphans. The CLI's signal handler walks
+ * this set and SIGTERMs each one before exiting.
+ *
+ * To extend: any provider that spawns a long-lived subprocess should call
+ * `trackChild(child)` immediately after `spawn(...)`. No untrack call is
+ * required — `kill()` is a no-op on dead PIDs and the registry lives only
+ * for the duration of one CLI invocation.
+ */
+
+import type { ChildProcess } from 'child_process';
+
+const tracked = new Set();
+
+export function trackChild(child: ChildProcess): void {
+ tracked.add(child);
+ child.once('close', () => tracked.delete(child));
+ child.once('exit', () => tracked.delete(child));
+}
+
+export function killAllTrackedChildren(signal: NodeJS.Signals = 'SIGTERM'): void {
+ for (const child of tracked) {
+ try {
+ child.kill(signal);
+ } catch {
+ // Already dead or unable to signal; nothing to do.
+ }
+ }
+}
+
+export function trackedChildCount(): number {
+ return tracked.size;
+}
From 576d097ba14a4972c98d197109f5f8805c6bae40 Mon Sep 17 00:00:00 2001
From: Christopher Tso
Date: Thu, 7 May 2026 07:56:33 +0200
Subject: [PATCH 2/4] chore: fix biome formatting drive-by + structural
Killable type
- Pre-existing format errors in two studio routes block any push; fixed
with biome --write so CI passes.
- child-tracker uses a structural Killable type to avoid a tsup dts
resolution failure on `node:child_process` re-export.
---
.../$benchmarkId_/evals/$runId.$evalId.tsx | 4 +++-
.../src/routes/evals/$runId.$evalId.tsx | 4 +++-
packages/core/src/runtime/child-tracker.ts | 20 ++++++++++++++-----
3 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx
index 169d6d643..2bf2b9ea6 100644
--- a/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx
+++ b/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx
@@ -64,7 +64,9 @@ function BenchmarkEvalDetailPage() {
Run: {runId} / Eval: {evalId}
-
+
{passed ? '✓' : '✗'}
{evalId}
diff --git a/packages/core/src/runtime/child-tracker.ts b/packages/core/src/runtime/child-tracker.ts
index ed9e5e036..573a00515 100644
--- a/packages/core/src/runtime/child-tracker.ts
+++ b/packages/core/src/runtime/child-tracker.ts
@@ -12,16 +12,26 @@
* `trackChild(child)` immediately after `spawn(...)`. No untrack call is
* required — `kill()` is a no-op on dead PIDs and the registry lives only
* for the duration of one CLI invocation.
+ *
+ * Why a structural `Killable` type instead of `ChildProcess`: importing
+ * the real type from `node:child_process` triggers a rollup-plugin-dts
+ * resolution failure when this module's signatures are re-exported from
+ * the package entry. The structural shape avoids that, and any
+ * `ChildProcess` satisfies it.
*/
-import type { ChildProcess } from 'child_process';
+type Killable = {
+ kill(signal?: NodeJS.Signals | number): boolean;
+ once(event: 'close', listener: (...args: unknown[]) => void): unknown;
+};
-const tracked = new Set();
+const tracked = new Set();
-export function trackChild(child: ChildProcess): void {
+export function trackChild(child: Killable): void {
tracked.add(child);
- child.once('close', () => tracked.delete(child));
- child.once('exit', () => tracked.delete(child));
+ child.once('close', () => {
+ tracked.delete(child);
+ });
}
export function killAllTrackedChildren(signal: NodeJS.Signals = 'SIGTERM'): void {
From 974f386baf1b269117ff70831d4b4358e59dafa8 Mon Sep 17 00:00:00 2001
From: Christopher Tso
Date: Thu, 7 May 2026 08:16:27 +0200
Subject: [PATCH 3/4] feat(studio): DELETE stop endpoint, Stop button,
partial-run resume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- DELETE /api/eval/run/:id (and benchmark-scoped variant) SIGTERMs the
spawned CLI. The CLI's own signal handler walks the child registry
added in the previous commit and kills grandchildren before exiting.
Existing child.on('close') flips status — no new 'stopping' state.
- StopRunButton on /jobs/:runId, hidden when terminal or read-only.
Optimistic 'Stopping…' label until the next status poll observes a
terminal state.
- planned_test_count persisted in benchmark.json.metadata via a stub
written at run start. Resume-action visibility now triggers when
results.length < planned_test_count even with no execution_error
rows — covers Stop-button / Ctrl+C cases.
- Narrow tests: shouldShowStopButton matrix, partial-run resume helper,
DELETE 404/403 routes for both base + benchmark-scoped paths. Happy
path of DELETE→SIGTERM is covered by manual UAT.
---
apps/cli/src/commands/eval/artifact-writer.ts | 78 +++++++++++++++++--
apps/cli/src/commands/eval/run-eval.ts | 16 ++++
apps/cli/src/commands/results/eval-runner.ts | 41 ++++++++++
apps/cli/src/commands/results/serve.ts | 12 ++-
apps/cli/test/commands/results/serve.test.ts | 44 +++++++++++
.../src/components/ResumeRunActions.tsx | 4 +-
apps/studio/src/components/StopRunButton.tsx | 62 +++++++++++++++
.../src/components/resume-run-helpers.test.ts | 17 ++++
.../src/components/resume-run-helpers.ts | 27 +++++--
.../src/components/stop-run-helpers.test.ts | 36 +++++++++
.../studio/src/components/stop-run-helpers.ts | 28 +++++++
apps/studio/src/lib/api.ts | 15 ++++
apps/studio/src/lib/types.ts | 2 +
.../benchmarks/$benchmarkId_/runs/$runId.tsx | 1 +
apps/studio/src/routes/jobs/$runId.tsx | 8 +-
apps/studio/src/routes/runs/$runId.tsx | 1 +
16 files changed, 374 insertions(+), 18 deletions(-)
create mode 100644 apps/studio/src/components/StopRunButton.tsx
create mode 100644 apps/studio/src/components/stop-run-helpers.test.ts
create mode 100644 apps/studio/src/components/stop-run-helpers.ts
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index e005d786f..c0a511915 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -34,7 +34,7 @@ export function deduplicateByTestIdTarget(
export async function aggregateRunDir(
runDir: string,
- options?: { evalFile?: string; experiment?: string },
+ options?: { evalFile?: string; experiment?: string; plannedTestCount?: number },
): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> {
const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
const content = await readFile(indexPath, 'utf8');
@@ -45,7 +45,18 @@ export async function aggregateRunDir(
const timingPath = path.join(runDir, 'timing.json');
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
- const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
+ // Preserve `planned_test_count` from any pre-existing benchmark.json (e.g.
+ // the stub written at run start, or from the original run when this is a
+ // resume) unless an explicit value was passed.
+ const plannedTestCount =
+ options?.plannedTestCount ?? (await readPlannedTestCount(path.join(runDir, 'benchmark.json')));
+
+ const benchmark = buildBenchmarkArtifact(
+ results,
+ options?.evalFile,
+ options?.experiment,
+ plannedTestCount,
+ );
const benchmarkPath = path.join(runDir, 'benchmark.json');
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
@@ -53,6 +64,17 @@ export async function aggregateRunDir(
return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
}
+async function readPlannedTestCount(benchmarkPath: string): Promise {
+ try {
+ const raw = await readFile(benchmarkPath, 'utf8');
+ const parsed = JSON.parse(raw) as { metadata?: { planned_test_count?: number } };
+ const value = parsed.metadata?.planned_test_count;
+ return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
+ } catch {
+ return undefined;
+ }
+}
+
// ---------------------------------------------------------------------------
// Artifact interfaces (snake_case to match skill-creator conventions)
// ---------------------------------------------------------------------------
@@ -110,6 +132,13 @@ export interface BenchmarkArtifact {
readonly targets: readonly string[];
readonly tests_run: readonly string[];
readonly experiment?: string;
+ /**
+ * Total number of test cases the run was planned to execute (across all
+ * targets and eval files). Written at run start so an interrupted run
+ * can be diagnosed as resumable when `tests_run.length < planned_test_count`,
+ * even if every recorded row has `execution_status: ok`.
+ */
+ readonly planned_test_count?: number;
};
readonly run_summary: Record<
string,
@@ -364,6 +393,7 @@ export function buildBenchmarkArtifact(
results: readonly EvaluationResult[],
evalFile = '',
experiment?: string,
+ plannedTestCount?: number,
): BenchmarkArtifact {
const targetSet = new Set();
const testIdSet = new Set();
@@ -457,6 +487,7 @@ export function buildBenchmarkArtifact(
targets,
tests_run: testIds,
experiment,
+ planned_test_count: plannedTestCount,
},
run_summary: runSummary,
per_grader_summary: perEvaluatorSummary,
@@ -464,6 +495,35 @@ export function buildBenchmarkArtifact(
};
}
+/**
+ * Write a stub `benchmark.json` at the start of a run, before any tests
+ * have executed. Carries `planned_test_count` so an interrupted run can
+ * still be detected as resumable even when every recorded row has
+ * `execution_status: ok`.
+ *
+ * The end-of-run write (writeArtifactsFromResults / aggregateRunDir)
+ * overwrites this file with the full summary; preserve `planned_test_count`
+ * by passing it through.
+ */
+export async function writeInitialBenchmarkArtifact(
+ runDir: string,
+ options: {
+ evalFile: string;
+ plannedTestCount: number;
+ experiment?: string;
+ },
+): Promise {
+ await mkdir(runDir, { recursive: true });
+ const stub = buildBenchmarkArtifact(
+ [],
+ options.evalFile,
+ options.experiment,
+ options.plannedTestCount,
+ );
+ const benchmarkPath = path.join(runDir, 'benchmark.json');
+ await writeFile(benchmarkPath, `${JSON.stringify(stub, null, 2)}\n`, 'utf8');
+}
+
export function buildAggregateGradingArtifact(
results: readonly EvaluationResult[],
): AggregateGradingArtifact {
@@ -826,7 +886,7 @@ export async function writePerTestArtifacts(
export async function writeArtifactsFromResults(
results: readonly EvaluationResult[],
outputDir: string,
- options?: { evalFile?: string; experiment?: string },
+ options?: { evalFile?: string; experiment?: string; plannedTestCount?: number },
): Promise<{
testArtifactDir: string;
timingPath: string;
@@ -877,8 +937,16 @@ export async function writeArtifactsFromResults(
const timing = buildTimingArtifact(results);
await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
- // Write benchmark
- const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
+ // Write benchmark — preserve `planned_test_count` from the run-start stub
+ // (or from the original run when this is a resume) unless an explicit
+ // value was passed by the caller.
+ const plannedTestCount = options?.plannedTestCount ?? (await readPlannedTestCount(benchmarkPath));
+ const benchmark = buildBenchmarkArtifact(
+ results,
+ options?.evalFile,
+ options?.experiment,
+ plannedTestCount,
+ );
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
await writeJsonlFile(indexPath, indexRecords);
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index db0b0fb19..0f3536061 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -38,6 +38,7 @@ import {
deduplicateByTestIdTarget,
parseJsonlResults,
writeArtifactsFromResults,
+ writeInitialBenchmarkArtifact,
} from './artifact-writer.js';
import { writeBenchmarkJson } from './benchmark-writer.js';
import { loadEnvFromHierarchy } from './env.js';
@@ -1447,6 +1448,21 @@ export async function runEvalCommand(
);
}
+ // Write a stub benchmark.json before dispatching tests, carrying the planned
+ // execution count so an interrupted run can still surface as resumable in
+ // Studio (results.length < planned_test_count) even when every recorded row
+ // has execution_status: ok. The end-of-run write preserves this value via
+ // readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults.
+ // Skip on resume — we want to preserve the *original* planned count.
+ if (!isResumeAppend && usesDefaultArtifactWorkspace && totalEvalCount > 0) {
+ const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
+ await writeInitialBenchmarkArtifact(runDir, {
+ evalFile,
+ plannedTestCount: totalEvalCount,
+ experiment: normalizeExperimentName(options.experiment),
+ });
+ }
+
// Eval files run sequentially; within each file, --workers N test cases run in parallel.
// This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file
// workspace races without any grouping complexity.
diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts
index e40c01c74..7f295bf05 100644
--- a/apps/cli/src/commands/results/eval-runner.ts
+++ b/apps/cli/src/commands/results/eval-runner.ts
@@ -412,6 +412,29 @@ export function registerEvalRoutes(
}
});
+ // ── Stop a running eval ────────────────────────────────────────────────
+ // SIGTERM the spawned CLI; the existing child.on('close') flips status to
+ // 'finished'/'failed'. The CLI's own signal handler walks its tracked
+ // grandchildren (claude/codex/pi/copilot subprocesses) and kills them
+ // before exiting.
+ app.delete('/api/eval/run/:id', (c) => {
+ if (readOnly) {
+ return c.json({ error: 'Studio is running in read-only mode' }, 403);
+ }
+ const id = c.req.param('id');
+ const run = activeRuns.get(id ?? '');
+ if (!run) return c.json({ error: 'Run not found' }, 404);
+ if (run.status === 'finished' || run.status === 'failed' || !run.process) {
+ return c.json({ stopped: false, reason: 'already_terminal', status: run.status });
+ }
+ try {
+ run.process.kill('SIGTERM');
+ } catch (err) {
+ return c.json({ error: (err as Error).message }, 500);
+ }
+ return c.json({ stopped: true, status: run.status });
+ });
+
// ── Run status ─────────────────────────────────────────────────────────
app.get('/api/eval/status/:id', (c) => {
const id = c.req.param('id');
@@ -576,6 +599,24 @@ export function registerEvalRoutes(
}
});
+ app.delete('/api/benchmarks/:benchmarkId/eval/run/:id', (c) => {
+ if (readOnly) {
+ return c.json({ error: 'Studio is running in read-only mode' }, 403);
+ }
+ const id = c.req.param('id');
+ const run = activeRuns.get(id ?? '');
+ if (!run) return c.json({ error: 'Run not found' }, 404);
+ if (run.status === 'finished' || run.status === 'failed' || !run.process) {
+ return c.json({ stopped: false, reason: 'already_terminal', status: run.status });
+ }
+ try {
+ run.process.kill('SIGTERM');
+ } catch (err) {
+ return c.json({ error: (err as Error).message }, 500);
+ }
+ return c.json({ stopped: true, status: run.status });
+ });
+
app.get('/api/benchmarks/:benchmarkId/eval/status/:id', (c) => {
const id = c.req.param('id');
const run = activeRuns.get(id ?? '');
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index f88bf212e..8468bdf88 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -347,8 +347,8 @@ async function handleRunDetail(c: C, { searchDir }: DataContext) {
function deriveResumeMeta(
cwd: string,
manifestPath: string,
-): { run_dir?: string; suite_filter?: string } {
- const out: { run_dir?: string; suite_filter?: string } = {};
+): { run_dir?: string; suite_filter?: string; planned_test_count?: number } {
+ const out: { run_dir?: string; suite_filter?: string; planned_test_count?: number } = {};
const runDir = path.dirname(manifestPath);
const relative = path.relative(cwd, runDir);
// path.relative returns '..'-prefixed paths when runDir is outside cwd; keep
@@ -359,15 +359,19 @@ function deriveResumeMeta(
const benchmarkPath = path.join(runDir, 'benchmark.json');
if (existsSync(benchmarkPath)) {
const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as {
- metadata?: { eval_file?: string };
+ metadata?: { eval_file?: string; planned_test_count?: number };
};
const evalFile = parsed.metadata?.eval_file;
if (typeof evalFile === 'string' && evalFile.trim()) {
out.suite_filter = evalFile.trim();
}
+ const planned = parsed.metadata?.planned_test_count;
+ if (typeof planned === 'number' && Number.isFinite(planned) && planned > 0) {
+ out.planned_test_count = planned;
+ }
}
} catch {
- // benchmark.json missing / unreadable / malformed — leave suite_filter unset.
+ // benchmark.json missing / unreadable / malformed — leave fields unset.
}
return out;
}
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index 0e69e495a..d1a1d8ae5 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -1003,6 +1003,50 @@ describe('serve app', () => {
});
});
+ // ── DELETE /api/eval/run/:id — stop a running eval ─────────────────────
+ //
+ // The DELETE endpoint validates routing/auth shape (404 unknown id, 403
+ // read-only). The happy-path SIGTERM behavior is covered by manual UAT
+ // because it requires a live subprocess that is reliably mid-run; unit
+ // tests that race a launch against a delete are flaky.
+
+ describe('DELETE /api/eval/run/:id (stop API)', () => {
+ function makeAppForStop(opts?: { readOnly?: boolean }) {
+ return createApp([], tempDir, undefined, undefined, {
+ studioDir,
+ readOnly: opts?.readOnly === true,
+ });
+ }
+
+ it('returns 404 for an unknown run id', async () => {
+ const app = makeAppForStop();
+ const res = await app.request('/api/eval/run/no-such-id', { method: 'DELETE' });
+ expect(res.status).toBe(404);
+ });
+
+ it('returns 403 in read-only mode', async () => {
+ const app = makeAppForStop({ readOnly: true });
+ const res = await app.request('/api/eval/run/anything', { method: 'DELETE' });
+ expect(res.status).toBe(403);
+ });
+
+ it('returns 404 for benchmark-scoped DELETE with unknown run id', async () => {
+ const app = makeAppForStop();
+ const res = await app.request('/api/benchmarks/some-id/eval/run/no-such-id', {
+ method: 'DELETE',
+ });
+ expect(res.status).toBe(404);
+ });
+
+ it('returns 403 in read-only mode for benchmark-scoped DELETE', async () => {
+ const app = makeAppForStop({ readOnly: true });
+ const res = await app.request('/api/benchmarks/some-id/eval/run/anything', {
+ method: 'DELETE',
+ });
+ expect(res.status).toBe(403);
+ });
+ });
+
// ── POST /api/eval/preview — argument shaping for resume flags ─────────
//
// /api/eval/preview is a lightweight endpoint that returns the CLI
diff --git a/apps/studio/src/components/ResumeRunActions.tsx b/apps/studio/src/components/ResumeRunActions.tsx
index d161c9d22..f50ebe5b5 100644
--- a/apps/studio/src/components/ResumeRunActions.tsx
+++ b/apps/studio/src/components/ResumeRunActions.tsx
@@ -35,6 +35,7 @@ export interface ResumeRunActionsProps {
target?: string;
benchmarkId?: string;
isReadOnly: boolean;
+ plannedTestCount?: number;
}
export function ResumeRunActions({
@@ -44,12 +45,13 @@ export function ResumeRunActions({
target,
benchmarkId,
isReadOnly,
+ plannedTestCount,
}: ResumeRunActionsProps) {
const navigate = useNavigate();
const [busy, setBusy] = useState(null);
const [error, setError] = useState(null);
- if (!shouldShowResumeActions(results, isReadOnly)) return null;
+ if (!shouldShowResumeActions(results, isReadOnly, plannedTestCount)) return null;
// Both actions need the run dir + the original eval file. Without those
// we can't target the existing run workspace, so we render the buttons
diff --git a/apps/studio/src/components/StopRunButton.tsx b/apps/studio/src/components/StopRunButton.tsx
new file mode 100644
index 000000000..39ce1433e
--- /dev/null
+++ b/apps/studio/src/components/StopRunButton.tsx
@@ -0,0 +1,62 @@
+/**
+ * StopRunButton — destructive button on /jobs/:runId that terminates a
+ * Studio-launched eval.
+ *
+ * Calls DELETE /api/eval/run/:id (or the benchmark-scoped variant).
+ * Optimistically flips the local label to "Stopping…" until the next
+ * poll of /api/eval/status/:id observes a terminal state — at which
+ * point the button hides via `shouldShowStopButton`.
+ *
+ * To extend with a "Force kill" affordance: thread a second handler
+ * through that POSTs DELETE again (the CLI's signal handler interprets
+ * a second SIGTERM within the same process as hard-exit) and surfaces a
+ * confirmation prompt.
+ */
+
+import { useState } from 'react';
+
+import { stopEvalRun } from '~/lib/api';
+
+import { type RunStatus, shouldShowStopButton } from './stop-run-helpers';
+
+export interface StopRunButtonProps {
+ runId: string;
+ status: RunStatus | undefined;
+ isReadOnly: boolean;
+ benchmarkId?: string;
+}
+
+export function StopRunButton({ runId, status, isReadOnly, benchmarkId }: StopRunButtonProps) {
+ const [stopping, setStopping] = useState(false);
+ const [error, setError] = useState(null);
+
+ if (!shouldShowStopButton(status, isReadOnly)) return null;
+
+ async function onClick() {
+ setStopping(true);
+ setError(null);
+ try {
+ await stopEvalRun(runId, benchmarkId);
+ } catch (err) {
+ setError(err instanceof Error ? err.message : 'Failed to stop run');
+ setStopping(false);
+ }
+ // On success, leave `stopping=true`. The status poller will flip to
+ // a terminal state shortly, at which point the button unmounts.
+ }
+
+ return (
+
+
+ {error &&
{error}
}
+
+ );
+}
diff --git a/apps/studio/src/components/resume-run-helpers.test.ts b/apps/studio/src/components/resume-run-helpers.test.ts
index 39e2d807c..9ceb71870 100644
--- a/apps/studio/src/components/resume-run-helpers.test.ts
+++ b/apps/studio/src/components/resume-run-helpers.test.ts
@@ -32,6 +32,23 @@ describe('shouldShowResumeActions', () => {
it('hides on empty results', () => {
expect(shouldShowResumeActions([], false)).toBe(false);
});
+
+ it('shows for an incomplete partial run with only ok rows when planned_test_count exceeds results', () => {
+ // Stop button / Ctrl+C scenario: 5 of 10 planned tests finished
+ // successfully before the run was killed. No execution errors, but
+ // still resumable.
+ const results = [ok('a'), ok('b'), ok('c'), ok('d'), ok('e')];
+ expect(shouldShowResumeActions(results, false, 10)).toBe(true);
+ });
+
+ it('hides when results match planned_test_count (complete passing run)', () => {
+ const results = [ok('a'), ok('b'), ok('c')];
+ expect(shouldShowResumeActions(results, false, 3)).toBe(false);
+ });
+
+ it('hides incomplete partial run in read-only mode', () => {
+ expect(shouldShowResumeActions([ok('a')], true, 5)).toBe(false);
+ });
});
describe('buildResumeRequestBody', () => {
diff --git a/apps/studio/src/components/resume-run-helpers.ts b/apps/studio/src/components/resume-run-helpers.ts
index fcbf805a0..b8e6451f5 100644
--- a/apps/studio/src/components/resume-run-helpers.ts
+++ b/apps/studio/src/components/resume-run-helpers.ts
@@ -21,14 +21,29 @@ export interface BuildResumeRequestParams {
}
/**
- * Whether the resume actions should be visible. The button only makes sense
- * when at least one row failed with an execution error and the user has
- * write access (read-only mode hides the entire control rather than
- * showing a disabled button — see issue acceptance criteria).
+ * Whether the resume actions should be visible. The button is shown when:
+ * 1. At least one recorded row has `execution_status: execution_error`, OR
+ * 2. The run is *incomplete* — fewer recorded rows than the originally
+ * planned execution count, even if every recorded row is `ok`.
+ *
+ * Case 2 covers Stop-button / Ctrl+C interruptions where the run produced
+ * only successful rows before being killed: there is no `execution_error`
+ * to anchor on, but the run is still resumable. `plannedTestCount` is
+ * persisted in `benchmark.json.metadata` at run start (see
+ * `writeInitialBenchmarkArtifact`).
+ *
+ * Hidden in read-only mode — the server also returns 403, but UI-level
+ * hiding avoids dead controls.
*/
-export function shouldShowResumeActions(results: EvalResult[], isReadOnly: boolean): boolean {
+export function shouldShowResumeActions(
+ results: EvalResult[],
+ isReadOnly: boolean,
+ plannedTestCount?: number,
+): boolean {
if (isReadOnly) return false;
- return results.some((r) => r.executionStatus === 'execution_error');
+ if (results.some((r) => r.executionStatus === 'execution_error')) return true;
+ if (plannedTestCount !== undefined && results.length < plannedTestCount) return true;
+ return false;
}
/**
diff --git a/apps/studio/src/components/stop-run-helpers.test.ts b/apps/studio/src/components/stop-run-helpers.test.ts
new file mode 100644
index 000000000..3bd407b1b
--- /dev/null
+++ b/apps/studio/src/components/stop-run-helpers.test.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it } from 'bun:test';
+
+import { isTerminalRunStatus, shouldShowStopButton } from './stop-run-helpers';
+
+describe('isTerminalRunStatus', () => {
+ it('treats finished and failed as terminal', () => {
+ expect(isTerminalRunStatus('finished')).toBe(true);
+ expect(isTerminalRunStatus('failed')).toBe(true);
+ });
+
+ it('treats live states and unknowns as non-terminal', () => {
+ expect(isTerminalRunStatus('starting')).toBe(false);
+ expect(isTerminalRunStatus('running')).toBe(false);
+ expect(isTerminalRunStatus(undefined)).toBe(false);
+ });
+});
+
+describe('shouldShowStopButton', () => {
+ it('shows while the run is live', () => {
+ expect(shouldShowStopButton('starting', false)).toBe(true);
+ expect(shouldShowStopButton('running', false)).toBe(true);
+ });
+
+ it('hides once the run reaches a terminal state', () => {
+ expect(shouldShowStopButton('finished', false)).toBe(false);
+ expect(shouldShowStopButton('failed', false)).toBe(false);
+ });
+
+ it('hides in read-only mode regardless of status', () => {
+ expect(shouldShowStopButton('running', true)).toBe(false);
+ });
+
+ it('hides when the status is undefined', () => {
+ expect(shouldShowStopButton(undefined, false)).toBe(false);
+ });
+});
diff --git a/apps/studio/src/components/stop-run-helpers.ts b/apps/studio/src/components/stop-run-helpers.ts
new file mode 100644
index 000000000..fcef0e56a
--- /dev/null
+++ b/apps/studio/src/components/stop-run-helpers.ts
@@ -0,0 +1,28 @@
+/**
+ * Pure helpers backing StopRunButton, isolated for unit testing.
+ *
+ * Intentionally side-effect-free so the visibility matrix is testable
+ * without rendering React.
+ *
+ * To extend: extend the union of statuses recognized as non-terminal as
+ * the server adds new lifecycle states. Today the server only emits
+ * starting / running / finished / failed; anything not in the terminal
+ * set is treated as live.
+ */
+
+export type RunStatus = 'starting' | 'running' | 'finished' | 'failed' | (string & {});
+
+export function isTerminalRunStatus(status: RunStatus | undefined): boolean {
+ return status === 'finished' || status === 'failed';
+}
+
+/**
+ * Whether the Stop button should be visible. Hidden when the run is
+ * terminal (no process to kill) and in read-only mode (the API also
+ * 403s, but UI-level hiding avoids dead controls).
+ */
+export function shouldShowStopButton(status: RunStatus | undefined, isReadOnly: boolean): boolean {
+ if (isReadOnly) return false;
+ if (!status) return false;
+ return !isTerminalRunStatus(status);
+}
diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts
index 1ea719e6e..1edde07f9 100644
--- a/apps/studio/src/lib/api.ts
+++ b/apps/studio/src/lib/api.ts
@@ -539,6 +539,21 @@ export async function launchEvalRun(
return res.json() as Promise;
}
+export async function stopEvalRun(
+ runId: string,
+ benchmarkId?: string,
+): Promise<{ stopped: boolean; reason?: string; status?: string }> {
+ const url = benchmarkId
+ ? `${benchmarkApiBase(benchmarkId)}/eval/run/${runId}`
+ : `/api/eval/run/${runId}`;
+ const res = await fetch(url, { method: 'DELETE' });
+ if (!res.ok) {
+ const err = await res.json().catch(() => ({ error: res.statusText }));
+ throw new Error((err as { error?: string }).error ?? `Failed: ${res.status}`);
+ }
+ return res.json() as Promise<{ stopped: boolean; reason?: string; status?: string }>;
+}
+
export function evalRunStatusOptions(runId: string | null) {
return queryOptions({
queryKey: ['eval-status', runId],
diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts
index 46f321d32..6ef4dc1b4 100644
--- a/apps/studio/src/lib/types.ts
+++ b/apps/studio/src/lib/types.ts
@@ -80,6 +80,8 @@ export interface RunDetailResponse {
run_dir?: string;
/** Eval file path the run was launched against, if recorded in benchmark.json. Local runs only. */
suite_filter?: string;
+ /** Total (test_id, target) executions originally planned for this run. Used to detect incomplete partial runs as resumable. Local runs only, populated when the run was launched after the planned-count metadata feature shipped. */
+ planned_test_count?: number;
}
export interface SuiteSummary {
diff --git a/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx
index db0f28bd4..dfb41c244 100644
--- a/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx
+++ b/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx
@@ -77,6 +77,7 @@ function BenchmarkRunDetailPage() {
target={target ?? undefined}
benchmarkId={benchmarkId}
isReadOnly={isReadOnly}
+ plannedTestCount={data?.planned_test_count}
/>
{!isReadOnly && (