From 563a999f0f04e7ec489feb1adbc9687275667b6a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 7 May 2026 07:49:07 +0200 Subject: [PATCH 1/4] feat(cli): kill spawned providers on SIGINT/SIGTERM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track long-lived provider subprocesses (claude, codex, pi, copilot, vscode) in a per-process registry and walk it from a top-level signal handler in cli.ts. Without this, Studio's child.kill('SIGTERM') against the CLI orphans grandchildren — the Node parent exits but the OS does not propagate the signal. Plan in docs/plans/1222-stop-run.md. --- apps/cli/src/cli.ts | 26 ++++ docs/plans/1222-stop-run.md | 118 ++++++++++++++++++ .../src/evaluation/providers/claude-cli.ts | 2 + .../src/evaluation/providers/codex-cli.ts | 2 + .../src/evaluation/providers/copilot-cli.ts | 2 + .../core/src/evaluation/providers/pi-cli.ts | 2 + .../vscode/dispatch/vscodeProcess.ts | 2 + packages/core/src/index.ts | 5 + packages/core/src/runtime/child-tracker.ts | 39 ++++++ 9 files changed, 198 insertions(+) create mode 100644 docs/plans/1222-stop-run.md create mode 100644 packages/core/src/runtime/child-tracker.ts diff --git a/apps/cli/src/cli.ts b/apps/cli/src/cli.ts index 12dece45c..995cc8c26 100644 --- a/apps/cli/src/cli.ts +++ b/apps/cli/src/cli.ts @@ -1,6 +1,32 @@ #!/usr/bin/env node +import { killAllTrackedChildren } from '@agentv/core'; + import { runCli } from './index.js'; +// Forward SIGINT/SIGTERM to spawned provider subprocesses before exiting. +// Without this, Studio's `child.kill('SIGTERM')` against the CLI orphans +// any in-flight `claude`/`codex`/`pi`/`copilot` subprocess. The partial +// `index.jsonl` is already row-by-row durable, so finished tests survive. +// +// First signal: kill children, exit with the conventional 128+signal code. +// Second signal within the same process: hard-exit so a hung child cannot +// trap the user. +let interrupted = false; +function installShutdown(signal: NodeJS.Signals, exitCode: number) { + process.on(signal, () => { + if (interrupted) { + process.exit(1); + } + interrupted = true; + killAllTrackedChildren('SIGTERM'); + // Defer exit one tick so SIGTERM has a chance to dispatch before the + // event loop tears down. + setTimeout(() => process.exit(exitCode), 50); + }); +} +installShutdown('SIGINT', 130); +installShutdown('SIGTERM', 143); + runCli() .then(() => { process.exit(0); diff --git a/docs/plans/1222-stop-run.md b/docs/plans/1222-stop-run.md new file mode 100644 index 000000000..5decca1b9 --- /dev/null +++ b/docs/plans/1222-stop-run.md @@ -0,0 +1,118 @@ +# 1222 — Stop run + partial-run resumability (simplified scope) + +## Scope + +Make Ctrl+C / DELETE work. No staged shutdown, no new status words, no +AbortSignal threading. + +1. **CLI** — Ctrl+C → install `process.on('SIGINT'|'SIGTERM')` that kills + tracked child processes, then exits. Partial `index.jsonl` is already + row-by-row durable; whatever finished is preserved. +2. **Studio API** — `DELETE /api/eval/run/:id` (and benchmark-scoped + variant) calls `child.kill('SIGTERM')` on the spawned CLI. Existing + `child.on('close')` flips `running → failed` once the process exits. + No new `'stopping'` state on the server. +3. **Studio UI** — Red Stop button on `/jobs/:runId` while + `status ∈ {starting, running}` and not read-only. On click, optimistic + local "Stopping…" label until the next status poll flips to terminal. +4. **Resume detection for partial runs** — Persist `planned_test_count` in + `benchmark.json.metadata` at run start (early write), updated at end. + Run-detail API surfaces the number; Studio computes + `shouldShowResumeActions(results, isReadOnly, plannedTestCount?)` as + `executionError OR results.length < plannedTestCount`. No + `is_resumable` / `resume_reason` flags. + +## Files touched + +### CLI signal handler +- `packages/core/src/runtime/child-tracker.ts` — *new*. Singleton + `Set`; `trackChild(child)` and `killAll(signal)`. +- `packages/core/src/evaluation/providers/{claude-cli,codex-cli,pi-cli}.ts` + + `copilot-utils.ts` — call `trackChild(child)` after each `spawn(...)`. + No need to untrack on close: `kill()` is a no-op on dead PIDs and the + registry is short-lived. +- `apps/cli/src/cli.ts` — install SIGINT/SIGTERM handlers that call + `killAll('SIGTERM')` then `process.exit(130)` / `143`. Idempotent on + re-entry; second signal hard-exits (`process.exit(1)`). + +### Studio DELETE endpoint +- `apps/cli/src/commands/results/eval-runner.ts` — add + `app.delete('/api/eval/run/:id', ...)` and + `app.delete('/api/benchmarks/:benchmarkId/eval/run/:id', ...)`. + - 404 if id unknown + - 403 in read-only mode + - 200 `{stopped: true}` after `child.kill('SIGTERM')` + - 200 `{stopped: false, reason: 'already_terminal'}` if already done + - No status mutation here; close handler does it. + +### Studio Stop button +- `apps/studio/src/components/StopRunButton.tsx` — *new*. Renders the red + destructive button when `status` is non-terminal; calls DELETE with the + benchmark-scoped path when `benchmarkId` is set; sets local + `stopping=true` state to flip the label optimistically. +- `apps/studio/src/components/stop-run-helpers.ts` — *new*. Pure + `shouldShowStopButton(status, isReadOnly)` for unit testing. +- `apps/studio/src/lib/api.ts` — add `stopEvalRun(id, benchmarkId?)`. +- `apps/studio/src/routes/jobs/$runId.tsx` — wire `` into + the header. + +### Resume — planned_test_count +- `apps/cli/src/commands/eval/artifact-writer.ts` + - Extend `BenchmarkArtifact.metadata` with optional + `planned_test_count?: number`. + - Add `writeInitialBenchmarkArtifact(runDir, { evalFile, targets, + plannedTestCount, experiment })` that writes a stub at run start + (`run_summary: {}`, `metadata` pre-filled). +- `apps/cli/src/commands/eval/run-eval.ts` — call the initial writer + immediately after the run dir is created, **before** dispatching tests. + At end of run, the existing `writeArtifactsFromResults` / + `aggregateRunDir` overwrites the file with full data and the same + `planned_test_count`. +- `apps/cli/src/commands/results/serve.ts` — `deriveResumeMeta` already + reads `metadata.eval_file`; extend to also surface + `planned_test_count`. Run-detail response gains `planned_test_count`. +- `apps/studio/src/lib/types.ts` — add optional + `planned_test_count?: number` to the run-detail response type. +- `apps/studio/src/components/resume-run-helpers.ts` — extend + `shouldShowResumeActions` to also return true when + `plannedTestCount && results.length < plannedTestCount`. +- `apps/studio/src/components/ResumeRunActions.tsx` and the two run + detail routes — pass `plannedTestCount` through. + +### Tests (narrow) +- `apps/cli/test/commands/results/serve.test.ts` (or co-located) — DELETE + endpoint: 404 unknown, 403 read-only, 200 happy path with a fake child + whose `kill` is observable. +- `apps/studio/src/components/resume-run-helpers.test.ts` — case where + every result is `ok` but `results.length < plannedTestCount` → button + visible. +- `apps/studio/src/components/stop-run-helpers.test.ts` — visibility + matrix. +- Skip: integration test that signals a real eval. Manual UAT is enough. + +## Non-goals + +- "Pause" / resume-from-mid-test semantics. +- Per-test SIGINT to graders. +- Multi-job orchestration / job queue. +- A "stopping" status word — UI handles the optimistic flicker locally. + +## Acceptance signals + +- [ ] CLI: `Ctrl+C` during a multi-test eval kills all spawned providers + and the partial `index.jsonl` is preserved. +- [ ] Server: `DELETE /api/eval/run/:id` returns 200 in normal mode, 403 + in read-only, 404 for unknown id. +- [ ] UI: Stop button appears while running, hidden in read-only, hidden + when terminal. Optimistic "Stopping…" label appears until status + poll flips. +- [ ] UI: A run with 5 of 10 tests `ok` and 5 missing shows Resume. +- [ ] UI: A complete passing run does *not* show Resume. + +## Out-of-scope cleanups noted + +The benchmark.json write happens entirely at end-of-run today. Writing a +stub at start means a run that crashes before the first test still has a +metadata file on disk — that may obsolete some of the fallback logic in +`deriveResumeMeta`. We are not consolidating that here; this PR only adds +the early-write call. diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts index 4635d0e26..06ed5c810 100644 --- a/packages/core/src/evaluation/providers/claude-cli.ts +++ b/packages/core/src/evaluation/providers/claude-cli.ts @@ -5,6 +5,7 @@ import type { WriteStream } from 'node:fs'; import { mkdir } from 'node:fs/promises'; import path from 'node:path'; +import { trackChild } from '../../runtime/child-tracker.js'; import { extractTextContent, toContentArray } from './claude-content.js'; import { recordClaudeLogEntry } from './claude-log-tracker.js'; import { normalizeToolCall } from './normalize-tool-call.js'; @@ -270,6 +271,7 @@ export class ClaudeCliProvider implements Provider { } const child = spawn(this.config.executable, options.args, spawnOptions); + trackChild(child); let stdout = ''; let stderr = ''; diff --git a/packages/core/src/evaluation/providers/codex-cli.ts b/packages/core/src/evaluation/providers/codex-cli.ts index c776c77c5..232d540a4 100644 --- a/packages/core/src/evaluation/providers/codex-cli.ts +++ b/packages/core/src/evaluation/providers/codex-cli.ts @@ -7,6 +7,7 @@ import { tmpdir } from 'node:os'; import path from 'node:path'; import { promisify } from 'node:util'; +import { trackChild } from '../../runtime/child-tracker.js'; import { recordCodexLogEntry } from './codex-log-tracker.js'; import { buildPromptDocument, normalizeInputFiles } from './preread.js'; import type { CodexResolvedConfig } from './targets.js'; @@ -755,6 +756,7 @@ async function defaultCodexRunner(options: CodexRunOptions): Promise { env: options.env, stdio: ['pipe', 'pipe', 'pipe'], }); + trackChild(child); let stdout = ''; let stderr = ''; diff --git a/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts b/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts index 00bd69748..036c723b9 100644 --- a/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts +++ b/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts @@ -3,6 +3,7 @@ import { mkdir, writeFile } from 'node:fs/promises'; import path from 'node:path'; import { promisify } from 'node:util'; +import { trackChild } from '../../../../runtime/child-tracker.js'; import { pathExists, removeIfExists } from '../utils/fs.js'; import { pathToFileUri } from '../utils/path.js'; import { sleep } from '../utils/time.js'; @@ -36,6 +37,7 @@ function spawnVsCode( shell: useShell, detached: false, }); + trackChild(child); child.on('error', (error) => { const label = options?.label ?? 'spawn'; const renderedArgs = args.map((value) => JSON.stringify(value)).join(' '); diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index d6d1a349a..b88ba6eb2 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -132,6 +132,11 @@ export { export { discoverGraders } from './evaluation/registry/grader-discovery.js'; export { RunBudgetTracker } from './evaluation/run-budget-tracker.js'; export { runBeforeSessionHook, parseEnvOutput } from './evaluation/hooks.js'; +export { + trackChild, + killAllTrackedChildren, + trackedChildCount, +} from './runtime/child-tracker.js'; // Import pipeline export * from './import/index.js'; diff --git a/packages/core/src/runtime/child-tracker.ts b/packages/core/src/runtime/child-tracker.ts new file mode 100644 index 000000000..ed9e5e036 --- /dev/null +++ b/packages/core/src/runtime/child-tracker.ts @@ -0,0 +1,39 @@ +/** + * Tracks long-lived child processes spawned by AgentV providers so that a + * top-level signal handler can kill them all on Ctrl+C / SIGTERM. + * + * Why this exists: when the CLI receives SIGTERM (e.g. from Studio's Stop + * button), Node exits the parent process but does NOT propagate the signal + * to grandchildren. Without tracking, the spawned `claude`, `codex`, `pi`, + * `copilot` subprocesses linger as orphans. The CLI's signal handler walks + * this set and SIGTERMs each one before exiting. + * + * To extend: any provider that spawns a long-lived subprocess should call + * `trackChild(child)` immediately after `spawn(...)`. No untrack call is + * required — `kill()` is a no-op on dead PIDs and the registry lives only + * for the duration of one CLI invocation. + */ + +import type { ChildProcess } from 'child_process'; + +const tracked = new Set(); + +export function trackChild(child: ChildProcess): void { + tracked.add(child); + child.once('close', () => tracked.delete(child)); + child.once('exit', () => tracked.delete(child)); +} + +export function killAllTrackedChildren(signal: NodeJS.Signals = 'SIGTERM'): void { + for (const child of tracked) { + try { + child.kill(signal); + } catch { + // Already dead or unable to signal; nothing to do. + } + } +} + +export function trackedChildCount(): number { + return tracked.size; +} From 576d097ba14a4972c98d197109f5f8805c6bae40 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 7 May 2026 07:56:33 +0200 Subject: [PATCH 2/4] chore: fix biome formatting drive-by + structural Killable type - Pre-existing format errors in two studio routes block any push; fixed with biome --write so CI passes. - child-tracker uses a structural Killable type to avoid a tsup dts resolution failure on `node:child_process` re-export. --- .../$benchmarkId_/evals/$runId.$evalId.tsx | 4 +++- .../src/routes/evals/$runId.$evalId.tsx | 4 +++- packages/core/src/runtime/child-tracker.ts | 20 ++++++++++++++----- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx index 169d6d643..2bf2b9ea6 100644 --- a/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx +++ b/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx @@ -64,7 +64,9 @@ function BenchmarkEvalDetailPage() { Run: {runId} / Eval: {evalId}

- + {passed ? '✓' : '✗'} {evalId} diff --git a/apps/studio/src/routes/evals/$runId.$evalId.tsx b/apps/studio/src/routes/evals/$runId.$evalId.tsx index 54347798c..c801dbd3c 100644 --- a/apps/studio/src/routes/evals/$runId.$evalId.tsx +++ b/apps/studio/src/routes/evals/$runId.$evalId.tsx @@ -68,7 +68,9 @@ function EvalDetailPage() { Run: {runId} / Eval: {evalId}

- + {passed ? '✓' : '✗'} {evalId} diff --git a/packages/core/src/runtime/child-tracker.ts b/packages/core/src/runtime/child-tracker.ts index ed9e5e036..573a00515 100644 --- a/packages/core/src/runtime/child-tracker.ts +++ b/packages/core/src/runtime/child-tracker.ts @@ -12,16 +12,26 @@ * `trackChild(child)` immediately after `spawn(...)`. No untrack call is * required — `kill()` is a no-op on dead PIDs and the registry lives only * for the duration of one CLI invocation. + * + * Why a structural `Killable` type instead of `ChildProcess`: importing + * the real type from `node:child_process` triggers a rollup-plugin-dts + * resolution failure when this module's signatures are re-exported from + * the package entry. The structural shape avoids that, and any + * `ChildProcess` satisfies it. */ -import type { ChildProcess } from 'child_process'; +type Killable = { + kill(signal?: NodeJS.Signals | number): boolean; + once(event: 'close', listener: (...args: unknown[]) => void): unknown; +}; -const tracked = new Set(); +const tracked = new Set(); -export function trackChild(child: ChildProcess): void { +export function trackChild(child: Killable): void { tracked.add(child); - child.once('close', () => tracked.delete(child)); - child.once('exit', () => tracked.delete(child)); + child.once('close', () => { + tracked.delete(child); + }); } export function killAllTrackedChildren(signal: NodeJS.Signals = 'SIGTERM'): void { From 974f386baf1b269117ff70831d4b4358e59dafa8 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 7 May 2026 08:16:27 +0200 Subject: [PATCH 3/4] feat(studio): DELETE stop endpoint, Stop button, partial-run resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - DELETE /api/eval/run/:id (and benchmark-scoped variant) SIGTERMs the spawned CLI. The CLI's own signal handler walks the child registry added in the previous commit and kills grandchildren before exiting. Existing child.on('close') flips status — no new 'stopping' state. - StopRunButton on /jobs/:runId, hidden when terminal or read-only. Optimistic 'Stopping…' label until the next status poll observes a terminal state. - planned_test_count persisted in benchmark.json.metadata via a stub written at run start. Resume-action visibility now triggers when results.length < planned_test_count even with no execution_error rows — covers Stop-button / Ctrl+C cases. - Narrow tests: shouldShowStopButton matrix, partial-run resume helper, DELETE 404/403 routes for both base + benchmark-scoped paths. Happy path of DELETE→SIGTERM is covered by manual UAT. --- apps/cli/src/commands/eval/artifact-writer.ts | 78 +++++++++++++++++-- apps/cli/src/commands/eval/run-eval.ts | 16 ++++ apps/cli/src/commands/results/eval-runner.ts | 41 ++++++++++ apps/cli/src/commands/results/serve.ts | 12 ++- apps/cli/test/commands/results/serve.test.ts | 44 +++++++++++ .../src/components/ResumeRunActions.tsx | 4 +- apps/studio/src/components/StopRunButton.tsx | 62 +++++++++++++++ .../src/components/resume-run-helpers.test.ts | 17 ++++ .../src/components/resume-run-helpers.ts | 27 +++++-- .../src/components/stop-run-helpers.test.ts | 36 +++++++++ .../studio/src/components/stop-run-helpers.ts | 28 +++++++ apps/studio/src/lib/api.ts | 15 ++++ apps/studio/src/lib/types.ts | 2 + .../benchmarks/$benchmarkId_/runs/$runId.tsx | 1 + apps/studio/src/routes/jobs/$runId.tsx | 8 +- apps/studio/src/routes/runs/$runId.tsx | 1 + 16 files changed, 374 insertions(+), 18 deletions(-) create mode 100644 apps/studio/src/components/StopRunButton.tsx create mode 100644 apps/studio/src/components/stop-run-helpers.test.ts create mode 100644 apps/studio/src/components/stop-run-helpers.ts diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index e005d786f..c0a511915 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -34,7 +34,7 @@ export function deduplicateByTestIdTarget( export async function aggregateRunDir( runDir: string, - options?: { evalFile?: string; experiment?: string }, + options?: { evalFile?: string; experiment?: string; plannedTestCount?: number }, ): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> { const indexPath = path.join(runDir, RESULT_INDEX_FILENAME); const content = await readFile(indexPath, 'utf8'); @@ -45,7 +45,18 @@ export async function aggregateRunDir( const timingPath = path.join(runDir, 'timing.json'); await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8'); - const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment); + // Preserve `planned_test_count` from any pre-existing benchmark.json (e.g. + // the stub written at run start, or from the original run when this is a + // resume) unless an explicit value was passed. + const plannedTestCount = + options?.plannedTestCount ?? (await readPlannedTestCount(path.join(runDir, 'benchmark.json'))); + + const benchmark = buildBenchmarkArtifact( + results, + options?.evalFile, + options?.experiment, + plannedTestCount, + ); const benchmarkPath = path.join(runDir, 'benchmark.json'); await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8'); @@ -53,6 +64,17 @@ export async function aggregateRunDir( return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size }; } +async function readPlannedTestCount(benchmarkPath: string): Promise { + try { + const raw = await readFile(benchmarkPath, 'utf8'); + const parsed = JSON.parse(raw) as { metadata?: { planned_test_count?: number } }; + const value = parsed.metadata?.planned_test_count; + return typeof value === 'number' && Number.isFinite(value) ? value : undefined; + } catch { + return undefined; + } +} + // --------------------------------------------------------------------------- // Artifact interfaces (snake_case to match skill-creator conventions) // --------------------------------------------------------------------------- @@ -110,6 +132,13 @@ export interface BenchmarkArtifact { readonly targets: readonly string[]; readonly tests_run: readonly string[]; readonly experiment?: string; + /** + * Total number of test cases the run was planned to execute (across all + * targets and eval files). Written at run start so an interrupted run + * can be diagnosed as resumable when `tests_run.length < planned_test_count`, + * even if every recorded row has `execution_status: ok`. + */ + readonly planned_test_count?: number; }; readonly run_summary: Record< string, @@ -364,6 +393,7 @@ export function buildBenchmarkArtifact( results: readonly EvaluationResult[], evalFile = '', experiment?: string, + plannedTestCount?: number, ): BenchmarkArtifact { const targetSet = new Set(); const testIdSet = new Set(); @@ -457,6 +487,7 @@ export function buildBenchmarkArtifact( targets, tests_run: testIds, experiment, + planned_test_count: plannedTestCount, }, run_summary: runSummary, per_grader_summary: perEvaluatorSummary, @@ -464,6 +495,35 @@ export function buildBenchmarkArtifact( }; } +/** + * Write a stub `benchmark.json` at the start of a run, before any tests + * have executed. Carries `planned_test_count` so an interrupted run can + * still be detected as resumable even when every recorded row has + * `execution_status: ok`. + * + * The end-of-run write (writeArtifactsFromResults / aggregateRunDir) + * overwrites this file with the full summary; preserve `planned_test_count` + * by passing it through. + */ +export async function writeInitialBenchmarkArtifact( + runDir: string, + options: { + evalFile: string; + plannedTestCount: number; + experiment?: string; + }, +): Promise { + await mkdir(runDir, { recursive: true }); + const stub = buildBenchmarkArtifact( + [], + options.evalFile, + options.experiment, + options.plannedTestCount, + ); + const benchmarkPath = path.join(runDir, 'benchmark.json'); + await writeFile(benchmarkPath, `${JSON.stringify(stub, null, 2)}\n`, 'utf8'); +} + export function buildAggregateGradingArtifact( results: readonly EvaluationResult[], ): AggregateGradingArtifact { @@ -826,7 +886,7 @@ export async function writePerTestArtifacts( export async function writeArtifactsFromResults( results: readonly EvaluationResult[], outputDir: string, - options?: { evalFile?: string; experiment?: string }, + options?: { evalFile?: string; experiment?: string; plannedTestCount?: number }, ): Promise<{ testArtifactDir: string; timingPath: string; @@ -877,8 +937,16 @@ export async function writeArtifactsFromResults( const timing = buildTimingArtifact(results); await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8'); - // Write benchmark - const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment); + // Write benchmark — preserve `planned_test_count` from the run-start stub + // (or from the original run when this is a resume) unless an explicit + // value was passed by the caller. + const plannedTestCount = options?.plannedTestCount ?? (await readPlannedTestCount(benchmarkPath)); + const benchmark = buildBenchmarkArtifact( + results, + options?.evalFile, + options?.experiment, + plannedTestCount, + ); await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8'); await writeJsonlFile(indexPath, indexRecords); diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index db0b0fb19..0f3536061 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -38,6 +38,7 @@ import { deduplicateByTestIdTarget, parseJsonlResults, writeArtifactsFromResults, + writeInitialBenchmarkArtifact, } from './artifact-writer.js'; import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; @@ -1447,6 +1448,21 @@ export async function runEvalCommand( ); } + // Write a stub benchmark.json before dispatching tests, carrying the planned + // execution count so an interrupted run can still surface as resumable in + // Studio (results.length < planned_test_count) even when every recorded row + // has execution_status: ok. The end-of-run write preserves this value via + // readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults. + // Skip on resume — we want to preserve the *original* planned count. + if (!isResumeAppend && usesDefaultArtifactWorkspace && totalEvalCount > 0) { + const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : ''; + await writeInitialBenchmarkArtifact(runDir, { + evalFile, + plannedTestCount: totalEvalCount, + experiment: normalizeExperimentName(options.experiment), + }); + } + // Eval files run sequentially; within each file, --workers N test cases run in parallel. // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file // workspace races without any grouping complexity. diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts index e40c01c74..7f295bf05 100644 --- a/apps/cli/src/commands/results/eval-runner.ts +++ b/apps/cli/src/commands/results/eval-runner.ts @@ -412,6 +412,29 @@ export function registerEvalRoutes( } }); + // ── Stop a running eval ──────────────────────────────────────────────── + // SIGTERM the spawned CLI; the existing child.on('close') flips status to + // 'finished'/'failed'. The CLI's own signal handler walks its tracked + // grandchildren (claude/codex/pi/copilot subprocesses) and kills them + // before exiting. + app.delete('/api/eval/run/:id', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + const id = c.req.param('id'); + const run = activeRuns.get(id ?? ''); + if (!run) return c.json({ error: 'Run not found' }, 404); + if (run.status === 'finished' || run.status === 'failed' || !run.process) { + return c.json({ stopped: false, reason: 'already_terminal', status: run.status }); + } + try { + run.process.kill('SIGTERM'); + } catch (err) { + return c.json({ error: (err as Error).message }, 500); + } + return c.json({ stopped: true, status: run.status }); + }); + // ── Run status ───────────────────────────────────────────────────────── app.get('/api/eval/status/:id', (c) => { const id = c.req.param('id'); @@ -576,6 +599,24 @@ export function registerEvalRoutes( } }); + app.delete('/api/benchmarks/:benchmarkId/eval/run/:id', (c) => { + if (readOnly) { + return c.json({ error: 'Studio is running in read-only mode' }, 403); + } + const id = c.req.param('id'); + const run = activeRuns.get(id ?? ''); + if (!run) return c.json({ error: 'Run not found' }, 404); + if (run.status === 'finished' || run.status === 'failed' || !run.process) { + return c.json({ stopped: false, reason: 'already_terminal', status: run.status }); + } + try { + run.process.kill('SIGTERM'); + } catch (err) { + return c.json({ error: (err as Error).message }, 500); + } + return c.json({ stopped: true, status: run.status }); + }); + app.get('/api/benchmarks/:benchmarkId/eval/status/:id', (c) => { const id = c.req.param('id'); const run = activeRuns.get(id ?? ''); diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index f88bf212e..8468bdf88 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -347,8 +347,8 @@ async function handleRunDetail(c: C, { searchDir }: DataContext) { function deriveResumeMeta( cwd: string, manifestPath: string, -): { run_dir?: string; suite_filter?: string } { - const out: { run_dir?: string; suite_filter?: string } = {}; +): { run_dir?: string; suite_filter?: string; planned_test_count?: number } { + const out: { run_dir?: string; suite_filter?: string; planned_test_count?: number } = {}; const runDir = path.dirname(manifestPath); const relative = path.relative(cwd, runDir); // path.relative returns '..'-prefixed paths when runDir is outside cwd; keep @@ -359,15 +359,19 @@ function deriveResumeMeta( const benchmarkPath = path.join(runDir, 'benchmark.json'); if (existsSync(benchmarkPath)) { const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as { - metadata?: { eval_file?: string }; + metadata?: { eval_file?: string; planned_test_count?: number }; }; const evalFile = parsed.metadata?.eval_file; if (typeof evalFile === 'string' && evalFile.trim()) { out.suite_filter = evalFile.trim(); } + const planned = parsed.metadata?.planned_test_count; + if (typeof planned === 'number' && Number.isFinite(planned) && planned > 0) { + out.planned_test_count = planned; + } } } catch { - // benchmark.json missing / unreadable / malformed — leave suite_filter unset. + // benchmark.json missing / unreadable / malformed — leave fields unset. } return out; } diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index 0e69e495a..d1a1d8ae5 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -1003,6 +1003,50 @@ describe('serve app', () => { }); }); + // ── DELETE /api/eval/run/:id — stop a running eval ───────────────────── + // + // The DELETE endpoint validates routing/auth shape (404 unknown id, 403 + // read-only). The happy-path SIGTERM behavior is covered by manual UAT + // because it requires a live subprocess that is reliably mid-run; unit + // tests that race a launch against a delete are flaky. + + describe('DELETE /api/eval/run/:id (stop API)', () => { + function makeAppForStop(opts?: { readOnly?: boolean }) { + return createApp([], tempDir, undefined, undefined, { + studioDir, + readOnly: opts?.readOnly === true, + }); + } + + it('returns 404 for an unknown run id', async () => { + const app = makeAppForStop(); + const res = await app.request('/api/eval/run/no-such-id', { method: 'DELETE' }); + expect(res.status).toBe(404); + }); + + it('returns 403 in read-only mode', async () => { + const app = makeAppForStop({ readOnly: true }); + const res = await app.request('/api/eval/run/anything', { method: 'DELETE' }); + expect(res.status).toBe(403); + }); + + it('returns 404 for benchmark-scoped DELETE with unknown run id', async () => { + const app = makeAppForStop(); + const res = await app.request('/api/benchmarks/some-id/eval/run/no-such-id', { + method: 'DELETE', + }); + expect(res.status).toBe(404); + }); + + it('returns 403 in read-only mode for benchmark-scoped DELETE', async () => { + const app = makeAppForStop({ readOnly: true }); + const res = await app.request('/api/benchmarks/some-id/eval/run/anything', { + method: 'DELETE', + }); + expect(res.status).toBe(403); + }); + }); + // ── POST /api/eval/preview — argument shaping for resume flags ───────── // // /api/eval/preview is a lightweight endpoint that returns the CLI diff --git a/apps/studio/src/components/ResumeRunActions.tsx b/apps/studio/src/components/ResumeRunActions.tsx index d161c9d22..f50ebe5b5 100644 --- a/apps/studio/src/components/ResumeRunActions.tsx +++ b/apps/studio/src/components/ResumeRunActions.tsx @@ -35,6 +35,7 @@ export interface ResumeRunActionsProps { target?: string; benchmarkId?: string; isReadOnly: boolean; + plannedTestCount?: number; } export function ResumeRunActions({ @@ -44,12 +45,13 @@ export function ResumeRunActions({ target, benchmarkId, isReadOnly, + plannedTestCount, }: ResumeRunActionsProps) { const navigate = useNavigate(); const [busy, setBusy] = useState(null); const [error, setError] = useState(null); - if (!shouldShowResumeActions(results, isReadOnly)) return null; + if (!shouldShowResumeActions(results, isReadOnly, plannedTestCount)) return null; // Both actions need the run dir + the original eval file. Without those // we can't target the existing run workspace, so we render the buttons diff --git a/apps/studio/src/components/StopRunButton.tsx b/apps/studio/src/components/StopRunButton.tsx new file mode 100644 index 000000000..39ce1433e --- /dev/null +++ b/apps/studio/src/components/StopRunButton.tsx @@ -0,0 +1,62 @@ +/** + * StopRunButton — destructive button on /jobs/:runId that terminates a + * Studio-launched eval. + * + * Calls DELETE /api/eval/run/:id (or the benchmark-scoped variant). + * Optimistically flips the local label to "Stopping…" until the next + * poll of /api/eval/status/:id observes a terminal state — at which + * point the button hides via `shouldShowStopButton`. + * + * To extend with a "Force kill" affordance: thread a second handler + * through that POSTs DELETE again (the CLI's signal handler interprets + * a second SIGTERM within the same process as hard-exit) and surfaces a + * confirmation prompt. + */ + +import { useState } from 'react'; + +import { stopEvalRun } from '~/lib/api'; + +import { type RunStatus, shouldShowStopButton } from './stop-run-helpers'; + +export interface StopRunButtonProps { + runId: string; + status: RunStatus | undefined; + isReadOnly: boolean; + benchmarkId?: string; +} + +export function StopRunButton({ runId, status, isReadOnly, benchmarkId }: StopRunButtonProps) { + const [stopping, setStopping] = useState(false); + const [error, setError] = useState(null); + + if (!shouldShowStopButton(status, isReadOnly)) return null; + + async function onClick() { + setStopping(true); + setError(null); + try { + await stopEvalRun(runId, benchmarkId); + } catch (err) { + setError(err instanceof Error ? err.message : 'Failed to stop run'); + setStopping(false); + } + // On success, leave `stopping=true`. The status poller will flip to + // a terminal state shortly, at which point the button unmounts. + } + + return ( +
+ + {error &&

{error}

} +
+ ); +} diff --git a/apps/studio/src/components/resume-run-helpers.test.ts b/apps/studio/src/components/resume-run-helpers.test.ts index 39e2d807c..9ceb71870 100644 --- a/apps/studio/src/components/resume-run-helpers.test.ts +++ b/apps/studio/src/components/resume-run-helpers.test.ts @@ -32,6 +32,23 @@ describe('shouldShowResumeActions', () => { it('hides on empty results', () => { expect(shouldShowResumeActions([], false)).toBe(false); }); + + it('shows for an incomplete partial run with only ok rows when planned_test_count exceeds results', () => { + // Stop button / Ctrl+C scenario: 5 of 10 planned tests finished + // successfully before the run was killed. No execution errors, but + // still resumable. + const results = [ok('a'), ok('b'), ok('c'), ok('d'), ok('e')]; + expect(shouldShowResumeActions(results, false, 10)).toBe(true); + }); + + it('hides when results match planned_test_count (complete passing run)', () => { + const results = [ok('a'), ok('b'), ok('c')]; + expect(shouldShowResumeActions(results, false, 3)).toBe(false); + }); + + it('hides incomplete partial run in read-only mode', () => { + expect(shouldShowResumeActions([ok('a')], true, 5)).toBe(false); + }); }); describe('buildResumeRequestBody', () => { diff --git a/apps/studio/src/components/resume-run-helpers.ts b/apps/studio/src/components/resume-run-helpers.ts index fcbf805a0..b8e6451f5 100644 --- a/apps/studio/src/components/resume-run-helpers.ts +++ b/apps/studio/src/components/resume-run-helpers.ts @@ -21,14 +21,29 @@ export interface BuildResumeRequestParams { } /** - * Whether the resume actions should be visible. The button only makes sense - * when at least one row failed with an execution error and the user has - * write access (read-only mode hides the entire control rather than - * showing a disabled button — see issue acceptance criteria). + * Whether the resume actions should be visible. The button is shown when: + * 1. At least one recorded row has `execution_status: execution_error`, OR + * 2. The run is *incomplete* — fewer recorded rows than the originally + * planned execution count, even if every recorded row is `ok`. + * + * Case 2 covers Stop-button / Ctrl+C interruptions where the run produced + * only successful rows before being killed: there is no `execution_error` + * to anchor on, but the run is still resumable. `plannedTestCount` is + * persisted in `benchmark.json.metadata` at run start (see + * `writeInitialBenchmarkArtifact`). + * + * Hidden in read-only mode — the server also returns 403, but UI-level + * hiding avoids dead controls. */ -export function shouldShowResumeActions(results: EvalResult[], isReadOnly: boolean): boolean { +export function shouldShowResumeActions( + results: EvalResult[], + isReadOnly: boolean, + plannedTestCount?: number, +): boolean { if (isReadOnly) return false; - return results.some((r) => r.executionStatus === 'execution_error'); + if (results.some((r) => r.executionStatus === 'execution_error')) return true; + if (plannedTestCount !== undefined && results.length < plannedTestCount) return true; + return false; } /** diff --git a/apps/studio/src/components/stop-run-helpers.test.ts b/apps/studio/src/components/stop-run-helpers.test.ts new file mode 100644 index 000000000..3bd407b1b --- /dev/null +++ b/apps/studio/src/components/stop-run-helpers.test.ts @@ -0,0 +1,36 @@ +import { describe, expect, it } from 'bun:test'; + +import { isTerminalRunStatus, shouldShowStopButton } from './stop-run-helpers'; + +describe('isTerminalRunStatus', () => { + it('treats finished and failed as terminal', () => { + expect(isTerminalRunStatus('finished')).toBe(true); + expect(isTerminalRunStatus('failed')).toBe(true); + }); + + it('treats live states and unknowns as non-terminal', () => { + expect(isTerminalRunStatus('starting')).toBe(false); + expect(isTerminalRunStatus('running')).toBe(false); + expect(isTerminalRunStatus(undefined)).toBe(false); + }); +}); + +describe('shouldShowStopButton', () => { + it('shows while the run is live', () => { + expect(shouldShowStopButton('starting', false)).toBe(true); + expect(shouldShowStopButton('running', false)).toBe(true); + }); + + it('hides once the run reaches a terminal state', () => { + expect(shouldShowStopButton('finished', false)).toBe(false); + expect(shouldShowStopButton('failed', false)).toBe(false); + }); + + it('hides in read-only mode regardless of status', () => { + expect(shouldShowStopButton('running', true)).toBe(false); + }); + + it('hides when the status is undefined', () => { + expect(shouldShowStopButton(undefined, false)).toBe(false); + }); +}); diff --git a/apps/studio/src/components/stop-run-helpers.ts b/apps/studio/src/components/stop-run-helpers.ts new file mode 100644 index 000000000..fcef0e56a --- /dev/null +++ b/apps/studio/src/components/stop-run-helpers.ts @@ -0,0 +1,28 @@ +/** + * Pure helpers backing StopRunButton, isolated for unit testing. + * + * Intentionally side-effect-free so the visibility matrix is testable + * without rendering React. + * + * To extend: extend the union of statuses recognized as non-terminal as + * the server adds new lifecycle states. Today the server only emits + * starting / running / finished / failed; anything not in the terminal + * set is treated as live. + */ + +export type RunStatus = 'starting' | 'running' | 'finished' | 'failed' | (string & {}); + +export function isTerminalRunStatus(status: RunStatus | undefined): boolean { + return status === 'finished' || status === 'failed'; +} + +/** + * Whether the Stop button should be visible. Hidden when the run is + * terminal (no process to kill) and in read-only mode (the API also + * 403s, but UI-level hiding avoids dead controls). + */ +export function shouldShowStopButton(status: RunStatus | undefined, isReadOnly: boolean): boolean { + if (isReadOnly) return false; + if (!status) return false; + return !isTerminalRunStatus(status); +} diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 1ea719e6e..1edde07f9 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -539,6 +539,21 @@ export async function launchEvalRun( return res.json() as Promise; } +export async function stopEvalRun( + runId: string, + benchmarkId?: string, +): Promise<{ stopped: boolean; reason?: string; status?: string }> { + const url = benchmarkId + ? `${benchmarkApiBase(benchmarkId)}/eval/run/${runId}` + : `/api/eval/run/${runId}`; + const res = await fetch(url, { method: 'DELETE' }); + if (!res.ok) { + const err = await res.json().catch(() => ({ error: res.statusText })); + throw new Error((err as { error?: string }).error ?? `Failed: ${res.status}`); + } + return res.json() as Promise<{ stopped: boolean; reason?: string; status?: string }>; +} + export function evalRunStatusOptions(runId: string | null) { return queryOptions({ queryKey: ['eval-status', runId], diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts index 46f321d32..6ef4dc1b4 100644 --- a/apps/studio/src/lib/types.ts +++ b/apps/studio/src/lib/types.ts @@ -80,6 +80,8 @@ export interface RunDetailResponse { run_dir?: string; /** Eval file path the run was launched against, if recorded in benchmark.json. Local runs only. */ suite_filter?: string; + /** Total (test_id, target) executions originally planned for this run. Used to detect incomplete partial runs as resumable. Local runs only, populated when the run was launched after the planned-count metadata feature shipped. */ + planned_test_count?: number; } export interface SuiteSummary { diff --git a/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx index db0f28bd4..dfb41c244 100644 --- a/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx +++ b/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx @@ -77,6 +77,7 @@ function BenchmarkRunDetailPage() { target={target ?? undefined} benchmarkId={benchmarkId} isReadOnly={isReadOnly} + plannedTestCount={data?.planned_test_count} /> {!isReadOnly && ( {error &&

{error}

} diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts index 1edde07f9..eb4b95010 100644 --- a/apps/studio/src/lib/api.ts +++ b/apps/studio/src/lib/api.ts @@ -544,9 +544,9 @@ export async function stopEvalRun( benchmarkId?: string, ): Promise<{ stopped: boolean; reason?: string; status?: string }> { const url = benchmarkId - ? `${benchmarkApiBase(benchmarkId)}/eval/run/${runId}` - : `/api/eval/run/${runId}`; - const res = await fetch(url, { method: 'DELETE' }); + ? `${benchmarkApiBase(benchmarkId)}/eval/run/${runId}/stop` + : `/api/eval/run/${runId}/stop`; + const res = await fetch(url, { method: 'POST' }); if (!res.ok) { const err = await res.json().catch(() => ({ error: res.statusText })); throw new Error((err as { error?: string }).error ?? `Failed: ${res.status}`); diff --git a/docs/plans/1222-stop-run.md b/docs/plans/1222-stop-run.md index 5decca1b9..12696b7ac 100644 --- a/docs/plans/1222-stop-run.md +++ b/docs/plans/1222-stop-run.md @@ -2,19 +2,23 @@ ## Scope -Make Ctrl+C / DELETE work. No staged shutdown, no new status words, no +Make Ctrl+C / POST work. No staged shutdown, no new status words, no AbortSignal threading. 1. **CLI** — Ctrl+C → install `process.on('SIGINT'|'SIGTERM')` that kills tracked child processes, then exits. Partial `index.jsonl` is already row-by-row durable; whatever finished is preserved. -2. **Studio API** — `DELETE /api/eval/run/:id` (and benchmark-scoped +2. **Studio API** — `POST /api/eval/run/:id/stop` (and benchmark-scoped variant) calls `child.kill('SIGTERM')` on the spawned CLI. Existing `child.on('close')` flips `running → failed` once the process exits. + Idempotent: returns 200 `{stopped:false, reason:'already_terminal'}` + on terminal runs rather than 4xx, so clients can fire-and-forget. No new `'stopping'` state on the server. -3. **Studio UI** — Red Stop button on `/jobs/:runId` while - `status ∈ {starting, running}` and not read-only. On click, optimistic - local "Stopping…" label until the next status poll flips to terminal. +3. **Studio UI** — Neutral-styled Stop button on `/jobs/:runId` while + `status ∈ {starting, running}` and not read-only. Stop is part of the + stop → resume workflow, not a destructive cancel, so styling is gray + (not red). On click, optimistic local "Stopping…" label until the + next status poll flips to terminal. 4. **Resume detection for partial runs** — Persist `planned_test_count` in `benchmark.json.metadata` at run start (early write), updated at end. Run-detail API surfaces the number; Studio computes @@ -35,10 +39,10 @@ AbortSignal threading. `killAll('SIGTERM')` then `process.exit(130)` / `143`. Idempotent on re-entry; second signal hard-exits (`process.exit(1)`). -### Studio DELETE endpoint +### Studio POST endpoint - `apps/cli/src/commands/results/eval-runner.ts` — add - `app.delete('/api/eval/run/:id', ...)` and - `app.delete('/api/benchmarks/:benchmarkId/eval/run/:id', ...)`. + `app.post('/api/eval/run/:id/stop', ...)` and + `app.post('/api/benchmarks/:benchmarkId/eval/run/:id/stop', ...)`. - 404 if id unknown - 403 in read-only mode - 200 `{stopped: true}` after `child.kill('SIGTERM')` @@ -46,10 +50,11 @@ AbortSignal threading. - No status mutation here; close handler does it. ### Studio Stop button -- `apps/studio/src/components/StopRunButton.tsx` — *new*. Renders the red - destructive button when `status` is non-terminal; calls DELETE with the - benchmark-scoped path when `benchmarkId` is set; sets local - `stopping=true` state to flip the label optimistically. +- `apps/studio/src/components/StopRunButton.tsx` — *new*. Renders a + neutral-styled (gray, not red) Stop button when `status` is + non-terminal; calls POST with the benchmark-scoped path when + `benchmarkId` is set; sets local `stopping=true` state to flip the + label optimistically. - `apps/studio/src/components/stop-run-helpers.ts` — *new*. Pure `shouldShowStopButton(status, isReadOnly)` for unit testing. - `apps/studio/src/lib/api.ts` — add `stopEvalRun(id, benchmarkId?)`. @@ -80,9 +85,9 @@ AbortSignal threading. detail routes — pass `plannedTestCount` through. ### Tests (narrow) -- `apps/cli/test/commands/results/serve.test.ts` (or co-located) — DELETE - endpoint: 404 unknown, 403 read-only, 200 happy path with a fake child - whose `kill` is observable. +- `apps/cli/test/commands/results/serve.test.ts` (or co-located) — stop + endpoint: 404 unknown, 403 read-only, base + benchmark-scoped paths. + Happy path SIGTERM is covered by manual UAT (race-prone in unit tests). - `apps/studio/src/components/resume-run-helpers.test.ts` — case where every result is `ok` but `results.length < plannedTestCount` → button visible. @@ -101,8 +106,8 @@ AbortSignal threading. - [ ] CLI: `Ctrl+C` during a multi-test eval kills all spawned providers and the partial `index.jsonl` is preserved. -- [ ] Server: `DELETE /api/eval/run/:id` returns 200 in normal mode, 403 - in read-only, 404 for unknown id. +- [ ] Server: `POST /api/eval/run/:id/stop` returns 200 in normal mode, + 403 in read-only, 404 for unknown id; idempotent on terminal runs. - [ ] UI: Stop button appears while running, hidden in read-only, hidden when terminal. Optimistic "Stopping…" label appears until status poll flips.