From 563a999f0f04e7ec489feb1adbc9687275667b6a Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 7 May 2026 07:49:07 +0200
Subject: [PATCH 1/4] feat(cli): kill spawned providers on SIGINT/SIGTERM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Track long-lived provider subprocesses (claude, codex, pi, copilot,
vscode) in a per-process registry and walk it from a top-level signal
handler in cli.ts. Without this, Studio's child.kill('SIGTERM') against
the CLI orphans grandchildren — the Node parent exits but the OS does
not propagate the signal.

Plan in docs/plans/1222-stop-run.md.
---
 apps/cli/src/cli.ts                           |  26 ++++
 docs/plans/1222-stop-run.md                   | 118 ++++++++++++++++++
 .../src/evaluation/providers/claude-cli.ts    |   2 +
 .../src/evaluation/providers/codex-cli.ts     |   2 +
 .../src/evaluation/providers/copilot-cli.ts   |   2 +
 .../core/src/evaluation/providers/pi-cli.ts   |   2 +
 .../vscode/dispatch/vscodeProcess.ts          |   2 +
 packages/core/src/index.ts                    |   5 +
 packages/core/src/runtime/child-tracker.ts    |  39 ++++++
 9 files changed, 198 insertions(+)
 create mode 100644 docs/plans/1222-stop-run.md
 create mode 100644 packages/core/src/runtime/child-tracker.ts

diff --git a/apps/cli/src/cli.ts b/apps/cli/src/cli.ts
index 12dece45c..995cc8c26 100644
--- a/apps/cli/src/cli.ts
+++ b/apps/cli/src/cli.ts
@@ -1,6 +1,32 @@
 #!/usr/bin/env node
+import { killAllTrackedChildren } from '@agentv/core';
+
 import { runCli } from './index.js';
 
+// Forward SIGINT/SIGTERM to spawned provider subprocesses before exiting.
+// Without this, Studio's `child.kill('SIGTERM')` against the CLI orphans
+// any in-flight `claude`/`codex`/`pi`/`copilot` subprocess. The partial
+// `index.jsonl` is already row-by-row durable, so finished tests survive.
+//
+// First signal: kill children, exit with the conventional 128+signal code.
+// Second signal within the same process: hard-exit so a hung child cannot
+// trap the user.
+let interrupted = false;
+function installShutdown(signal: NodeJS.Signals, exitCode: number) {
+  process.on(signal, () => {
+    if (interrupted) {
+      process.exit(1);
+    }
+    interrupted = true;
+    killAllTrackedChildren('SIGTERM');
+    // Defer exit one tick so SIGTERM has a chance to dispatch before the
+    // event loop tears down.
+    setTimeout(() => process.exit(exitCode), 50);
+  });
+}
+installShutdown('SIGINT', 130);
+installShutdown('SIGTERM', 143);
+
 runCli()
   .then(() => {
     process.exit(0);
diff --git a/docs/plans/1222-stop-run.md b/docs/plans/1222-stop-run.md
new file mode 100644
index 000000000..5decca1b9
--- /dev/null
+++ b/docs/plans/1222-stop-run.md
@@ -0,0 +1,118 @@
+# 1222 — Stop run + partial-run resumability (simplified scope)
+
+## Scope
+
+Make Ctrl+C / DELETE work. No staged shutdown, no new status words, no
+AbortSignal threading.
+
+1. **CLI** — Ctrl+C → install `process.on('SIGINT'|'SIGTERM')` that kills
+   tracked child processes, then exits. Partial `index.jsonl` is already
+   row-by-row durable; whatever finished is preserved.
+2. **Studio API** — `DELETE /api/eval/run/:id` (and benchmark-scoped
+   variant) calls `child.kill('SIGTERM')` on the spawned CLI. Existing
+   `child.on('close')` flips `running → failed` once the process exits.
+   No new `'stopping'` state on the server.
+3. **Studio UI** — Red Stop button on `/jobs/:runId` while
+   `status ∈ {starting, running}` and not read-only. On click, optimistic
+   local "Stopping…" label until the next status poll flips to terminal.
+4. **Resume detection for partial runs** — Persist `planned_test_count` in
+   `benchmark.json.metadata` at run start (early write), updated at end.
+   Run-detail API surfaces the number; Studio computes
+   `shouldShowResumeActions(results, isReadOnly, plannedTestCount?)` as
+   `executionError OR results.length < plannedTestCount`. No
+   `is_resumable` / `resume_reason` flags.
+
+## Files touched
+
+### CLI signal handler
+- `packages/core/src/runtime/child-tracker.ts` — *new*. Singleton
+  `Set<ChildProcess>`; `trackChild(child)` and `killAll(signal)`.
+- `packages/core/src/evaluation/providers/{claude-cli,codex-cli,pi-cli}.ts`
+  + `copilot-utils.ts` — call `trackChild(child)` after each `spawn(...)`.
+  No need to untrack on close: `kill()` is a no-op on dead PIDs and the
+  registry is short-lived.
+- `apps/cli/src/cli.ts` — install SIGINT/SIGTERM handlers that call
+  `killAll('SIGTERM')` then `process.exit(130)` / `143`. Idempotent on
+  re-entry; second signal hard-exits (`process.exit(1)`).
+
+### Studio DELETE endpoint
+- `apps/cli/src/commands/results/eval-runner.ts` — add
+  `app.delete('/api/eval/run/:id', ...)` and
+  `app.delete('/api/benchmarks/:benchmarkId/eval/run/:id', ...)`.
+  - 404 if id unknown
+  - 403 in read-only mode
+  - 200 `{stopped: true}` after `child.kill('SIGTERM')`
+  - 200 `{stopped: false, reason: 'already_terminal'}` if already done
+  - No status mutation here; close handler does it.
+
+### Studio Stop button
+- `apps/studio/src/components/StopRunButton.tsx` — *new*. Renders the red
+  destructive button when `status` is non-terminal; calls DELETE with the
+  benchmark-scoped path when `benchmarkId` is set; sets local
+  `stopping=true` state to flip the label optimistically.
+- `apps/studio/src/components/stop-run-helpers.ts` — *new*. Pure
+  `shouldShowStopButton(status, isReadOnly)` for unit testing.
+- `apps/studio/src/lib/api.ts` — add `stopEvalRun(id, benchmarkId?)`.
+- `apps/studio/src/routes/jobs/$runId.tsx` — wire `<StopRunButton />` into
+  the header.
+
+### Resume — planned_test_count
+- `apps/cli/src/commands/eval/artifact-writer.ts`
+  - Extend `BenchmarkArtifact.metadata` with optional
+    `planned_test_count?: number`.
+  - Add `writeInitialBenchmarkArtifact(runDir, { evalFile, targets,
+    plannedTestCount, experiment })` that writes a stub at run start
+    (`run_summary: {}`, `metadata` pre-filled).
+- `apps/cli/src/commands/eval/run-eval.ts` — call the initial writer
+  immediately after the run dir is created, **before** dispatching tests.
+  At end of run, the existing `writeArtifactsFromResults` /
+  `aggregateRunDir` overwrites the file with full data and the same
+  `planned_test_count`.
+- `apps/cli/src/commands/results/serve.ts` — `deriveResumeMeta` already
+  reads `metadata.eval_file`; extend to also surface
+  `planned_test_count`. Run-detail response gains `planned_test_count`.
+- `apps/studio/src/lib/types.ts` — add optional
+  `planned_test_count?: number` to the run-detail response type.
+- `apps/studio/src/components/resume-run-helpers.ts` — extend
+  `shouldShowResumeActions` to also return true when
+  `plannedTestCount && results.length < plannedTestCount`.
+- `apps/studio/src/components/ResumeRunActions.tsx` and the two run
+  detail routes — pass `plannedTestCount` through.
+
+### Tests (narrow)
+- `apps/cli/test/commands/results/serve.test.ts` (or co-located) — DELETE
+  endpoint: 404 unknown, 403 read-only, 200 happy path with a fake child
+  whose `kill` is observable.
+- `apps/studio/src/components/resume-run-helpers.test.ts` — case where
+  every result is `ok` but `results.length < plannedTestCount` → button
+  visible.
+- `apps/studio/src/components/stop-run-helpers.test.ts` — visibility
+  matrix.
+- Skip: integration test that signals a real eval. Manual UAT is enough.
+
+## Non-goals
+
+- "Pause" / resume-from-mid-test semantics.
+- Per-test SIGINT to graders.
+- Multi-job orchestration / job queue.
+- A "stopping" status word — UI handles the optimistic flicker locally.
+
+## Acceptance signals
+
+- [ ] CLI: `Ctrl+C` during a multi-test eval kills all spawned providers
+      and the partial `index.jsonl` is preserved.
+- [ ] Server: `DELETE /api/eval/run/:id` returns 200 in normal mode, 403
+      in read-only, 404 for unknown id.
+- [ ] UI: Stop button appears while running, hidden in read-only, hidden
+      when terminal. Optimistic "Stopping…" label appears until status
+      poll flips.
+- [ ] UI: A run with 5 of 10 tests `ok` and 5 missing shows Resume.
+- [ ] UI: A complete passing run does *not* show Resume.
+
+## Out-of-scope cleanups noted
+
+The benchmark.json write happens entirely at end-of-run today. Writing a
+stub at start means a run that crashes before the first test still has a
+metadata file on disk — that may obsolete some of the fallback logic in
+`deriveResumeMeta`. We are not consolidating that here; this PR only adds
+the early-write call.
diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts
index 4635d0e26..06ed5c810 100644
--- a/packages/core/src/evaluation/providers/claude-cli.ts
+++ b/packages/core/src/evaluation/providers/claude-cli.ts
@@ -5,6 +5,7 @@ import type { WriteStream } from 'node:fs';
 import { mkdir } from 'node:fs/promises';
 import path from 'node:path';
 
+import { trackChild } from '../../runtime/child-tracker.js';
 import { extractTextContent, toContentArray } from './claude-content.js';
 import { recordClaudeLogEntry } from './claude-log-tracker.js';
 import { normalizeToolCall } from './normalize-tool-call.js';
@@ -270,6 +271,7 @@ export class ClaudeCliProvider implements Provider {
       }
 
       const child = spawn(this.config.executable, options.args, spawnOptions);
+      trackChild(child);
 
       let stdout = '';
       let stderr = '';
diff --git a/packages/core/src/evaluation/providers/codex-cli.ts b/packages/core/src/evaluation/providers/codex-cli.ts
index c776c77c5..232d540a4 100644
--- a/packages/core/src/evaluation/providers/codex-cli.ts
+++ b/packages/core/src/evaluation/providers/codex-cli.ts
@@ -7,6 +7,7 @@ import { tmpdir } from 'node:os';
 import path from 'node:path';
 import { promisify } from 'node:util';
 
+import { trackChild } from '../../runtime/child-tracker.js';
 import { recordCodexLogEntry } from './codex-log-tracker.js';
 import { buildPromptDocument, normalizeInputFiles } from './preread.js';
 import type { CodexResolvedConfig } from './targets.js';
@@ -755,6 +756,7 @@ async function defaultCodexRunner(options: CodexRunOptions): Promise<CodexRunRes
       stdio: ['pipe', 'pipe', 'pipe'],
       shell: shouldShellExecute(options.executable),
     });
+    trackChild(child);
 
     let stdout = '';
     let stderr = '';
diff --git a/packages/core/src/evaluation/providers/copilot-cli.ts b/packages/core/src/evaluation/providers/copilot-cli.ts
index 61b32a6c4..dd53b3fad 100644
--- a/packages/core/src/evaluation/providers/copilot-cli.ts
+++ b/packages/core/src/evaluation/providers/copilot-cli.ts
@@ -9,6 +9,7 @@ import { spawn } from 'node:child_process';
 
 import * as acp from '@agentclientprotocol/sdk';
 
+import { trackChild } from '../../runtime/child-tracker.js';
 import { captureSessionArtifacts } from '../workspace/file-changes.js';
 import { recordCopilotCliLogEntry } from './copilot-cli-log-tracker.js';
 import {
@@ -83,6 +84,7 @@ export class CopilotCliProvider implements Provider {
     const agentProcess = spawn(executable, args, {
       stdio: ['pipe', 'pipe', 'inherit'],
     });
+    trackChild(agentProcess);
 
     await waitForProcessSpawn(agentProcess, executable, this.targetName);
 
diff --git a/packages/core/src/evaluation/providers/pi-cli.ts b/packages/core/src/evaluation/providers/pi-cli.ts
index b854abf24..d340407d9 100644
--- a/packages/core/src/evaluation/providers/pi-cli.ts
+++ b/packages/core/src/evaluation/providers/pi-cli.ts
@@ -16,6 +16,7 @@ import { mkdir, mkdtemp, rm, writeFile } from 'node:fs/promises';
 import { tmpdir } from 'node:os';
 import path from 'node:path';
 
+import { trackChild } from '../../runtime/child-tracker.js';
 import { normalizeToolCall } from './normalize-tool-call.js';
 import { recordPiLogEntry } from './pi-log-tracker.js';
 import {
@@ -958,6 +959,7 @@ async function defaultPiRunner(options: PiRunOptions): Promise<PiRunResult> {
       env: options.env,
       stdio: ['pipe', 'pipe', 'pipe'],
     });
+    trackChild(child);
 
     let stdout = '';
     let stderr = '';
diff --git a/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts b/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts
index 00bd69748..036c723b9 100644
--- a/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts
+++ b/packages/core/src/evaluation/providers/vscode/dispatch/vscodeProcess.ts
@@ -3,6 +3,7 @@ import { mkdir, writeFile } from 'node:fs/promises';
 import path from 'node:path';
 import { promisify } from 'node:util';
 
+import { trackChild } from '../../../../runtime/child-tracker.js';
 import { pathExists, removeIfExists } from '../utils/fs.js';
 import { pathToFileUri } from '../utils/path.js';
 import { sleep } from '../utils/time.js';
@@ -36,6 +37,7 @@ function spawnVsCode(
     shell: useShell,
     detached: false,
   });
+  trackChild(child);
   child.on('error', (error) => {
     const label = options?.label ?? 'spawn';
     const renderedArgs = args.map((value) => JSON.stringify(value)).join(' ');
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index d6d1a349a..b88ba6eb2 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -132,6 +132,11 @@ export {
 export { discoverGraders } from './evaluation/registry/grader-discovery.js';
 export { RunBudgetTracker } from './evaluation/run-budget-tracker.js';
 export { runBeforeSessionHook, parseEnvOutput } from './evaluation/hooks.js';
+export {
+  trackChild,
+  killAllTrackedChildren,
+  trackedChildCount,
+} from './runtime/child-tracker.js';
 
 // Import pipeline
 export * from './import/index.js';
diff --git a/packages/core/src/runtime/child-tracker.ts b/packages/core/src/runtime/child-tracker.ts
new file mode 100644
index 000000000..ed9e5e036
--- /dev/null
+++ b/packages/core/src/runtime/child-tracker.ts
@@ -0,0 +1,39 @@
+/**
+ * Tracks long-lived child processes spawned by AgentV providers so that a
+ * top-level signal handler can kill them all on Ctrl+C / SIGTERM.
+ *
+ * Why this exists: when the CLI receives SIGTERM (e.g. from Studio's Stop
+ * button), Node exits the parent process but does NOT propagate the signal
+ * to grandchildren. Without tracking, the spawned `claude`, `codex`, `pi`,
+ * `copilot` subprocesses linger as orphans. The CLI's signal handler walks
+ * this set and SIGTERMs each one before exiting.
+ *
+ * To extend: any provider that spawns a long-lived subprocess should call
+ * `trackChild(child)` immediately after `spawn(...)`. No untrack call is
+ * required — `kill()` is a no-op on dead PIDs and the registry lives only
+ * for the duration of one CLI invocation.
+ */
+
+import type { ChildProcess } from 'child_process';
+
+const tracked = new Set<ChildProcess>();
+
+export function trackChild(child: ChildProcess): void {
+  tracked.add(child);
+  child.once('close', () => tracked.delete(child));
+  child.once('exit', () => tracked.delete(child));
+}
+
+export function killAllTrackedChildren(signal: NodeJS.Signals = 'SIGTERM'): void {
+  for (const child of tracked) {
+    try {
+      child.kill(signal);
+    } catch {
+      // Already dead or unable to signal; nothing to do.
+    }
+  }
+}
+
+export function trackedChildCount(): number {
+  return tracked.size;
+}

From 576d097ba14a4972c98d197109f5f8805c6bae40 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 7 May 2026 07:56:33 +0200
Subject: [PATCH 2/4] chore: fix biome formatting drive-by + structural
 Killable type

- Pre-existing format errors in two studio routes block any push; fixed
  with biome --write so CI passes.
- child-tracker uses a structural Killable type to avoid a tsup dts
  resolution failure on `node:child_process` re-export.
---
 .../$benchmarkId_/evals/$runId.$evalId.tsx    |  4 +++-
 .../src/routes/evals/$runId.$evalId.tsx       |  4 +++-
 packages/core/src/runtime/child-tracker.ts    | 20 ++++++++++++++-----
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx
index 169d6d643..2bf2b9ea6 100644
--- a/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx
+++ b/apps/studio/src/routes/benchmarks/$benchmarkId_/evals/$runId.$evalId.tsx
@@ -64,7 +64,9 @@ function BenchmarkEvalDetailPage() {
             Run: {runId} / Eval: {evalId}
           </p>
           <h1 className="flex items-center gap-2 text-2xl font-semibold text-white">
-            <span className={`text-2xl font-bold leading-none ${passed ? 'text-emerald-400' : 'text-red-400'}`}>
+            <span
+              className={`text-2xl font-bold leading-none ${passed ? 'text-emerald-400' : 'text-red-400'}`}
+            >
               {passed ? '✓' : '✗'}
             </span>
             {evalId}
diff --git a/apps/studio/src/routes/evals/$runId.$evalId.tsx b/apps/studio/src/routes/evals/$runId.$evalId.tsx
index 54347798c..c801dbd3c 100644
--- a/apps/studio/src/routes/evals/$runId.$evalId.tsx
+++ b/apps/studio/src/routes/evals/$runId.$evalId.tsx
@@ -68,7 +68,9 @@ function EvalDetailPage() {
             Run: {runId} / Eval: {evalId}
           </p>
           <h1 className="flex items-center gap-2 text-2xl font-semibold text-white">
-            <span className={`text-2xl font-bold leading-none ${passed ? 'text-emerald-400' : 'text-red-400'}`}>
+            <span
+              className={`text-2xl font-bold leading-none ${passed ? 'text-emerald-400' : 'text-red-400'}`}
+            >
               {passed ? '✓' : '✗'}
             </span>
             {evalId}
diff --git a/packages/core/src/runtime/child-tracker.ts b/packages/core/src/runtime/child-tracker.ts
index ed9e5e036..573a00515 100644
--- a/packages/core/src/runtime/child-tracker.ts
+++ b/packages/core/src/runtime/child-tracker.ts
@@ -12,16 +12,26 @@
  * `trackChild(child)` immediately after `spawn(...)`. No untrack call is
  * required — `kill()` is a no-op on dead PIDs and the registry lives only
  * for the duration of one CLI invocation.
+ *
+ * Why a structural `Killable` type instead of `ChildProcess`: importing
+ * the real type from `node:child_process` triggers a rollup-plugin-dts
+ * resolution failure when this module's signatures are re-exported from
+ * the package entry. The structural shape avoids that, and any
+ * `ChildProcess` satisfies it.
  */
 
-import type { ChildProcess } from 'child_process';
+type Killable = {
+  kill(signal?: NodeJS.Signals | number): boolean;
+  once(event: 'close', listener: (...args: unknown[]) => void): unknown;
+};
 
-const tracked = new Set<ChildProcess>();
+const tracked = new Set<Killable>();
 
-export function trackChild(child: ChildProcess): void {
+export function trackChild(child: Killable): void {
   tracked.add(child);
-  child.once('close', () => tracked.delete(child));
-  child.once('exit', () => tracked.delete(child));
+  child.once('close', () => {
+    tracked.delete(child);
+  });
 }
 
 export function killAllTrackedChildren(signal: NodeJS.Signals = 'SIGTERM'): void {

From 974f386baf1b269117ff70831d4b4358e59dafa8 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 7 May 2026 08:16:27 +0200
Subject: [PATCH 3/4] feat(studio): DELETE stop endpoint, Stop button,
 partial-run resume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- DELETE /api/eval/run/:id (and benchmark-scoped variant) SIGTERMs the
  spawned CLI. The CLI's own signal handler walks the child registry
  added in the previous commit and kills grandchildren before exiting.
  Existing child.on('close') flips status — no new 'stopping' state.

- StopRunButton on /jobs/:runId, hidden when terminal or read-only.
  Optimistic 'Stopping…' label until the next status poll observes a
  terminal state.

- planned_test_count persisted in benchmark.json.metadata via a stub
  written at run start. Resume-action visibility now triggers when
  results.length < planned_test_count even with no execution_error
  rows — covers Stop-button / Ctrl+C cases.

- Narrow tests: shouldShowStopButton matrix, partial-run resume helper,
  DELETE 404/403 routes for both base + benchmark-scoped paths. Happy
  path of DELETE→SIGTERM is covered by manual UAT.
---
 apps/cli/src/commands/eval/artifact-writer.ts | 78 +++++++++++++++++--
 apps/cli/src/commands/eval/run-eval.ts        | 16 ++++
 apps/cli/src/commands/results/eval-runner.ts  | 41 ++++++++++
 apps/cli/src/commands/results/serve.ts        | 12 ++-
 apps/cli/test/commands/results/serve.test.ts  | 44 +++++++++++
 .../src/components/ResumeRunActions.tsx       |  4 +-
 apps/studio/src/components/StopRunButton.tsx  | 62 +++++++++++++++
 .../src/components/resume-run-helpers.test.ts | 17 ++++
 .../src/components/resume-run-helpers.ts      | 27 +++++--
 .../src/components/stop-run-helpers.test.ts   | 36 +++++++++
 .../studio/src/components/stop-run-helpers.ts | 28 +++++++
 apps/studio/src/lib/api.ts                    | 15 ++++
 apps/studio/src/lib/types.ts                  |  2 +
 .../benchmarks/$benchmarkId_/runs/$runId.tsx  |  1 +
 apps/studio/src/routes/jobs/$runId.tsx        |  8 +-
 apps/studio/src/routes/runs/$runId.tsx        |  1 +
 16 files changed, 374 insertions(+), 18 deletions(-)
 create mode 100644 apps/studio/src/components/StopRunButton.tsx
 create mode 100644 apps/studio/src/components/stop-run-helpers.test.ts
 create mode 100644 apps/studio/src/components/stop-run-helpers.ts

diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index e005d786f..c0a511915 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -34,7 +34,7 @@ export function deduplicateByTestIdTarget(
 
 export async function aggregateRunDir(
   runDir: string,
-  options?: { evalFile?: string; experiment?: string },
+  options?: { evalFile?: string; experiment?: string; plannedTestCount?: number },
 ): Promise<{ benchmarkPath: string; timingPath: string; testCount: number; targetCount: number }> {
   const indexPath = path.join(runDir, RESULT_INDEX_FILENAME);
   const content = await readFile(indexPath, 'utf8');
@@ -45,7 +45,18 @@ export async function aggregateRunDir(
   const timingPath = path.join(runDir, 'timing.json');
   await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
 
-  const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
+  // Preserve `planned_test_count` from any pre-existing benchmark.json (e.g.
+  // the stub written at run start, or from the original run when this is a
+  // resume) unless an explicit value was passed.
+  const plannedTestCount =
+    options?.plannedTestCount ?? (await readPlannedTestCount(path.join(runDir, 'benchmark.json')));
+
+  const benchmark = buildBenchmarkArtifact(
+    results,
+    options?.evalFile,
+    options?.experiment,
+    plannedTestCount,
+  );
   const benchmarkPath = path.join(runDir, 'benchmark.json');
   await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
 
@@ -53,6 +64,17 @@ export async function aggregateRunDir(
   return { benchmarkPath, timingPath, testCount: results.length, targetCount: targetSet.size };
 }
 
+async function readPlannedTestCount(benchmarkPath: string): Promise<number | undefined> {
+  try {
+    const raw = await readFile(benchmarkPath, 'utf8');
+    const parsed = JSON.parse(raw) as { metadata?: { planned_test_count?: number } };
+    const value = parsed.metadata?.planned_test_count;
+    return typeof value === 'number' && Number.isFinite(value) ? value : undefined;
+  } catch {
+    return undefined;
+  }
+}
+
 // ---------------------------------------------------------------------------
 // Artifact interfaces (snake_case to match skill-creator conventions)
 // ---------------------------------------------------------------------------
@@ -110,6 +132,13 @@ export interface BenchmarkArtifact {
     readonly targets: readonly string[];
     readonly tests_run: readonly string[];
     readonly experiment?: string;
+    /**
+     * Total number of test cases the run was planned to execute (across all
+     * targets and eval files). Written at run start so an interrupted run
+     * can be diagnosed as resumable when `tests_run.length < planned_test_count`,
+     * even if every recorded row has `execution_status: ok`.
+     */
+    readonly planned_test_count?: number;
   };
   readonly run_summary: Record<
     string,
@@ -364,6 +393,7 @@ export function buildBenchmarkArtifact(
   results: readonly EvaluationResult[],
   evalFile = '',
   experiment?: string,
+  plannedTestCount?: number,
 ): BenchmarkArtifact {
   const targetSet = new Set<string>();
   const testIdSet = new Set<string>();
@@ -457,6 +487,7 @@ export function buildBenchmarkArtifact(
       targets,
       tests_run: testIds,
       experiment,
+      planned_test_count: plannedTestCount,
     },
     run_summary: runSummary,
     per_grader_summary: perEvaluatorSummary,
@@ -464,6 +495,35 @@ export function buildBenchmarkArtifact(
   };
 }
 
+/**
+ * Write a stub `benchmark.json` at the start of a run, before any tests
+ * have executed. Carries `planned_test_count` so an interrupted run can
+ * still be detected as resumable even when every recorded row has
+ * `execution_status: ok`.
+ *
+ * The end-of-run write (writeArtifactsFromResults / aggregateRunDir)
+ * overwrites this file with the full summary; preserve `planned_test_count`
+ * by passing it through.
+ */
+export async function writeInitialBenchmarkArtifact(
+  runDir: string,
+  options: {
+    evalFile: string;
+    plannedTestCount: number;
+    experiment?: string;
+  },
+): Promise<void> {
+  await mkdir(runDir, { recursive: true });
+  const stub = buildBenchmarkArtifact(
+    [],
+    options.evalFile,
+    options.experiment,
+    options.plannedTestCount,
+  );
+  const benchmarkPath = path.join(runDir, 'benchmark.json');
+  await writeFile(benchmarkPath, `${JSON.stringify(stub, null, 2)}\n`, 'utf8');
+}
+
 export function buildAggregateGradingArtifact(
   results: readonly EvaluationResult[],
 ): AggregateGradingArtifact {
@@ -826,7 +886,7 @@ export async function writePerTestArtifacts(
 export async function writeArtifactsFromResults(
   results: readonly EvaluationResult[],
   outputDir: string,
-  options?: { evalFile?: string; experiment?: string },
+  options?: { evalFile?: string; experiment?: string; plannedTestCount?: number },
 ): Promise<{
   testArtifactDir: string;
   timingPath: string;
@@ -877,8 +937,16 @@ export async function writeArtifactsFromResults(
   const timing = buildTimingArtifact(results);
   await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
 
-  // Write benchmark
-  const benchmark = buildBenchmarkArtifact(results, options?.evalFile, options?.experiment);
+  // Write benchmark — preserve `planned_test_count` from the run-start stub
+  // (or from the original run when this is a resume) unless an explicit
+  // value was passed by the caller.
+  const plannedTestCount = options?.plannedTestCount ?? (await readPlannedTestCount(benchmarkPath));
+  const benchmark = buildBenchmarkArtifact(
+    results,
+    options?.evalFile,
+    options?.experiment,
+    plannedTestCount,
+  );
   await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
 
   await writeJsonlFile(indexPath, indexRecords);
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index db0b0fb19..0f3536061 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -38,6 +38,7 @@ import {
   deduplicateByTestIdTarget,
   parseJsonlResults,
   writeArtifactsFromResults,
+  writeInitialBenchmarkArtifact,
 } from './artifact-writer.js';
 import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
@@ -1447,6 +1448,21 @@ export async function runEvalCommand(
     );
   }
 
+  // Write a stub benchmark.json before dispatching tests, carrying the planned
+  // execution count so an interrupted run can still surface as resumable in
+  // Studio (results.length < planned_test_count) even when every recorded row
+  // has execution_status: ok. The end-of-run write preserves this value via
+  // readPlannedTestCount inside aggregateRunDir / writeArtifactsFromResults.
+  // Skip on resume — we want to preserve the *original* planned count.
+  if (!isResumeAppend && usesDefaultArtifactWorkspace && totalEvalCount > 0) {
+    const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
+    await writeInitialBenchmarkArtifact(runDir, {
+      evalFile,
+      plannedTestCount: totalEvalCount,
+      experiment: normalizeExperimentName(options.experiment),
+    });
+  }
+
   // Eval files run sequentially; within each file, --workers N test cases run in parallel.
   // This matches industry practice (promptfoo, deepeval, OpenAI Evals) and avoids cross-file
   // workspace races without any grouping complexity.
diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts
index e40c01c74..7f295bf05 100644
--- a/apps/cli/src/commands/results/eval-runner.ts
+++ b/apps/cli/src/commands/results/eval-runner.ts
@@ -412,6 +412,29 @@ export function registerEvalRoutes(
     }
   });
 
+  // ── Stop a running eval ────────────────────────────────────────────────
+  // SIGTERM the spawned CLI; the existing child.on('close') flips status to
+  // 'finished'/'failed'. The CLI's own signal handler walks its tracked
+  // grandchildren (claude/codex/pi/copilot subprocesses) and kills them
+  // before exiting.
+  app.delete('/api/eval/run/:id', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    const id = c.req.param('id');
+    const run = activeRuns.get(id ?? '');
+    if (!run) return c.json({ error: 'Run not found' }, 404);
+    if (run.status === 'finished' || run.status === 'failed' || !run.process) {
+      return c.json({ stopped: false, reason: 'already_terminal', status: run.status });
+    }
+    try {
+      run.process.kill('SIGTERM');
+    } catch (err) {
+      return c.json({ error: (err as Error).message }, 500);
+    }
+    return c.json({ stopped: true, status: run.status });
+  });
+
   // ── Run status ─────────────────────────────────────────────────────────
   app.get('/api/eval/status/:id', (c) => {
     const id = c.req.param('id');
@@ -576,6 +599,24 @@ export function registerEvalRoutes(
     }
   });
 
+  app.delete('/api/benchmarks/:benchmarkId/eval/run/:id', (c) => {
+    if (readOnly) {
+      return c.json({ error: 'Studio is running in read-only mode' }, 403);
+    }
+    const id = c.req.param('id');
+    const run = activeRuns.get(id ?? '');
+    if (!run) return c.json({ error: 'Run not found' }, 404);
+    if (run.status === 'finished' || run.status === 'failed' || !run.process) {
+      return c.json({ stopped: false, reason: 'already_terminal', status: run.status });
+    }
+    try {
+      run.process.kill('SIGTERM');
+    } catch (err) {
+      return c.json({ error: (err as Error).message }, 500);
+    }
+    return c.json({ stopped: true, status: run.status });
+  });
+
   app.get('/api/benchmarks/:benchmarkId/eval/status/:id', (c) => {
     const id = c.req.param('id');
     const run = activeRuns.get(id ?? '');
diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts
index f88bf212e..8468bdf88 100644
--- a/apps/cli/src/commands/results/serve.ts
+++ b/apps/cli/src/commands/results/serve.ts
@@ -347,8 +347,8 @@ async function handleRunDetail(c: C, { searchDir }: DataContext) {
 function deriveResumeMeta(
   cwd: string,
   manifestPath: string,
-): { run_dir?: string; suite_filter?: string } {
-  const out: { run_dir?: string; suite_filter?: string } = {};
+): { run_dir?: string; suite_filter?: string; planned_test_count?: number } {
+  const out: { run_dir?: string; suite_filter?: string; planned_test_count?: number } = {};
   const runDir = path.dirname(manifestPath);
   const relative = path.relative(cwd, runDir);
   // path.relative returns '..'-prefixed paths when runDir is outside cwd; keep
@@ -359,15 +359,19 @@ function deriveResumeMeta(
     const benchmarkPath = path.join(runDir, 'benchmark.json');
     if (existsSync(benchmarkPath)) {
       const parsed = JSON.parse(readFileSync(benchmarkPath, 'utf8')) as {
-        metadata?: { eval_file?: string };
+        metadata?: { eval_file?: string; planned_test_count?: number };
       };
       const evalFile = parsed.metadata?.eval_file;
       if (typeof evalFile === 'string' && evalFile.trim()) {
         out.suite_filter = evalFile.trim();
       }
+      const planned = parsed.metadata?.planned_test_count;
+      if (typeof planned === 'number' && Number.isFinite(planned) && planned > 0) {
+        out.planned_test_count = planned;
+      }
     }
   } catch {
-    // benchmark.json missing / unreadable / malformed — leave suite_filter unset.
+    // benchmark.json missing / unreadable / malformed — leave fields unset.
   }
   return out;
 }
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index 0e69e495a..d1a1d8ae5 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -1003,6 +1003,50 @@ describe('serve app', () => {
     });
   });
 
+  // ── DELETE /api/eval/run/:id — stop a running eval ─────────────────────
+  //
+  // The DELETE endpoint validates routing/auth shape (404 unknown id, 403
+  // read-only). The happy-path SIGTERM behavior is covered by manual UAT
+  // because it requires a live subprocess that is reliably mid-run; unit
+  // tests that race a launch against a delete are flaky.
+
+  describe('DELETE /api/eval/run/:id (stop API)', () => {
+    function makeAppForStop(opts?: { readOnly?: boolean }) {
+      return createApp([], tempDir, undefined, undefined, {
+        studioDir,
+        readOnly: opts?.readOnly === true,
+      });
+    }
+
+    it('returns 404 for an unknown run id', async () => {
+      const app = makeAppForStop();
+      const res = await app.request('/api/eval/run/no-such-id', { method: 'DELETE' });
+      expect(res.status).toBe(404);
+    });
+
+    it('returns 403 in read-only mode', async () => {
+      const app = makeAppForStop({ readOnly: true });
+      const res = await app.request('/api/eval/run/anything', { method: 'DELETE' });
+      expect(res.status).toBe(403);
+    });
+
+    it('returns 404 for benchmark-scoped DELETE with unknown run id', async () => {
+      const app = makeAppForStop();
+      const res = await app.request('/api/benchmarks/some-id/eval/run/no-such-id', {
+        method: 'DELETE',
+      });
+      expect(res.status).toBe(404);
+    });
+
+    it('returns 403 in read-only mode for benchmark-scoped DELETE', async () => {
+      const app = makeAppForStop({ readOnly: true });
+      const res = await app.request('/api/benchmarks/some-id/eval/run/anything', {
+        method: 'DELETE',
+      });
+      expect(res.status).toBe(403);
+    });
+  });
+
   // ── POST /api/eval/preview — argument shaping for resume flags ─────────
   //
   // /api/eval/preview is a lightweight endpoint that returns the CLI
diff --git a/apps/studio/src/components/ResumeRunActions.tsx b/apps/studio/src/components/ResumeRunActions.tsx
index d161c9d22..f50ebe5b5 100644
--- a/apps/studio/src/components/ResumeRunActions.tsx
+++ b/apps/studio/src/components/ResumeRunActions.tsx
@@ -35,6 +35,7 @@ export interface ResumeRunActionsProps {
   target?: string;
   benchmarkId?: string;
   isReadOnly: boolean;
+  plannedTestCount?: number;
 }
 
 export function ResumeRunActions({
@@ -44,12 +45,13 @@ export function ResumeRunActions({
   target,
   benchmarkId,
   isReadOnly,
+  plannedTestCount,
 }: ResumeRunActionsProps) {
   const navigate = useNavigate();
   const [busy, setBusy] = useState<ResumeMode | null>(null);
   const [error, setError] = useState<string | null>(null);
 
-  if (!shouldShowResumeActions(results, isReadOnly)) return null;
+  if (!shouldShowResumeActions(results, isReadOnly, plannedTestCount)) return null;
 
   // Both actions need the run dir + the original eval file. Without those
   // we can't target the existing run workspace, so we render the buttons
diff --git a/apps/studio/src/components/StopRunButton.tsx b/apps/studio/src/components/StopRunButton.tsx
new file mode 100644
index 000000000..39ce1433e
--- /dev/null
+++ b/apps/studio/src/components/StopRunButton.tsx
@@ -0,0 +1,62 @@
+/**
+ * StopRunButton — destructive button on /jobs/:runId that terminates a
+ * Studio-launched eval.
+ *
+ * Calls DELETE /api/eval/run/:id (or the benchmark-scoped variant).
+ * Optimistically flips the local label to "Stopping…" until the next
+ * poll of /api/eval/status/:id observes a terminal state — at which
+ * point the button hides via `shouldShowStopButton`.
+ *
+ * To extend with a "Force kill" affordance: thread a second handler
+ * through that POSTs DELETE again (the CLI's signal handler interprets
+ * a second SIGTERM within the same process as hard-exit) and surfaces a
+ * confirmation prompt.
+ */
+
+import { useState } from 'react';
+
+import { stopEvalRun } from '~/lib/api';
+
+import { type RunStatus, shouldShowStopButton } from './stop-run-helpers';
+
+export interface StopRunButtonProps {
+  runId: string;
+  status: RunStatus | undefined;
+  isReadOnly: boolean;
+  benchmarkId?: string;
+}
+
+export function StopRunButton({ runId, status, isReadOnly, benchmarkId }: StopRunButtonProps) {
+  const [stopping, setStopping] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+
+  if (!shouldShowStopButton(status, isReadOnly)) return null;
+
+  async function onClick() {
+    setStopping(true);
+    setError(null);
+    try {
+      await stopEvalRun(runId, benchmarkId);
+    } catch (err) {
+      setError(err instanceof Error ? err.message : 'Failed to stop run');
+      setStopping(false);
+    }
+    // On success, leave `stopping=true`. The status poller will flip to
+    // a terminal state shortly, at which point the button unmounts.
+  }
+
+  return (
+    <div className="flex flex-col items-end gap-1">
+      <button
+        type="button"
+        onClick={onClick}
+        disabled={stopping}
+        className="rounded-md border border-red-700/70 bg-transparent px-3 py-1.5 text-sm font-medium text-red-300 hover:bg-red-950/40 disabled:cursor-not-allowed disabled:opacity-50"
+        data-testid="stop-run-button"
+      >
+        {stopping ? 'Stopping…' : '■ Stop run'}
+      </button>
+      {error && <p className="text-xs text-red-400">{error}</p>}
+    </div>
+  );
+}
diff --git a/apps/studio/src/components/resume-run-helpers.test.ts b/apps/studio/src/components/resume-run-helpers.test.ts
index 39e2d807c..9ceb71870 100644
--- a/apps/studio/src/components/resume-run-helpers.test.ts
+++ b/apps/studio/src/components/resume-run-helpers.test.ts
@@ -32,6 +32,23 @@ describe('shouldShowResumeActions', () => {
   it('hides on empty results', () => {
     expect(shouldShowResumeActions([], false)).toBe(false);
   });
+
+  it('shows for an incomplete partial run with only ok rows when planned_test_count exceeds results', () => {
+    // Stop button / Ctrl+C scenario: 5 of 10 planned tests finished
+    // successfully before the run was killed. No execution errors, but
+    // still resumable.
+    const results = [ok('a'), ok('b'), ok('c'), ok('d'), ok('e')];
+    expect(shouldShowResumeActions(results, false, 10)).toBe(true);
+  });
+
+  it('hides when results match planned_test_count (complete passing run)', () => {
+    const results = [ok('a'), ok('b'), ok('c')];
+    expect(shouldShowResumeActions(results, false, 3)).toBe(false);
+  });
+
+  it('hides incomplete partial run in read-only mode', () => {
+    expect(shouldShowResumeActions([ok('a')], true, 5)).toBe(false);
+  });
 });
 
 describe('buildResumeRequestBody', () => {
diff --git a/apps/studio/src/components/resume-run-helpers.ts b/apps/studio/src/components/resume-run-helpers.ts
index fcbf805a0..b8e6451f5 100644
--- a/apps/studio/src/components/resume-run-helpers.ts
+++ b/apps/studio/src/components/resume-run-helpers.ts
@@ -21,14 +21,29 @@ export interface BuildResumeRequestParams {
 }
 
 /**
- * Whether the resume actions should be visible. The button only makes sense
- * when at least one row failed with an execution error and the user has
- * write access (read-only mode hides the entire control rather than
- * showing a disabled button — see issue acceptance criteria).
+ * Whether the resume actions should be visible. The button is shown when:
+ *   1. At least one recorded row has `execution_status: execution_error`, OR
+ *   2. The run is *incomplete* — fewer recorded rows than the originally
+ *      planned execution count, even if every recorded row is `ok`.
+ *
+ * Case 2 covers Stop-button / Ctrl+C interruptions where the run produced
+ * only successful rows before being killed: there is no `execution_error`
+ * to anchor on, but the run is still resumable. `plannedTestCount` is
+ * persisted in `benchmark.json.metadata` at run start (see
+ * `writeInitialBenchmarkArtifact`).
+ *
+ * Hidden in read-only mode — the server also returns 403, but UI-level
+ * hiding avoids dead controls.
  */
-export function shouldShowResumeActions(results: EvalResult[], isReadOnly: boolean): boolean {
+export function shouldShowResumeActions(
+  results: EvalResult[],
+  isReadOnly: boolean,
+  plannedTestCount?: number,
+): boolean {
   if (isReadOnly) return false;
-  return results.some((r) => r.executionStatus === 'execution_error');
+  if (results.some((r) => r.executionStatus === 'execution_error')) return true;
+  if (plannedTestCount !== undefined && results.length < plannedTestCount) return true;
+  return false;
 }
 
 /**
diff --git a/apps/studio/src/components/stop-run-helpers.test.ts b/apps/studio/src/components/stop-run-helpers.test.ts
new file mode 100644
index 000000000..3bd407b1b
--- /dev/null
+++ b/apps/studio/src/components/stop-run-helpers.test.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it } from 'bun:test';
+
+import { isTerminalRunStatus, shouldShowStopButton } from './stop-run-helpers';
+
+describe('isTerminalRunStatus', () => {
+  it('treats finished and failed as terminal', () => {
+    expect(isTerminalRunStatus('finished')).toBe(true);
+    expect(isTerminalRunStatus('failed')).toBe(true);
+  });
+
+  it('treats live states and unknowns as non-terminal', () => {
+    expect(isTerminalRunStatus('starting')).toBe(false);
+    expect(isTerminalRunStatus('running')).toBe(false);
+    expect(isTerminalRunStatus(undefined)).toBe(false);
+  });
+});
+
+describe('shouldShowStopButton', () => {
+  it('shows while the run is live', () => {
+    expect(shouldShowStopButton('starting', false)).toBe(true);
+    expect(shouldShowStopButton('running', false)).toBe(true);
+  });
+
+  it('hides once the run reaches a terminal state', () => {
+    expect(shouldShowStopButton('finished', false)).toBe(false);
+    expect(shouldShowStopButton('failed', false)).toBe(false);
+  });
+
+  it('hides in read-only mode regardless of status', () => {
+    expect(shouldShowStopButton('running', true)).toBe(false);
+  });
+
+  it('hides when the status is undefined', () => {
+    expect(shouldShowStopButton(undefined, false)).toBe(false);
+  });
+});
diff --git a/apps/studio/src/components/stop-run-helpers.ts b/apps/studio/src/components/stop-run-helpers.ts
new file mode 100644
index 000000000..fcef0e56a
--- /dev/null
+++ b/apps/studio/src/components/stop-run-helpers.ts
@@ -0,0 +1,28 @@
+/**
+ * Pure helpers backing StopRunButton, isolated for unit testing.
+ *
+ * Intentionally side-effect-free so the visibility matrix is testable
+ * without rendering React.
+ *
+ * To extend: extend the union of statuses recognized as non-terminal as
+ * the server adds new lifecycle states. Today the server only emits
+ * starting / running / finished / failed; anything not in the terminal
+ * set is treated as live.
+ */
+
+export type RunStatus = 'starting' | 'running' | 'finished' | 'failed' | (string & {});
+
+export function isTerminalRunStatus(status: RunStatus | undefined): boolean {
+  return status === 'finished' || status === 'failed';
+}
+
+/**
+ * Whether the Stop button should be visible. Hidden when the run is
+ * terminal (no process to kill) and in read-only mode (the API also
+ * 403s, but UI-level hiding avoids dead controls).
+ */
+export function shouldShowStopButton(status: RunStatus | undefined, isReadOnly: boolean): boolean {
+  if (isReadOnly) return false;
+  if (!status) return false;
+  return !isTerminalRunStatus(status);
+}
diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts
index 1ea719e6e..1edde07f9 100644
--- a/apps/studio/src/lib/api.ts
+++ b/apps/studio/src/lib/api.ts
@@ -539,6 +539,21 @@ export async function launchEvalRun(
   return res.json() as Promise<EvalRunResponse>;
 }
 
+export async function stopEvalRun(
+  runId: string,
+  benchmarkId?: string,
+): Promise<{ stopped: boolean; reason?: string; status?: string }> {
+  const url = benchmarkId
+    ? `${benchmarkApiBase(benchmarkId)}/eval/run/${runId}`
+    : `/api/eval/run/${runId}`;
+  const res = await fetch(url, { method: 'DELETE' });
+  if (!res.ok) {
+    const err = await res.json().catch(() => ({ error: res.statusText }));
+    throw new Error((err as { error?: string }).error ?? `Failed: ${res.status}`);
+  }
+  return res.json() as Promise<{ stopped: boolean; reason?: string; status?: string }>;
+}
+
 export function evalRunStatusOptions(runId: string | null) {
   return queryOptions({
     queryKey: ['eval-status', runId],
diff --git a/apps/studio/src/lib/types.ts b/apps/studio/src/lib/types.ts
index 46f321d32..6ef4dc1b4 100644
--- a/apps/studio/src/lib/types.ts
+++ b/apps/studio/src/lib/types.ts
@@ -80,6 +80,8 @@ export interface RunDetailResponse {
   run_dir?: string;
   /** Eval file path the run was launched against, if recorded in benchmark.json. Local runs only. */
   suite_filter?: string;
+  /** Total (test_id, target) executions originally planned for this run. Used to detect incomplete partial runs as resumable. Local runs only, populated when the run was launched after the planned-count metadata feature shipped. */
+  planned_test_count?: number;
 }
 
 export interface SuiteSummary {
diff --git a/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx b/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx
index db0f28bd4..dfb41c244 100644
--- a/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx
+++ b/apps/studio/src/routes/benchmarks/$benchmarkId_/runs/$runId.tsx
@@ -77,6 +77,7 @@ function BenchmarkRunDetailPage() {
             target={target ?? undefined}
             benchmarkId={benchmarkId}
             isReadOnly={isReadOnly}
+            plannedTestCount={data?.planned_test_count}
           />
           {!isReadOnly && (
             <button
diff --git a/apps/studio/src/routes/jobs/$runId.tsx b/apps/studio/src/routes/jobs/$runId.tsx
index 600670e5b..3107b0b80 100644
--- a/apps/studio/src/routes/jobs/$runId.tsx
+++ b/apps/studio/src/routes/jobs/$runId.tsx
@@ -9,7 +9,8 @@
 
 import { Link, createFileRoute } from '@tanstack/react-router';
 
-import { useEvalRunStatus } from '~/lib/api';
+import { StopRunButton } from '~/components/StopRunButton';
+import { useEvalRunStatus, useStudioConfig } from '~/lib/api';
 
 export const Route = createFileRoute('/jobs/$runId')({
   component: JobDetailPage,
@@ -18,6 +19,8 @@ export const Route = createFileRoute('/jobs/$runId')({
 function JobDetailPage() {
   const { runId } = Route.useParams();
   const { data: status, isLoading, error } = useEvalRunStatus(runId);
+  const { data: config } = useStudioConfig();
+  const isReadOnly = config?.read_only === true;
 
   if (isLoading) {
     return (
@@ -74,7 +77,8 @@ function JobDetailPage() {
             )}
           </p>
         </div>
-        <div className="flex flex-shrink-0 items-center gap-2">
+        <div className="flex flex-shrink-0 items-center gap-3">
+          <StopRunButton runId={runId} status={status.status} isReadOnly={isReadOnly} />
           <span className={`text-sm font-medium ${statusColor}`}>
             {status.status.charAt(0).toUpperCase() + status.status.slice(1)}
           </span>
diff --git a/apps/studio/src/routes/runs/$runId.tsx b/apps/studio/src/routes/runs/$runId.tsx
index da7311637..955db9291 100644
--- a/apps/studio/src/routes/runs/$runId.tsx
+++ b/apps/studio/src/routes/runs/$runId.tsx
@@ -77,6 +77,7 @@ function RunDetailPage() {
             suiteFilter={data?.suite_filter}
             target={target ?? undefined}
             isReadOnly={isReadOnly}
+            plannedTestCount={data?.planned_test_count}
           />
           {!isReadOnly && (
             <button

From e1cfcc042f683eed0581a2b6b77bd8baa7287ec7 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 7 May 2026 10:17:59 +0200
Subject: [PATCH 4/4] refactor(studio): stop is POST /run/:id/stop, not DELETE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stop is part of the stop → resume → complete workflow, not a
destructive cancel — DELETE semantics were wrong. Switched to
POST /api/eval/run/:id/stop (and benchmark-scoped variant), kept the
idempotent-on-terminal behavior so clients can fire-and-forget.

UI: removed red destructive styling on the Stop button. Now neutral
gray with a pause glyph to signal "this is a pause, not a kill."
---
 apps/cli/src/commands/results/eval-runner.ts | 14 +++++--
 apps/cli/test/commands/results/serve.test.ts | 32 ++++++++--------
 apps/studio/src/components/StopRunButton.tsx | 18 ++++-----
 apps/studio/src/lib/api.ts                   |  6 +--
 docs/plans/1222-stop-run.md                  | 39 +++++++++++---------
 5 files changed, 61 insertions(+), 48 deletions(-)

diff --git a/apps/cli/src/commands/results/eval-runner.ts b/apps/cli/src/commands/results/eval-runner.ts
index 7f295bf05..ab601e7f2 100644
--- a/apps/cli/src/commands/results/eval-runner.ts
+++ b/apps/cli/src/commands/results/eval-runner.ts
@@ -413,11 +413,17 @@ export function registerEvalRoutes(
   });
 
   // ── Stop a running eval ────────────────────────────────────────────────
-  // SIGTERM the spawned CLI; the existing child.on('close') flips status to
-  // 'finished'/'failed'. The CLI's own signal handler walks its tracked
+  // POST (not DELETE) because Stop is part of the stop → resume → complete
+  // workflow, not a destructive cancel. The run remains resumable from the
+  // partial index.jsonl on disk. Idempotent: hitting /stop on a terminal
+  // run returns 200 with `stopped: false, reason: 'already_terminal'`
+  // rather than 4xx, so clients can fire-and-forget.
+  //
+  // SIGTERM the spawned CLI; the existing child.on('close') flips status
+  // to 'finished'/'failed'. The CLI's own signal handler walks its tracked
   // grandchildren (claude/codex/pi/copilot subprocesses) and kills them
   // before exiting.
-  app.delete('/api/eval/run/:id', (c) => {
+  app.post('/api/eval/run/:id/stop', (c) => {
     if (readOnly) {
       return c.json({ error: 'Studio is running in read-only mode' }, 403);
     }
@@ -599,7 +605,7 @@ export function registerEvalRoutes(
     }
   });
 
-  app.delete('/api/benchmarks/:benchmarkId/eval/run/:id', (c) => {
+  app.post('/api/benchmarks/:benchmarkId/eval/run/:id/stop', (c) => {
     if (readOnly) {
       return c.json({ error: 'Studio is running in read-only mode' }, 403);
     }
diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts
index d1a1d8ae5..30c04ee66 100644
--- a/apps/cli/test/commands/results/serve.test.ts
+++ b/apps/cli/test/commands/results/serve.test.ts
@@ -1003,14 +1003,16 @@ describe('serve app', () => {
     });
   });
 
-  // ── DELETE /api/eval/run/:id — stop a running eval ─────────────────────
+  // ── POST /api/eval/run/:id/stop — interrupt a running eval ─────────────
   //
-  // The DELETE endpoint validates routing/auth shape (404 unknown id, 403
-  // read-only). The happy-path SIGTERM behavior is covered by manual UAT
-  // because it requires a live subprocess that is reliably mid-run; unit
-  // tests that race a launch against a delete are flaky.
-
-  describe('DELETE /api/eval/run/:id (stop API)', () => {
+  // Stop is part of the stop → resume workflow, not a destructive cancel —
+  // POST (not DELETE) and idempotent on already-terminal runs. These tests
+  // validate routing/auth shape (404 unknown id, 403 read-only). The happy
+  // path SIGTERM behavior is covered by manual UAT because it requires a
+  // live subprocess that is reliably mid-run; unit tests that race a launch
+  // against a stop are flaky.
+
+  describe('POST /api/eval/run/:id/stop (stop API)', () => {
     function makeAppForStop(opts?: { readOnly?: boolean }) {
       return createApp([], tempDir, undefined, undefined, {
         studioDir,
@@ -1020,28 +1022,28 @@ describe('serve app', () => {
 
     it('returns 404 for an unknown run id', async () => {
       const app = makeAppForStop();
-      const res = await app.request('/api/eval/run/no-such-id', { method: 'DELETE' });
+      const res = await app.request('/api/eval/run/no-such-id/stop', { method: 'POST' });
       expect(res.status).toBe(404);
     });
 
     it('returns 403 in read-only mode', async () => {
       const app = makeAppForStop({ readOnly: true });
-      const res = await app.request('/api/eval/run/anything', { method: 'DELETE' });
+      const res = await app.request('/api/eval/run/anything/stop', { method: 'POST' });
       expect(res.status).toBe(403);
     });
 
-    it('returns 404 for benchmark-scoped DELETE with unknown run id', async () => {
+    it('returns 404 for benchmark-scoped stop with unknown run id', async () => {
       const app = makeAppForStop();
-      const res = await app.request('/api/benchmarks/some-id/eval/run/no-such-id', {
-        method: 'DELETE',
+      const res = await app.request('/api/benchmarks/some-id/eval/run/no-such-id/stop', {
+        method: 'POST',
       });
       expect(res.status).toBe(404);
     });
 
-    it('returns 403 in read-only mode for benchmark-scoped DELETE', async () => {
+    it('returns 403 in read-only mode for benchmark-scoped stop', async () => {
       const app = makeAppForStop({ readOnly: true });
-      const res = await app.request('/api/benchmarks/some-id/eval/run/anything', {
-        method: 'DELETE',
+      const res = await app.request('/api/benchmarks/some-id/eval/run/anything/stop', {
+        method: 'POST',
       });
       expect(res.status).toBe(403);
     });
diff --git a/apps/studio/src/components/StopRunButton.tsx b/apps/studio/src/components/StopRunButton.tsx
index 39ce1433e..d0eb65e2b 100644
--- a/apps/studio/src/components/StopRunButton.tsx
+++ b/apps/studio/src/components/StopRunButton.tsx
@@ -1,16 +1,16 @@
 /**
- * StopRunButton — destructive button on /jobs/:runId that terminates a
- * Studio-launched eval.
+ * StopRunButton — pause-style affordance on /jobs/:runId that interrupts
+ * a Studio-launched eval. Stop is part of the stop → resume → complete
+ * workflow, not a destructive cancel: the partial index.jsonl is
+ * preserved and can be resumed in one click from the run-detail page.
  *
- * Calls DELETE /api/eval/run/:id (or the benchmark-scoped variant).
+ * Calls POST /api/eval/run/:id/stop (or the benchmark-scoped variant).
  * Optimistically flips the local label to "Stopping…" until the next
  * poll of /api/eval/status/:id observes a terminal state — at which
  * point the button hides via `shouldShowStopButton`.
  *
- * To extend with a "Force kill" affordance: thread a second handler
- * through that POSTs DELETE again (the CLI's signal handler interprets
- * a second SIGTERM within the same process as hard-exit) and surfaces a
- * confirmation prompt.
+ * Styling is intentionally neutral (gray, not red) to signal that this
+ * is a pause, not a kill.
  */
 
 import { useState } from 'react';
@@ -51,10 +51,10 @@ export function StopRunButton({ runId, status, isReadOnly, benchmarkId }: StopRu
         type="button"
         onClick={onClick}
         disabled={stopping}
-        className="rounded-md border border-red-700/70 bg-transparent px-3 py-1.5 text-sm font-medium text-red-300 hover:bg-red-950/40 disabled:cursor-not-allowed disabled:opacity-50"
+        className="rounded-md border border-gray-700 bg-transparent px-3 py-1.5 text-sm font-medium text-gray-300 hover:bg-gray-800 disabled:cursor-not-allowed disabled:opacity-50"
         data-testid="stop-run-button"
       >
-        {stopping ? 'Stopping…' : '■ Stop run'}
+        {stopping ? 'Stopping…' : '⏸ Stop'}
       </button>
       {error && <p className="text-xs text-red-400">{error}</p>}
     </div>
diff --git a/apps/studio/src/lib/api.ts b/apps/studio/src/lib/api.ts
index 1edde07f9..eb4b95010 100644
--- a/apps/studio/src/lib/api.ts
+++ b/apps/studio/src/lib/api.ts
@@ -544,9 +544,9 @@ export async function stopEvalRun(
   benchmarkId?: string,
 ): Promise<{ stopped: boolean; reason?: string; status?: string }> {
   const url = benchmarkId
-    ? `${benchmarkApiBase(benchmarkId)}/eval/run/${runId}`
-    : `/api/eval/run/${runId}`;
-  const res = await fetch(url, { method: 'DELETE' });
+    ? `${benchmarkApiBase(benchmarkId)}/eval/run/${runId}/stop`
+    : `/api/eval/run/${runId}/stop`;
+  const res = await fetch(url, { method: 'POST' });
   if (!res.ok) {
     const err = await res.json().catch(() => ({ error: res.statusText }));
     throw new Error((err as { error?: string }).error ?? `Failed: ${res.status}`);
diff --git a/docs/plans/1222-stop-run.md b/docs/plans/1222-stop-run.md
index 5decca1b9..12696b7ac 100644
--- a/docs/plans/1222-stop-run.md
+++ b/docs/plans/1222-stop-run.md
@@ -2,19 +2,23 @@
 
 ## Scope
 
-Make Ctrl+C / DELETE work. No staged shutdown, no new status words, no
+Make Ctrl+C / POST work. No staged shutdown, no new status words, no
 AbortSignal threading.
 
 1. **CLI** — Ctrl+C → install `process.on('SIGINT'|'SIGTERM')` that kills
    tracked child processes, then exits. Partial `index.jsonl` is already
    row-by-row durable; whatever finished is preserved.
-2. **Studio API** — `DELETE /api/eval/run/:id` (and benchmark-scoped
+2. **Studio API** — `POST /api/eval/run/:id/stop` (and benchmark-scoped
    variant) calls `child.kill('SIGTERM')` on the spawned CLI. Existing
    `child.on('close')` flips `running → failed` once the process exits.
+   Idempotent: returns 200 `{stopped:false, reason:'already_terminal'}`
+   on terminal runs rather than 4xx, so clients can fire-and-forget.
    No new `'stopping'` state on the server.
-3. **Studio UI** — Red Stop button on `/jobs/:runId` while
-   `status ∈ {starting, running}` and not read-only. On click, optimistic
-   local "Stopping…" label until the next status poll flips to terminal.
+3. **Studio UI** — Neutral-styled Stop button on `/jobs/:runId` while
+   `status ∈ {starting, running}` and not read-only. Stop is part of the
+   stop → resume workflow, not a destructive cancel, so styling is gray
+   (not red). On click, optimistic local "Stopping…" label until the
+   next status poll flips to terminal.
 4. **Resume detection for partial runs** — Persist `planned_test_count` in
    `benchmark.json.metadata` at run start (early write), updated at end.
    Run-detail API surfaces the number; Studio computes
@@ -35,10 +39,10 @@ AbortSignal threading.
   `killAll('SIGTERM')` then `process.exit(130)` / `143`. Idempotent on
   re-entry; second signal hard-exits (`process.exit(1)`).
 
-### Studio DELETE endpoint
+### Studio POST endpoint
 - `apps/cli/src/commands/results/eval-runner.ts` — add
-  `app.delete('/api/eval/run/:id', ...)` and
-  `app.delete('/api/benchmarks/:benchmarkId/eval/run/:id', ...)`.
+  `app.post('/api/eval/run/:id/stop', ...)` and
+  `app.post('/api/benchmarks/:benchmarkId/eval/run/:id/stop', ...)`.
   - 404 if id unknown
   - 403 in read-only mode
   - 200 `{stopped: true}` after `child.kill('SIGTERM')`
@@ -46,10 +50,11 @@ AbortSignal threading.
   - No status mutation here; close handler does it.
 
 ### Studio Stop button
-- `apps/studio/src/components/StopRunButton.tsx` — *new*. Renders the red
-  destructive button when `status` is non-terminal; calls DELETE with the
-  benchmark-scoped path when `benchmarkId` is set; sets local
-  `stopping=true` state to flip the label optimistically.
+- `apps/studio/src/components/StopRunButton.tsx` — *new*. Renders a
+  neutral-styled (gray, not red) Stop button when `status` is
+  non-terminal; calls POST with the benchmark-scoped path when
+  `benchmarkId` is set; sets local `stopping=true` state to flip the
+  label optimistically.
 - `apps/studio/src/components/stop-run-helpers.ts` — *new*. Pure
   `shouldShowStopButton(status, isReadOnly)` for unit testing.
 - `apps/studio/src/lib/api.ts` — add `stopEvalRun(id, benchmarkId?)`.
@@ -80,9 +85,9 @@ AbortSignal threading.
   detail routes — pass `plannedTestCount` through.
 
 ### Tests (narrow)
-- `apps/cli/test/commands/results/serve.test.ts` (or co-located) — DELETE
-  endpoint: 404 unknown, 403 read-only, 200 happy path with a fake child
-  whose `kill` is observable.
+- `apps/cli/test/commands/results/serve.test.ts` (or co-located) — stop
+  endpoint: 404 unknown, 403 read-only, base + benchmark-scoped paths.
+  Happy path SIGTERM is covered by manual UAT (race-prone in unit tests).
 - `apps/studio/src/components/resume-run-helpers.test.ts` — case where
   every result is `ok` but `results.length < plannedTestCount` → button
   visible.
@@ -101,8 +106,8 @@ AbortSignal threading.
 
 - [ ] CLI: `Ctrl+C` during a multi-test eval kills all spawned providers
       and the partial `index.jsonl` is preserved.
-- [ ] Server: `DELETE /api/eval/run/:id` returns 200 in normal mode, 403
-      in read-only, 404 for unknown id.
+- [ ] Server: `POST /api/eval/run/:id/stop` returns 200 in normal mode,
+      403 in read-only, 404 for unknown id; idempotent on terminal runs.
 - [ ] UI: Stop button appears while running, hidden in read-only, hidden
       when terminal. Optimistic "Stopping…" label appears until status
       poll flips.