From da03e03364c7aabf73bd42e3cb54b2b63de1420c Mon Sep 17 00:00:00 2001 From: msfstef Date: Thu, 4 Jun 2026 20:55:49 +0300 Subject: [PATCH] fix(agents): reclaim zombie sandbox containers and create them lazily Users opening the desktop app found 15+ electric-sbx-* containers running that they never asked for. Several compounding bugs: - The boot sweep only removed *exited ephemeral* leftovers, but crash/ quit leftovers are RUNNING (PID 1 is an infinite sleep loop), so it never reclaimed anything real. Containers now carry an owner-pid label; the sweep reclaims running orphans whose owner is dead (remove ephemeral / stop persistent), consults an in-container adoption marker so a live process that reattached a dead creator's container is never swept, and is awaited at boot so it can't race reattaches. - The debounced idle teardowns are unref'd timers that died with the process: every graceful quit leaked the recently-active containers as running zombies. Runtime shutdown now flushes pending teardowns (BuiltinAgentsServer.stop), leaving live-leased entries to their own dispose or the next boot's sweep. - A failed post-start init (mkdir exec) left a started container that was never registered - invisible to dispose and, while running, to the sweep. Creation now verifies the init exit code and removes the container on any failure. Fixing this exposed that isNameConflict() treated any HTTP 409 as a lost create race (exec on an exited container is also 409), silently "reattaching" to a removed container; it now matches the daemon's name-conflict message. - Sandbox creation was eager on every claimed wake, so a reconnect backlog of trivial wakes (cron ticks, bookkeeping) stampeded the daemon with containers. The docker profile now returns a lazySandbox wrapper that defers the provider factory to first actual use. Terminal reclaim without use goes through reclaimDockerSandboxByKey (no create-to-delete), spawn-inherit force-materializes the owner's workspace before the child can attach, and concurrent creations are capped at 4 to smooth real bursts. - All sandbox containers now carry compose project/service labels (com.docker.compose.project=electric-sandboxes) so Docker Desktop groups them under one entry and they can be stopped/deleted together (docker compose -p electric-sandboxes down). Co-Authored-By: Claude Opus 4.8 (1M context) --- .changeset/zombie-sandbox-containers-fix.md | 24 ++ packages/agents-runtime/src/process-wake.ts | 11 +- packages/agents-runtime/src/sandbox-docker.ts | 2 + packages/agents-runtime/src/sandbox.ts | 2 + packages/agents-runtime/src/sandbox/docker.ts | 340 +++++++++++++++--- packages/agents-runtime/src/sandbox/lazy.ts | 195 ++++++++++ .../test/sandbox-docker.test.ts | 212 ++++++++++- .../agents-runtime/test/sandbox-lazy.test.ts | 197 ++++++++++ packages/agents/src/bootstrap.ts | 124 ++++--- packages/agents/src/server.ts | 14 + 10 files changed, 1015 insertions(+), 106 deletions(-) create mode 100644 .changeset/zombie-sandbox-containers-fix.md create mode 100644 packages/agents-runtime/src/sandbox/lazy.ts create mode 100644 packages/agents-runtime/test/sandbox-lazy.test.ts diff --git a/.changeset/zombie-sandbox-containers-fix.md b/.changeset/zombie-sandbox-containers-fix.md new file mode 100644 index 0000000000..d05322fd4b --- /dev/null +++ b/.changeset/zombie-sandbox-containers-fix.md @@ -0,0 +1,24 @@ +--- +'@electric-ax/agents-runtime': patch +'@electric-ax/agents': patch +--- + +Fix Docker sandbox containers (`electric-sbx-*`) accumulating as zombies, and +stop creating containers for wakes that never use their sandbox: + +- The boot sweep now reclaims RUNNING orphans whose owning process died + (owner-pid label + in-container adoption marker), instead of only exited + ephemeral leftovers — previously crash/quit leftovers were never cleaned up. +- Runtime shutdown flushes the debounced idle teardowns (stop persistent / + remove ephemeral) instead of letting the unref'd timers die with the + process, which leaked a running container on every quit. +- A failed post-start init no longer leaves a running, untracked container + behind, and a 409 from inside creation is no longer misread as a name + conflict (which "reattached" to a removed container). +- Sandbox creation is now lazy: the container is only created/started when a + wake actually uses its sandbox, so backlog bursts of trivial wakes (cron + ticks, bookkeeping) on runner reconnect no longer spin up containers. + Terminal reclaim and spawn-`inherit` still work for never-used sandboxes, + and concurrent container creations are capped to smooth real bursts. +- All sandbox containers carry `com.docker.compose.project=electric-sandboxes` + so Docker GUIs group them and they can be stopped/removed together. diff --git a/packages/agents-runtime/src/process-wake.ts b/packages/agents-runtime/src/process-wake.ts index 63b0c3abb7..e54cbe3fbb 100644 --- a/packages/agents-runtime/src/process-wake.ts +++ b/packages/agents-runtime/src/process-wake.ts @@ -10,6 +10,7 @@ import { createSetupContext } from './setup-context' import { createEntityLogPrefix, runtimeLog } from './log' import { createRuntimeServerClient } from './runtime-server-client' import { unrestrictedSandbox } from './sandbox/unrestricted' +import { ensureSandboxMaterialized } from './sandbox/lazy' import { resolveSandboxIdentity } from './sandbox/identity' import { appendPathToUrl } from './url' import { manifestChildKey } from './manifest-helpers' @@ -1302,7 +1303,7 @@ export async function processWake( const requestedInherit = opts?.sandbox === `inherit` || (typeof opts?.sandbox === `object` && opts.sandbox.inherit === true) - const sandbox = requestedInherit + const childSandbox = requestedInherit ? resolvedSandboxSelection ? { profile: resolvedSandboxSelection.profile, @@ -1316,6 +1317,12 @@ export async function processWake( : opts?.sandbox === `inherit` ? undefined : opts?.sandbox + // An inheriting child only ever ATTACHES by key — make sure the + // owner's (lazily-created) container/workspace actually exists before + // the child can wake, even if this wake never ran a tool itself. + if (requestedInherit && resolvedSandboxSelection && sandbox) { + await ensureSandboxMaterialized(sandbox) + } return serverClient.spawnEntity({ type: childType, id: childId, @@ -1323,7 +1330,7 @@ export async function processWake( parentUrl, initialMessage: opts?.initialMessage, tags: opts?.tags, - sandbox, + sandbox: childSandbox, wake: wakeOpt, }) }, diff --git a/packages/agents-runtime/src/sandbox-docker.ts b/packages/agents-runtime/src/sandbox-docker.ts index c1c39dc6cd..ffb8e7c4bb 100644 --- a/packages/agents-runtime/src/sandbox-docker.ts +++ b/packages/agents-runtime/src/sandbox-docker.ts @@ -9,6 +9,8 @@ export { dockerSandbox, + reclaimDockerSandboxByKey, + shutdownAllDockerSandboxes, sweepOrphanedDockerSandboxes, __resetPersistentRegistryForTests, } from './sandbox/docker' diff --git a/packages/agents-runtime/src/sandbox.ts b/packages/agents-runtime/src/sandbox.ts index 9e1bc7ffe4..29b5839d2a 100644 --- a/packages/agents-runtime/src/sandbox.ts +++ b/packages/agents-runtime/src/sandbox.ts @@ -8,6 +8,8 @@ export type { RemoteProvider, RemoteSandboxOpts } from './sandbox/remote' export type { RemoteSandboxClient } from './sandbox/remote/types' export { isE2BAvailable } from './sandbox/remote/e2b' export { chooseDefaultSandbox } from './sandbox/default' +export { ensureSandboxMaterialized, lazySandbox } from './sandbox/lazy' +export type { LazySandboxOpts } from './sandbox/lazy' export { SandboxError } from './sandbox/types' export type { Sandbox, diff --git a/packages/agents-runtime/src/sandbox/docker.ts b/packages/agents-runtime/src/sandbox/docker.ts index f30bb1c53f..336f9dbc27 100644 --- a/packages/agents-runtime/src/sandbox/docker.ts +++ b/packages/agents-runtime/src/sandbox/docker.ts @@ -156,10 +156,35 @@ const SANDBOX_ENTITY_LABEL = `com.electric.sandbox.entity` * key) and only reclaims ephemeral leftovers. */ const SANDBOX_PERSISTENT_LABEL = `com.electric.sandbox.persistent` +/** + * PID of the process that created the container. The boot sweep probes it + * (same host as the daemon's clients) to tell a live sibling's container from + * a crash/quit leftover — the keepalive loop never exits on its own, so a dead + * owner's container stays RUNNING forever without this. + */ +const SANDBOX_OWNER_PID_LABEL = `com.electric.sandbox.owner-pid` +/** + * Compose-style grouping labels. Docker Desktop (and other GUI clients) group + * containers by compose project, so every sandbox shows up under one + * collapsible `electric-sandboxes` entry and can be stopped / deleted together + * — also via `docker compose -p electric-sandboxes down`. + */ +const COMPOSE_PROJECT_LABEL = `com.docker.compose.project` +const COMPOSE_SERVICE_LABEL = `com.docker.compose.service` +const COMPOSE_PROJECT_NAME = `electric-sandboxes` /** Common prefix for every container name this module assigns. */ const NAME_PREFIX = `electric-sbx` +/** + * In-container record of the process that most recently ADOPTED the container + * (reattached by key after its creator exited). Labels are immutable, so the + * creation-time owner pid goes stale on reattach; the boot sweep probes this + * marker before reclaiming a running container whose labelled owner is dead. + * Lives on the /tmp tmpfs, so a stop wipes it along with any stale adoption. + */ +const OWNER_MARKER_PATH = `/tmp/.electric-sbx-owner-pid` + /** Default warm window before an idle container is torn down (stop/remove). */ const DEFAULT_IDLE_GRACE_MS = 2 * 60 * 1000 @@ -209,6 +234,33 @@ function withKeyLock(key: string, fn: () => Promise): Promise { return run } +/** + * Bound on concurrent container CREATIONS (image resolve + create + start + + * init), so a burst of wakes materializing sandboxes at once — e.g. a backlog + * replay when a runner reconnects — queues against the daemon instead of + * stampeding it. Reattaches and execs are not limited; total creations are + * unbounded, only their concurrency. + */ +const MAX_CONCURRENT_CREATIONS = 4 +let creationSlots = MAX_CONCURRENT_CREATIONS +const creationQueue: Array<() => void> = [] +async function withCreationSlot(fn: () => Promise): Promise { + if (creationSlots > 0) { + creationSlots -= 1 + } else { + await new Promise((release) => creationQueue.push(release)) + } + try { + return await fn() + } finally { + // Hand the slot directly to the next waiter (no decrement on their side), + // so a new caller can't race in between release and re-acquire. + const next = creationQueue.shift() + if (next) next() + else creationSlots += 1 + } +} + /** Lowercase, DNS-safe slug from an arbitrary identity string (≤24 chars). */ function slugify(s: string): string { const slug = s @@ -421,6 +473,9 @@ export async function dockerSandbox( [SANDBOX_LABEL]: `1`, [SANDBOX_KEY_LABEL]: sandboxKey, [SANDBOX_PERSISTENT_LABEL]: persistent ? `true` : `false`, + [SANDBOX_OWNER_PID_LABEL]: String(process.pid), + [COMPOSE_PROJECT_LABEL]: COMPOSE_PROJECT_NAME, + [COMPOSE_SERVICE_LABEL]: opts.entityType ?? `sandbox`, ...(opts.entityType ? { [SANDBOX_ENTITY_TYPE_LABEL]: opts.entityType } : {}), @@ -429,39 +484,59 @@ export async function dockerSandbox( } // Create-and-start a fresh container. Image is pulled here only (skipped - // entirely when reattaching to an existing container). - const createStarted = async (): Promise => { - const image = resolveImage(opts) - await ensureImage(docker, image, opts) - const c = await docker.createContainer({ - // Spread (rather than a literal `name:`) so the `name` query param — - // which dockerode accepts but doesn't declare on its create-opts type — - // doesn't trip excess-property checking. - ...{ name: containerName }, - Image: image, - Cmd: [`sh`, `-c`, `while true; do sleep 3600; done`], - WorkingDir: containerCwd, - Env: Object.entries(baseEnv).map(([k, v]) => `${k}=${v}`), - Labels: labels, - ExposedPorts: exposedPorts, - HostConfig, + // entirely when reattaching to an existing container). Creation concurrency + // is bounded process-wide (see withCreationSlot). + const createStarted = (): Promise => + withCreationSlot(async (): Promise => { + const image = resolveImage(opts) + await ensureImage(docker, image, opts) + const c = await docker.createContainer({ + // Spread (rather than a literal `name:`) so the `name` query param — + // which dockerode accepts but doesn't declare on its create-opts type — + // doesn't trip excess-property checking. + ...{ name: containerName }, + Image: image, + Cmd: [`sh`, `-c`, `while true; do sleep 3600; done`], + WorkingDir: containerCwd, + Env: Object.entries(baseEnv).map(([k, v]) => `${k}=${v}`), + Labels: labels, + ExposedPorts: exposedPorts, + HostConfig, + }) + try { + await c.start() + } catch (err) { + await c.remove({ force: true, v: true }).catch(() => {}) + throw new SandboxError( + `runtime`, + `dockerSandbox: container start failed: ${ + err instanceof Error ? err.message : String(err) + }` + ) + } + // Tmpfs on `/work` is empty at start; ensure caller-supplied workingDir + // exists with sensible perms. If this post-start init fails — the exec + // call throws, or the mkdir can't run / exits non-zero (a null exit code + // means the exec never produced one, e.g. it couldn't spawn at all) — + // remove the already-started container before rethrowing: it isn't + // registered yet, so no dispose (and, while running, no sweep) would + // ever reclaim it. + try { + const init = await runOneOff(c, [`mkdir`, `-p`, containerCwd]) + if (init.exitCode !== 0) { + throw new SandboxError( + `runtime`, + `dockerSandbox: post-start init failed: \`mkdir -p ${containerCwd}\` exited ${init.exitCode ?? `without a code`}${ + init.stderr.length > 0 ? `: ${init.stderr.toString().trim()}` : `` + }` + ) + } + } catch (err) { + await c.remove({ force: true, v: true }).catch(() => {}) + throw err + } + return c }) - try { - await c.start() - } catch (err) { - await c.remove({ force: true, v: true }).catch(() => {}) - throw new SandboxError( - `runtime`, - `dockerSandbox: container start failed: ${ - err instanceof Error ? err.message : String(err) - }` - ) - } - // Tmpfs on `/work` is empty at start; ensure caller-supplied workingDir - // exists with sensible perms. - await runOneOff(c, [`mkdir`, `-p`, containerCwd]) - return c - } // Serialize reattach + registration against the debounced idle teardown // (which holds the same per-key lock), so a re-acquire can't race a teardown @@ -496,6 +571,25 @@ export async function dockerSandbox( }) } +/** + * Best-effort: record this process as the container's current owner in the + * in-container marker (see {@link OWNER_MARKER_PATH}). Called on the reattach + * paths — where the immutable creation-time owner-pid label may belong to an + * exited process — so the boot sweep can tell an adopted container from a + * crash leftover. + */ +async function writeOwnerMarker(c: DockerodeContainer): Promise { + try { + await runOneOff(c, [ + `sh`, + `-c`, + `echo ${process.pid} > ${OWNER_MARKER_PATH}`, + ]) + } catch { + /* best-effort */ + } +} + /** * Resolve a container by name: reattach to an existing one (starting it if a * persistent container had been stopped) or create it fresh. Handles the race @@ -520,6 +614,7 @@ async function reattachOrCreate( // and exec/fs round-trips work again. await existing.start().catch(() => {}) } + await writeOwnerMarker(existing) return existing } try { @@ -560,34 +655,72 @@ async function reattachExisting( ) } if (!running) await existing.start().catch(() => {}) + await writeOwnerMarker(existing) return existing } function isNameConflict(err: unknown): boolean { - const status = (err as { statusCode?: number; status?: number })?.statusCode - if (status === 409) return true - return /already in use|Conflict/i.test( + // Match the daemon's name-conflict message, not bare HTTP 409 — the daemon + // also answers 409 for unrelated conflicts that can surface from inside + // `createStarted` (e.g. exec on a container that just exited), and treating + // those as a lost create race would "reattach" to a container that is gone. + return /already in use/i.test( err instanceof Error ? err.message : String(err) ) } /** - * One-shot startup cleanup of *ephemeral* sandbox leftovers from a previous - * process (a crash or restart before disposes ran). Call once at runner boot. + * Same-host liveness probe for the pid recorded on a container at creation. + * `kill(pid, 0)` delivers no signal; EPERM means the pid exists but belongs to + * another user — still alive. A reused pid reads as alive (the orphan is then + * left for a later sweep), which errs on the safe side. + */ +function isPidAlive(pid: number): boolean { + try { + process.kill(pid, 0) + return true + } catch (err) { + return (err as NodeJS.ErrnoException).code === `EPERM` + } +} + +/** + * Read the adoption marker (see {@link OWNER_MARKER_PATH}) from a RUNNING + * container. `null` when absent or unreadable — the sweep then falls back to + * the creation-time owner-pid label alone. + */ +async function readOwnerMarker(c: DockerodeContainer): Promise { + try { + const r = await runOneOff(c, [`cat`, OWNER_MARKER_PATH]) + if (r.exitCode !== 0) return null + const pid = Number(r.stdout.toString().trim()) + return Number.isInteger(pid) && pid > 0 ? pid : null + } catch { + return null + } +} + +/** + * One-shot startup cleanup of sandbox leftovers from a previous process (a + * crash, force-quit, or shutdown before disposes ran). Call once at runner + * boot. * - * Two containers are deliberately left untouched: - * - RUNNING containers — they may belong to a live sibling runner (or a - * concurrent test run) sharing this Docker daemon; force-removing those - * would wipe another process's in-use sandbox. Reboot/crash leftovers are - * `Exited` once the daemon restarts, so the common case is still reclaimed; - * a still-running ephemeral orphan is left for a manual labelled prune - * rather than risk a live peer. - * - PERSISTENT containers — `persistent: true` exists precisely so a restarted - * process can reattach to the warm workspace by key, so they must survive a - * boot (only ephemeral leftovers are reclaimed here; a manual labelled prune - * reclaims truly-abandoned persistent ones). + * The keepalive loop never exits on its own, so a leftover is usually still + * RUNNING — but so is a live sibling runner's container on the same daemon. + * The owner-pid label disambiguates (the sweep runs on the same host as the + * daemon's clients): owner alive ⇒ leave it; owner dead ⇒ check the + * in-container adoption marker (a live process that reattached after the + * creator exited) and, if no adopter is alive either, reclaim — REMOVE an + * ephemeral orphan, STOP a persistent one (its writable layer must survive so + * the next wake can reattach by key). A running container without the label + * (created before it existed) can't be probed and is left for a manual + * labelled prune. * - * Returns the names removed. + * Non-running containers: ephemeral leftovers are removed; persistent ones are + * preserved — `persistent: true` exists precisely so a restarted process can + * reattach to the warm workspace by key. + * + * Returns the names reclaimed (removed or stopped). */ export async function sweepOrphanedDockerSandboxes(opts?: { dockerSocket?: string @@ -612,21 +745,116 @@ export async function sweepOrphanedDockerSandboxes(opts?: { return [] } - const removed: Array = [] + const reclaimed: Array = [] for (const c of listed) { const name = c.Names?.[0]?.replace(/^\//, ``) ?? c.Id - // Never touch a running container (possibly a live peer) or a persistent - // one (meant to be reattached by key). See the doc comment above. - if (c.State === `running`) continue - if (c.Labels?.[SANDBOX_PERSISTENT_LABEL] === `true`) continue + // Held by this very process right now — its lifecycle is already managed. + if (sandboxContainers.has(name)) continue + const persistent = c.Labels?.[SANDBOX_PERSISTENT_LABEL] === `true` try { + if (c.State === `running`) { + const ownerPid = Number(c.Labels?.[SANDBOX_OWNER_PID_LABEL]) + // No (or unparsable) owner label, or the owner is alive ⇒ possibly a + // live peer's in-use sandbox — never touch it. + if (!Number.isInteger(ownerPid) || isPidAlive(ownerPid)) continue + // The creator is gone, but a live sibling process may have since + // ADOPTED the container (reattached by key) — labels are immutable, so + // adoption is recorded in an in-container marker. Probe it before + // reclaiming. + const adopter = await readOwnerMarker(docker.getContainer(c.Id)) + if (adopter !== null && isPidAlive(adopter)) continue + if (persistent) { + // `t: 0` → straight to SIGKILL (PID 1 ignores SIGTERM); the writable + // layer survives for a later reattach by key. + await docker.getContainer(c.Id).stop({ t: 0 }) + } else { + await docker.getContainer(c.Id).remove({ force: true, v: true }) + } + reclaimed.push(name) + continue + } + if (persistent) continue await docker.getContainer(c.Id).remove({ force: true, v: true }) - removed.push(name) + reclaimed.push(name) } catch { /* already gone */ } } - return removed + return reclaimed +} + +/** + * Wipe the container for a resolved sandbox key WITHOUT creating one — the + * reclaim path for a lazy sandbox that was never materialized: a terminal + * entity's persistent workspace from an EARLIER wake must not survive just + * because the final wake never used its sandbox. If sibling leases currently + * hold the container, it is only MARKED reclaimed — the last lease draining + * wipes it (matching an owner's `dispose({ reclaim: true })`). A container + * that exists with no registry entry (left from a previous process) is removed + * directly; absent containers are a no-op. + */ +export async function reclaimDockerSandboxByKey( + sandboxKey: string, + opts?: { dockerSocket?: string } +): Promise { + const name = containerNameForKey(sandboxKey) + const Docker = await loadDockerode() + const docker: Dockerode = opts?.dockerSocket + ? new Docker({ socketPath: opts.dockerSocket }) + : new Docker() + await withKeyLock(name, async () => { + const entry = sandboxContainers.get(name) + if (entry && entry.refs > 0) { + entry.reclaim = true + return + } + if (entry?.idleTimer) clearTimeout(entry.idleTimer) + sandboxContainers.delete(name) + try { + await docker.getContainer(name).remove({ force: true, v: true }) + } catch { + /* nothing to reclaim */ + } + }) +} + +/** + * Immediate, lock-serialized teardown of every IDLE container this process + * tracks: pending debounced timers are cancelled and each entry's teardown + * action runs NOW — REMOVE for ephemeral/reclaimed containers, STOP for + * persistent ones (the workspace survives for the next process to reattach by + * key). + * + * Call on runtime shutdown. The debounced idle timers are unref'd and die with + * the process; without this flush every recently-used container is left + * RUNNING for the boot sweep of some future process to find. + * + * Entries with live leases (refs > 0) are left alone: the registry is shared + * process-wide, so they may belong to a sibling runtime that is not shutting + * down (or to a wake whose drain timed out). Their own dispose tears them + * down; if the process exits first, the boot sweep reclaims them by dead + * owner pid. + */ +export async function shutdownAllDockerSandboxes(): Promise { + await Promise.all( + [...sandboxContainers.keys()].map((name) => + withKeyLock(name, async () => { + const entry = sandboxContainers.get(name) + if (!entry || entry.refs > 0) return + if (entry.idleTimer) clearTimeout(entry.idleTimer) + try { + if (sandboxWipesOnDispose(entry.reclaim ?? false, entry.persistent)) { + await entry.container.remove({ force: true, v: true }) + } else { + await entry.container.stop({ t: 0 }) + } + } catch { + /* already stopped / removed / gone */ + } + sandboxContainers.delete(name) + }) + ) + ) } /** Test-only: drop the in-process container registry bookkeeping. */ diff --git a/packages/agents-runtime/src/sandbox/lazy.ts b/packages/agents-runtime/src/sandbox/lazy.ts new file mode 100644 index 0000000000..06b6d7c561 --- /dev/null +++ b/packages/agents-runtime/src/sandbox/lazy.ts @@ -0,0 +1,195 @@ +/** + * Lazy sandbox wrapper — defers the (potentially expensive) provider factory + * until the sandbox is actually USED. A wake that never runs a tool — a cron + * tick deciding there is nothing to do, child-status bookkeeping, a drained + * inbox — then never pays for, or leaks, a provider-side sandbox (e.g. a + * Docker container). Profile authors wrap their factory: + * + * factory: async (params) => + * lazySandbox({ + * workingDirectory: `/work`, + * factory: () => dockerSandbox({ ... }), + * reclaim: params.owner + * ? () => reclaimDockerSandboxByKey(params.sandboxKey) + * : undefined, + * }) + * + * Materialization is single-flight and retried on failure. Disposing a + * never-used wrapper is free — except `dispose({ reclaim: true })`, which runs + * the provider-supplied `reclaim` callback so a terminal entity's persistent + * workspace from an EARLIER wake is still wiped without creating a fresh + * sandbox just to remove it. + */ + +import { SandboxError } from './types' +import type { + DirEntry, + FileStat, + Sandbox, + SandboxExecOpts, + SandboxExecResult, +} from './types' + +/** + * Cross-module-instance brand for {@link ensureSandboxMaterialized}: a global + * symbol survives duplicate copies of this module in a bundle, where an + * `instanceof` check would not. + */ +const MATERIALIZE = Symbol.for(`electric.sandbox.lazy.materialize`) + +export interface LazySandboxOpts { + /** Reported as `Sandbox.name` until the inner sandbox materializes. */ + name?: string + /** + * The inner sandbox's primary writable root. MUST match what `factory` + * produces: tools resolve relative paths against it synchronously, before + * the first (materializing) sandbox call could run. + */ + workingDirectory: string + /** Builds the inner sandbox on first use. */ + factory: () => Promise + /** + * Wipe the provider-side sandbox WITHOUT materializing. Invoked when this + * lease is disposed with `reclaim: true` before the sandbox was ever used — + * an earlier wake's persistent workspace may still exist under the same key + * and must not outlive its terminal entity. Only pass this for OWNER leases: + * the wrapper itself cannot tell an owner from an attacher. + */ + reclaim?: () => Promise +} + +class LazySandbox implements Sandbox { + private readonly opts: LazySandboxOpts + private inner: Sandbox | null = null + private materializing: Promise | null = null + private disposed = false + + constructor(opts: LazySandboxOpts) { + this.opts = opts + } + + get name(): string { + return this.inner?.name ?? this.opts.name ?? `lazy` + } + + get workingDirectory(): string { + return this.opts.workingDirectory + } + + /** Single-flight; a failed factory clears so the next use retries. */ + private materialize(): Promise { + if (this.inner) return Promise.resolve(this.inner) + this.materializing ??= this.opts.factory().then( + (sandbox) => { + this.inner = sandbox + this.materializing = null + return sandbox + }, + (err: unknown) => { + this.materializing = null + throw err + } + ) + return this.materializing + } + + /** @internal see {@link ensureSandboxMaterialized} */ + [MATERIALIZE](): Promise { + this.assertLive() + return this.materialize() + } + + private assertLive(): void { + if (this.disposed) { + throw new SandboxError( + `runtime`, + `lazySandbox: this sandbox lease has been disposed.` + ) + } + } + + async exec(opts: SandboxExecOpts): Promise { + this.assertLive() + return (await this.materialize()).exec(opts) + } + + async readFile(path: string): Promise { + this.assertLive() + return (await this.materialize()).readFile(path) + } + + async writeFile(path: string, content: Buffer | string): Promise { + this.assertLive() + return (await this.materialize()).writeFile(path, content) + } + + async mkdir(path: string, opts?: { recursive?: boolean }): Promise { + this.assertLive() + return (await this.materialize()).mkdir(path, opts) + } + + async readdir(path: string): Promise> { + this.assertLive() + return (await this.materialize()).readdir(path) + } + + async exists(path: string): Promise { + this.assertLive() + return (await this.materialize()).exists(path) + } + + async remove(path: string, opts?: { recursive?: boolean }): Promise { + this.assertLive() + return (await this.materialize()).remove(path, opts) + } + + async stat(path: string): Promise { + this.assertLive() + return (await this.materialize()).stat(path) + } + + async fetch(input: string | URL, init?: RequestInit): Promise { + this.assertLive() + return (await this.materialize()).fetch(input, init) + } + + async dispose(opts?: { reclaim?: boolean }): Promise { + if (this.disposed) return + this.disposed = true + if (this.inner || this.materializing) { + let inner: Sandbox | null = null + try { + inner = await this.materialize() + } catch { + // The in-flight factory failed — nothing live to dispose. Fall + // through: a requested reclaim still wipes the provider-side state. + } + if (inner) { + // The inner dispose owns the reclaim semantics from here. + await inner.dispose(opts) + return + } + } + if (opts?.reclaim && this.opts.reclaim) await this.opts.reclaim() + } +} + +/** Wrap a provider factory so the sandbox materializes on first use. */ +export function lazySandbox(opts: LazySandboxOpts): Sandbox { + return new LazySandbox(opts) +} + +/** + * Force a lazy sandbox to materialize (no-op for any other sandbox). Used by + * the spawn-`inherit` path: a child ATTACHES to this wake's sandbox by key and + * never creates, so the owner's container/workspace must actually exist before + * the child can wake — even if the owner itself never ran a tool. + */ +export async function ensureSandboxMaterialized( + sandbox: Sandbox +): Promise { + const materialize = ( + sandbox as Sandbox & { [MATERIALIZE]?: () => Promise } + )[MATERIALIZE] + if (typeof materialize === `function`) await materialize.call(sandbox) +} diff --git a/packages/agents-runtime/test/sandbox-docker.test.ts b/packages/agents-runtime/test/sandbox-docker.test.ts index df69688a20..830811e42e 100644 --- a/packages/agents-runtime/test/sandbox-docker.test.ts +++ b/packages/agents-runtime/test/sandbox-docker.test.ts @@ -2,11 +2,15 @@ import { afterAll, afterEach, beforeAll, describe, expect, it } from 'vitest' import { mkdtemp, rm, symlink, writeFile } from 'node:fs/promises' import { tmpdir } from 'node:os' import { join } from 'node:path' +import { spawnSync } from 'node:child_process' import { __resetPersistentRegistryForTests, dockerSandbox, + reclaimDockerSandboxByKey, + shutdownAllDockerSandboxes, sweepOrphanedDockerSandboxes, } from '../src/sandbox/docker' +import { lazySandbox } from '../src/sandbox/lazy' import { loadDockerode } from '../src/sandbox/docker/loader' import { SandboxError } from '../src/sandbox/types' import { dockerAvailable, TEST_IMAGE, TEST_LABEL } from './helpers/docker-probe' @@ -469,8 +473,14 @@ d(`dockerSandbox keyed lifecycle`, () => { filters: { label: [`com.electric.sandbox.key=${sandboxKey}`] }, }) if (list.length === 0) return `absent` - const info = await docker.getContainer(list[0].Id).inspect() - return info.State.Running ? `running` : `stopped` + try { + const info = await docker.getContainer(list[0].Id).inspect() + return info.State.Running ? `running` : `stopped` + } catch (err) { + // Removed between list and inspect (e.g. a reclaim in flight). + if ((err as { statusCode?: number }).statusCode === 404) return `absent` + throw err + } } const waitForKeyState = async ( @@ -530,6 +540,12 @@ d(`dockerSandbox keyed lifecycle`, () => { // Per-entity identity lives in labels, since the shared name can't. expect(labels[`com.electric.sandbox.entity-type`]).toBe(`horton`) expect(labels[`com.electric.sandbox.entity`]).toBe(`/horton/abc123/main`) + // Owning process, for the boot sweep's same-host liveness probe. + expect(labels[`com.electric.sandbox.owner-pid`]).toBe(String(process.pid)) + // Compose-style grouping: docker GUIs collapse all sandboxes under one + // project, and `docker compose -p electric-sandboxes down` clears them. + expect(labels[`com.docker.compose.project`]).toBe(`electric-sandboxes`) + expect(labels[`com.docker.compose.service`]).toBe(`horton`) } finally { await sandbox.dispose() } @@ -743,9 +759,10 @@ d(`dockerSandbox keyed lifecycle`, () => { expect(await keyState(key)).toBe(`absent`) }, 90_000) - it(`sweepOrphanedDockerSandboxes leaves a *running* container untouched`, async () => { - // A running container may belong to a live sibling process sharing this - // daemon — force-removing it would wipe a peer's in-use sandbox. + it(`sweepOrphanedDockerSandboxes leaves a *running* container with a live owner untouched`, async () => { + // A running container whose owner-pid process (this test process) is alive + // belongs to a live sibling — force-removing it would wipe an in-use + // sandbox. const key = `${KEY}-orphan-running` await make(key, 60_000, /* persistent */ false) __resetPersistentRegistryForTests() @@ -767,4 +784,189 @@ d(`dockerSandbox keyed lifecycle`, () => { await sweepOrphanedDockerSandboxes() expect(await keyState(key)).toBe(`stopped`) }, 90_000) + + // A pid that is guaranteed dead: spawn a no-op child and reuse its pid after + // it has exited (synchronously, so it is reaped before we return). + const deadPid = (): number => { + const child = spawnSync(process.execPath, [`-e`, ``]) + if (child.pid === undefined) throw new Error(`spawnSync returned no pid`) + return child.pid + } + + // Create a container whose owner-pid label points at a dead process — the + // shape a crashed/quit runner leaves behind (its keepalive loop never exits + // on its own, so the container is still RUNNING). + const makeCrashOrphan = (sandboxKey: string, persistent: boolean) => + dockerSandbox({ + image: TEST_IMAGE, + labels: { + [TEST_LABEL]: `1`, + 'com.electric.sandbox.owner-pid': String(deadPid()), + }, + sandboxKey, + persistent, + sharedIdleGraceMs: 600_000, + }) + + it(`sweepOrphanedDockerSandboxes REMOVES a running *ephemeral* orphan whose owner died`, async () => { + const key = `${KEY}-crash-eph` + await makeCrashOrphan(key, /* persistent */ false) + __resetPersistentRegistryForTests() // the owning process is gone + expect(await keyState(key)).toBe(`running`) + + const reclaimed = await sweepOrphanedDockerSandboxes() + expect(reclaimed.length).toBeGreaterThanOrEqual(1) + expect(await keyState(key)).toBe(`absent`) + }, 90_000) + + it(`sweepOrphanedDockerSandboxes STOPS a running *persistent* orphan whose owner died`, async () => { + // Persistent workspaces survive (the next wake reattaches by key), but a + // dead owner can't stop the keepalive — the sweep does it instead. + const key = `${KEY}-crash-persist` + await makeCrashOrphan(key, /* persistent */ true) + __resetPersistentRegistryForTests() + expect(await keyState(key)).toBe(`running`) + + await sweepOrphanedDockerSandboxes() + expect(await keyState(key)).toBe(`stopped`) + }, 90_000) + + it(`sweepOrphanedDockerSandboxes spares a running orphan ADOPTED by a live process`, async () => { + // The owner-pid label is immutable, so it goes stale when a live process + // reattaches a dead creator's container — adoption is recorded in an + // in-container marker that the sweep probes before reclaiming. + const key = `${KEY}-adopted` + await makeCrashOrphan(key, /* persistent */ true) + __resetPersistentRegistryForTests() // the creator is gone… + await make(key, 600_000, /* persistent */ true) // …a live process adopts + __resetPersistentRegistryForTests() // bypass the in-registry skip + expect(await keyState(key)).toBe(`running`) + + await sweepOrphanedDockerSandboxes() + expect(await keyState(key)).toBe(`running`) + }, 90_000) + + it(`sweepOrphanedDockerSandboxes leaves a running container with no owner-pid label untouched`, async () => { + // Containers created before the owner-pid label existed can't be liveness- + // probed — fail safe and leave them for a manual labelled prune. + const Docker = await loadDockerode() + const docker = new Docker() + const c = await docker.createContainer({ + ...{ name: `electric-sbx-legacy-nolabel-test` }, + Image: TEST_IMAGE, + Cmd: [`sh`, `-c`, `while true; do sleep 3600; done`], + Labels: { + 'com.electric.sandbox': `1`, + 'com.electric.sandbox.persistent': `false`, + [TEST_LABEL]: `1`, + }, + HostConfig: {}, + }) + await c.start() + await sweepOrphanedDockerSandboxes() + const info = await c.inspect() + expect(info.State.Running).toBe(true) + }, 90_000) + + it(`removes the container when post-start init fails (no unregistered zombie)`, async () => { + // pidsLimit:1 — PID 1's keepalive holds the only slot, so the post-start + // `mkdir -p` exec cannot run and creation fails AFTER the container + // started. The failed create must remove it, not leave it behind running + // and unregistered (invisible to dispose and, while running, to the sweep). + const key = `${KEY}-init-fail` + await expect( + dockerSandbox({ + image: TEST_IMAGE, + labels: { [TEST_LABEL]: `1` }, + sandboxKey: key, + resources: { pidsLimit: 1 }, + }) + ).rejects.toThrow() + expect(await keyState(key)).toBe(`absent`) + }, 90_000) + + it(`shutdownAllDockerSandboxes flushes pending debounced teardowns immediately`, async () => { + // The debounced idle timers are unref'd and die with the process — on + // shutdown the pending action must run NOW: STOP persistent, REMOVE + // ephemeral. + const ephKey = `${KEY}-shutdown-eph` + const perKey = `${KEY}-shutdown-per` + const eph = await make(ephKey, 600_000, /* persistent */ false) + const per = await make(perKey, 600_000, /* persistent */ true) + await eph.dispose() + await per.dispose() + // Teardowns are debounced 10 minutes out — both still running. + expect(await keyState(ephKey)).toBe(`running`) + expect(await keyState(perKey)).toBe(`running`) + + await shutdownAllDockerSandboxes() + expect(await keyState(ephKey)).toBe(`absent`) + expect(await keyState(perKey)).toBe(`stopped`) + }, 90_000) + + it(`lazy docker sandbox creates no container until first use`, async () => { + // The shape bootstrap wires: profile.factory returns a lazy wrapper, so a + // trivial wake (no tool use) never starts a container. + const key = `${KEY}-lazy` + const sb = lazySandbox({ + name: `docker`, + workingDirectory: `/work`, + factory: () => make(key, 60_000), + }) + expect(await keyState(key)).toBe(`absent`) + const r = await sb.exec({ command: `echo materialized` }) + expect(r.stdout.toString().trim()).toBe(`materialized`) + expect(await keyState(key)).toBe(`running`) + await sb.dispose() + }, 90_000) + + it(`lazy docker sandbox disposed without use leaves nothing behind`, async () => { + const key = `${KEY}-lazy-unused` + const sb = lazySandbox({ + name: `docker`, + workingDirectory: `/work`, + factory: () => make(key, 60_000), + }) + await sb.dispose() + expect(await keyState(key)).toBe(`absent`) + }, 60_000) + + it(`reclaimDockerSandboxByKey removes an earlier wake's persistent container without creating one`, async () => { + // The lazy wrapper's reclaim path: a terminal wake that never used its + // sandbox must still wipe the persistent workspace earlier wakes created. + const key = `${KEY}-reclaim-by-key` + const earlier = await make(key, 600_000, /* persistent */ true) + await earlier.dispose() // idle: container stays (debounced stop pending) + expect(await keyState(key)).toBe(`running`) + + await reclaimDockerSandboxByKey(key) + expect(await keyState(key)).toBe(`absent`) + }, 90_000) + + it(`reclaimDockerSandboxByKey is a no-op when nothing exists under the key`, async () => { + await reclaimDockerSandboxByKey(`${KEY}-reclaim-absent`) + expect(await keyState(`${KEY}-reclaim-absent`)).toBe(`absent`) + }, 60_000) + + it(`reclaimDockerSandboxByKey defers to live leases — wiped once the last drains`, async () => { + const key = `${KEY}-reclaim-live` + const lease = await make(key, 600_000, /* persistent */ true) + await reclaimDockerSandboxByKey(key) + // The sibling lease still owns it — not torn down underneath it. + expect(await keyState(key)).toBe(`running`) + await lease.dispose() + // Marked reclaimed: the last lease draining wipes it immediately. + await waitForKeyState(key, `absent`, 5_000) + }, 90_000) + + it(`shutdownAllDockerSandboxes leaves containers with live leases alone`, async () => { + // The registry is process-wide: a still-open lease may belong to a sibling + // runtime that is NOT shutting down. Its own dispose tears it down — and if + // the process exits first, the boot sweep reclaims it by dead owner pid. + const key = `${KEY}-shutdown-live` + const live = await make(key, 600_000, /* persistent */ false) + await shutdownAllDockerSandboxes() + expect(await keyState(key)).toBe(`running`) + await live.dispose() + }, 90_000) }) diff --git a/packages/agents-runtime/test/sandbox-lazy.test.ts b/packages/agents-runtime/test/sandbox-lazy.test.ts new file mode 100644 index 0000000000..b529ba4744 --- /dev/null +++ b/packages/agents-runtime/test/sandbox-lazy.test.ts @@ -0,0 +1,197 @@ +import { describe, expect, it } from 'vitest' +import { ensureSandboxMaterialized, lazySandbox } from '../src/sandbox/lazy' +import type { Sandbox } from '../src/sandbox/types' + +/** + * lazySandbox defers the provider factory until first use, so trivial wakes + * (cron ticks, bookkeeping) never pay for — or leak — a provider-side sandbox + * (e.g. a Docker container). + */ + +interface FakeCalls { + exec: number + dispose: Array<{ reclaim?: boolean } | undefined> +} + +const makeFake = (calls: FakeCalls): Sandbox => ({ + name: `fake:inner`, + workingDirectory: `/work`, + exec: async () => { + calls.exec += 1 + return { + exitCode: 0, + signal: null, + stdout: Buffer.from(`ok`), + stderr: Buffer.alloc(0), + timedOut: false, + aborted: false, + outputTruncated: false, + } + }, + readFile: async () => Buffer.from(`content`), + writeFile: async () => {}, + mkdir: async () => {}, + readdir: async () => [], + exists: async () => true, + remove: async () => {}, + stat: async () => ({ type: `file`, size: 0, mtimeMs: 0 }), + fetch: async () => new Response(`ok`), + dispose: async (opts) => { + calls.dispose.push(opts) + }, +}) + +const harness = () => { + const calls: FakeCalls = { exec: 0, dispose: [] } + let factoryCalls = 0 + const reclaims: Array = [] + const sandbox = lazySandbox({ + name: `fake`, + workingDirectory: `/work`, + factory: async () => { + factoryCalls += 1 + return makeFake(calls) + }, + reclaim: async () => { + reclaims.push(true) + }, + }) + return { + sandbox, + calls, + reclaims, + factoryCalls: () => factoryCalls, + } +} + +describe(`lazySandbox`, () => { + it(`does not invoke the factory on construction or property access`, () => { + const h = harness() + expect(h.sandbox.name).toBe(`fake`) + expect(h.sandbox.workingDirectory).toBe(`/work`) + expect(h.factoryCalls()).toBe(0) + }) + + it(`materializes on first use and reuses the inner sandbox after`, async () => { + const h = harness() + await h.sandbox.exec({ command: `echo hi` }) + await h.sandbox.exec({ command: `echo again` }) + await h.sandbox.readFile(`/work/x`) + expect(h.factoryCalls()).toBe(1) + expect(h.calls.exec).toBe(2) + // After materialization the inner provider's name shows through. + expect(h.sandbox.name).toBe(`fake:inner`) + }) + + it(`single-flights concurrent first calls`, async () => { + let resolveFactory!: (sb: Sandbox) => void + const calls: FakeCalls = { exec: 0, dispose: [] } + let factoryCalls = 0 + const sandbox = lazySandbox({ + workingDirectory: `/work`, + factory: () => { + factoryCalls += 1 + return new Promise((r) => { + resolveFactory = r + }) + }, + }) + const a = sandbox.exec({ command: `a` }) + const b = sandbox.exec({ command: `b` }) + resolveFactory(makeFake(calls)) + await Promise.all([a, b]) + expect(factoryCalls).toBe(1) + expect(calls.exec).toBe(2) + }) + + it(`a failed factory rejects the call and is retried on the next use`, async () => { + let attempts = 0 + const calls: FakeCalls = { exec: 0, dispose: [] } + const sandbox = lazySandbox({ + workingDirectory: `/work`, + factory: async () => { + attempts += 1 + if (attempts === 1) throw new Error(`daemon hiccup`) + return makeFake(calls) + }, + }) + await expect(sandbox.exec({ command: `x` })).rejects.toThrow( + `daemon hiccup` + ) + await sandbox.exec({ command: `x` }) + expect(attempts).toBe(2) + expect(calls.exec).toBe(1) + }) + + it(`dispose without use never invokes the factory`, async () => { + const h = harness() + await h.sandbox.dispose() + expect(h.factoryCalls()).toBe(0) + expect(h.calls.dispose).toEqual([]) + expect(h.reclaims).toEqual([]) + }) + + it(`dispose({reclaim}) without use runs the reclaim callback instead`, async () => { + // A terminal entity's persistent workspace from an EARLIER wake must not + // survive just because the final wake never touched the sandbox. + const h = harness() + await h.sandbox.dispose({ reclaim: true }) + expect(h.factoryCalls()).toBe(0) + expect(h.reclaims).toEqual([true]) + }) + + it(`dispose({reclaim}) without use is a no-op when no reclaim callback exists`, async () => { + let factoryCalls = 0 + const calls: FakeCalls = { exec: 0, dispose: [] } + const sandbox = lazySandbox({ + workingDirectory: `/work`, + factory: async () => { + factoryCalls += 1 + return makeFake(calls) + }, + }) + await sandbox.dispose({ reclaim: true }) + expect(factoryCalls).toBe(0) + }) + + it(`dispose after use forwards to the inner sandbox (reclaim included)`, async () => { + const h = harness() + await h.sandbox.exec({ command: `x` }) + await h.sandbox.dispose({ reclaim: true }) + expect(h.calls.dispose).toEqual([{ reclaim: true }]) + // The inner dispose owns reclaim semantics — the callback must not ALSO run. + expect(h.reclaims).toEqual([]) + }) + + it(`operations after dispose are rejected with a runtime SandboxError`, async () => { + const h = harness() + await h.sandbox.dispose() + await expect(h.sandbox.exec({ command: `x` })).rejects.toMatchObject({ + kind: `runtime`, + }) + expect(h.factoryCalls()).toBe(0) + }) + + it(`repeated dispose is a no-op`, async () => { + const h = harness() + await h.sandbox.exec({ command: `x` }) + await h.sandbox.dispose() + await h.sandbox.dispose() + expect(h.calls.dispose).toHaveLength(1) + }) +}) + +describe(`ensureSandboxMaterialized`, () => { + it(`materializes a lazy sandbox without running any operation`, async () => { + const h = harness() + await ensureSandboxMaterialized(h.sandbox) + expect(h.factoryCalls()).toBe(1) + expect(h.calls.exec).toBe(0) + }) + + it(`is a no-op for a non-lazy sandbox`, async () => { + const calls: FakeCalls = { exec: 0, dispose: [] } + await ensureSandboxMaterialized(makeFake(calls)) + expect(calls.exec).toBe(0) + }) +}) diff --git a/packages/agents/src/bootstrap.ts b/packages/agents/src/bootstrap.ts index 597597e66b..c1202ca86c 100644 --- a/packages/agents/src/bootstrap.ts +++ b/packages/agents/src/bootstrap.ts @@ -12,6 +12,7 @@ import { createEventSourceTools } from '@electric-ax/agents-runtime/tools' import { chooseDefaultSandbox, isE2BAvailable, + lazySandbox, remoteSandbox, type SandboxProfile, } from '@electric-ax/agents-runtime/sandbox' @@ -40,6 +41,13 @@ export interface AgentHandlerResult { registry: EntityRegistry typeNames: Array skillsRegistry: SkillsRegistry | null + /** + * Immediately tears down the idle sandboxes this process created (set when + * a provider with host-local state — docker — is registered). MUST be called + * on shutdown, after wakes drain: the providers' debounced idle teardowns + * die with the process, which would leave containers running. + */ + shutdownSandboxes: (() => Promise) | null } export type BuiltinElectricToolsFactory = NonNullable< @@ -154,7 +162,8 @@ export async function createBuiltinAgentHandler( registerWorker(registry, { workingDirectory: cwd, streamFn, modelCatalog }) typeNames.push(`worker`) - const sandboxProfiles = await buildBuiltinSandboxProfiles(cwd) + const { profiles: sandboxProfiles, shutdownSandboxes } = + await buildBuiltinSandboxProfiles(cwd) const runtime = createRuntimeHandler({ baseUrl: agentServerUrl, @@ -176,6 +185,7 @@ export async function createBuiltinAgentHandler( registry, typeNames, skillsRegistry, + shutdownSandboxes, } } @@ -211,7 +221,7 @@ export const registerAgentTypes = registerBuiltinAgentTypes * Guard so repeated `buildBuiltinSandboxProfiles` calls in one process don't * re-run the boot sweep. */ -let dockerSweptOnBoot = false +let dockerBootSweep: Promise | null = null type SweepOrphanedDockerSandboxes = // eslint-disable-next-line quotes -- type-position import() requires a string literal @@ -219,17 +229,15 @@ type SweepOrphanedDockerSandboxes = function sweepOrphanedDockerSandboxesOnce( sweep: SweepOrphanedDockerSandboxes -): void { - if (dockerSweptOnBoot) return - dockerSweptOnBoot = true - // One-shot, at boot: reclaim any sandbox containers a previous (crashed) - // process left behind. Fire-and-forget — nothing is acquired yet, and within - // a live process shared containers are stopped (not removed) when idle. - sweep() - .then((removed) => { - if (removed.length > 0) { +): Promise { + // One-shot, at boot: reclaim any sandbox containers a previous (crashed or + // quit) process left behind. Awaited by the caller — the sweep can stop + // running orphans, so it must finish before any wake reattaches by key. + dockerBootSweep ??= sweep() + .then((reclaimed) => { + if (reclaimed.length > 0) { serverLog.info( - `[builtin-agents] docker sandbox boot sweep removed ${removed.length} leftover container(s)` + `[builtin-agents] docker sandbox boot sweep reclaimed ${reclaimed.length} leftover container(s)` ) } }) @@ -238,16 +246,23 @@ function sweepOrphanedDockerSandboxesOnce( `[builtin-agents] docker sandbox boot sweep error: ${err instanceof Error ? err.message : String(err)}` ) ) + return dockerBootSweep } /** * Built-in sandbox profiles. `local` is always available. `docker` is * gated on Docker being reachable so a user without Docker installed * sees only what works — the UI never offers a non-functional choice. + * + * Also returns `shutdownSandboxes` when a host-local provider registered: an + * immediate teardown of this process's live containers that the embedding + * server must run on shutdown (the providers' debounced idle teardowns die + * with the process). */ -async function buildBuiltinSandboxProfiles( - workingDirectory: string -): Promise> { +async function buildBuiltinSandboxProfiles(workingDirectory: string): Promise<{ + profiles: Array + shutdownSandboxes: (() => Promise) | null +}> { const profiles: Array = [ { name: `local`, @@ -257,25 +272,32 @@ async function buildBuiltinSandboxProfiles( chooseDefaultSandbox(resolveCwd(args, workingDirectory)), }, ] + let shutdownSandboxes: (() => Promise) | null = null try { const { isDockerAvailable } = await import( `@electric-ax/agents-runtime/sandbox/docker` ) if (await isDockerAvailable()) { - const { dockerSandbox, sweepOrphanedDockerSandboxes } = await import( - `@electric-ax/agents-runtime/sandbox/docker` - ) - // Reclaim containers a previous crashed process may have left running. - // No periodic reaper: within a live process, shared containers stop - // themselves a short while after their last lease disposes (debounced - // idle-stop), and ephemeral ones are killed on dispose. - sweepOrphanedDockerSandboxesOnce(sweepOrphanedDockerSandboxes) + const { + dockerSandbox, + reclaimDockerSandboxByKey, + shutdownAllDockerSandboxes, + sweepOrphanedDockerSandboxes, + } = await import(`@electric-ax/agents-runtime/sandbox/docker`) + // Reclaim containers a previous process (crash, force-quit, or a + // shutdown that raced the debounced teardowns) left behind. No periodic + // reaper: within a live process, shared containers stop themselves a + // short while after their last lease disposes (debounced idle-stop), + // ephemeral ones are killed on dispose, and the rest are flushed by + // `shutdownSandboxes` on clean shutdown. + await sweepOrphanedDockerSandboxesOnce(sweepOrphanedDockerSandboxes) + shutdownSandboxes = shutdownAllDockerSandboxes profiles.push({ name: `docker`, label: `Docker`, description: `Runs in a hardened Docker container: dropped capabilities, no privilege escalation, and CPU/memory/process limits. The chosen working directory is mounted read-write and, by default, network egress is unrestricted (allow-all).`, - factory: ({ + factory: async ({ args, sandboxKey, persistent, @@ -284,25 +306,41 @@ async function buildBuiltinSandboxProfiles( entityUrl, }) => { const cwd = readWorkingDirectoryArg(args) - return dockerSandbox({ - // Default to open egress for local development. Network policy is a - // capability of this profile, not a separate profile: like the - // working directory above, it can be made a per-spawn arg later so a - // single `docker` profile spans permissive → fully-isolated rather - // than splitting into `docker-permissive` / `docker-isolated`. - initialNetworkPolicy: { mode: `allow-all` }, - extraMounts: cwd - ? [{ hostPath: cwd, containerPath: `/work`, readOnly: false }] + // Lazy: the container is only created/started when the wake actually + // USES the sandbox, so trivial wakes (cron ticks, bookkeeping) don't + // spin one up. `/work` is the container cwd dockerSandbox defaults to. + return lazySandbox({ + name: `docker`, + workingDirectory: `/work`, + factory: () => + dockerSandbox({ + // Default to open egress for local development. Network policy + // is a capability of this profile, not a separate profile: like + // the working directory above, it can be made a per-spawn arg + // later so a single `docker` profile spans permissive → + // fully-isolated rather than splitting into `docker-permissive` + // / `docker-isolated`. + initialNetworkPolicy: { mode: `allow-all` }, + extraMounts: cwd + ? [{ hostPath: cwd, containerPath: `/work`, readOnly: false }] + : undefined, + // The container is always named-by-key and reattachable; + // `persistent` chooses idle teardown (stop vs remove) and + // `owner` gates creation (an attacher reattaches only). All + // resolved upstream from config. + sandboxKey, + persistent, + owner, + // Observability: tag the container/labels with who spawned it. + entityType, + entityUrl, + }), + // A terminal entity whose final wake never used the sandbox must + // still wipe the persistent workspace earlier wakes created. Owner + // leases only — an attacher can never reclaim the owner's sandbox. + reclaim: owner + ? () => reclaimDockerSandboxByKey(sandboxKey) : undefined, - // The container is always named-by-key and reattachable; `persistent` - // chooses idle teardown (stop vs remove) and `owner` gates creation - // (an attacher reattaches only). All resolved upstream from config. - sandboxKey, - persistent, - owner, - // Observability: tag the container/labels with who spawned it. - entityType, - entityUrl, }) }, }) @@ -354,7 +392,7 @@ async function buildBuiltinSandboxProfiles( console.log( `[builtin-agents] sandbox profiles advertised: ${profiles.map((p) => p.name).join(`, `)}` ) - return profiles + return { profiles, shutdownSandboxes } } function readWorkingDirectoryArg( diff --git a/packages/agents/src/server.ts b/packages/agents/src/server.ts index 3e7786763a..7ee61e8479 100644 --- a/packages/agents/src/server.ts +++ b/packages/agents/src/server.ts @@ -355,6 +355,20 @@ export class BuiltinAgentsServer { }), new Promise((resolve) => setTimeout(resolve, 5_000)), ]) + // Tear down this process's live sandbox containers NOW — their debounced + // idle teardowns die with the process and would leave them running. + // Bounded like the drain above so a hung daemon can't block quit. + if (this.bootstrap.shutdownSandboxes) { + await Promise.race([ + this.bootstrap.shutdownSandboxes().catch((err) => { + serverLog.error( + `[builtin-agents] sandbox shutdown failed during shutdown:`, + err + ) + }), + new Promise((resolve) => setTimeout(resolve, 5_000)), + ]) + } this.bootstrap = null }