diff --git a/.changeset/zombie-sandbox-containers-fix.md b/.changeset/zombie-sandbox-containers-fix.md new file mode 100644 index 0000000000..d05322fd4b --- /dev/null +++ b/.changeset/zombie-sandbox-containers-fix.md @@ -0,0 +1,24 @@ +--- +'@electric-ax/agents-runtime': patch +'@electric-ax/agents': patch +--- + +Fix Docker sandbox containers (`electric-sbx-*`) accumulating as zombies, and +stop creating containers for wakes that never use their sandbox: + +- The boot sweep now reclaims RUNNING orphans whose owning process died + (owner-pid label + in-container adoption marker), instead of only exited + ephemeral leftovers — previously crash/quit leftovers were never cleaned up. +- Runtime shutdown flushes the debounced idle teardowns (stop persistent / + remove ephemeral) instead of letting the unref'd timers die with the + process, which leaked a running container on every quit. +- A failed post-start init no longer leaves a running, untracked container + behind, and a 409 from inside creation is no longer misread as a name + conflict (which "reattached" to a removed container). +- Sandbox creation is now lazy: the container is only created/started when a + wake actually uses its sandbox, so backlog bursts of trivial wakes (cron + ticks, bookkeeping) on runner reconnect no longer spin up containers. + Terminal reclaim and spawn-`inherit` still work for never-used sandboxes, + and concurrent container creations are capped to smooth real bursts. +- All sandbox containers carry `com.docker.compose.project=electric-sandboxes` + so Docker GUIs group them and they can be stopped/removed together. diff --git a/packages/agents-runtime/src/process-wake.ts b/packages/agents-runtime/src/process-wake.ts index 63b0c3abb7..e54cbe3fbb 100644 --- a/packages/agents-runtime/src/process-wake.ts +++ b/packages/agents-runtime/src/process-wake.ts @@ -10,6 +10,7 @@ import { createSetupContext } from './setup-context' import { createEntityLogPrefix, runtimeLog } from './log' import { createRuntimeServerClient } from './runtime-server-client' import { unrestrictedSandbox } from './sandbox/unrestricted' +import { ensureSandboxMaterialized } from './sandbox/lazy' import { resolveSandboxIdentity } from './sandbox/identity' import { appendPathToUrl } from './url' import { manifestChildKey } from './manifest-helpers' @@ -1302,7 +1303,7 @@ export async function processWake( const requestedInherit = opts?.sandbox === `inherit` || (typeof opts?.sandbox === `object` && opts.sandbox.inherit === true) - const sandbox = requestedInherit + const childSandbox = requestedInherit ? resolvedSandboxSelection ? { profile: resolvedSandboxSelection.profile, @@ -1316,6 +1317,12 @@ export async function processWake( : opts?.sandbox === `inherit` ? undefined : opts?.sandbox + // An inheriting child only ever ATTACHES by key — make sure the + // owner's (lazily-created) container/workspace actually exists before + // the child can wake, even if this wake never ran a tool itself. + if (requestedInherit && resolvedSandboxSelection && sandbox) { + await ensureSandboxMaterialized(sandbox) + } return serverClient.spawnEntity({ type: childType, id: childId, @@ -1323,7 +1330,7 @@ export async function processWake( parentUrl, initialMessage: opts?.initialMessage, tags: opts?.tags, - sandbox, + sandbox: childSandbox, wake: wakeOpt, }) }, diff --git a/packages/agents-runtime/src/sandbox-docker.ts b/packages/agents-runtime/src/sandbox-docker.ts index c1c39dc6cd..ffb8e7c4bb 100644 --- a/packages/agents-runtime/src/sandbox-docker.ts +++ b/packages/agents-runtime/src/sandbox-docker.ts @@ -9,6 +9,8 @@ export { dockerSandbox, + reclaimDockerSandboxByKey, + shutdownAllDockerSandboxes, sweepOrphanedDockerSandboxes, __resetPersistentRegistryForTests, } from './sandbox/docker' diff --git a/packages/agents-runtime/src/sandbox.ts b/packages/agents-runtime/src/sandbox.ts index 9e1bc7ffe4..29b5839d2a 100644 --- a/packages/agents-runtime/src/sandbox.ts +++ b/packages/agents-runtime/src/sandbox.ts @@ -8,6 +8,8 @@ export type { RemoteProvider, RemoteSandboxOpts } from './sandbox/remote' export type { RemoteSandboxClient } from './sandbox/remote/types' export { isE2BAvailable } from './sandbox/remote/e2b' export { chooseDefaultSandbox } from './sandbox/default' +export { ensureSandboxMaterialized, lazySandbox } from './sandbox/lazy' +export type { LazySandboxOpts } from './sandbox/lazy' export { SandboxError } from './sandbox/types' export type { Sandbox, diff --git a/packages/agents-runtime/src/sandbox/docker.ts b/packages/agents-runtime/src/sandbox/docker.ts index f30bb1c53f..336f9dbc27 100644 --- a/packages/agents-runtime/src/sandbox/docker.ts +++ b/packages/agents-runtime/src/sandbox/docker.ts @@ -156,10 +156,35 @@ const SANDBOX_ENTITY_LABEL = `com.electric.sandbox.entity` * key) and only reclaims ephemeral leftovers. */ const SANDBOX_PERSISTENT_LABEL = `com.electric.sandbox.persistent` +/** + * PID of the process that created the container. The boot sweep probes it + * (same host as the daemon's clients) to tell a live sibling's container from + * a crash/quit leftover — the keepalive loop never exits on its own, so a dead + * owner's container stays RUNNING forever without this. + */ +const SANDBOX_OWNER_PID_LABEL = `com.electric.sandbox.owner-pid` +/** + * Compose-style grouping labels. Docker Desktop (and other GUI clients) group + * containers by compose project, so every sandbox shows up under one + * collapsible `electric-sandboxes` entry and can be stopped / deleted together + * — also via `docker compose -p electric-sandboxes down`. + */ +const COMPOSE_PROJECT_LABEL = `com.docker.compose.project` +const COMPOSE_SERVICE_LABEL = `com.docker.compose.service` +const COMPOSE_PROJECT_NAME = `electric-sandboxes` /** Common prefix for every container name this module assigns. */ const NAME_PREFIX = `electric-sbx` +/** + * In-container record of the process that most recently ADOPTED the container + * (reattached by key after its creator exited). Labels are immutable, so the + * creation-time owner pid goes stale on reattach; the boot sweep probes this + * marker before reclaiming a running container whose labelled owner is dead. + * Lives on the /tmp tmpfs, so a stop wipes it along with any stale adoption. + */ +const OWNER_MARKER_PATH = `/tmp/.electric-sbx-owner-pid` + /** Default warm window before an idle container is torn down (stop/remove). */ const DEFAULT_IDLE_GRACE_MS = 2 * 60 * 1000 @@ -209,6 +234,33 @@ function withKeyLock(key: string, fn: () => Promise): Promise { return run } +/** + * Bound on concurrent container CREATIONS (image resolve + create + start + + * init), so a burst of wakes materializing sandboxes at once — e.g. a backlog + * replay when a runner reconnects — queues against the daemon instead of + * stampeding it. Reattaches and execs are not limited; total creations are + * unbounded, only their concurrency. + */ +const MAX_CONCURRENT_CREATIONS = 4 +let creationSlots = MAX_CONCURRENT_CREATIONS +const creationQueue: Array<() => void> = [] +async function withCreationSlot(fn: () => Promise): Promise { + if (creationSlots > 0) { + creationSlots -= 1 + } else { + await new Promise((release) => creationQueue.push(release)) + } + try { + return await fn() + } finally { + // Hand the slot directly to the next waiter (no decrement on their side), + // so a new caller can't race in between release and re-acquire. + const next = creationQueue.shift() + if (next) next() + else creationSlots += 1 + } +} + /** Lowercase, DNS-safe slug from an arbitrary identity string (≤24 chars). */ function slugify(s: string): string { const slug = s @@ -421,6 +473,9 @@ export async function dockerSandbox( [SANDBOX_LABEL]: `1`, [SANDBOX_KEY_LABEL]: sandboxKey, [SANDBOX_PERSISTENT_LABEL]: persistent ? `true` : `false`, + [SANDBOX_OWNER_PID_LABEL]: String(process.pid), + [COMPOSE_PROJECT_LABEL]: COMPOSE_PROJECT_NAME, + [COMPOSE_SERVICE_LABEL]: opts.entityType ?? `sandbox`, ...(opts.entityType ? { [SANDBOX_ENTITY_TYPE_LABEL]: opts.entityType } : {}), @@ -429,39 +484,59 @@ export async function dockerSandbox( } // Create-and-start a fresh container. Image is pulled here only (skipped - // entirely when reattaching to an existing container). - const createStarted = async (): Promise => { - const image = resolveImage(opts) - await ensureImage(docker, image, opts) - const c = await docker.createContainer({ - // Spread (rather than a literal `name:`) so the `name` query param — - // which dockerode accepts but doesn't declare on its create-opts type — - // doesn't trip excess-property checking. - ...{ name: containerName }, - Image: image, - Cmd: [`sh`, `-c`, `while true; do sleep 3600; done`], - WorkingDir: containerCwd, - Env: Object.entries(baseEnv).map(([k, v]) => `${k}=${v}`), - Labels: labels, - ExposedPorts: exposedPorts, - HostConfig, + // entirely when reattaching to an existing container). Creation concurrency + // is bounded process-wide (see withCreationSlot). + const createStarted = (): Promise => + withCreationSlot(async (): Promise => { + const image = resolveImage(opts) + await ensureImage(docker, image, opts) + const c = await docker.createContainer({ + // Spread (rather than a literal `name:`) so the `name` query param — + // which dockerode accepts but doesn't declare on its create-opts type — + // doesn't trip excess-property checking. + ...{ name: containerName }, + Image: image, + Cmd: [`sh`, `-c`, `while true; do sleep 3600; done`], + WorkingDir: containerCwd, + Env: Object.entries(baseEnv).map(([k, v]) => `${k}=${v}`), + Labels: labels, + ExposedPorts: exposedPorts, + HostConfig, + }) + try { + await c.start() + } catch (err) { + await c.remove({ force: true, v: true }).catch(() => {}) + throw new SandboxError( + `runtime`, + `dockerSandbox: container start failed: ${ + err instanceof Error ? err.message : String(err) + }` + ) + } + // Tmpfs on `/work` is empty at start; ensure caller-supplied workingDir + // exists with sensible perms. If this post-start init fails — the exec + // call throws, or the mkdir can't run / exits non-zero (a null exit code + // means the exec never produced one, e.g. it couldn't spawn at all) — + // remove the already-started container before rethrowing: it isn't + // registered yet, so no dispose (and, while running, no sweep) would + // ever reclaim it. + try { + const init = await runOneOff(c, [`mkdir`, `-p`, containerCwd]) + if (init.exitCode !== 0) { + throw new SandboxError( + `runtime`, + `dockerSandbox: post-start init failed: \`mkdir -p ${containerCwd}\` exited ${init.exitCode ?? `without a code`}${ + init.stderr.length > 0 ? `: ${init.stderr.toString().trim()}` : `` + }` + ) + } + } catch (err) { + await c.remove({ force: true, v: true }).catch(() => {}) + throw err + } + return c }) - try { - await c.start() - } catch (err) { - await c.remove({ force: true, v: true }).catch(() => {}) - throw new SandboxError( - `runtime`, - `dockerSandbox: container start failed: ${ - err instanceof Error ? err.message : String(err) - }` - ) - } - // Tmpfs on `/work` is empty at start; ensure caller-supplied workingDir - // exists with sensible perms. - await runOneOff(c, [`mkdir`, `-p`, containerCwd]) - return c - } // Serialize reattach + registration against the debounced idle teardown // (which holds the same per-key lock), so a re-acquire can't race a teardown @@ -496,6 +571,25 @@ export async function dockerSandbox( }) } +/** + * Best-effort: record this process as the container's current owner in the + * in-container marker (see {@link OWNER_MARKER_PATH}). Called on the reattach + * paths — where the immutable creation-time owner-pid label may belong to an + * exited process — so the boot sweep can tell an adopted container from a + * crash leftover. + */ +async function writeOwnerMarker(c: DockerodeContainer): Promise { + try { + await runOneOff(c, [ + `sh`, + `-c`, + `echo ${process.pid} > ${OWNER_MARKER_PATH}`, + ]) + } catch { + /* best-effort */ + } +} + /** * Resolve a container by name: reattach to an existing one (starting it if a * persistent container had been stopped) or create it fresh. Handles the race @@ -520,6 +614,7 @@ async function reattachOrCreate( // and exec/fs round-trips work again. await existing.start().catch(() => {}) } + await writeOwnerMarker(existing) return existing } try { @@ -560,34 +655,72 @@ async function reattachExisting( ) } if (!running) await existing.start().catch(() => {}) + await writeOwnerMarker(existing) return existing } function isNameConflict(err: unknown): boolean { - const status = (err as { statusCode?: number; status?: number })?.statusCode - if (status === 409) return true - return /already in use|Conflict/i.test( + // Match the daemon's name-conflict message, not bare HTTP 409 — the daemon + // also answers 409 for unrelated conflicts that can surface from inside + // `createStarted` (e.g. exec on a container that just exited), and treating + // those as a lost create race would "reattach" to a container that is gone. + return /already in use/i.test( err instanceof Error ? err.message : String(err) ) } /** - * One-shot startup cleanup of *ephemeral* sandbox leftovers from a previous - * process (a crash or restart before disposes ran). Call once at runner boot. + * Same-host liveness probe for the pid recorded on a container at creation. + * `kill(pid, 0)` delivers no signal; EPERM means the pid exists but belongs to + * another user — still alive. A reused pid reads as alive (the orphan is then + * left for a later sweep), which errs on the safe side. + */ +function isPidAlive(pid: number): boolean { + try { + process.kill(pid, 0) + return true + } catch (err) { + return (err as NodeJS.ErrnoException).code === `EPERM` + } +} + +/** + * Read the adoption marker (see {@link OWNER_MARKER_PATH}) from a RUNNING + * container. `null` when absent or unreadable — the sweep then falls back to + * the creation-time owner-pid label alone. + */ +async function readOwnerMarker(c: DockerodeContainer): Promise { + try { + const r = await runOneOff(c, [`cat`, OWNER_MARKER_PATH]) + if (r.exitCode !== 0) return null + const pid = Number(r.stdout.toString().trim()) + return Number.isInteger(pid) && pid > 0 ? pid : null + } catch { + return null + } +} + +/** + * One-shot startup cleanup of sandbox leftovers from a previous process (a + * crash, force-quit, or shutdown before disposes ran). Call once at runner + * boot. * - * Two containers are deliberately left untouched: - * - RUNNING containers — they may belong to a live sibling runner (or a - * concurrent test run) sharing this Docker daemon; force-removing those - * would wipe another process's in-use sandbox. Reboot/crash leftovers are - * `Exited` once the daemon restarts, so the common case is still reclaimed; - * a still-running ephemeral orphan is left for a manual labelled prune - * rather than risk a live peer. - * - PERSISTENT containers — `persistent: true` exists precisely so a restarted - * process can reattach to the warm workspace by key, so they must survive a - * boot (only ephemeral leftovers are reclaimed here; a manual labelled prune - * reclaims truly-abandoned persistent ones). + * The keepalive loop never exits on its own, so a leftover is usually still + * RUNNING — but so is a live sibling runner's container on the same daemon. + * The owner-pid label disambiguates (the sweep runs on the same host as the + * daemon's clients): owner alive ⇒ leave it; owner dead ⇒ check the + * in-container adoption marker (a live process that reattached after the + * creator exited) and, if no adopter is alive either, reclaim — REMOVE an + * ephemeral orphan, STOP a persistent one (its writable layer must survive so + * the next wake can reattach by key). A running container without the label + * (created before it existed) can't be probed and is left for a manual + * labelled prune. * - * Returns the names removed. + * Non-running containers: ephemeral leftovers are removed; persistent ones are + * preserved — `persistent: true` exists precisely so a restarted process can + * reattach to the warm workspace by key. + * + * Returns the names reclaimed (removed or stopped). */ export async function sweepOrphanedDockerSandboxes(opts?: { dockerSocket?: string @@ -612,21 +745,116 @@ export async function sweepOrphanedDockerSandboxes(opts?: { return [] } - const removed: Array = [] + const reclaimed: Array = [] for (const c of listed) { const name = c.Names?.[0]?.replace(/^\//, ``) ?? c.Id - // Never touch a running container (possibly a live peer) or a persistent - // one (meant to be reattached by key). See the doc comment above. - if (c.State === `running`) continue - if (c.Labels?.[SANDBOX_PERSISTENT_LABEL] === `true`) continue + // Held by this very process right now — its lifecycle is already managed. + if (sandboxContainers.has(name)) continue + const persistent = c.Labels?.[SANDBOX_PERSISTENT_LABEL] === `true` try { + if (c.State === `running`) { + const ownerPid = Number(c.Labels?.[SANDBOX_OWNER_PID_LABEL]) + // No (or unparsable) owner label, or the owner is alive ⇒ possibly a + // live peer's in-use sandbox — never touch it. + if (!Number.isInteger(ownerPid) || isPidAlive(ownerPid)) continue + // The creator is gone, but a live sibling process may have since + // ADOPTED the container (reattached by key) — labels are immutable, so + // adoption is recorded in an in-container marker. Probe it before + // reclaiming. + const adopter = await readOwnerMarker(docker.getContainer(c.Id)) + if (adopter !== null && isPidAlive(adopter)) continue + if (persistent) { + // `t: 0` → straight to SIGKILL (PID 1 ignores SIGTERM); the writable + // layer survives for a later reattach by key. + await docker.getContainer(c.Id).stop({ t: 0 }) + } else { + await docker.getContainer(c.Id).remove({ force: true, v: true }) + } + reclaimed.push(name) + continue + } + if (persistent) continue await docker.getContainer(c.Id).remove({ force: true, v: true }) - removed.push(name) + reclaimed.push(name) } catch { /* already gone */ } } - return removed + return reclaimed +} + +/** + * Wipe the container for a resolved sandbox key WITHOUT creating one — the + * reclaim path for a lazy sandbox that was never materialized: a terminal + * entity's persistent workspace from an EARLIER wake must not survive just + * because the final wake never used its sandbox. If sibling leases currently + * hold the container, it is only MARKED reclaimed — the last lease draining + * wipes it (matching an owner's `dispose({ reclaim: true })`). A container + * that exists with no registry entry (left from a previous process) is removed + * directly; absent containers are a no-op. + */ +export async function reclaimDockerSandboxByKey( + sandboxKey: string, + opts?: { dockerSocket?: string } +): Promise { + const name = containerNameForKey(sandboxKey) + const Docker = await loadDockerode() + const docker: Dockerode = opts?.dockerSocket + ? new Docker({ socketPath: opts.dockerSocket }) + : new Docker() + await withKeyLock(name, async () => { + const entry = sandboxContainers.get(name) + if (entry && entry.refs > 0) { + entry.reclaim = true + return + } + if (entry?.idleTimer) clearTimeout(entry.idleTimer) + sandboxContainers.delete(name) + try { + await docker.getContainer(name).remove({ force: true, v: true }) + } catch { + /* nothing to reclaim */ + } + }) +} + +/** + * Immediate, lock-serialized teardown of every IDLE container this process + * tracks: pending debounced timers are cancelled and each entry's teardown + * action runs NOW — REMOVE for ephemeral/reclaimed containers, STOP for + * persistent ones (the workspace survives for the next process to reattach by + * key). + * + * Call on runtime shutdown. The debounced idle timers are unref'd and die with + * the process; without this flush every recently-used container is left + * RUNNING for the boot sweep of some future process to find. + * + * Entries with live leases (refs > 0) are left alone: the registry is shared + * process-wide, so they may belong to a sibling runtime that is not shutting + * down (or to a wake whose drain timed out). Their own dispose tears them + * down; if the process exits first, the boot sweep reclaims them by dead + * owner pid. + */ +export async function shutdownAllDockerSandboxes(): Promise { + await Promise.all( + [...sandboxContainers.keys()].map((name) => + withKeyLock(name, async () => { + const entry = sandboxContainers.get(name) + if (!entry || entry.refs > 0) return + if (entry.idleTimer) clearTimeout(entry.idleTimer) + try { + if (sandboxWipesOnDispose(entry.reclaim ?? false, entry.persistent)) { + await entry.container.remove({ force: true, v: true }) + } else { + await entry.container.stop({ t: 0 }) + } + } catch { + /* already stopped / removed / gone */ + } + sandboxContainers.delete(name) + }) + ) + ) } /** Test-only: drop the in-process container registry bookkeeping. */ diff --git a/packages/agents-runtime/src/sandbox/lazy.ts b/packages/agents-runtime/src/sandbox/lazy.ts new file mode 100644 index 0000000000..06b6d7c561 --- /dev/null +++ b/packages/agents-runtime/src/sandbox/lazy.ts @@ -0,0 +1,195 @@ +/** + * Lazy sandbox wrapper — defers the (potentially expensive) provider factory + * until the sandbox is actually USED. A wake that never runs a tool — a cron + * tick deciding there is nothing to do, child-status bookkeeping, a drained + * inbox — then never pays for, or leaks, a provider-side sandbox (e.g. a + * Docker container). Profile authors wrap their factory: + * + * factory: async (params) => + * lazySandbox({ + * workingDirectory: `/work`, + * factory: () => dockerSandbox({ ... }), + * reclaim: params.owner + * ? () => reclaimDockerSandboxByKey(params.sandboxKey) + * : undefined, + * }) + * + * Materialization is single-flight and retried on failure. Disposing a + * never-used wrapper is free — except `dispose({ reclaim: true })`, which runs + * the provider-supplied `reclaim` callback so a terminal entity's persistent + * workspace from an EARLIER wake is still wiped without creating a fresh + * sandbox just to remove it. + */ + +import { SandboxError } from './types' +import type { + DirEntry, + FileStat, + Sandbox, + SandboxExecOpts, + SandboxExecResult, +} from './types' + +/** + * Cross-module-instance brand for {@link ensureSandboxMaterialized}: a global + * symbol survives duplicate copies of this module in a bundle, where an + * `instanceof` check would not. + */ +const MATERIALIZE = Symbol.for(`electric.sandbox.lazy.materialize`) + +export interface LazySandboxOpts { + /** Reported as `Sandbox.name` until the inner sandbox materializes. */ + name?: string + /** + * The inner sandbox's primary writable root. MUST match what `factory` + * produces: tools resolve relative paths against it synchronously, before + * the first (materializing) sandbox call could run. + */ + workingDirectory: string + /** Builds the inner sandbox on first use. */ + factory: () => Promise + /** + * Wipe the provider-side sandbox WITHOUT materializing. Invoked when this + * lease is disposed with `reclaim: true` before the sandbox was ever used — + * an earlier wake's persistent workspace may still exist under the same key + * and must not outlive its terminal entity. Only pass this for OWNER leases: + * the wrapper itself cannot tell an owner from an attacher. + */ + reclaim?: () => Promise +} + +class LazySandbox implements Sandbox { + private readonly opts: LazySandboxOpts + private inner: Sandbox | null = null + private materializing: Promise | null = null + private disposed = false + + constructor(opts: LazySandboxOpts) { + this.opts = opts + } + + get name(): string { + return this.inner?.name ?? this.opts.name ?? `lazy` + } + + get workingDirectory(): string { + return this.opts.workingDirectory + } + + /** Single-flight; a failed factory clears so the next use retries. */ + private materialize(): Promise { + if (this.inner) return Promise.resolve(this.inner) + this.materializing ??= this.opts.factory().then( + (sandbox) => { + this.inner = sandbox + this.materializing = null + return sandbox + }, + (err: unknown) => { + this.materializing = null + throw err + } + ) + return this.materializing + } + + /** @internal see {@link ensureSandboxMaterialized} */ + [MATERIALIZE](): Promise { + this.assertLive() + return this.materialize() + } + + private assertLive(): void { + if (this.disposed) { + throw new SandboxError( + `runtime`, + `lazySandbox: this sandbox lease has been disposed.` + ) + } + } + + async exec(opts: SandboxExecOpts): Promise { + this.assertLive() + return (await this.materialize()).exec(opts) + } + + async readFile(path: string): Promise { + this.assertLive() + return (await this.materialize()).readFile(path) + } + + async writeFile(path: string, content: Buffer | string): Promise { + this.assertLive() + return (await this.materialize()).writeFile(path, content) + } + + async mkdir(path: string, opts?: { recursive?: boolean }): Promise { + this.assertLive() + return (await this.materialize()).mkdir(path, opts) + } + + async readdir(path: string): Promise> { + this.assertLive() + return (await this.materialize()).readdir(path) + } + + async exists(path: string): Promise { + this.assertLive() + return (await this.materialize()).exists(path) + } + + async remove(path: string, opts?: { recursive?: boolean }): Promise { + this.assertLive() + return (await this.materialize()).remove(path, opts) + } + + async stat(path: string): Promise { + this.assertLive() + return (await this.materialize()).stat(path) + } + + async fetch(input: string | URL, init?: RequestInit): Promise { + this.assertLive() + return (await this.materialize()).fetch(input, init) + } + + async dispose(opts?: { reclaim?: boolean }): Promise { + if (this.disposed) return + this.disposed = true + if (this.inner || this.materializing) { + let inner: Sandbox | null = null + try { + inner = await this.materialize() + } catch { + // The in-flight factory failed — nothing live to dispose. Fall + // through: a requested reclaim still wipes the provider-side state. + } + if (inner) { + // The inner dispose owns the reclaim semantics from here. + await inner.dispose(opts) + return + } + } + if (opts?.reclaim && this.opts.reclaim) await this.opts.reclaim() + } +} + +/** Wrap a provider factory so the sandbox materializes on first use. */ +export function lazySandbox(opts: LazySandboxOpts): Sandbox { + return new LazySandbox(opts) +} + +/** + * Force a lazy sandbox to materialize (no-op for any other sandbox). Used by + * the spawn-`inherit` path: a child ATTACHES to this wake's sandbox by key and + * never creates, so the owner's container/workspace must actually exist before + * the child can wake — even if the owner itself never ran a tool. + */ +export async function ensureSandboxMaterialized( + sandbox: Sandbox +): Promise { + const materialize = ( + sandbox as Sandbox & { [MATERIALIZE]?: () => Promise } + )[MATERIALIZE] + if (typeof materialize === `function`) await materialize.call(sandbox) +} diff --git a/packages/agents-runtime/test/sandbox-docker.test.ts b/packages/agents-runtime/test/sandbox-docker.test.ts index df69688a20..830811e42e 100644 --- a/packages/agents-runtime/test/sandbox-docker.test.ts +++ b/packages/agents-runtime/test/sandbox-docker.test.ts @@ -2,11 +2,15 @@ import { afterAll, afterEach, beforeAll, describe, expect, it } from 'vitest' import { mkdtemp, rm, symlink, writeFile } from 'node:fs/promises' import { tmpdir } from 'node:os' import { join } from 'node:path' +import { spawnSync } from 'node:child_process' import { __resetPersistentRegistryForTests, dockerSandbox, + reclaimDockerSandboxByKey, + shutdownAllDockerSandboxes, sweepOrphanedDockerSandboxes, } from '../src/sandbox/docker' +import { lazySandbox } from '../src/sandbox/lazy' import { loadDockerode } from '../src/sandbox/docker/loader' import { SandboxError } from '../src/sandbox/types' import { dockerAvailable, TEST_IMAGE, TEST_LABEL } from './helpers/docker-probe' @@ -469,8 +473,14 @@ d(`dockerSandbox keyed lifecycle`, () => { filters: { label: [`com.electric.sandbox.key=${sandboxKey}`] }, }) if (list.length === 0) return `absent` - const info = await docker.getContainer(list[0].Id).inspect() - return info.State.Running ? `running` : `stopped` + try { + const info = await docker.getContainer(list[0].Id).inspect() + return info.State.Running ? `running` : `stopped` + } catch (err) { + // Removed between list and inspect (e.g. a reclaim in flight). + if ((err as { statusCode?: number }).statusCode === 404) return `absent` + throw err + } } const waitForKeyState = async ( @@ -530,6 +540,12 @@ d(`dockerSandbox keyed lifecycle`, () => { // Per-entity identity lives in labels, since the shared name can't. expect(labels[`com.electric.sandbox.entity-type`]).toBe(`horton`) expect(labels[`com.electric.sandbox.entity`]).toBe(`/horton/abc123/main`) + // Owning process, for the boot sweep's same-host liveness probe. + expect(labels[`com.electric.sandbox.owner-pid`]).toBe(String(process.pid)) + // Compose-style grouping: docker GUIs collapse all sandboxes under one + // project, and `docker compose -p electric-sandboxes down` clears them. + expect(labels[`com.docker.compose.project`]).toBe(`electric-sandboxes`) + expect(labels[`com.docker.compose.service`]).toBe(`horton`) } finally { await sandbox.dispose() } @@ -743,9 +759,10 @@ d(`dockerSandbox keyed lifecycle`, () => { expect(await keyState(key)).toBe(`absent`) }, 90_000) - it(`sweepOrphanedDockerSandboxes leaves a *running* container untouched`, async () => { - // A running container may belong to a live sibling process sharing this - // daemon — force-removing it would wipe a peer's in-use sandbox. + it(`sweepOrphanedDockerSandboxes leaves a *running* container with a live owner untouched`, async () => { + // A running container whose owner-pid process (this test process) is alive + // belongs to a live sibling — force-removing it would wipe an in-use + // sandbox. const key = `${KEY}-orphan-running` await make(key, 60_000, /* persistent */ false) __resetPersistentRegistryForTests() @@ -767,4 +784,189 @@ d(`dockerSandbox keyed lifecycle`, () => { await sweepOrphanedDockerSandboxes() expect(await keyState(key)).toBe(`stopped`) }, 90_000) + + // A pid that is guaranteed dead: spawn a no-op child and reuse its pid after + // it has exited (synchronously, so it is reaped before we return). + const deadPid = (): number => { + const child = spawnSync(process.execPath, [`-e`, ``]) + if (child.pid === undefined) throw new Error(`spawnSync returned no pid`) + return child.pid + } + + // Create a container whose owner-pid label points at a dead process — the + // shape a crashed/quit runner leaves behind (its keepalive loop never exits + // on its own, so the container is still RUNNING). + const makeCrashOrphan = (sandboxKey: string, persistent: boolean) => + dockerSandbox({ + image: TEST_IMAGE, + labels: { + [TEST_LABEL]: `1`, + 'com.electric.sandbox.owner-pid': String(deadPid()), + }, + sandboxKey, + persistent, + sharedIdleGraceMs: 600_000, + }) + + it(`sweepOrphanedDockerSandboxes REMOVES a running *ephemeral* orphan whose owner died`, async () => { + const key = `${KEY}-crash-eph` + await makeCrashOrphan(key, /* persistent */ false) + __resetPersistentRegistryForTests() // the owning process is gone + expect(await keyState(key)).toBe(`running`) + + const reclaimed = await sweepOrphanedDockerSandboxes() + expect(reclaimed.length).toBeGreaterThanOrEqual(1) + expect(await keyState(key)).toBe(`absent`) + }, 90_000) + + it(`sweepOrphanedDockerSandboxes STOPS a running *persistent* orphan whose owner died`, async () => { + // Persistent workspaces survive (the next wake reattaches by key), but a + // dead owner can't stop the keepalive — the sweep does it instead. + const key = `${KEY}-crash-persist` + await makeCrashOrphan(key, /* persistent */ true) + __resetPersistentRegistryForTests() + expect(await keyState(key)).toBe(`running`) + + await sweepOrphanedDockerSandboxes() + expect(await keyState(key)).toBe(`stopped`) + }, 90_000) + + it(`sweepOrphanedDockerSandboxes spares a running orphan ADOPTED by a live process`, async () => { + // The owner-pid label is immutable, so it goes stale when a live process + // reattaches a dead creator's container — adoption is recorded in an + // in-container marker that the sweep probes before reclaiming. + const key = `${KEY}-adopted` + await makeCrashOrphan(key, /* persistent */ true) + __resetPersistentRegistryForTests() // the creator is gone… + await make(key, 600_000, /* persistent */ true) // …a live process adopts + __resetPersistentRegistryForTests() // bypass the in-registry skip + expect(await keyState(key)).toBe(`running`) + + await sweepOrphanedDockerSandboxes() + expect(await keyState(key)).toBe(`running`) + }, 90_000) + + it(`sweepOrphanedDockerSandboxes leaves a running container with no owner-pid label untouched`, async () => { + // Containers created before the owner-pid label existed can't be liveness- + // probed — fail safe and leave them for a manual labelled prune. + const Docker = await loadDockerode() + const docker = new Docker() + const c = await docker.createContainer({ + ...{ name: `electric-sbx-legacy-nolabel-test` }, + Image: TEST_IMAGE, + Cmd: [`sh`, `-c`, `while true; do sleep 3600; done`], + Labels: { + 'com.electric.sandbox': `1`, + 'com.electric.sandbox.persistent': `false`, + [TEST_LABEL]: `1`, + }, + HostConfig: {}, + }) + await c.start() + await sweepOrphanedDockerSandboxes() + const info = await c.inspect() + expect(info.State.Running).toBe(true) + }, 90_000) + + it(`removes the container when post-start init fails (no unregistered zombie)`, async () => { + // pidsLimit:1 — PID 1's keepalive holds the only slot, so the post-start + // `mkdir -p` exec cannot run and creation fails AFTER the container + // started. The failed create must remove it, not leave it behind running + // and unregistered (invisible to dispose and, while running, to the sweep). + const key = `${KEY}-init-fail` + await expect( + dockerSandbox({ + image: TEST_IMAGE, + labels: { [TEST_LABEL]: `1` }, + sandboxKey: key, + resources: { pidsLimit: 1 }, + }) + ).rejects.toThrow() + expect(await keyState(key)).toBe(`absent`) + }, 90_000) + + it(`shutdownAllDockerSandboxes flushes pending debounced teardowns immediately`, async () => { + // The debounced idle timers are unref'd and die with the process — on + // shutdown the pending action must run NOW: STOP persistent, REMOVE + // ephemeral. + const ephKey = `${KEY}-shutdown-eph` + const perKey = `${KEY}-shutdown-per` + const eph = await make(ephKey, 600_000, /* persistent */ false) + const per = await make(perKey, 600_000, /* persistent */ true) + await eph.dispose() + await per.dispose() + // Teardowns are debounced 10 minutes out — both still running. + expect(await keyState(ephKey)).toBe(`running`) + expect(await keyState(perKey)).toBe(`running`) + + await shutdownAllDockerSandboxes() + expect(await keyState(ephKey)).toBe(`absent`) + expect(await keyState(perKey)).toBe(`stopped`) + }, 90_000) + + it(`lazy docker sandbox creates no container until first use`, async () => { + // The shape bootstrap wires: profile.factory returns a lazy wrapper, so a + // trivial wake (no tool use) never starts a container. + const key = `${KEY}-lazy` + const sb = lazySandbox({ + name: `docker`, + workingDirectory: `/work`, + factory: () => make(key, 60_000), + }) + expect(await keyState(key)).toBe(`absent`) + const r = await sb.exec({ command: `echo materialized` }) + expect(r.stdout.toString().trim()).toBe(`materialized`) + expect(await keyState(key)).toBe(`running`) + await sb.dispose() + }, 90_000) + + it(`lazy docker sandbox disposed without use leaves nothing behind`, async () => { + const key = `${KEY}-lazy-unused` + const sb = lazySandbox({ + name: `docker`, + workingDirectory: `/work`, + factory: () => make(key, 60_000), + }) + await sb.dispose() + expect(await keyState(key)).toBe(`absent`) + }, 60_000) + + it(`reclaimDockerSandboxByKey removes an earlier wake's persistent container without creating one`, async () => { + // The lazy wrapper's reclaim path: a terminal wake that never used its + // sandbox must still wipe the persistent workspace earlier wakes created. + const key = `${KEY}-reclaim-by-key` + const earlier = await make(key, 600_000, /* persistent */ true) + await earlier.dispose() // idle: container stays (debounced stop pending) + expect(await keyState(key)).toBe(`running`) + + await reclaimDockerSandboxByKey(key) + expect(await keyState(key)).toBe(`absent`) + }, 90_000) + + it(`reclaimDockerSandboxByKey is a no-op when nothing exists under the key`, async () => { + await reclaimDockerSandboxByKey(`${KEY}-reclaim-absent`) + expect(await keyState(`${KEY}-reclaim-absent`)).toBe(`absent`) + }, 60_000) + + it(`reclaimDockerSandboxByKey defers to live leases — wiped once the last drains`, async () => { + const key = `${KEY}-reclaim-live` + const lease = await make(key, 600_000, /* persistent */ true) + await reclaimDockerSandboxByKey(key) + // The sibling lease still owns it — not torn down underneath it. + expect(await keyState(key)).toBe(`running`) + await lease.dispose() + // Marked reclaimed: the last lease draining wipes it immediately. + await waitForKeyState(key, `absent`, 5_000) + }, 90_000) + + it(`shutdownAllDockerSandboxes leaves containers with live leases alone`, async () => { + // The registry is process-wide: a still-open lease may belong to a sibling + // runtime that is NOT shutting down. Its own dispose tears it down — and if + // the process exits first, the boot sweep reclaims it by dead owner pid. + const key = `${KEY}-shutdown-live` + const live = await make(key, 600_000, /* persistent */ false) + await shutdownAllDockerSandboxes() + expect(await keyState(key)).toBe(`running`) + await live.dispose() + }, 90_000) }) diff --git a/packages/agents-runtime/test/sandbox-lazy.test.ts b/packages/agents-runtime/test/sandbox-lazy.test.ts new file mode 100644 index 0000000000..b529ba4744 --- /dev/null +++ b/packages/agents-runtime/test/sandbox-lazy.test.ts @@ -0,0 +1,197 @@ +import { describe, expect, it } from 'vitest' +import { ensureSandboxMaterialized, lazySandbox } from '../src/sandbox/lazy' +import type { Sandbox } from '../src/sandbox/types' + +/** + * lazySandbox defers the provider factory until first use, so trivial wakes + * (cron ticks, bookkeeping) never pay for — or leak — a provider-side sandbox + * (e.g. a Docker container). + */ + +interface FakeCalls { + exec: number + dispose: Array<{ reclaim?: boolean } | undefined> +} + +const makeFake = (calls: FakeCalls): Sandbox => ({ + name: `fake:inner`, + workingDirectory: `/work`, + exec: async () => { + calls.exec += 1 + return { + exitCode: 0, + signal: null, + stdout: Buffer.from(`ok`), + stderr: Buffer.alloc(0), + timedOut: false, + aborted: false, + outputTruncated: false, + } + }, + readFile: async () => Buffer.from(`content`), + writeFile: async () => {}, + mkdir: async () => {}, + readdir: async () => [], + exists: async () => true, + remove: async () => {}, + stat: async () => ({ type: `file`, size: 0, mtimeMs: 0 }), + fetch: async () => new Response(`ok`), + dispose: async (opts) => { + calls.dispose.push(opts) + }, +}) + +const harness = () => { + const calls: FakeCalls = { exec: 0, dispose: [] } + let factoryCalls = 0 + const reclaims: Array = [] + const sandbox = lazySandbox({ + name: `fake`, + workingDirectory: `/work`, + factory: async () => { + factoryCalls += 1 + return makeFake(calls) + }, + reclaim: async () => { + reclaims.push(true) + }, + }) + return { + sandbox, + calls, + reclaims, + factoryCalls: () => factoryCalls, + } +} + +describe(`lazySandbox`, () => { + it(`does not invoke the factory on construction or property access`, () => { + const h = harness() + expect(h.sandbox.name).toBe(`fake`) + expect(h.sandbox.workingDirectory).toBe(`/work`) + expect(h.factoryCalls()).toBe(0) + }) + + it(`materializes on first use and reuses the inner sandbox after`, async () => { + const h = harness() + await h.sandbox.exec({ command: `echo hi` }) + await h.sandbox.exec({ command: `echo again` }) + await h.sandbox.readFile(`/work/x`) + expect(h.factoryCalls()).toBe(1) + expect(h.calls.exec).toBe(2) + // After materialization the inner provider's name shows through. + expect(h.sandbox.name).toBe(`fake:inner`) + }) + + it(`single-flights concurrent first calls`, async () => { + let resolveFactory!: (sb: Sandbox) => void + const calls: FakeCalls = { exec: 0, dispose: [] } + let factoryCalls = 0 + const sandbox = lazySandbox({ + workingDirectory: `/work`, + factory: () => { + factoryCalls += 1 + return new Promise((r) => { + resolveFactory = r + }) + }, + }) + const a = sandbox.exec({ command: `a` }) + const b = sandbox.exec({ command: `b` }) + resolveFactory(makeFake(calls)) + await Promise.all([a, b]) + expect(factoryCalls).toBe(1) + expect(calls.exec).toBe(2) + }) + + it(`a failed factory rejects the call and is retried on the next use`, async () => { + let attempts = 0 + const calls: FakeCalls = { exec: 0, dispose: [] } + const sandbox = lazySandbox({ + workingDirectory: `/work`, + factory: async () => { + attempts += 1 + if (attempts === 1) throw new Error(`daemon hiccup`) + return makeFake(calls) + }, + }) + await expect(sandbox.exec({ command: `x` })).rejects.toThrow( + `daemon hiccup` + ) + await sandbox.exec({ command: `x` }) + expect(attempts).toBe(2) + expect(calls.exec).toBe(1) + }) + + it(`dispose without use never invokes the factory`, async () => { + const h = harness() + await h.sandbox.dispose() + expect(h.factoryCalls()).toBe(0) + expect(h.calls.dispose).toEqual([]) + expect(h.reclaims).toEqual([]) + }) + + it(`dispose({reclaim}) without use runs the reclaim callback instead`, async () => { + // A terminal entity's persistent workspace from an EARLIER wake must not + // survive just because the final wake never touched the sandbox. + const h = harness() + await h.sandbox.dispose({ reclaim: true }) + expect(h.factoryCalls()).toBe(0) + expect(h.reclaims).toEqual([true]) + }) + + it(`dispose({reclaim}) without use is a no-op when no reclaim callback exists`, async () => { + let factoryCalls = 0 + const calls: FakeCalls = { exec: 0, dispose: [] } + const sandbox = lazySandbox({ + workingDirectory: `/work`, + factory: async () => { + factoryCalls += 1 + return makeFake(calls) + }, + }) + await sandbox.dispose({ reclaim: true }) + expect(factoryCalls).toBe(0) + }) + + it(`dispose after use forwards to the inner sandbox (reclaim included)`, async () => { + const h = harness() + await h.sandbox.exec({ command: `x` }) + await h.sandbox.dispose({ reclaim: true }) + expect(h.calls.dispose).toEqual([{ reclaim: true }]) + // The inner dispose owns reclaim semantics — the callback must not ALSO run. + expect(h.reclaims).toEqual([]) + }) + + it(`operations after dispose are rejected with a runtime SandboxError`, async () => { + const h = harness() + await h.sandbox.dispose() + await expect(h.sandbox.exec({ command: `x` })).rejects.toMatchObject({ + kind: `runtime`, + }) + expect(h.factoryCalls()).toBe(0) + }) + + it(`repeated dispose is a no-op`, async () => { + const h = harness() + await h.sandbox.exec({ command: `x` }) + await h.sandbox.dispose() + await h.sandbox.dispose() + expect(h.calls.dispose).toHaveLength(1) + }) +}) + +describe(`ensureSandboxMaterialized`, () => { + it(`materializes a lazy sandbox without running any operation`, async () => { + const h = harness() + await ensureSandboxMaterialized(h.sandbox) + expect(h.factoryCalls()).toBe(1) + expect(h.calls.exec).toBe(0) + }) + + it(`is a no-op for a non-lazy sandbox`, async () => { + const calls: FakeCalls = { exec: 0, dispose: [] } + await ensureSandboxMaterialized(makeFake(calls)) + expect(calls.exec).toBe(0) + }) +}) diff --git a/packages/agents/src/bootstrap.ts b/packages/agents/src/bootstrap.ts index 597597e66b..c1202ca86c 100644 --- a/packages/agents/src/bootstrap.ts +++ b/packages/agents/src/bootstrap.ts @@ -12,6 +12,7 @@ import { createEventSourceTools } from '@electric-ax/agents-runtime/tools' import { chooseDefaultSandbox, isE2BAvailable, + lazySandbox, remoteSandbox, type SandboxProfile, } from '@electric-ax/agents-runtime/sandbox' @@ -40,6 +41,13 @@ export interface AgentHandlerResult { registry: EntityRegistry typeNames: Array skillsRegistry: SkillsRegistry | null + /** + * Immediately tears down the idle sandboxes this process created (set when + * a provider with host-local state — docker — is registered). MUST be called + * on shutdown, after wakes drain: the providers' debounced idle teardowns + * die with the process, which would leave containers running. + */ + shutdownSandboxes: (() => Promise) | null } export type BuiltinElectricToolsFactory = NonNullable< @@ -154,7 +162,8 @@ export async function createBuiltinAgentHandler( registerWorker(registry, { workingDirectory: cwd, streamFn, modelCatalog }) typeNames.push(`worker`) - const sandboxProfiles = await buildBuiltinSandboxProfiles(cwd) + const { profiles: sandboxProfiles, shutdownSandboxes } = + await buildBuiltinSandboxProfiles(cwd) const runtime = createRuntimeHandler({ baseUrl: agentServerUrl, @@ -176,6 +185,7 @@ export async function createBuiltinAgentHandler( registry, typeNames, skillsRegistry, + shutdownSandboxes, } } @@ -211,7 +221,7 @@ export const registerAgentTypes = registerBuiltinAgentTypes * Guard so repeated `buildBuiltinSandboxProfiles` calls in one process don't * re-run the boot sweep. */ -let dockerSweptOnBoot = false +let dockerBootSweep: Promise | null = null type SweepOrphanedDockerSandboxes = // eslint-disable-next-line quotes -- type-position import() requires a string literal @@ -219,17 +229,15 @@ type SweepOrphanedDockerSandboxes = function sweepOrphanedDockerSandboxesOnce( sweep: SweepOrphanedDockerSandboxes -): void { - if (dockerSweptOnBoot) return - dockerSweptOnBoot = true - // One-shot, at boot: reclaim any sandbox containers a previous (crashed) - // process left behind. Fire-and-forget — nothing is acquired yet, and within - // a live process shared containers are stopped (not removed) when idle. - sweep() - .then((removed) => { - if (removed.length > 0) { +): Promise { + // One-shot, at boot: reclaim any sandbox containers a previous (crashed or + // quit) process left behind. Awaited by the caller — the sweep can stop + // running orphans, so it must finish before any wake reattaches by key. + dockerBootSweep ??= sweep() + .then((reclaimed) => { + if (reclaimed.length > 0) { serverLog.info( - `[builtin-agents] docker sandbox boot sweep removed ${removed.length} leftover container(s)` + `[builtin-agents] docker sandbox boot sweep reclaimed ${reclaimed.length} leftover container(s)` ) } }) @@ -238,16 +246,23 @@ function sweepOrphanedDockerSandboxesOnce( `[builtin-agents] docker sandbox boot sweep error: ${err instanceof Error ? err.message : String(err)}` ) ) + return dockerBootSweep } /** * Built-in sandbox profiles. `local` is always available. `docker` is * gated on Docker being reachable so a user without Docker installed * sees only what works — the UI never offers a non-functional choice. + * + * Also returns `shutdownSandboxes` when a host-local provider registered: an + * immediate teardown of this process's live containers that the embedding + * server must run on shutdown (the providers' debounced idle teardowns die + * with the process). */ -async function buildBuiltinSandboxProfiles( - workingDirectory: string -): Promise> { +async function buildBuiltinSandboxProfiles(workingDirectory: string): Promise<{ + profiles: Array + shutdownSandboxes: (() => Promise) | null +}> { const profiles: Array = [ { name: `local`, @@ -257,25 +272,32 @@ async function buildBuiltinSandboxProfiles( chooseDefaultSandbox(resolveCwd(args, workingDirectory)), }, ] + let shutdownSandboxes: (() => Promise) | null = null try { const { isDockerAvailable } = await import( `@electric-ax/agents-runtime/sandbox/docker` ) if (await isDockerAvailable()) { - const { dockerSandbox, sweepOrphanedDockerSandboxes } = await import( - `@electric-ax/agents-runtime/sandbox/docker` - ) - // Reclaim containers a previous crashed process may have left running. - // No periodic reaper: within a live process, shared containers stop - // themselves a short while after their last lease disposes (debounced - // idle-stop), and ephemeral ones are killed on dispose. - sweepOrphanedDockerSandboxesOnce(sweepOrphanedDockerSandboxes) + const { + dockerSandbox, + reclaimDockerSandboxByKey, + shutdownAllDockerSandboxes, + sweepOrphanedDockerSandboxes, + } = await import(`@electric-ax/agents-runtime/sandbox/docker`) + // Reclaim containers a previous process (crash, force-quit, or a + // shutdown that raced the debounced teardowns) left behind. No periodic + // reaper: within a live process, shared containers stop themselves a + // short while after their last lease disposes (debounced idle-stop), + // ephemeral ones are killed on dispose, and the rest are flushed by + // `shutdownSandboxes` on clean shutdown. + await sweepOrphanedDockerSandboxesOnce(sweepOrphanedDockerSandboxes) + shutdownSandboxes = shutdownAllDockerSandboxes profiles.push({ name: `docker`, label: `Docker`, description: `Runs in a hardened Docker container: dropped capabilities, no privilege escalation, and CPU/memory/process limits. The chosen working directory is mounted read-write and, by default, network egress is unrestricted (allow-all).`, - factory: ({ + factory: async ({ args, sandboxKey, persistent, @@ -284,25 +306,41 @@ async function buildBuiltinSandboxProfiles( entityUrl, }) => { const cwd = readWorkingDirectoryArg(args) - return dockerSandbox({ - // Default to open egress for local development. Network policy is a - // capability of this profile, not a separate profile: like the - // working directory above, it can be made a per-spawn arg later so a - // single `docker` profile spans permissive → fully-isolated rather - // than splitting into `docker-permissive` / `docker-isolated`. - initialNetworkPolicy: { mode: `allow-all` }, - extraMounts: cwd - ? [{ hostPath: cwd, containerPath: `/work`, readOnly: false }] + // Lazy: the container is only created/started when the wake actually + // USES the sandbox, so trivial wakes (cron ticks, bookkeeping) don't + // spin one up. `/work` is the container cwd dockerSandbox defaults to. + return lazySandbox({ + name: `docker`, + workingDirectory: `/work`, + factory: () => + dockerSandbox({ + // Default to open egress for local development. Network policy + // is a capability of this profile, not a separate profile: like + // the working directory above, it can be made a per-spawn arg + // later so a single `docker` profile spans permissive → + // fully-isolated rather than splitting into `docker-permissive` + // / `docker-isolated`. + initialNetworkPolicy: { mode: `allow-all` }, + extraMounts: cwd + ? [{ hostPath: cwd, containerPath: `/work`, readOnly: false }] + : undefined, + // The container is always named-by-key and reattachable; + // `persistent` chooses idle teardown (stop vs remove) and + // `owner` gates creation (an attacher reattaches only). All + // resolved upstream from config. + sandboxKey, + persistent, + owner, + // Observability: tag the container/labels with who spawned it. + entityType, + entityUrl, + }), + // A terminal entity whose final wake never used the sandbox must + // still wipe the persistent workspace earlier wakes created. Owner + // leases only — an attacher can never reclaim the owner's sandbox. + reclaim: owner + ? () => reclaimDockerSandboxByKey(sandboxKey) : undefined, - // The container is always named-by-key and reattachable; `persistent` - // chooses idle teardown (stop vs remove) and `owner` gates creation - // (an attacher reattaches only). All resolved upstream from config. - sandboxKey, - persistent, - owner, - // Observability: tag the container/labels with who spawned it. - entityType, - entityUrl, }) }, }) @@ -354,7 +392,7 @@ async function buildBuiltinSandboxProfiles( console.log( `[builtin-agents] sandbox profiles advertised: ${profiles.map((p) => p.name).join(`, `)}` ) - return profiles + return { profiles, shutdownSandboxes } } function readWorkingDirectoryArg( diff --git a/packages/agents/src/server.ts b/packages/agents/src/server.ts index 3e7786763a..7ee61e8479 100644 --- a/packages/agents/src/server.ts +++ b/packages/agents/src/server.ts @@ -355,6 +355,20 @@ export class BuiltinAgentsServer { }), new Promise((resolve) => setTimeout(resolve, 5_000)), ]) + // Tear down this process's live sandbox containers NOW — their debounced + // idle teardowns die with the process and would leave them running. + // Bounded like the drain above so a hung daemon can't block quit. + if (this.bootstrap.shutdownSandboxes) { + await Promise.race([ + this.bootstrap.shutdownSandboxes().catch((err) => { + serverLog.error( + `[builtin-agents] sandbox shutdown failed during shutdown:`, + err + ) + }), + new Promise((resolve) => setTimeout(resolve, 5_000)), + ]) + } this.bootstrap = null }