From 78dbae6457c391d7e44d86711f16a8acdab881aa Mon Sep 17 00:00:00 2001 From: syn Date: Sun, 29 Mar 2026 14:58:41 -0500 Subject: [PATCH 01/15] feat(kiloclaw): add KiloClawRegistry DO + wire provision/destroy/proxy through registry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the KiloClawRegistry Durable Object (SQLite-backed via Drizzle ORM) that indexes instances per owner (user or org). Wire provision, destroy, and catch-all proxy flows through the registry. Enable lazy migration of legacy instances from Postgres on first access. Key changes: - KiloClawRegistry DO with listInstances, createInstance, destroyInstance, resolveDoKey, findInstancesForUser methods - Lazy migration: reads legacy instance from Postgres via Hyperdrive on first listInstances() call, with 60s retry cooldown - Catch-all proxy reads sandboxId from DO status (not middleware) for gateway token derivation — critical for instance-keyed DOs using ki_ sandboxIds - Registry create/destroy are best-effort (non-fatal errors) - resolveRegistryEntry falls back to legacy idFromName(userId) on registry failure - ensureActiveInstance supports org instances with instance-keyed sandboxId derivation - restoreFromPostgres accepts opts.sandboxId for precise multi-instance lookup - tRPC router threads instanceId to worker for all provisions/destroys --- kiloclaw/drizzle.config.ts | 7 + kiloclaw/drizzle/0000_messy_grim_reaper.sql | 7 + kiloclaw/drizzle/meta/0000_snapshot.json | 63 ++++++ kiloclaw/drizzle/meta/_journal.json | 13 ++ kiloclaw/drizzle/migrations.js | 9 + kiloclaw/package.json | 1 + kiloclaw/src/db/sqlite-schema.ts | 10 + .../kiloclaw-instance/index.ts | 4 +- .../kiloclaw-instance/postgres.ts | 25 +- .../src/durable-objects/kiloclaw-registry.ts | 214 ++++++++++++++++++ kiloclaw/src/index.ts | 95 ++++++-- kiloclaw/src/routes/platform.ts | 52 ++++- kiloclaw/src/test-utils.ts | 1 + kiloclaw/src/types.ts | 2 + kiloclaw/tsconfig.json | 7 +- kiloclaw/vitest.config.ts | 2 + kiloclaw/wrangler.jsonc | 8 + pnpm-lock.yaml | 25 +- src/lib/kiloclaw/instance-registry.ts | 62 +++-- src/routers/kiloclaw-router.ts | 24 +- 20 files changed, 577 insertions(+), 54 deletions(-) create mode 100644 kiloclaw/drizzle.config.ts create mode 100644 kiloclaw/drizzle/0000_messy_grim_reaper.sql create mode 100644 kiloclaw/drizzle/meta/0000_snapshot.json create mode 100644 kiloclaw/drizzle/meta/_journal.json create mode 100644 kiloclaw/drizzle/migrations.js create mode 100644 kiloclaw/src/db/sqlite-schema.ts create mode 100644 kiloclaw/src/durable-objects/kiloclaw-registry.ts diff --git a/kiloclaw/drizzle.config.ts b/kiloclaw/drizzle.config.ts new file mode 100644 index 000000000..6f1812413 --- /dev/null +++ b/kiloclaw/drizzle.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from 'drizzle-kit'; +export default defineConfig({ + out: './drizzle', + schema: './src/db/sqlite-schema.ts', + dialect: 'sqlite', + driver: 'durable-sqlite', +}); diff --git a/kiloclaw/drizzle/0000_messy_grim_reaper.sql b/kiloclaw/drizzle/0000_messy_grim_reaper.sql new file mode 100644 index 000000000..9e05eb8c6 --- /dev/null +++ b/kiloclaw/drizzle/0000_messy_grim_reaper.sql @@ -0,0 +1,7 @@ +CREATE TABLE `instances` ( + `instance_id` text PRIMARY KEY NOT NULL, + `do_key` text NOT NULL, + `assigned_user_id` text NOT NULL, + `created_at` text NOT NULL, + `destroyed_at` text +); diff --git a/kiloclaw/drizzle/meta/0000_snapshot.json b/kiloclaw/drizzle/meta/0000_snapshot.json new file mode 100644 index 000000000..0e4905830 --- /dev/null +++ b/kiloclaw/drizzle/meta/0000_snapshot.json @@ -0,0 +1,63 @@ +{ + "version": "6", + "dialect": "sqlite", + "id": "f488eead-1986-472a-8973-f487dc7599bf", + "prevId": "00000000-0000-0000-0000-000000000000", + "tables": { + "instances": { + "name": "instances", + "columns": { + "instance_id": { + "name": "instance_id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "do_key": { + "name": "do_key", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "assigned_user_id": { + "name": "assigned_user_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "created_at": { + "name": "created_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "destroyed_at": { + "name": "destroyed_at", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + } + }, + "views": {}, + "enums": {}, + "_meta": { + "schemas": {}, + "tables": {}, + "columns": {} + }, + "internal": { + "indexes": {} + } +} \ No newline at end of file diff --git a/kiloclaw/drizzle/meta/_journal.json b/kiloclaw/drizzle/meta/_journal.json new file mode 100644 index 000000000..23010e462 --- /dev/null +++ b/kiloclaw/drizzle/meta/_journal.json @@ -0,0 +1,13 @@ +{ + "version": "7", + "dialect": "sqlite", + "entries": [ + { + "idx": 0, + "version": "6", + "when": 1774809105002, + "tag": "0000_messy_grim_reaper", + "breakpoints": true + } + ] +} \ No newline at end of file diff --git a/kiloclaw/drizzle/migrations.js b/kiloclaw/drizzle/migrations.js new file mode 100644 index 000000000..809f6627d --- /dev/null +++ b/kiloclaw/drizzle/migrations.js @@ -0,0 +1,9 @@ +import journal from './meta/_journal.json'; +import m0000 from './0000_messy_grim_reaper.sql'; + +export default { + journal, + migrations: { + m0000, + }, +}; diff --git a/kiloclaw/package.json b/kiloclaw/package.json index f0260ec4f..951d5ad96 100644 --- a/kiloclaw/package.json +++ b/kiloclaw/package.json @@ -33,6 +33,7 @@ "@types/node": "^22.19.15", "@typescript/native-preview": "catalog:", "@vitest/coverage-v8": "^4.1.0", + "drizzle-kit": "catalog:", "typescript": "catalog:", "vitest": "^4.1.0", "wrangler": "catalog:" diff --git a/kiloclaw/src/db/sqlite-schema.ts b/kiloclaw/src/db/sqlite-schema.ts new file mode 100644 index 000000000..af81fdbaa --- /dev/null +++ b/kiloclaw/src/db/sqlite-schema.ts @@ -0,0 +1,10 @@ +import { sqliteTable, text } from 'drizzle-orm/sqlite-core'; + +/** Registry DO SQLite table: tracks instance ownership per registry (user or org). */ +export const registryInstances = sqliteTable('instances', { + instance_id: text('instance_id').primaryKey(), + do_key: text('do_key').notNull(), + assigned_user_id: text('assigned_user_id').notNull(), + created_at: text('created_at').notNull(), + destroyed_at: text('destroyed_at'), +}); diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts index aa7287672..e1bca1761 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts @@ -927,7 +927,9 @@ export class KiloClawInstance extends DurableObject { if (!this.s.userId || !this.s.sandboxId) { const restoreUserId = userId ?? this.s.userId; if (restoreUserId) { - await restoreFromPostgres(this.env, this.ctx, this.s, restoreUserId); + await restoreFromPostgres(this.env, this.ctx, this.s, restoreUserId, { + sandboxId: this.s.sandboxId, + }); } } diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/postgres.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/postgres.ts index cadbf003d..6b4f8f4cc 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/postgres.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/postgres.ts @@ -1,6 +1,11 @@ import type { KiloClawEnv } from '../../types'; import type { EncryptedEnvelope } from '../../schemas/instance-config'; -import { getWorkerDb, getActiveInstance, markInstanceDestroyed } from '../../db'; +import { + getWorkerDb, + getActiveInstance, + getInstanceBySandboxId, + markInstanceDestroyed, +} from '../../db'; import { appNameFromUserId } from '../../fly/apps'; import type { InstanceMutableState } from './types'; import { getFlyConfig } from './types'; @@ -8,14 +13,24 @@ import { storageUpdate } from './state'; import { attemptMetadataRecovery } from './reconcile'; import { doError, doWarn, toLoggable, createReconcileContext } from './log'; +type RestoreOpts = { + /** If the DO has a stored sandboxId, use it for precise lookup. */ + sandboxId?: string | null; +}; + /** * Restore DO state from Postgres backup if SQLite was wiped. + * + * Lookup priority: + * 1. If opts.sandboxId is provided, look up by sandbox_id (precise, multi-instance safe). + * 2. Otherwise, fall back to getActiveInstance(db, userId) (legacy single-instance). */ export async function restoreFromPostgres( env: KiloClawEnv, ctx: DurableObjectState, state: InstanceMutableState, - userId: string + userId: string, + opts?: RestoreOpts ): Promise { const connectionString = env.HYPERDRIVE?.connectionString; if (!connectionString) { @@ -25,7 +40,11 @@ export async function restoreFromPostgres( try { const db = getWorkerDb(connectionString); - const instance = await getActiveInstance(db, userId); + + // Prefer sandboxId lookup (multi-instance safe) over userId lookup (ambiguous). + const instance = opts?.sandboxId + ? await getInstanceBySandboxId(db, opts.sandboxId) + : await getActiveInstance(db, userId); if (!instance) { doWarn(state, 'No active instance found in Postgres', { userId }); diff --git a/kiloclaw/src/durable-objects/kiloclaw-registry.ts b/kiloclaw/src/durable-objects/kiloclaw-registry.ts new file mode 100644 index 000000000..2f8213ed4 --- /dev/null +++ b/kiloclaw/src/durable-objects/kiloclaw-registry.ts @@ -0,0 +1,214 @@ +import { DurableObject } from 'cloudflare:workers'; +import { drizzle, type DrizzleSqliteDODatabase } from 'drizzle-orm/durable-sqlite'; +import { migrate } from 'drizzle-orm/durable-sqlite/migrator'; +import { eq, isNull, and } from 'drizzle-orm'; +import migrations from '../../drizzle/migrations'; +import { registryInstances } from '../db/sqlite-schema'; +import { getWorkerDb, getActiveInstance } from '../db'; +import type { KiloClawEnv } from '../types'; + +export type RegistryEntry = { + instanceId: string; + doKey: string; + assignedUserId: string; + createdAt: string; + destroyedAt: string | null; +}; + +function rowToEntry(row: typeof registryInstances.$inferSelect): RegistryEntry { + return { + instanceId: row.instance_id, + doKey: row.do_key, + assignedUserId: row.assigned_user_id, + createdAt: row.created_at, + destroyedAt: row.destroyed_at, + }; +} + +/** + * KiloClawRegistry DO — SQLite-backed index of instances per owner. + * + * Keyed by `user:{userId}` (personal) or `org:{orgId}` (org). + * Each instance has its own isolated SQLite database. Migrations run + * per-instance on first access after deploy. + * + * Lazy migration: on first listInstances() for a user registry that has + * no entries, reads the legacy instance row from Postgres via Hyperdrive + * and backfills a registry entry. + */ +export class KiloClawRegistry extends DurableObject { + private db: DrizzleSqliteDODatabase; + private ownerKey: string | null = null; + private migrated = false; + private lastMigrationAttempt = 0; + + /** Cooldown between lazy migration retries when Hyperdrive/Postgres is unavailable. */ + private static MIGRATION_RETRY_COOLDOWN_MS = 60_000; + + constructor(ctx: DurableObjectState, env: KiloClawEnv) { + super(ctx, env); + this.db = drizzle(ctx.storage, { logger: false }); + void ctx.blockConcurrencyWhile(async () => { + await migrate(this.db, migrations); + this.ownerKey = (await ctx.storage.get('owner_key')) ?? null; + this.migrated = (await ctx.storage.get('migrated')) ?? false; + }); + } + + // -- Owner key management -------------------------------------------------- + + /** + * Store the owner key on first call. Subsequent calls validate consistency. + * Every public method receives ownerKey as its first argument; this method + * is called internally at the top of each. + */ + private async ensureOwnerKey(ownerKey: string): Promise { + if (this.ownerKey === ownerKey) return; + if (this.ownerKey !== null) { + throw new Error( + `Registry owner key mismatch: stored="${this.ownerKey}", received="${ownerKey}"` + ); + } + this.ownerKey = ownerKey; + await this.ctx.storage.put('owner_key', ownerKey); + } + + // -- Public RPC methods ---------------------------------------------------- + + async listInstances(ownerKey: string): Promise { + await this.ensureOwnerKey(ownerKey); + + if (!this.migrated) { + const now = Date.now(); + if (now - this.lastMigrationAttempt >= KiloClawRegistry.MIGRATION_RETRY_COOLDOWN_MS) { + this.lastMigrationAttempt = now; + await this.lazyMigrate(); + } + } + + return this.db + .select() + .from(registryInstances) + .where(isNull(registryInstances.destroyed_at)) + .all() + .map(rowToEntry); + } + + async createInstance( + ownerKey: string, + assignedUserId: string, + instanceId: string, + doKey: string + ): Promise { + await this.ensureOwnerKey(ownerKey); + + this.db + .insert(registryInstances) + .values({ + instance_id: instanceId, + do_key: doKey, + assigned_user_id: assignedUserId, + created_at: new Date().toISOString(), + }) + .onConflictDoNothing() + .run(); + } + + async destroyInstance(ownerKey: string, instanceId: string): Promise { + await this.ensureOwnerKey(ownerKey); + + this.db + .update(registryInstances) + .set({ destroyed_at: new Date().toISOString() }) + .where( + and(eq(registryInstances.instance_id, instanceId), isNull(registryInstances.destroyed_at)) + ) + .run(); + } + + async resolveDoKey(ownerKey: string, instanceId: string): Promise { + await this.ensureOwnerKey(ownerKey); + + const row = this.db + .select({ do_key: registryInstances.do_key }) + .from(registryInstances) + .where( + and(eq(registryInstances.instance_id, instanceId), isNull(registryInstances.destroyed_at)) + ) + .get(); + + return row?.do_key ?? null; + } + + async findInstancesForUser(ownerKey: string, userId: string): Promise { + await this.ensureOwnerKey(ownerKey); + + return this.db + .select() + .from(registryInstances) + .where( + and(eq(registryInstances.assigned_user_id, userId), isNull(registryInstances.destroyed_at)) + ) + .all() + .map(rowToEntry); + } + + // -- Lazy migration -------------------------------------------------------- + + /** + * Backfill registry from Postgres for user registries. + * + * Only runs for `user:{userId}` registries. Org registries have no legacy + * instances to migrate. + * + * Migration reads the active instance row from Postgres via Hyperdrive. + * If Hyperdrive is unavailable, migration is deferred to the next access. + */ + private async lazyMigrate(): Promise { + const ownerKey = this.ownerKey; + if (!ownerKey?.startsWith('user:')) { + // Org registries have no legacy instances to migrate + this.migrated = true; + await this.ctx.storage.put('migrated', true); + return; + } + + const userId = ownerKey.slice('user:'.length); + + const connectionString = this.env.HYPERDRIVE?.connectionString; + if (!connectionString) { + // Hyperdrive unavailable — defer migration, next access will retry + console.warn('[Registry] HYPERDRIVE not configured, deferring lazy migration'); + return; + } + + try { + const db = getWorkerDb(connectionString); + const instance = await getActiveInstance(db, userId); + + if (instance) { + // Legacy instance found in Postgres — backfill registry entry. + // do_key = userId for legacy instances (DO keyed by idFromName(userId)). + this.db + .insert(registryInstances) + .values({ + instance_id: instance.id, + do_key: userId, + assigned_user_id: userId, + created_at: new Date().toISOString(), + }) + .onConflictDoNothing() + .run(); + } + // No Postgres row means no legacy instance — Postgres is the source of truth. + // Orphaned DOs (state but no Postgres row) only occur via manual DB deletion + // and are handled by the resolveRegistryEntry fallback in index.ts. + + this.migrated = true; + await this.ctx.storage.put('migrated', true); + } catch (err) { + // Postgres/Hyperdrive error — defer migration, next access will retry after cooldown + console.error('[Registry] Lazy migration failed, will retry on next access:', err); + } + } +} diff --git a/kiloclaw/src/index.ts b/kiloclaw/src/index.ts index 3df12f7b5..36bd19678 100644 --- a/kiloclaw/src/index.ts +++ b/kiloclaw/src/index.ts @@ -27,10 +27,12 @@ import { startingUpPage } from './pages/starting-up'; import { buildForwardHeaders } from './utils/proxy-headers'; import { timingMiddleware } from './middleware/analytics'; import { writeEvent } from './utils/analytics'; +import type { RegistryEntry } from './durable-objects/kiloclaw-registry'; // Export DOs (match wrangler.jsonc class_name bindings) export { KiloClawInstance } from './durable-objects/kiloclaw-instance'; export { KiloClawApp } from './durable-objects/kiloclaw-app'; +export { KiloClawRegistry } from './durable-objects/kiloclaw-registry'; // ============================================================================= // Helpers @@ -313,6 +315,46 @@ app.all('/i/:instanceId/*', async c => { // CATCH-ALL: Proxy to per-user OpenClaw gateway via Fly Proxy // ============================================================================= +/** + * Resolve the user's default personal instance DO stub via the registry. + * Returns the stub and its DO key, or null if no instance exists. + * Triggers lazy migration on first access. + * + * Falls back to legacy direct userId-keyed DO lookup if the Registry DO + * is unavailable (e.g., migration error, transient failure). This ensures + * proxy access is preserved even when the registry is unhealthy. + */ +async function resolveRegistryEntry(c: Context) { + const userId = c.get('userId'); + if (!userId) return null; + + try { + const registryKey = `user:${userId}`; + const registryStub = c.env.KILOCLAW_REGISTRY.get( + c.env.KILOCLAW_REGISTRY.idFromName(registryKey) + ); + const entries = await registryStub.listInstances(registryKey); + if (entries.length === 0) return null; + + const entry = entries[0]; + const stub = c.env.KILOCLAW_INSTANCE.get(c.env.KILOCLAW_INSTANCE.idFromName(entry.doKey)); + return { stub, entry }; + } catch (err) { + // Registry DO failed — fall back to legacy direct userId-keyed lookup. + // This preserves proxy access during registry outages / migration errors. + console.error('[PROXY] Registry lookup failed, falling back to legacy DO:', err); + const stub = c.env.KILOCLAW_INSTANCE.get(c.env.KILOCLAW_INSTANCE.idFromName(userId)); + const fallbackEntry: RegistryEntry = { + doKey: userId, + instanceId: '', + assignedUserId: userId, + createdAt: '', + destroyedAt: null, + }; + return { stub, entry: fallbackEntry }; + } +} + /** * Attempt crash recovery: if the user's instance has status 'running' but * the machine is dead, call start() to restart it transparently. @@ -323,7 +365,9 @@ async function attemptCrashRecovery(c: Context): Promise { const startedAt = performance.now(); try { - const stub = c.env.KILOCLAW_INSTANCE.get(c.env.KILOCLAW_INSTANCE.idFromName(userId)); + const resolved = await resolveRegistryEntry(c); + if (!resolved) return false; + const { stub } = resolved; const status = await stub.getStatus(); if (status.status !== 'running') { @@ -339,7 +383,7 @@ async function attemptCrashRecovery(c: Context): Promise { event: 'instance.crash_recovery_succeeded', delivery: 'http', userId, - sandboxId: c.get('sandboxId') ?? undefined, + sandboxId: freshStatus.sandboxId ?? undefined, flyMachineId: freshStatus.flyMachineId ?? undefined, flyAppName: freshStatus.flyAppName ?? undefined, status: freshStatus.status ?? undefined, @@ -362,36 +406,48 @@ async function attemptCrashRecovery(c: Context): Promise { } /** - * Resolve the flyMachineId, flyAppName, and status for the current user from their DO. + * Resolve the flyMachineId, flyAppName, sandboxId, and status for the current user from their DO. * Returns null machineId if the instance is destroying (blocks proxy during teardown). + * Routes through the user registry, which triggers lazy migration on first access. + * + * The returned sandboxId is the DO's authoritative value — it may differ from the + * middleware-derived `c.get('sandboxId')` for instance-keyed DOs (which use `ki_` prefix). + * Callers MUST use the returned sandboxId for gateway token derivation. */ async function resolveInstance(c: Context): Promise<{ machineId: string | null; flyAppName: string | null; + sandboxId: string | null; status: string | null; }> { - const userId = c.get('userId'); - if (!userId) return { machineId: null, flyAppName: null, status: null }; - - const stub = c.env.KILOCLAW_INSTANCE.get(c.env.KILOCLAW_INSTANCE.idFromName(userId)); - const s = await stub.getStatus(); - - if (s.status === 'destroying') return { machineId: null, flyAppName: null, status: 'destroying' }; - if (s.status === 'restoring') return { machineId: null, flyAppName: null, status: 'restoring' }; - - return { machineId: s.flyMachineId, flyAppName: s.flyAppName, status: s.status }; + const resolved = await resolveRegistryEntry(c); + if (!resolved) return { machineId: null, flyAppName: null, sandboxId: null, status: null }; + + const s = await resolved.stub.getStatus(); + + if (s.status === 'destroying') + return { machineId: null, flyAppName: null, sandboxId: null, status: 'destroying' }; + if (s.status === 'restoring') + return { machineId: null, flyAppName: null, sandboxId: null, status: 'restoring' }; + + return { + machineId: s.flyMachineId, + flyAppName: s.flyAppName, + sandboxId: s.sandboxId, + status: s.status, + }; } app.all('*', async c => { - const sandboxId = c.get('sandboxId'); - if (!sandboxId) { + // Auth gate: middleware-derived sandboxId proves the user is authenticated. + if (!c.get('sandboxId')) { return c.json( { error: 'Authentication required', hint: 'No active session. Please log in.' }, 401 ); } - const { machineId, flyAppName, status } = await resolveInstance(c); + const { machineId, flyAppName, sandboxId, status } = await resolveInstance(c); if (status === 'destroying') { return c.json( { error: 'Instance is being destroyed', hint: 'This instance is being torn down.' }, @@ -416,6 +472,9 @@ app.all('*', async c => { 404 ); } + if (!sandboxId) { + return c.json({ error: 'Instance has no sandboxId' }, 500); + } // Per-user app name, with legacy fallback for existing instances const appName = flyAppName ?? c.env.FLY_APP_NAME; @@ -436,6 +495,10 @@ app.all('*', async c => { return c.json({ error: 'Configuration error' }, 503); } + // Use the DO's authoritative sandboxId for gateway token derivation. + // This is critical: instance-keyed DOs derive sandboxId from instanceId (ki_ prefix), + // which differs from the middleware-derived value (sandboxIdFromUserId). The gateway + // token must match what the machine expects. const forwardHeaders = await buildForwardHeaders({ requestHeaders: request.headers, machineId, diff --git a/kiloclaw/src/routes/platform.ts b/kiloclaw/src/routes/platform.ts index 59c3b452b..4b4ed8f20 100644 --- a/kiloclaw/src/routes/platform.ts +++ b/kiloclaw/src/routes/platform.ts @@ -300,8 +300,9 @@ platform.post('/provision', async c => { pinnedImageTag, } = result.data; + let provision; try { - const provision = await withDORetry( + provision = await withDORetry( instanceStubFactory(c.env, userId, instanceId), stub => stub.provision( @@ -321,7 +322,6 @@ platform.post('/provision', async c => { ), 'provision' ); - return c.json(provision, 201); } catch (err) { const raw = err instanceof Error ? err.message : 'Unknown error'; if (raw.includes('duplicate key') || raw.includes('unique constraint')) { @@ -331,6 +331,24 @@ platform.post('/provision', async c => { const { message, status } = sanitizeError(err, 'provision'); return jsonError(message, status); } + + // Record the instance in the appropriate registry (best-effort). + // instanceId is always provided by Next.js (the Postgres row UUID). + if (instanceId) { + try { + const registryKey = orgId ? `org:${orgId}` : `user:${userId}`; + const registryStub = c.env.KILOCLAW_REGISTRY.get( + c.env.KILOCLAW_REGISTRY.idFromName(registryKey) + ); + // doKey = instanceId: all new provisions create DOs keyed by instanceId. + // For lazy-migrated legacy instances, doKey = userId (set in lazyMigrate). + await registryStub.createInstance(registryKey, userId, instanceId, instanceId); + } catch (registryErr) { + console.error('[platform] Registry create failed (non-fatal):', registryErr); + } + } + + return c.json(provision, 201); }); // PATCH /api/platform/kilocode-config @@ -1138,12 +1156,40 @@ platform.post('/destroy', async c => { if ('error' in iidResult) return iidResult.error; const { instanceId } = iidResult; + const { userId } = result.data; + + // Read the instance's orgId before destroying so we can update the correct registry. + let orgId: string | null = null; + if (instanceId) { + try { + const statusStub = instanceStubFactory(c.env, userId, instanceId)(); + const status = await statusStub.getStatus(); + orgId = status.orgId; + } catch { + // If we can't read status, proceed with destroy — registry cleanup is best-effort. + } + } + try { await withDORetry( - instanceStubFactory(c.env, result.data.userId, instanceId), + instanceStubFactory(c.env, userId, instanceId), stub => stub.destroy(), 'destroy' ); + + // Remove the instance from the registry (best-effort). + if (instanceId) { + try { + const registryKey = orgId ? `org:${orgId}` : `user:${userId}`; + const registryStub = c.env.KILOCLAW_REGISTRY.get( + c.env.KILOCLAW_REGISTRY.idFromName(registryKey) + ); + await registryStub.destroyInstance(registryKey, instanceId); + } catch (registryErr) { + console.error('[platform] Registry destroy failed (non-fatal):', registryErr); + } + } + return c.json({ ok: true }); } catch (err) { const { message, status } = sanitizeError(err, 'destroy'); diff --git a/kiloclaw/src/test-utils.ts b/kiloclaw/src/test-utils.ts index c00bc37f0..ce679c56d 100644 --- a/kiloclaw/src/test-utils.ts +++ b/kiloclaw/src/test-utils.ts @@ -29,6 +29,7 @@ export function createMockEnv(overrides: Partial = {}): KiloClawEnv return { KILOCLAW_INSTANCE: {} as unknown as KiloClawEnv['KILOCLAW_INSTANCE'], KILOCLAW_APP: {} as unknown as KiloClawEnv['KILOCLAW_APP'], + KILOCLAW_REGISTRY: {} as unknown as KiloClawEnv['KILOCLAW_REGISTRY'], KILOCLAW_CONTROLLER_AE: { writeDataPoint: vi.fn(), } as unknown as KiloClawEnv['KILOCLAW_CONTROLLER_AE'], diff --git a/kiloclaw/src/types.ts b/kiloclaw/src/types.ts index e4d5ae6bd..b2fb5f51b 100644 --- a/kiloclaw/src/types.ts +++ b/kiloclaw/src/types.ts @@ -1,5 +1,6 @@ import type { KiloClawInstance } from './durable-objects/kiloclaw-instance'; import type { KiloClawApp } from './durable-objects/kiloclaw-app'; +import type { KiloClawRegistry } from './durable-objects/kiloclaw-registry'; import type { SnapshotRestoreMessage } from './schemas/snapshot-restore'; /** @@ -8,6 +9,7 @@ import type { SnapshotRestoreMessage } from './schemas/snapshot-restore'; export type KiloClawEnv = { KILOCLAW_INSTANCE: DurableObjectNamespace; KILOCLAW_APP: DurableObjectNamespace; + KILOCLAW_REGISTRY: DurableObjectNamespace; KILOCLAW_AE?: AnalyticsEngineDataset; KILOCLAW_CONTROLLER_AE: AnalyticsEngineDataset; HYPERDRIVE?: Hyperdrive; diff --git a/kiloclaw/tsconfig.json b/kiloclaw/tsconfig.json index ce9943bf8..213613389 100644 --- a/kiloclaw/tsconfig.json +++ b/kiloclaw/tsconfig.json @@ -9,12 +9,15 @@ "forceConsistentCasingInFileNames": true, "strict": true, "skipLibCheck": true, - "noEmit": true + "noEmit": true, + "allowJs": true, + "resolveJsonModule": true }, "include": [ "worker-configuration.d.ts", "src/**/*.ts", "controller/src/**/*.ts", - "vitest.config.ts" + "vitest.config.ts", + "drizzle/**/*" ] } diff --git a/kiloclaw/vitest.config.ts b/kiloclaw/vitest.config.ts index b47859ea5..bc58e37f7 100644 --- a/kiloclaw/vitest.config.ts +++ b/kiloclaw/vitest.config.ts @@ -1,6 +1,8 @@ import { defineConfig } from 'vitest/config'; export default defineConfig({ + // Treat .sql imports as raw text (needed for drizzle-orm/durable-sqlite migrations) + assetsInclude: ['**/*.sql'], test: { name: 'unit', globals: true, diff --git a/kiloclaw/wrangler.jsonc b/kiloclaw/wrangler.jsonc index ce0a1e7e0..e693a5d1c 100644 --- a/kiloclaw/wrangler.jsonc +++ b/kiloclaw/wrangler.jsonc @@ -31,6 +31,10 @@ "class_name": "KiloClawApp", "name": "KILOCLAW_APP", }, + { + "class_name": "KiloClawRegistry", + "name": "KILOCLAW_REGISTRY", + }, ], }, "migrations": [ @@ -50,6 +54,10 @@ "new_sqlite_classes": ["KiloClawApp"], "tag": "v5", }, + { + "new_sqlite_classes": ["KiloClawRegistry"], + "tag": "v6", + }, ], "kv_namespaces": [ { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a9caf244b..29185143c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1503,7 +1503,7 @@ importers: version: 4.0.2 knip: specifier: ^5.86.0 - version: 5.86.0(@types/node@22.19.15)(typescript@5.9.3) + version: 5.86.0(@types/node@25.5.0)(typescript@5.9.3) typescript: specifier: 'catalog:' version: 5.9.3 @@ -1544,6 +1544,9 @@ importers: '@vitest/coverage-v8': specifier: ^4.1.0 version: 4.1.0(vitest@4.1.0(@opentelemetry/api@1.9.0)(@types/node@22.19.15)(esbuild@0.27.4)(jiti@2.6.1)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.2)) + drizzle-kit: + specifier: 'catalog:' + version: 0.31.9 typescript: specifier: 'catalog:' version: 5.9.3 @@ -21204,7 +21207,7 @@ snapshots: sirv: 3.0.2 tinyglobby: 0.2.15 tinyrainbow: 2.0.0 - vitest: 3.2.4(@types/debug@4.1.12)(@types/node@25.5.0)(@vitest/ui@3.2.4)(esbuild@0.27.4)(jiti@2.6.1)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.2) + vitest: 3.2.4(@types/debug@4.1.12)(@types/node@22.19.15)(@vitest/ui@3.2.4)(esbuild@0.27.4)(jiti@2.6.1)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.2) '@vitest/utils@3.2.4': dependencies: @@ -25085,6 +25088,24 @@ snapshots: yaml: 2.8.2 zod: 4.3.6 + knip@5.86.0(@types/node@25.5.0)(typescript@5.9.3): + dependencies: + '@nodelib/fs.walk': 1.2.8 + '@types/node': 25.5.0 + fast-glob: 3.3.3 + formatly: 0.3.0 + jiti: 2.6.1 + minimist: 1.2.8 + oxc-resolver: 11.19.1 + picocolors: 1.1.1 + picomatch: 4.0.3 + smol-toml: 1.6.0 + strip-json-comments: 5.0.3 + typescript: 5.9.3 + unbash: 2.2.0 + yaml: 2.8.2 + zod: 4.3.6 + lan-network@0.2.0: {} leven@3.1.0: {} diff --git a/src/lib/kiloclaw/instance-registry.ts b/src/lib/kiloclaw/instance-registry.ts index fa896a009..e2bec6b0c 100644 --- a/src/lib/kiloclaw/instance-registry.ts +++ b/src/lib/kiloclaw/instance-registry.ts @@ -3,7 +3,7 @@ import 'server-only'; import { and, eq, isNull } from 'drizzle-orm'; import { kiloclaw_instances } from '@kilocode/db/schema'; import { db } from '@/lib/drizzle'; -import { sandboxIdFromUserId } from '@/lib/kiloclaw/sandbox-id'; +import { sandboxIdFromUserId, sandboxIdFromInstanceId } from '@/lib/kiloclaw/sandbox-id'; export type ActiveKiloClawInstance = { id: string; @@ -23,30 +23,58 @@ type EnsureActiveInstanceOpts = { * This is idempotent and safe under concurrent calls. * * The returned `id` (DB row UUID) serves as the instanceId for multi-instance - * routing. For legacy personal flow, sandboxId is derived from userId. - * For new multi-instance flows (PR 2+), callers use the returned `id` as the - * DO key and derive sandboxId from it via `sandboxIdFromInstanceId(id)`. + * routing. + * + * For legacy personal flow (no opts.orgId): sandboxId is derived from userId, + * DO key = userId. Idempotent via onConflictDoNothing on the unique index. + * + * For org instances (opts.orgId present): sandboxId is derived from a freshly + * generated UUID (the row's id), DO key = instanceId. Not idempotent — each + * call creates a new instance row. */ export async function ensureActiveInstance( userId: string, opts?: EnsureActiveInstanceOpts ): Promise { - const sandboxId = sandboxIdFromUserId(userId); - - const values: { - user_id: string; - sandbox_id: string; - organization_id?: string; - } = { - user_id: userId, - sandbox_id: sandboxId, - }; - if (opts?.orgId) { - values.organization_id = opts.orgId; + // Org instance: generate UUID, derive sandboxId from it. + // Each call creates a new row (no idempotency — callers gate on existing rows). + const instanceId = crypto.randomUUID(); + const sandboxId = sandboxIdFromInstanceId(instanceId); + + const [row] = await db + .insert(kiloclaw_instances) + .values({ + id: instanceId, + user_id: userId, + sandbox_id: sandboxId, + organization_id: opts.orgId, + }) + .returning({ + id: kiloclaw_instances.id, + userId: kiloclaw_instances.user_id, + sandboxId: kiloclaw_instances.sandbox_id, + organizationId: kiloclaw_instances.organization_id, + name: kiloclaw_instances.name, + }); + + if (!row) { + throw new Error('Failed to create org instance row'); + } + + return row; } - await db.insert(kiloclaw_instances).values(values).onConflictDoNothing(); + // Legacy personal flow: derive sandboxId from userId. Idempotent. + const sandboxId = sandboxIdFromUserId(userId); + + await db + .insert(kiloclaw_instances) + .values({ + user_id: userId, + sandbox_id: sandboxId, + }) + .onConflictDoNothing(); const [row] = await db .select({ diff --git a/src/routers/kiloclaw-router.ts b/src/routers/kiloclaw-router.ts index a530ed842..71cf69167 100644 --- a/src/routers/kiloclaw-router.ts +++ b/src/routers/kiloclaw-router.ts @@ -336,15 +336,19 @@ async function provisionInstance( const client = new KiloClawInternalClient(); try { - return await client.provision(user.id, { - envVars: input.envVars, - encryptedSecrets, - channels: buildWorkerChannels(input.channels), - kilocodeApiKey, - kilocodeApiKeyExpiresAt, - kilocodeDefaultModel: input.kilocodeDefaultModel ?? undefined, - pinnedImageTag, - }); + return await client.provision( + user.id, + { + envVars: input.envVars, + encryptedSecrets, + channels: buildWorkerChannels(input.channels), + kilocodeApiKey, + kilocodeApiKeyExpiresAt, + kilocodeDefaultModel: input.kilocodeDefaultModel ?? undefined, + pinnedImageTag, + }, + { instanceId: instanceRow.id } + ); } catch (error) { // Only clean up the exact row this attempt created. Target by primary // key so a concurrent request's row is never affected. @@ -698,7 +702,7 @@ export const kiloclawRouter = createTRPCRouter({ const client = new KiloClawInternalClient(); let result; try { - result = await client.destroy(ctx.user.id); + result = await client.destroy(ctx.user.id, destroyedRow?.id); } catch (error) { if (destroyedRow) { await restoreDestroyedInstance(destroyedRow.id); From f82197f6227f79c2b1fb8a16ea5feeebefc49821 Mon Sep 17 00:00:00 2001 From: syn Date: Sun, 29 Mar 2026 16:17:07 -0500 Subject: [PATCH 02/15] feat(kiloclaw): thread instanceId through all lifecycle callers Complete the instance-keyed DO migration by threading instanceId through every caller that resolves a KiloClawInstance DO stub: Worker: - All ~30 platform.ts routes now parse ?instanceId= and pass to instanceStubFactory (3-arg calls) - controller.ts handles ki_ sandboxIds via isInstanceKeyedSandboxId to resolve the correct DO key - Snapshot-restore queue message includes optional instanceId; consumer uses it as DO key when present Internal client: - All ~30 instance-scoped methods accept optional instanceId as last parameter, forwarded as ?instanceId= query param Next.js callers: - All tRPC router methods call getActiveInstance(userId) and pass instance?.id to internal client - Admin router methods pass instance.id from DB lookups - Billing cron + autoResumeIfSuspended already had instanceId (verified pre-existing) New exports from @kilocode/worker-utils/instance-id: - isInstanceKeyedSandboxId(sandboxId): boolean - instanceIdFromSandboxId(sandboxId): string --- .../kiloclaw-instance/index.ts | 8 + kiloclaw/src/index.ts | 8 +- kiloclaw/src/queue/snapshot-restore.ts | 5 +- kiloclaw/src/routes/controller.ts | 26 +- kiloclaw/src/routes/platform.ts | 186 +++++++++--- kiloclaw/src/schemas/snapshot-restore.ts | 1 + packages/worker-utils/src/instance-id.ts | 17 ++ src/lib/kiloclaw/kiloclaw-internal-client.ts | 271 ++++++++++++------ .../admin-kiloclaw-instances-router.ts | 79 +++-- src/routers/kiloclaw-router.ts | 110 ++++--- 10 files changed, 510 insertions(+), 201 deletions(-) diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts index e1bca1761..3030febdf 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts @@ -22,6 +22,10 @@ import { DEFAULT_INSTANCE_FEATURES } from '../../schemas/instance-config'; import type { FlyVolume, FlyVolumeSnapshot } from '../../fly/types'; import * as fly from '../../fly/client'; import { sandboxIdFromUserId, sandboxIdFromInstanceId } from '../../auth/sandbox-id'; +import { + isInstanceKeyedSandboxId, + instanceIdFromSandboxId, +} from '@kilocode/worker-utils/instance-id'; import { resolveLatestVersion, resolveVersionByTag } from '../../lib/image-version'; import { lookupCatalogVersion } from '../../lib/catalog-registration'; import { ImageVariantSchema } from '../../schemas/image-version'; @@ -1709,6 +1713,10 @@ export class KiloClawInstance extends DurableObject { snapshotId, previousVolumeId, region: this.s.flyRegion, + instanceId: + this.s.sandboxId && isInstanceKeyedSandboxId(this.s.sandboxId) + ? instanceIdFromSandboxId(this.s.sandboxId) + : undefined, }); } catch (err) { this.s.status = previousStatus; diff --git a/kiloclaw/src/index.ts b/kiloclaw/src/index.ts index 36bd19678..fa5a5b0cf 100644 --- a/kiloclaw/src/index.ts +++ b/kiloclaw/src/index.ts @@ -340,8 +340,12 @@ async function resolveRegistryEntry(c: Context) { const stub = c.env.KILOCLAW_INSTANCE.get(c.env.KILOCLAW_INSTANCE.idFromName(entry.doKey)); return { stub, entry }; } catch (err) { - // Registry DO failed — fall back to legacy direct userId-keyed lookup. - // This preserves proxy access during registry outages / migration errors. + // Registry DO failed. Fall back to the legacy userId-keyed DO. + // Only preserves access for legacy instances (doKey = userId). + // For instance-keyed DOs (doKey = instanceId), this hits the wrong + // (empty) DO — the user sees "not provisioned" until the registry + // recovers. Acceptable: a broken registry is transient, and silently + // routing to the wrong DO would be worse. console.error('[PROXY] Registry lookup failed, falling back to legacy DO:', err); const stub = c.env.KILOCLAW_INSTANCE.get(c.env.KILOCLAW_INSTANCE.idFromName(userId)); const fallbackEntry: RegistryEntry = { diff --git a/kiloclaw/src/queue/snapshot-restore.ts b/kiloclaw/src/queue/snapshot-restore.ts index f607c4a1a..17be7467f 100644 --- a/kiloclaw/src/queue/snapshot-restore.ts +++ b/kiloclaw/src/queue/snapshot-restore.ts @@ -54,8 +54,9 @@ export async function handleSnapshotRestoreQueue( continue; } - const { userId, snapshotId, previousVolumeId, region } = parsed.data; - const stub = env.KILOCLAW_INSTANCE.get(env.KILOCLAW_INSTANCE.idFromName(userId)); + const { userId, snapshotId, previousVolumeId, region, instanceId } = parsed.data; + const doKey = instanceId ?? userId; + const stub = env.KILOCLAW_INSTANCE.get(env.KILOCLAW_INSTANCE.idFromName(doKey)); try { // Step 0: Idempotency check — has the volume already been swapped? diff --git a/kiloclaw/src/routes/controller.ts b/kiloclaw/src/routes/controller.ts index 711953a59..744f7da72 100644 --- a/kiloclaw/src/routes/controller.ts +++ b/kiloclaw/src/routes/controller.ts @@ -4,6 +4,10 @@ import { z } from 'zod'; import { timingSafeEqual } from '@kilocode/encryption'; import type { AppEnv } from '../types'; import { userIdFromSandboxId } from '../auth/sandbox-id'; +import { + isInstanceKeyedSandboxId, + instanceIdFromSandboxId, +} from '@kilocode/worker-utils/instance-id'; import { deriveGatewayToken } from '../auth/gateway-token'; import { waitUntil } from 'cloudflare:workers'; import { getWorkerDb, findEmailByUserId } from '../db'; @@ -103,14 +107,26 @@ controller.post('/checkin', async (c: Context) => { return c.json({ error: 'Forbidden' }, 403); } + // For instance-keyed sandboxIds (ki_ prefix), the DO key is the instanceId. + // For legacy sandboxIds (base64url), the DO key is the userId. let userId: string; - try { - userId = userIdFromSandboxId(data.sandboxId); - } catch { - return c.json({ error: 'Invalid sandboxId' }, 400); + let doKey: string; + if (isInstanceKeyedSandboxId(data.sandboxId)) { + const instanceId = instanceIdFromSandboxId(data.sandboxId); + doKey = instanceId; + // We don't know the userId from the sandboxId alone for instance-keyed DOs. + // The DO will validate auth via the API key check below. + userId = instanceId; // placeholder — not used for auth, only for logging + } else { + try { + userId = userIdFromSandboxId(data.sandboxId); + } catch { + return c.json({ error: 'Invalid sandboxId' }, 400); + } + doKey = userId; } - const stub = c.env.KILOCLAW_INSTANCE.get(c.env.KILOCLAW_INSTANCE.idFromName(userId)); + const stub = c.env.KILOCLAW_INSTANCE.get(c.env.KILOCLAW_INSTANCE.idFromName(doKey)); const config = await stub.getConfig().catch(() => null); if (!config?.kilocodeApiKey || !timingSafeEqual(apiKey, config.kilocodeApiKey)) { return c.json({ error: 'Forbidden' }, 403); diff --git a/kiloclaw/src/routes/platform.ts b/kiloclaw/src/routes/platform.ts index 4b4ed8f20..4a7a9a9bb 100644 --- a/kiloclaw/src/routes/platform.ts +++ b/kiloclaw/src/routes/platform.ts @@ -356,11 +356,14 @@ platform.patch('/kilocode-config', async c => { const result = await parseBody(c, KiloCodeConfigPatchSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, kilocodeApiKey, kilocodeApiKeyExpiresAt, kilocodeDefaultModel } = result.data; try { const updated = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.updateKiloCodeConfig({ kilocodeApiKey, @@ -381,11 +384,14 @@ platform.patch('/channels', async c => { const result = await parseBody(c, ChannelsPatchSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, channels } = result.data; try { const updated = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.updateChannels(channels), 'updateChannels' ); @@ -407,11 +413,14 @@ platform.patch('/exec-preset', async c => { const result = await parseBody(c, ExecPresetPatchSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, security, ask } = result.data; try { const updated = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.updateExecPreset({ security, ask }), 'updateExecPreset' ); @@ -432,11 +441,14 @@ platform.post('/google-credentials', async c => { const result = await parseBody(c, GoogleCredentialsPatchSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, googleCredentials } = result.data; try { const updated = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.updateGoogleCredentials(googleCredentials), 'updateGoogleCredentials' ); @@ -452,9 +464,12 @@ platform.delete('/google-credentials', async c => { const userId = setValidatedQueryUserId(c); if (!userId) return c.json({ error: 'userId is required' }, 400); + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const updated = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.clearGoogleCredentials(), 'clearGoogleCredentials' ); @@ -470,11 +485,14 @@ platform.post('/gmail-notifications', async c => { const result = await parseBody(c, UserIdRequestSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId } = result.data; try { const updated = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.updateGmailNotifications(true), 'enableGmailNotifications' ); @@ -490,9 +508,12 @@ platform.delete('/gmail-notifications', async c => { const userId = setValidatedQueryUserId(c); if (!userId) return c.json({ error: 'userId is required' }, 400); + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const updated = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.updateGmailNotifications(false), 'disableGmailNotifications' ); @@ -508,11 +529,14 @@ platform.post('/gmail-history-id', async c => { const result = await parseBody(c, GmailHistoryIdSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, historyId } = result.data; try { await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.updateGmailHistoryId(historyId), 'updateGmailHistoryId' ); @@ -529,9 +553,12 @@ platform.get('/gmail-oidc-email', async c => { const userId = setValidatedQueryUserId(c); if (!userId) return c.json({ error: 'userId is required' }, 400); + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const result = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.getGmailOidcEmail(), 'getGmailOidcEmail' ); @@ -547,11 +574,14 @@ platform.patch('/secrets', async c => { const result = await parseBody(c, SecretsPatchSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, secrets, meta } = result.data; try { const updated = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.updateSecrets(secrets, meta), 'updateSecrets' ); @@ -567,11 +597,14 @@ platform.get('/pairing', async c => { const userId = setValidatedQueryUserId(c); if (!userId) return c.json({ error: 'userId is required' }, 400); + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const forceRefresh = c.req.query('refresh') === 'true'; try { const pairing = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.listPairingRequests(forceRefresh), 'listPairingRequests' ); @@ -593,11 +626,14 @@ platform.post('/pairing/approve', async c => { const result = await parseBody(c, PairingApproveSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, channel, code } = result.data; try { const approved = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.approvePairingRequest(channel, code), 'approvePairingRequest' ); @@ -613,11 +649,14 @@ platform.get('/device-pairing', async c => { const userId = setValidatedQueryUserId(c); if (!userId) return c.json({ error: 'userId is required' }, 400); + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const forceRefresh = c.req.query('refresh') === 'true'; try { const pairing = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.listDevicePairingRequests(forceRefresh), 'listDevicePairingRequests' ); @@ -638,11 +677,14 @@ platform.post('/device-pairing/approve', async c => { const result = await parseBody(c, DevicePairingApproveSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, requestId } = result.data; try { const approved = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.approveDevicePairingRequest(requestId), 'approveDevicePairingRequest' ); @@ -660,9 +702,12 @@ platform.get('/gateway/status', async c => { return c.json({ error: 'userId query parameter is required' }, 400); } + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const gatewayStatus = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.getGatewayProcessStatus(), 'getGatewayProcessStatus' ); @@ -682,9 +727,12 @@ platform.get('/gateway/ready', async c => { return c.json({ error: 'userId query parameter is required' }, 400); } + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const result = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.getGatewayReady(), 'getGatewayReady' ); @@ -702,9 +750,12 @@ platform.get('/controller-version', async c => { return c.json({ error: 'userId query parameter is required' }, 400); } + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const result = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.getControllerVersion(), 'getControllerVersion' ); @@ -723,9 +774,12 @@ platform.post('/gateway/start', async c => { const result = await parseBody(c, UserIdRequestSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const response = await withDORetry( - instanceStubFactory(c.env, result.data.userId), + instanceStubFactory(c.env, result.data.userId, iidResult.instanceId), stub => stub.startGatewayProcess(), 'startGatewayProcess' ); @@ -741,9 +795,12 @@ platform.post('/gateway/stop', async c => { const result = await parseBody(c, UserIdRequestSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const response = await withDORetry( - instanceStubFactory(c.env, result.data.userId), + instanceStubFactory(c.env, result.data.userId, iidResult.instanceId), stub => stub.stopGatewayProcess(), 'stopGatewayProcess' ); @@ -759,9 +816,12 @@ platform.post('/gateway/restart', async c => { const result = await parseBody(c, UserIdRequestSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const response = await withDORetry( - instanceStubFactory(c.env, result.data.userId), + instanceStubFactory(c.env, result.data.userId, iidResult.instanceId), stub => stub.restartGatewayProcess(), 'restartGatewayProcess' ); @@ -782,11 +842,14 @@ platform.post('/config/restore', async c => { const result = await parseBody(c, ConfigRestoreSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, version } = result.data; try { const response = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.restoreConfig(version), 'restoreConfig' ); @@ -807,9 +870,12 @@ platform.get('/openclaw-config', async c => { return c.json({ error: 'userId query parameter is required' }, 400); } + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const config = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.getOpenclawConfig(), 'getOpenclawConfig' ); @@ -835,11 +901,14 @@ platform.post('/openclaw-config', async c => { const result = await parseBody(c, ReplaceOpenclawConfigSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, config, etag } = result.data; try { const response = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.replaceConfigOnMachine(config, etag), 'replaceConfigOnMachine' ); @@ -864,11 +933,14 @@ platform.patch('/openclaw-config', async c => { const result = await parseBody(c, PatchOpenclawConfigSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, patch } = result.data; try { const response = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.patchOpenclawConfig(patch), 'patchOpenclawConfig' ); @@ -885,9 +957,13 @@ platform.get('/files/tree', async c => { if (!userId) { return c.json({ error: 'userId query parameter is required' }, 400); } + + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const result = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.getFileTree(), 'getFileTree' ); @@ -915,9 +991,13 @@ platform.get('/files/read', async c => { if (!filePath) { return c.json({ error: 'path query parameter is required' }, 400); } + + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const result = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.readFile(filePath), 'readFile' ); @@ -946,10 +1026,14 @@ const WriteFileSchema = z.object({ platform.post('/files/write', async c => { const result = await parseBody(c, WriteFileSchema); if ('error' in result) return result.error; + + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, path: filePath, content, etag } = result.data; try { const response = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.writeFile(filePath, content, etag), 'writeFile' ); @@ -972,9 +1056,12 @@ platform.post('/doctor', async c => { const result = await parseBody(c, UserIdRequestSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const doctor = await withDORetry( - instanceStubFactory(c.env, result.data.userId), + instanceStubFactory(c.env, result.data.userId, iidResult.instanceId), stub => stub.runDoctor(), 'runDoctor' ); @@ -996,10 +1083,12 @@ const KiloCliRunStartSchema = z.object({ platform.post('/kilo-cli-run/start', async c => { const result = await parseBody(c, KiloCliRunStartSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; try { const response = await withDORetry( - instanceStubFactory(c.env, result.data.userId), + instanceStubFactory(c.env, result.data.userId, iidResult.instanceId), stub => stub.startKiloCliRun(result.data.prompt), 'startKiloCliRun' ); @@ -1014,10 +1103,12 @@ platform.post('/kilo-cli-run/start', async c => { platform.get('/kilo-cli-run/status', async c => { const userId = c.req.query('userId'); if (!userId) return jsonError('Missing userId', 400); + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; try { const response = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.getKiloCliRunStatus(), 'getKiloCliRunStatus' ); @@ -1032,10 +1123,12 @@ platform.get('/kilo-cli-run/status', async c => { platform.post('/kilo-cli-run/cancel', async c => { const result = await parseBody(c, UserIdRequestSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; try { const response = await withDORetry( - instanceStubFactory(c.env, result.data.userId), + instanceStubFactory(c.env, result.data.userId, iidResult.instanceId), stub => stub.cancelKiloCliRun(), 'cancelKiloCliRun' ); @@ -1095,11 +1188,15 @@ platform.post('/start', async c => { platform.post('/force-retry-recovery', async c => { const result = await parseBody(c, UserIdRequestSchema); if ('error' in result) return result.error; + + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const startedAt = performance.now(); try { const { ok } = await withDORetry( - instanceStubFactory(c.env, result.data.userId), + instanceStubFactory(c.env, result.data.userId, iidResult.instanceId), stub => stub.forceRetryRecovery(), 'forceRetryRecovery' ); @@ -1356,9 +1453,12 @@ platform.get('/volume-snapshots', async c => { return c.json({ error: 'userId query parameter is required' }, 400); } + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const snapshots = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.listVolumeSnapshots(), 'listVolumeSnapshots' ); @@ -1377,9 +1477,12 @@ platform.get('/candidate-volumes', async c => { return c.json({ error: 'userId query parameter is required' }, 400); } + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const result = await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.listCandidateVolumes(), 'listCandidateVolumes' ); @@ -1402,9 +1505,12 @@ platform.post('/reassociate-volume', async c => { const result = await parseBody(c, ReassociateVolumeSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const response = await withDORetry( - instanceStubFactory(c.env, result.data.userId), + instanceStubFactory(c.env, result.data.userId, iidResult.instanceId), stub => stub.reassociateVolume(result.data.newVolumeId, result.data.reason), 'reassociateVolume' ); @@ -1426,9 +1532,12 @@ platform.post('/restore-volume-snapshot', async c => { const result = await parseBody(c, RestoreVolumeSnapshotSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + try { const response = await withDORetry( - instanceStubFactory(c.env, result.data.userId), + instanceStubFactory(c.env, result.data.userId, iidResult.instanceId), stub => stub.enqueueSnapshotRestore(result.data.snapshotId), 'enqueueSnapshotRestore' ); @@ -1612,6 +1721,9 @@ platform.post('/destroy-fly-machine', async c => { const result = await parseBody(c, DestroyFlyMachineSchema); if ('error' in result) return result.error; + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + const { userId, appName, machineId } = result.data; const apiToken = c.env.FLY_API_TOKEN; if (!apiToken) { @@ -1639,7 +1751,7 @@ platform.post('/destroy-fly-machine', async c => { // Trigger immediate reconcile so the DO discovers the machine is gone. try { await withDORetry( - instanceStubFactory(c.env, userId), + instanceStubFactory(c.env, userId, iidResult.instanceId), stub => stub.forceRetryRecovery(), 'forceRetryRecovery' ); diff --git a/kiloclaw/src/schemas/snapshot-restore.ts b/kiloclaw/src/schemas/snapshot-restore.ts index 32d0d88dc..1a09c6c75 100644 --- a/kiloclaw/src/schemas/snapshot-restore.ts +++ b/kiloclaw/src/schemas/snapshot-restore.ts @@ -9,6 +9,7 @@ export const SnapshotRestoreMessageSchema = z.object({ snapshotId: z.string(), previousVolumeId: z.string(), region: z.string(), + instanceId: z.string().optional(), }); export type SnapshotRestoreMessage = z.infer; diff --git a/packages/worker-utils/src/instance-id.ts b/packages/worker-utils/src/instance-id.ts index 3c7e902bb..acfe50382 100644 --- a/packages/worker-utils/src/instance-id.ts +++ b/packages/worker-utils/src/instance-id.ts @@ -34,3 +34,20 @@ export function sandboxIdFromInstanceId(instanceId: string): string { } return prefixed; } + +/** Returns true if the sandboxId uses the `ki_` instance-keyed format. */ +export function isInstanceKeyedSandboxId(sandboxId: string): boolean { + return sandboxId.startsWith('ki_') && sandboxId.length === 35; +} + +/** + * Recover the instanceId (UUID with dashes) from a `ki_`-prefixed sandboxId. + * Throws if the sandboxId is not in the expected format. + */ +export function instanceIdFromSandboxId(sandboxId: string): string { + if (!isInstanceKeyedSandboxId(sandboxId)) { + throw new Error('Not an instance-keyed sandboxId (expected ki_ prefix, 35 chars)'); + } + const hex = sandboxId.slice(3); // strip "ki_" + return `${hex.slice(0, 8)}-${hex.slice(8, 12)}-${hex.slice(12, 16)}-${hex.slice(16, 20)}-${hex.slice(20)}`; +} diff --git a/src/lib/kiloclaw/kiloclaw-internal-client.ts b/src/lib/kiloclaw/kiloclaw-internal-client.ts index 32ddef457..8e69ad29b 100644 --- a/src/lib/kiloclaw/kiloclaw-internal-client.ts +++ b/src/lib/kiloclaw/kiloclaw-internal-client.ts @@ -224,10 +224,12 @@ export class KiloClawInternalClient { async patchKiloCodeConfig( userId: string, - patch: KiloCodeConfigPatchInput + patch: KiloCodeConfigPatchInput, + instanceId?: string ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/kilocode-config', + `/api/platform/kilocode-config${params}`, { method: 'PATCH', body: JSON.stringify({ userId, ...patch }), @@ -236,9 +238,14 @@ export class KiloClawInternalClient { ); } - async patchChannels(userId: string, input: ChannelsPatchInput): Promise { + async patchChannels( + userId: string, + input: ChannelsPatchInput, + instanceId?: string + ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/channels', + `/api/platform/channels${params}`, { method: 'PATCH', body: JSON.stringify({ userId, ...input }), @@ -249,10 +256,12 @@ export class KiloClawInternalClient { async patchExecPreset( userId: string, - patch: { security?: string; ask?: string } + patch: { security?: string; ask?: string }, + instanceId?: string ): Promise<{ execSecurity: string | null; execAsk: string | null }> { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/exec-preset', + `/api/platform/exec-preset${params}`, { method: 'PATCH', body: JSON.stringify({ userId, ...patch }), @@ -261,9 +270,14 @@ export class KiloClawInternalClient { ); } - async patchSecrets(userId: string, input: SecretsPatchInput): Promise { + async patchSecrets( + userId: string, + input: SecretsPatchInput, + instanceId?: string + ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/secrets', + `/api/platform/secrets${params}`, { method: 'PATCH', body: JSON.stringify({ userId, ...input }), @@ -272,27 +286,34 @@ export class KiloClawInternalClient { ); } - async listVolumeSnapshots(userId: string): Promise { - return this.request( - `/api/platform/volume-snapshots?userId=${encodeURIComponent(userId)}`, - undefined, - { userId } - ); + async listVolumeSnapshots(userId: string, instanceId?: string): Promise { + const params = new URLSearchParams({ userId }); + if (instanceId) params.set('instanceId', instanceId); + return this.request(`/api/platform/volume-snapshots?${params.toString()}`, undefined, { + userId, + }); } - async listPairingRequests(userId: string, refresh = false): Promise { + async listPairingRequests( + userId: string, + refresh = false, + instanceId?: string + ): Promise { const params = new URLSearchParams({ userId }); if (refresh) params.set('refresh', 'true'); + if (instanceId) params.set('instanceId', instanceId); return this.request(`/api/platform/pairing?${params.toString()}`, undefined, { userId }); } async approvePairingRequest( userId: string, channel: string, - code: string + code: string, + instanceId?: string ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/pairing/approve', + `/api/platform/pairing/approve${params}`, { method: 'POST', body: JSON.stringify({ userId, channel, code }), @@ -303,19 +324,23 @@ export class KiloClawInternalClient { async listDevicePairingRequests( userId: string, - refresh = false + refresh = false, + instanceId?: string ): Promise { const params = new URLSearchParams({ userId }); if (refresh) params.set('refresh', 'true'); + if (instanceId) params.set('instanceId', instanceId); return this.request(`/api/platform/device-pairing?${params.toString()}`, undefined, { userId }); } async approveDevicePairingRequest( userId: string, - requestId: string + requestId: string, + instanceId?: string ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/device-pairing/approve', + `/api/platform/device-pairing/approve${params}`, { method: 'POST', body: JSON.stringify({ userId, requestId }), @@ -324,9 +349,10 @@ export class KiloClawInternalClient { ); } - async runDoctor(userId: string): Promise { + async runDoctor(userId: string, instanceId?: string): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/doctor', + `/api/platform/doctor${params}`, { method: 'POST', body: JSON.stringify({ userId }), @@ -335,9 +361,14 @@ export class KiloClawInternalClient { ); } - async startKiloCliRun(userId: string, prompt: string): Promise { + async startKiloCliRun( + userId: string, + prompt: string, + instanceId?: string + ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/kilo-cli-run/start', + `/api/platform/kilo-cli-run/start${params}`, { method: 'POST', body: JSON.stringify({ userId, prompt }), @@ -346,17 +377,18 @@ export class KiloClawInternalClient { ); } - async getKiloCliRunStatus(userId: string): Promise { - return this.request( - `/api/platform/kilo-cli-run/status?userId=${encodeURIComponent(userId)}`, - undefined, - { userId } - ); + async getKiloCliRunStatus(userId: string, instanceId?: string): Promise { + const params = new URLSearchParams({ userId }); + if (instanceId) params.set('instanceId', instanceId); + return this.request(`/api/platform/kilo-cli-run/status?${params.toString()}`, undefined, { + userId, + }); } - async cancelKiloCliRun(userId: string): Promise<{ ok: boolean }> { + async cancelKiloCliRun(userId: string, instanceId?: string): Promise<{ ok: boolean }> { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/kilo-cli-run/cancel', + `/api/platform/kilo-cli-run/cancel${params}`, { method: 'POST', body: JSON.stringify({ userId }), @@ -365,33 +397,36 @@ export class KiloClawInternalClient { ); } - async getGatewayStatus(userId: string): Promise { - return this.request( - `/api/platform/gateway/status?userId=${encodeURIComponent(userId)}`, - undefined, - { userId } - ); + async getGatewayStatus( + userId: string, + instanceId?: string + ): Promise { + const params = new URLSearchParams({ userId }); + if (instanceId) params.set('instanceId', instanceId); + return this.request(`/api/platform/gateway/status?${params.toString()}`, undefined, { userId }); } - async getGatewayReady(userId: string): Promise { - return this.request( - `/api/platform/gateway/ready?userId=${encodeURIComponent(userId)}`, - undefined, - { userId } - ); + async getGatewayReady(userId: string, instanceId?: string): Promise { + const params = new URLSearchParams({ userId }); + if (instanceId) params.set('instanceId', instanceId); + return this.request(`/api/platform/gateway/ready?${params.toString()}`, undefined, { userId }); } - async getControllerVersion(userId: string): Promise { - return this.request( - `/api/platform/controller-version?userId=${encodeURIComponent(userId)}`, - undefined, - { userId } - ); + async getControllerVersion( + userId: string, + instanceId?: string + ): Promise { + const params = new URLSearchParams({ userId }); + if (instanceId) params.set('instanceId', instanceId); + return this.request(`/api/platform/controller-version?${params.toString()}`, undefined, { + userId, + }); } - async startGateway(userId: string): Promise { + async startGateway(userId: string, instanceId?: string): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/gateway/start', + `/api/platform/gateway/start${params}`, { method: 'POST', body: JSON.stringify({ userId }), @@ -400,9 +435,10 @@ export class KiloClawInternalClient { ); } - async stopGateway(userId: string): Promise { + async stopGateway(userId: string, instanceId?: string): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/gateway/stop', + `/api/platform/gateway/stop${params}`, { method: 'POST', body: JSON.stringify({ userId }), @@ -411,9 +447,13 @@ export class KiloClawInternalClient { ); } - async restartGatewayProcess(userId: string): Promise { + async restartGatewayProcess( + userId: string, + instanceId?: string + ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/gateway/restart', + `/api/platform/gateway/restart${params}`, { method: 'POST', body: JSON.stringify({ userId }), @@ -422,9 +462,14 @@ export class KiloClawInternalClient { ); } - async restoreConfig(userId: string, version = 'base'): Promise { + async restoreConfig( + userId: string, + version = 'base', + instanceId?: string + ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/config/restore', + `/api/platform/config/restore${params}`, { method: 'POST', body: JSON.stringify({ userId, version }), @@ -433,21 +478,23 @@ export class KiloClawInternalClient { ); } - async getOpenclawConfig(userId: string): Promise { - return this.request( - `/api/platform/openclaw-config?userId=${encodeURIComponent(userId)}`, - undefined, - { userId } - ); + async getOpenclawConfig(userId: string, instanceId?: string): Promise { + const params = new URLSearchParams({ userId }); + if (instanceId) params.set('instanceId', instanceId); + return this.request(`/api/platform/openclaw-config?${params.toString()}`, undefined, { + userId, + }); } async replaceOpenclawConfig( userId: string, config: Record, - etag?: string + etag?: string, + instanceId?: string ): Promise<{ ok: true }> { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/openclaw-config', + `/api/platform/openclaw-config${params}`, { method: 'POST', body: JSON.stringify({ userId, config, ...(etag !== undefined && { etag }) }), @@ -458,10 +505,12 @@ export class KiloClawInternalClient { async patchOpenclawConfig( userId: string, - patch: Record + patch: Record, + instanceId?: string ): Promise<{ ok: boolean }> { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/openclaw-config', + `/api/platform/openclaw-config${params}`, { method: 'PATCH', body: JSON.stringify({ userId, patch }), @@ -470,13 +519,19 @@ export class KiloClawInternalClient { ); } - async getFileTree(userId: string): Promise<{ tree: FileNode[] }> { + async getFileTree(userId: string, instanceId?: string): Promise<{ tree: FileNode[] }> { const params = new URLSearchParams({ userId }); + if (instanceId) params.set('instanceId', instanceId); return this.request(`/api/platform/files/tree?${params.toString()}`); } - async readFile(userId: string, filePath: string): Promise<{ content: string; etag: string }> { + async readFile( + userId: string, + filePath: string, + instanceId?: string + ): Promise<{ content: string; etag: string }> { const params = new URLSearchParams({ userId, path: filePath }); + if (instanceId) params.set('instanceId', instanceId); return this.request(`/api/platform/files/read?${params.toString()}`); } @@ -484,9 +539,11 @@ export class KiloClawInternalClient { userId: string, filePath: string, content: string, - etag?: string + etag?: string, + instanceId?: string ): Promise<{ etag: string }> { - return this.request('/api/platform/files/write', { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; + return this.request(`/api/platform/files/write${params}`, { method: 'POST', body: JSON.stringify({ userId, path: filePath, content, etag }), }); @@ -494,10 +551,12 @@ export class KiloClawInternalClient { async updateGoogleCredentials( userId: string, - input: GoogleCredentialsInput + input: GoogleCredentialsInput, + instanceId?: string ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/google-credentials', + `/api/platform/google-credentials${params}`, { method: 'POST', body: JSON.stringify({ userId, ...input }), @@ -506,9 +565,14 @@ export class KiloClawInternalClient { ); } - async clearGoogleCredentials(userId: string): Promise { + async clearGoogleCredentials( + userId: string, + instanceId?: string + ): Promise { + const params = new URLSearchParams({ userId }); + if (instanceId) params.set('instanceId', instanceId); return this.request( - `/api/platform/google-credentials?userId=${encodeURIComponent(userId)}`, + `/api/platform/google-credentials?${params.toString()}`, { method: 'DELETE', }, @@ -516,9 +580,13 @@ export class KiloClawInternalClient { ); } - async enableGmailNotifications(userId: string): Promise { + async enableGmailNotifications( + userId: string, + instanceId?: string + ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/gmail-notifications', + `/api/platform/gmail-notifications${params}`, { method: 'POST', body: JSON.stringify({ userId }), @@ -527,9 +595,14 @@ export class KiloClawInternalClient { ); } - async disableGmailNotifications(userId: string): Promise { + async disableGmailNotifications( + userId: string, + instanceId?: string + ): Promise { + const params = new URLSearchParams({ userId }); + if (instanceId) params.set('instanceId', instanceId); return this.request( - `/api/platform/gmail-notifications?userId=${encodeURIComponent(userId)}`, + `/api/platform/gmail-notifications?${params.toString()}`, { method: 'DELETE', }, @@ -537,9 +610,10 @@ export class KiloClawInternalClient { ); } - async forceRetryRecovery(userId: string): Promise<{ ok: true }> { + async forceRetryRecovery(userId: string, instanceId?: string): Promise<{ ok: true }> { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/force-retry-recovery', + `/api/platform/force-retry-recovery${params}`, { method: 'POST', body: JSON.stringify({ userId }), @@ -548,21 +622,26 @@ export class KiloClawInternalClient { ); } - async listCandidateVolumes(userId: string): Promise { - return this.request( - `/api/platform/candidate-volumes?userId=${encodeURIComponent(userId)}`, - undefined, - { userId } - ); + async listCandidateVolumes( + userId: string, + instanceId?: string + ): Promise { + const params = new URLSearchParams({ userId }); + if (instanceId) params.set('instanceId', instanceId); + return this.request(`/api/platform/candidate-volumes?${params.toString()}`, undefined, { + userId, + }); } async reassociateVolume( userId: string, newVolumeId: string, - reason: string + reason: string, + instanceId?: string ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/reassociate-volume', + `/api/platform/reassociate-volume${params}`, { method: 'POST', body: JSON.stringify({ userId, newVolumeId, reason }), @@ -573,10 +652,12 @@ export class KiloClawInternalClient { async restoreVolumeFromSnapshot( userId: string, - snapshotId: string + snapshotId: string, + instanceId?: string ): Promise { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/restore-volume-snapshot', + `/api/platform/restore-volume-snapshot${params}`, { method: 'POST', body: JSON.stringify({ userId, snapshotId }), @@ -588,10 +669,12 @@ export class KiloClawInternalClient { async destroyFlyMachine( userId: string, appName: string, - machineId: string + machineId: string, + instanceId?: string ): Promise<{ ok: true }> { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; return this.request( - '/api/platform/destroy-fly-machine', + `/api/platform/destroy-fly-machine${params}`, { method: 'POST', body: JSON.stringify({ userId, appName, machineId }), diff --git a/src/routers/admin-kiloclaw-instances-router.ts b/src/routers/admin-kiloclaw-instances-router.ts index 7b6cc9f22..41f20fd2d 100644 --- a/src/routers/admin-kiloclaw-instances-router.ts +++ b/src/routers/admin-kiloclaw-instances-router.ts @@ -10,6 +10,7 @@ import { import { KiloClawInternalClient, KiloClawApiError } from '@/lib/kiloclaw/kiloclaw-internal-client'; import { KiloClawUserClient } from '@/lib/kiloclaw/kiloclaw-user-client'; import { + getActiveInstance, markActiveInstanceDestroyed, restoreDestroyedInstance, } from '@/lib/kiloclaw/instance-registry'; @@ -193,7 +194,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const client = new KiloClawInternalClient(); - workerStatus = await client.getDebugStatus(instance.user_id); + workerStatus = await client.getDebugStatus(instance.user_id, instance.id); } catch (err) { workerStatusError = err instanceof KiloClawApiError @@ -415,8 +416,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ .query(async ({ input }): Promise<{ snapshots: VolumeSnapshot[] }> => { const fallbackMessage = 'Failed to fetch volume snapshots'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.listVolumeSnapshots(input.userId); + return await client.listVolumeSnapshots(input.userId, instance?.id); } catch (err) { console.error('Failed to fetch volume snapshots for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -426,8 +428,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ controllerVersion: adminProcedure.input(GatewayProcessSchema).query(async ({ input }) => { const fallbackMessage = 'Failed to fetch controller version'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.getControllerVersion(input.userId); + return await client.getControllerVersion(input.userId, instance?.id); } catch (err) { console.error('Failed to fetch controller version for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -437,8 +440,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ gatewayStatus: adminProcedure.input(GatewayProcessSchema).query(async ({ input }) => { const fallbackMessage = 'Failed to fetch gateway status'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.getGatewayStatus(input.userId); + return await client.getGatewayStatus(input.userId, instance?.id); } catch (err) { console.error('Failed to fetch gateway status for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage, { @@ -454,8 +458,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ gatewayStart: adminProcedure.input(GatewayProcessSchema).mutation(async ({ input }) => { const fallbackMessage = 'Failed to start gateway'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.startGateway(input.userId); + return await client.startGateway(input.userId, instance?.id); } catch (err) { console.error('Failed to start gateway for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -465,8 +470,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ gatewayStop: adminProcedure.input(GatewayProcessSchema).mutation(async ({ input }) => { const fallbackMessage = 'Failed to stop gateway'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.stopGateway(input.userId); + return await client.stopGateway(input.userId, instance?.id); } catch (err) { console.error('Failed to stop gateway for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -476,8 +482,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ gatewayRestart: adminProcedure.input(GatewayProcessSchema).mutation(async ({ input }) => { const fallbackMessage = 'Failed to restart gateway'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.restartGatewayProcess(input.userId); + return await client.restartGatewayProcess(input.userId, instance?.id); } catch (err) { console.error('Failed to restart gateway for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -487,8 +494,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ runDoctor: adminProcedure.input(GatewayProcessSchema).mutation(async ({ input }) => { const fallbackMessage = 'Failed to run doctor'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.runDoctor(input.userId); + return await client.runDoctor(input.userId, instance?.id); } catch (err) { console.error('Failed to run doctor for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -610,8 +618,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ restoreConfig: adminProcedure.input(GatewayProcessSchema).mutation(async ({ input }) => { const fallbackMessage = 'Failed to restore config'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.restoreConfig(input.userId); + return await client.restoreConfig(input.userId, undefined, instance?.id); } catch (err) { console.error('Failed to restore config for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -622,8 +631,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ .input(z.object({ userId: z.string().min(1) })) .query(async ({ input }) => { try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - const result = await client.getFileTree(input.userId); + const result = await client.getFileTree(input.userId, instance?.id); return result.tree; } catch (err) { throwKiloclawAdminError(err, 'Failed to fetch file tree'); @@ -634,8 +644,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ .input(z.object({ userId: z.string().min(1), path: z.string().min(1) })) .query(async ({ input }) => { try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.readFile(input.userId, input.path); + return await client.readFile(input.userId, input.path, instance?.id); } catch (err) { throwKiloclawAdminError(err, 'Failed to read file'); } @@ -652,8 +663,15 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ ) .mutation(async ({ input }) => { try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.writeFile(input.userId, input.path, input.content, input.etag); + return await client.writeFile( + input.userId, + input.path, + input.content, + input.etag, + instance?.id + ); } catch (err) { // Propagate file_etag_conflict with UpstreamApiError so the UI can detect it if (err instanceof KiloClawApiError && err.statusCode === 409) { @@ -673,8 +691,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ machineStart: adminProcedure.input(GatewayProcessSchema).mutation(async ({ input }) => { const fallbackMessage = 'Failed to start machine'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.start(input.userId, undefined, { skipCooldown: true }); + return await client.start(input.userId, instance?.id, { skipCooldown: true }); } catch (err) { console.error('Failed to start machine for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -684,8 +703,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ forceRetryRecovery: adminProcedure.input(GatewayProcessSchema).mutation(async ({ input }) => { const fallbackMessage = 'Failed to retry recovery'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.forceRetryRecovery(input.userId); + return await client.forceRetryRecovery(input.userId, instance?.id); } catch (err) { console.error('Failed to retry recovery for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -695,8 +715,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ machineStop: adminProcedure.input(GatewayProcessSchema).mutation(async ({ input }) => { const fallbackMessage = 'Failed to stop machine'; try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.stop(input.userId); + return await client.stop(input.userId, instance?.id); } catch (err) { console.error('Failed to stop machine for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -762,12 +783,13 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ console.log( `[admin-kiloclaw] destroyFlyMachine triggered by admin ${ctx.user.id} (${ctx.user.google_user_email}) app=${input.appName} machine=${input.machineId}` ); + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); // Verify the appName/machineId match the DO's actual state let status: Awaited>; try { - status = await client.getDebugStatus(input.userId); + status = await client.getDebugStatus(input.userId, instance?.id); } catch (err) { throwKiloclawAdminError(err, 'Failed to verify machine state before destroy'); } @@ -780,7 +802,12 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ const fallbackMessage = 'Failed to destroy Fly machine'; try { - const result = await client.destroyFlyMachine(input.userId, input.appName, input.machineId); + const result = await client.destroyFlyMachine( + input.userId, + input.appName, + input.machineId, + instance?.id + ); try { await createKiloClawAdminAuditLog({ @@ -835,7 +862,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ const destroyedRow = await markActiveInstanceDestroyed(instance.user_id); const client = new KiloClawInternalClient(); try { - await client.destroy(instance.user_id); + await client.destroy(instance.user_id, instance.id); } catch (error) { if (destroyedRow) { await restoreDestroyedInstance(destroyedRow.id); @@ -898,8 +925,9 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ .input(z.object({ userId: z.string().min(1) })) .query(async ({ input }): Promise => { try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.listCandidateVolumes(input.userId); + return await client.listCandidateVolumes(input.userId, instance?.id); } catch (err) { throwKiloclawAdminError(err, 'Failed to list candidate volumes'); } @@ -932,7 +960,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ for (const instance of activeInstances) { const destroyedRow = await markActiveInstanceDestroyed(instance.user_id); try { - await client.destroy(instance.user_id); + await client.destroy(instance.user_id, instance.id); destroyed++; } catch (err) { if (destroyedRow) { @@ -963,11 +991,13 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ `[admin-kiloclaw] Volume reassociation triggered by admin ${ctx.user.id} (${ctx.user.google_user_email}) for user ${input.userId}: newVolume=${input.newVolumeId} reason="${input.reason}"` ); try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); const result = await client.reassociateVolume( input.userId, input.newVolumeId, - input.reason + input.reason, + instance?.id ); try { @@ -1009,8 +1039,13 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ `[admin-kiloclaw] Snapshot restore triggered by admin ${ctx.user.id} (${ctx.user.google_user_email}) for user ${input.userId}: snapshot=${input.snapshotId} reason="${input.reason}"` ); try { + const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - const result = await client.restoreVolumeFromSnapshot(input.userId, input.snapshotId); + const result = await client.restoreVolumeFromSnapshot( + input.userId, + input.snapshotId, + instance?.id + ); try { await createKiloClawAdminAuditLog({ diff --git a/src/routers/kiloclaw-router.ts b/src/routers/kiloclaw-router.ts index 71cf69167..48e41a8b2 100644 --- a/src/routers/kiloclaw-router.ts +++ b/src/routers/kiloclaw-router.ts @@ -368,6 +368,7 @@ async function patchConfig( user: Parameters[0], input: z.infer ): Promise { + const instance = await getActiveInstance(user.id); const client = new KiloClawInternalClient(); const expiresInSeconds = TOKEN_EXPIRY.thirtyDays; const kilocodeApiKey = generateApiToken(user, undefined, { @@ -375,11 +376,15 @@ async function patchConfig( }); const kilocodeApiKeyExpiresAt = new Date(Date.now() + expiresInSeconds * 1000).toISOString(); - const response = await client.patchKiloCodeConfig(user.id, { - ...input, - kilocodeApiKey, - kilocodeApiKeyExpiresAt, - }); + const response = await client.patchKiloCodeConfig( + user.id, + { + ...input, + kilocodeApiKey, + kilocodeApiKeyExpiresAt, + }, + instance?.id + ); return sanitizeKiloCodeConfigResponse(response); } @@ -584,12 +589,11 @@ export const kiloclawRouter = createTRPCRouter({ }), getStatus: baseProcedure.query(async ({ ctx }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const status = await client.getStatus(ctx.user.id); + const status = await client.getStatus(ctx.user.id, instance?.id); const workerUrl = KILOCLAW_API_URL || 'https://claw.kilo.ai'; - const instance = await getActiveInstance(ctx.user.id); - return { ...status, name: instance?.name ?? null, @@ -611,8 +615,9 @@ export const kiloclawRouter = createTRPCRouter({ }), getStreamChatCredentials: clawAccessProcedure.query(async ({ ctx }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.getStreamChatCredentials(ctx.user.id); + return client.getStreamChatCredentials(ctx.user.id, instance?.id); }), sendChatMessage: clawAccessProcedure @@ -679,8 +684,9 @@ export const kiloclawRouter = createTRPCRouter({ // Instance lifecycle start: clawAccessProcedure.mutation(async ({ ctx }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const result = await client.start(ctx.user.id); + const result = await client.start(ctx.user.id, instance?.id); // /api/platform/start always returns { ok: true } regardless of whether // the machine transitioned state, so this may fire for no-op requests. // The UI only enables Start when isStartable is true, so false fires are rare. @@ -693,8 +699,9 @@ export const kiloclawRouter = createTRPCRouter({ }), stop: clawAccessProcedure.mutation(async ({ ctx }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.stop(ctx.user.id); + return client.stop(ctx.user.id, instance?.id); }), destroy: baseProcedure.mutation(async ({ ctx }) => { @@ -791,17 +798,21 @@ export const kiloclawRouter = createTRPCRouter({ }), patchChannels: clawAccessProcedure.input(patchChannelsSchema).mutation(async ({ ctx, input }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.patchChannels(ctx.user.id, { - channels: buildWorkerChannelsPatch(input), - }); + return client.patchChannels( + ctx.user.id, + { channels: buildWorkerChannelsPatch(input) }, + instance?.id + ); }), patchExecPreset: clawAccessProcedure .input(z.object({ security: z.string().optional(), ask: z.string().optional() })) .mutation(async ({ ctx, input }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.patchExecPreset(ctx.user.id, input); + return client.patchExecPreset(ctx.user.id, input, instance?.id); }), /** @@ -883,12 +894,14 @@ export const kiloclawRouter = createTRPCRouter({ } // 4. Forward to worker — translate 4xx responses into TRPCErrors + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); try { - return await client.patchSecrets(ctx.user.id, { - secrets: encryptedPatch, - meta: input.meta, - }); + return await client.patchSecrets( + ctx.user.id, + { secrets: encryptedPatch, meta: input.meta }, + instance?.id + ); } catch (err) { if (err instanceof KiloClawApiError && err.statusCode >= 400 && err.statusCode < 500) { // Extract message from worker response body (JSON or plain text) @@ -1003,35 +1016,40 @@ export const kiloclawRouter = createTRPCRouter({ listPairingRequests: clawAccessProcedure .input(z.object({ refresh: z.boolean().optional() }).optional()) .query(async ({ ctx, input }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.listPairingRequests(ctx.user.id, input?.refresh); + return client.listPairingRequests(ctx.user.id, input?.refresh, instance?.id); }), approvePairingRequest: clawAccessProcedure .input(z.object({ channel: z.string().min(1), code: z.string().min(1) })) .mutation(async ({ ctx, input }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.approvePairingRequest(ctx.user.id, input.channel, input.code); + return client.approvePairingRequest(ctx.user.id, input.channel, input.code, instance?.id); }), listDevicePairingRequests: clawAccessProcedure .input(z.object({ refresh: z.boolean().optional() }).optional()) .query(async ({ ctx, input }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.listDevicePairingRequests(ctx.user.id, input?.refresh); + return client.listDevicePairingRequests(ctx.user.id, input?.refresh, instance?.id); }), approveDevicePairingRequest: clawAccessProcedure .input(z.object({ requestId: z.string().uuid() })) .mutation(async ({ ctx, input }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.approveDevicePairingRequest(ctx.user.id, input.requestId); + return client.approveDevicePairingRequest(ctx.user.id, input.requestId, instance?.id); }), gatewayStatus: baseProcedure.query(async ({ ctx }) => { try { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return await client.getGatewayStatus(ctx.user.id); + return await client.getGatewayStatus(ctx.user.id, instance?.id); } catch (err) { console.error('Failed to fetch gateway status for user:', ctx.user.id, err); if (err instanceof KiloClawApiError && (err.statusCode === 404 || err.statusCode === 409)) { @@ -1049,8 +1067,9 @@ export const kiloclawRouter = createTRPCRouter({ gatewayReady: baseProcedure.query(async ({ ctx }) => { try { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return await client.getGatewayReady(ctx.user.id); + return await client.getGatewayReady(ctx.user.id, instance?.id); } catch (err) { console.error('[gatewayReady] error for user:', ctx.user.id, err); if (err instanceof KiloClawApiError && (err.statusCode === 404 || err.statusCode === 409)) { @@ -1067,18 +1086,21 @@ export const kiloclawRouter = createTRPCRouter({ }), controllerVersion: baseProcedure.query(async ({ ctx }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.getControllerVersion(ctx.user.id); + return client.getControllerVersion(ctx.user.id, instance?.id); }), restartOpenClaw: clawAccessProcedure.mutation(async ({ ctx }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.restartGatewayProcess(ctx.user.id); + return client.restartGatewayProcess(ctx.user.id, instance?.id); }), runDoctor: clawAccessProcedure.mutation(async ({ ctx }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.runDoctor(ctx.user.id); + return client.runDoctor(ctx.user.id, instance?.id); }), // ── Kilo CLI Run ────────────────────────────────────────────────── @@ -1086,8 +1108,9 @@ export const kiloclawRouter = createTRPCRouter({ startKiloCliRun: clawAccessProcedure .input(z.object({ prompt: z.string().min(1).max(10_000) })) .mutation(async ({ ctx, input }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const result = await client.startKiloCliRun(ctx.user.id, input.prompt); + const result = await client.startKiloCliRun(ctx.user.id, input.prompt, instance?.id); // Persist the run in the database and return its ID const [row] = await db @@ -1144,8 +1167,9 @@ export const kiloclawRouter = createTRPCRouter({ } // Run is still active — poll the controller for live output. + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const controllerStatus = await client.getKiloCliRunStatus(ctx.user.id); + const controllerStatus = await client.getKiloCliRunStatus(ctx.user.id, instance?.id); // If controller reports the run finished, persist to the DB row. if ( @@ -1179,8 +1203,9 @@ export const kiloclawRouter = createTRPCRouter({ cancelKiloCliRun: clawAccessProcedure .input(z.object({ runId: z.string().uuid() })) .mutation(async ({ ctx, input }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const result = await client.cancelKiloCliRun(ctx.user.id); + const result = await client.cancelKiloCliRun(ctx.user.id, instance?.id); // Mark the specific run as cancelled in DB if (result.ok) { @@ -1217,8 +1242,9 @@ export const kiloclawRouter = createTRPCRouter({ }), restoreConfig: clawAccessProcedure.mutation(async ({ ctx }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.restoreConfig(ctx.user.id); + return client.restoreConfig(ctx.user.id, undefined, instance?.id); }), getGoogleSetupCommand: clawAccessProcedure.query(({ ctx }) => { @@ -1238,19 +1264,21 @@ export const kiloclawRouter = createTRPCRouter({ }), disconnectGoogle: clawAccessProcedure.mutation(async ({ ctx }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.clearGoogleCredentials(ctx.user.id); + return client.clearGoogleCredentials(ctx.user.id, instance?.id); }), setGmailNotifications: baseProcedure .input(z.object({ enabled: z.boolean() })) .mutation(async ({ ctx, input }) => { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); try { if (input.enabled) { - return await client.enableGmailNotifications(ctx.user.id); + return await client.enableGmailNotifications(ctx.user.id, instance?.id); } - return await client.disableGmailNotifications(ctx.user.id); + return await client.disableGmailNotifications(ctx.user.id, instance?.id); } catch (err) { if (err instanceof KiloClawApiError && err.statusCode >= 400 && err.statusCode < 500) { let message = `Failed to update Gmail notifications (${err.statusCode})`; @@ -1546,8 +1574,9 @@ export const kiloclawRouter = createTRPCRouter({ fileTree: clawAccessProcedure.query(async ({ ctx }) => { try { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const result = await client.getFileTree(ctx.user.id); + const result = await client.getFileTree(ctx.user.id, instance?.id); return result.tree; } catch (err) { handleFileOperationError(err, 'fetch file tree'); @@ -1558,8 +1587,9 @@ export const kiloclawRouter = createTRPCRouter({ .input(z.object({ path: z.string().min(1) })) .query(async ({ ctx, input }) => { try { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return await client.readFile(ctx.user.id, input.path); + return await client.readFile(ctx.user.id, input.path, instance?.id); } catch (err) { handleFileOperationError(err, 'read file'); } @@ -1575,6 +1605,7 @@ export const kiloclawRouter = createTRPCRouter({ ) .mutation(async ({ ctx, input }) => { try { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); let content = input.content; @@ -1597,7 +1628,7 @@ export const kiloclawRouter = createTRPCRouter({ content = JSON.stringify(userConfig, null, 2); } - return await client.writeFile(ctx.user.id, input.path, content, input.etag); + return await client.writeFile(ctx.user.id, input.path, content, input.etag, instance?.id); } catch (err) { handleFileOperationError(err, 'write file'); } @@ -1607,8 +1638,9 @@ export const kiloclawRouter = createTRPCRouter({ .input(z.object({ patch: z.record(z.string(), z.unknown()) })) .mutation(async ({ ctx, input }) => { try { + const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return await client.patchOpenclawConfig(ctx.user.id, input.patch); + return await client.patchOpenclawConfig(ctx.user.id, input.patch, instance?.id); } catch (err) { handleFileOperationError(err, 'patch openclaw config'); } From 0ee64e29f3d47389e4d7689a3c931ac906ec1695 Mon Sep 17 00:00:00 2001 From: syn Date: Sun, 29 Mar 2026 16:25:50 -0500 Subject: [PATCH 03/15] fix(kiloclaw): resolve real userId from DO in controller checkin The controller checkin route used instanceId as a placeholder for userId when handling ki_ sandboxIds. This caused PostHog attribution and instance-ready emails to silently fail for instance-keyed DOs. Fix: call stub.getStatus() after auth to read the real userId from the DO, which always stores it during provision. --- kiloclaw/src/routes/controller.test.ts | 5 ++++- kiloclaw/src/routes/controller.ts | 16 ++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/kiloclaw/src/routes/controller.test.ts b/kiloclaw/src/routes/controller.test.ts index ac20c47e2..94b7ffb19 100644 --- a/kiloclaw/src/routes/controller.test.ts +++ b/kiloclaw/src/routes/controller.test.ts @@ -40,6 +40,9 @@ function makeEnv(options?: { const getConfig = vi.fn().mockResolvedValue({ kilocodeApiKey: options?.kilocodeApiKey ?? 'kilo-key-1', }); + const getStatus = vi.fn().mockResolvedValue({ + userId: 'user-1', + }); const tryMarkInstanceReady = options?.tryMarkInstanceReady ?? vi.fn().mockResolvedValue({ shouldNotify: false, userId: null }); @@ -50,7 +53,7 @@ function makeEnv(options?: { INTERNAL_API_SECRET: options?.internalApiSecret, KILOCLAW_INSTANCE: { idFromName: (userId: string) => userId, - get: () => ({ getConfig, tryMarkInstanceReady }), + get: () => ({ getConfig, getStatus, tryMarkInstanceReady }), }, KILOCLAW_CONTROLLER_AE: options?.writeDataPoint ? { diff --git a/kiloclaw/src/routes/controller.ts b/kiloclaw/src/routes/controller.ts index 744f7da72..1b9becd24 100644 --- a/kiloclaw/src/routes/controller.ts +++ b/kiloclaw/src/routes/controller.ts @@ -109,21 +109,15 @@ controller.post('/checkin', async (c: Context) => { // For instance-keyed sandboxIds (ki_ prefix), the DO key is the instanceId. // For legacy sandboxIds (base64url), the DO key is the userId. - let userId: string; let doKey: string; if (isInstanceKeyedSandboxId(data.sandboxId)) { - const instanceId = instanceIdFromSandboxId(data.sandboxId); - doKey = instanceId; - // We don't know the userId from the sandboxId alone for instance-keyed DOs. - // The DO will validate auth via the API key check below. - userId = instanceId; // placeholder — not used for auth, only for logging + doKey = instanceIdFromSandboxId(data.sandboxId); } else { try { - userId = userIdFromSandboxId(data.sandboxId); + doKey = userIdFromSandboxId(data.sandboxId); } catch { return c.json({ error: 'Invalid sandboxId' }, 400); } - doKey = userId; } const stub = c.env.KILOCLAW_INSTANCE.get(c.env.KILOCLAW_INSTANCE.idFromName(doKey)); @@ -132,6 +126,12 @@ controller.post('/checkin', async (c: Context) => { return c.json({ error: 'Forbidden' }, 403); } + // Resolve the real userId from the DO — needed for PostHog attribution and + // instance-ready emails. For legacy DOs, doKey IS the userId. For instance-keyed + // DOs, doKey is the instanceId and the DO stores the actual userId. + const status = await stub.getStatus(); + const userId = status.userId ?? doKey; + try { const flyRegion = c.req.header('fly-region') ?? ''; c.env.KILOCLAW_CONTROLLER_AE.writeDataPoint({ From 533440995f51b38e9e36456ed7f743baba061ab1 Mon Sep 17 00:00:00 2001 From: syn Date: Mon, 30 Mar 2026 11:11:06 -0500 Subject: [PATCH 04/15] fix(test): update admin destroyFlyMachine assertions for instanceId param --- src/routers/admin-kiloclaw-instances-router.test.ts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/routers/admin-kiloclaw-instances-router.test.ts b/src/routers/admin-kiloclaw-instances-router.test.ts index 6e0ef35b6..d5fa52292 100644 --- a/src/routers/admin-kiloclaw-instances-router.test.ts +++ b/src/routers/admin-kiloclaw-instances-router.test.ts @@ -95,8 +95,13 @@ describe('admin.kiloclawInstances.destroyFlyMachine', () => { }); expect(result).toEqual({ ok: true }); - expect(mockGetDebugStatus).toHaveBeenCalledWith(testUserId); - expect(mockDestroyFlyMachine).toHaveBeenCalledWith(testUserId, testAppName, testMachineId); + expect(mockGetDebugStatus).toHaveBeenCalledWith(testUserId, undefined); + expect(mockDestroyFlyMachine).toHaveBeenCalledWith( + testUserId, + testAppName, + testMachineId, + undefined + ); }); it('throws BAD_REQUEST when appName does not match DO state', async () => { From 7b4dd2cefe151cf4fc7a3f1552100d7c9718be71 Mon Sep 17 00:00:00 2001 From: syn Date: Mon, 30 Mar 2026 11:31:36 -0500 Subject: [PATCH 05/15] fix(kiloclaw): align DB/DO sandboxId identity and fix registry/admin row targeting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three identity consistency fixes: 1. ensureActiveInstance personal flow now writes instance-keyed sandboxId (ki_ prefix) matching what the DO stores. Previously wrote sandboxIdFromUserId which diverged from the DO's sandboxIdFromInstanceId, breaking restoreFromPostgres lookups. getActiveInstance/markActiveInstanceDestroyed/renameInstance updated to find rows without filtering by sandboxId format. 2. Lazy migration derives doKey from Postgres row's sandboxId format (ki_ prefix → instanceId, base64url → userId) instead of blindly writing doKey=userId. Prevents wrong DO routing after transient registry create failures. 3. Admin destroy and devNukeAll pass instance.id to markActiveInstanceDestroyed for explicit row targeting instead of legacy sandboxId matching. --- .../src/durable-objects/kiloclaw-registry.ts | 15 +++- src/lib/kiloclaw/instance-registry.ts | 85 +++++++++---------- .../admin-kiloclaw-instances-router.ts | 4 +- 3 files changed, 53 insertions(+), 51 deletions(-) diff --git a/kiloclaw/src/durable-objects/kiloclaw-registry.ts b/kiloclaw/src/durable-objects/kiloclaw-registry.ts index 2f8213ed4..4d1c0ec47 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-registry.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-registry.ts @@ -5,6 +5,10 @@ import { eq, isNull, and } from 'drizzle-orm'; import migrations from '../../drizzle/migrations'; import { registryInstances } from '../db/sqlite-schema'; import { getWorkerDb, getActiveInstance } from '../db'; +import { + isInstanceKeyedSandboxId, + instanceIdFromSandboxId, +} from '@kilocode/worker-utils/instance-id'; import type { KiloClawEnv } from '../types'; export type RegistryEntry = { @@ -187,13 +191,18 @@ export class KiloClawRegistry extends DurableObject { const instance = await getActiveInstance(db, userId); if (instance) { - // Legacy instance found in Postgres — backfill registry entry. - // do_key = userId for legacy instances (DO keyed by idFromName(userId)). + // Backfill registry entry from Postgres row. + // Derive doKey from the row's sandboxId format: + // - ki_ prefix → instance-keyed DO at idFromName(instanceId) + // - base64url → legacy DO at idFromName(userId) + const doKey = isInstanceKeyedSandboxId(instance.sandboxId) + ? instanceIdFromSandboxId(instance.sandboxId) + : userId; this.db .insert(registryInstances) .values({ instance_id: instance.id, - do_key: userId, + do_key: doKey, assigned_user_id: userId, created_at: new Date().toISOString(), }) diff --git a/src/lib/kiloclaw/instance-registry.ts b/src/lib/kiloclaw/instance-registry.ts index e2bec6b0c..82a64665d 100644 --- a/src/lib/kiloclaw/instance-registry.ts +++ b/src/lib/kiloclaw/instance-registry.ts @@ -3,7 +3,7 @@ import 'server-only'; import { and, eq, isNull } from 'drizzle-orm'; import { kiloclaw_instances } from '@kilocode/db/schema'; import { db } from '@/lib/drizzle'; -import { sandboxIdFromUserId, sandboxIdFromInstanceId } from '@/lib/kiloclaw/sandbox-id'; +import { sandboxIdFromInstanceId } from '@/lib/kiloclaw/sandbox-id'; export type ActiveKiloClawInstance = { id: string; @@ -20,22 +20,30 @@ type EnsureActiveInstanceOpts = { /** * Ensure the user has an active KiloClaw registry row before worker provisioning. - * This is idempotent and safe under concurrent calls. * - * The returned `id` (DB row UUID) serves as the instanceId for multi-instance - * routing. + * The returned `id` (DB row UUID) serves as the instanceId for DO keying. + * sandboxId is always derived from instanceId (`ki_` prefix) for consistency + * between DB and DO identity. Legacy rows with userId-derived sandboxIds are + * returned as-is if they already exist. * - * For legacy personal flow (no opts.orgId): sandboxId is derived from userId, - * DO key = userId. Idempotent via onConflictDoNothing on the unique index. + * Personal flow: returns existing active row if present, otherwise creates a + * new instance-keyed row. Idempotent under concurrent calls (second caller + * sees the first caller's row). * - * For org instances (opts.orgId present): sandboxId is derived from a freshly - * generated UUID (the row's id), DO key = instanceId. Not idempotent — each - * call creates a new instance row. + * Org flow: always creates a new row. Callers must gate on existing rows. */ export async function ensureActiveInstance( userId: string, opts?: EnsureActiveInstanceOpts ): Promise { + const selectFields = { + id: kiloclaw_instances.id, + userId: kiloclaw_instances.user_id, + sandboxId: kiloclaw_instances.sandbox_id, + organizationId: kiloclaw_instances.organization_id, + name: kiloclaw_instances.name, + }; + if (opts?.orgId) { // Org instance: generate UUID, derive sandboxId from it. // Each call creates a new row (no idempotency — callers gate on existing rows). @@ -50,13 +58,7 @@ export async function ensureActiveInstance( sandbox_id: sandboxId, organization_id: opts.orgId, }) - .returning({ - id: kiloclaw_instances.id, - userId: kiloclaw_instances.user_id, - sandboxId: kiloclaw_instances.sandbox_id, - organizationId: kiloclaw_instances.organization_id, - name: kiloclaw_instances.name, - }); + .returning(selectFields); if (!row) { throw new Error('Failed to create org instance row'); @@ -65,37 +67,26 @@ export async function ensureActiveInstance( return row; } - // Legacy personal flow: derive sandboxId from userId. Idempotent. - const sandboxId = sandboxIdFromUserId(userId); + // Personal flow: return existing active row if present. + const existing = await getActiveInstance(userId); + if (existing) return existing; - await db + // No active row — create a new instance-keyed row. + // sandboxId = sandboxIdFromInstanceId(uuid) ensures DB and DO identity match. + const instanceId = crypto.randomUUID(); + const sandboxId = sandboxIdFromInstanceId(instanceId); + + const [row] = await db .insert(kiloclaw_instances) .values({ + id: instanceId, user_id: userId, sandbox_id: sandboxId, }) - .onConflictDoNothing(); - - const [row] = await db - .select({ - id: kiloclaw_instances.id, - userId: kiloclaw_instances.user_id, - sandboxId: kiloclaw_instances.sandbox_id, - organizationId: kiloclaw_instances.organization_id, - name: kiloclaw_instances.name, - }) - .from(kiloclaw_instances) - .where( - and( - eq(kiloclaw_instances.user_id, userId), - eq(kiloclaw_instances.sandbox_id, sandboxId), - isNull(kiloclaw_instances.destroyed_at) - ) - ) - .limit(1); + .returning(selectFields); if (!row) { - throw new Error('Failed to ensure active KiloClaw instance row'); + throw new Error('Failed to create personal instance row'); } return row; @@ -119,7 +110,7 @@ export async function markActiveInstanceDestroyed( ? and(eq(kiloclaw_instances.id, instanceId), isNull(kiloclaw_instances.destroyed_at)) : and( eq(kiloclaw_instances.user_id, userId), - eq(kiloclaw_instances.sandbox_id, sandboxIdFromUserId(userId)), + isNull(kiloclaw_instances.organization_id), isNull(kiloclaw_instances.destroyed_at) ); @@ -163,11 +154,14 @@ export async function restoreDestroyedInstance(instanceId: string): Promise { - const sandboxId = sandboxIdFromUserId(userId); - const [row] = await db .select({ id: kiloclaw_instances.id, @@ -180,7 +174,7 @@ export async function getActiveInstance(userId: string): Promise { - const sandboxId = sandboxIdFromUserId(userId); const trimmed = name?.trim() || null; if (trimmed !== null && trimmed.length > 50) { @@ -207,7 +200,7 @@ export async function renameInstance(userId: string, name: string | null): Promi .where( and( eq(kiloclaw_instances.user_id, userId), - eq(kiloclaw_instances.sandbox_id, sandboxId), + isNull(kiloclaw_instances.organization_id), isNull(kiloclaw_instances.destroyed_at) ) ); diff --git a/src/routers/admin-kiloclaw-instances-router.ts b/src/routers/admin-kiloclaw-instances-router.ts index 41f20fd2d..a8ac1c366 100644 --- a/src/routers/admin-kiloclaw-instances-router.ts +++ b/src/routers/admin-kiloclaw-instances-router.ts @@ -859,7 +859,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ `[admin-kiloclaw] Destroy triggered by admin ${ctx.user.id} (${ctx.user.google_user_email}) for instance ${instance.id} (user: ${instance.user_id})` ); - const destroyedRow = await markActiveInstanceDestroyed(instance.user_id); + const destroyedRow = await markActiveInstanceDestroyed(instance.user_id, instance.id); const client = new KiloClawInternalClient(); try { await client.destroy(instance.user_id, instance.id); @@ -958,7 +958,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ const errors: Array<{ userId: string; error: string }> = []; for (const instance of activeInstances) { - const destroyedRow = await markActiveInstanceDestroyed(instance.user_id); + const destroyedRow = await markActiveInstanceDestroyed(instance.user_id, instance.id); try { await client.destroy(instance.user_id, instance.id); destroyed++; From 0e2d672b7f53f7d5162d9fad046a9b4010606f54 Mon Sep 17 00:00:00 2001 From: syn Date: Mon, 30 Mar 2026 12:02:47 -0500 Subject: [PATCH 06/15] fix(kiloclaw): make getActiveInstance deterministic with ORDER BY created_at MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ORDER BY created_at ASC to getActiveInstance so LIMIT 1 always returns the oldest active row. Documents the known race window in ensureActiveInstance where concurrent callers can both insert — the consequence is a benign orphan row since all subsequent reads converge on the same (oldest) row. --- src/lib/kiloclaw/instance-registry.ts | 6 ++++++ src/lib/kiloclaw/kiloclaw-internal-client.ts | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/lib/kiloclaw/instance-registry.ts b/src/lib/kiloclaw/instance-registry.ts index 82a64665d..5c0a2a8a7 100644 --- a/src/lib/kiloclaw/instance-registry.ts +++ b/src/lib/kiloclaw/instance-registry.ts @@ -68,6 +68,11 @@ export async function ensureActiveInstance( } // Personal flow: return existing active row if present. + // Race note: two concurrent callers can both see no row and both insert. + // This is benign — getActiveInstance uses ORDER BY created_at ASC so all + // subsequent reads converge on the oldest row. The second row is an inert + // orphan (no DO created for it). The window is milliseconds on a user- + // initiated action already deduplicated by the frontend's useMutation. const existing = await getActiveInstance(userId); if (existing) return existing; @@ -178,6 +183,7 @@ export async function getActiveInstance(userId: string): Promise { + async getKiloCliRunStatus( + userId: string, + instanceId?: string + ): Promise { const params = new URLSearchParams({ userId }); if (instanceId) params.set('instanceId', instanceId); return this.request(`/api/platform/kilo-cli-run/status?${params.toString()}`, undefined, { From b02a1a96d51a22dc6439157df07a81eb529bb48d Mon Sep 17 00:00:00 2001 From: syn Date: Mon, 30 Mar 2026 12:44:04 -0500 Subject: [PATCH 07/15] fix(kiloclaw): try both registries on destroy when orgId is known MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Always attempt user registry cleanup on destroy. When getStatus() succeeds and reveals an orgId, also clean up the org registry. When getStatus() fails, log a warning — the org registry entry becomes stale but harmless (points to a destroyed DO with no machineId). --- kiloclaw/src/routes/platform.ts | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/kiloclaw/src/routes/platform.ts b/kiloclaw/src/routes/platform.ts index 4a7a9a9bb..8dd0d9e1b 100644 --- a/kiloclaw/src/routes/platform.ts +++ b/kiloclaw/src/routes/platform.ts @@ -1263,7 +1263,12 @@ platform.post('/destroy', async c => { const status = await statusStub.getStatus(); orgId = status.orgId; } catch { - // If we can't read status, proceed with destroy — registry cleanup is best-effort. + // Can't determine orgId. We'll clean up the user registry below; if the + // instance was org-owned, its org registry entry becomes stale but harmless + // (points to a destroyed DO that returns no machineId). + console.warn( + '[platform] Could not read orgId before destroy, org registry entry may be stale' + ); } } @@ -1276,14 +1281,19 @@ platform.post('/destroy', async c => { // Remove the instance from the registry (best-effort). if (instanceId) { - try { - const registryKey = orgId ? `org:${orgId}` : `user:${userId}`; - const registryStub = c.env.KILOCLAW_REGISTRY.get( - c.env.KILOCLAW_REGISTRY.idFromName(registryKey) - ); - await registryStub.destroyInstance(registryKey, instanceId); - } catch (registryErr) { - console.error('[platform] Registry destroy failed (non-fatal):', registryErr); + // Always try the user registry. If the instance was org-owned and we + // know the orgId, also clean up the org registry. + const registryKeys = [`user:${userId}`]; + if (orgId) registryKeys.push(`org:${orgId}`); + for (const registryKey of registryKeys) { + try { + const registryStub = c.env.KILOCLAW_REGISTRY.get( + c.env.KILOCLAW_REGISTRY.idFromName(registryKey) + ); + await registryStub.destroyInstance(registryKey, instanceId); + } catch (registryErr) { + console.error('[platform] Registry destroy failed (non-fatal):', registryErr); + } } } From a0303f1970fe937466999909405761c27db2ebb8 Mon Sep 17 00:00:00 2001 From: syn Date: Mon, 30 Mar 2026 12:57:39 -0500 Subject: [PATCH 08/15] fix(kiloclaw): fix billing instance reassignment for instance-keyed rows getOrCreateInstanceForBilling searched for the destroyed personal instance by sandboxIdFromUserId, which doesn't match instance-keyed rows (ki_ prefix). Now finds the most recently destroyed personal instance by user_id + organization_id IS NULL without filtering by sandboxId format, matching the pattern used by getActiveInstance. --- src/routers/kiloclaw-router.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/routers/kiloclaw-router.ts b/src/routers/kiloclaw-router.ts index 48e41a8b2..2fa7da057 100644 --- a/src/routers/kiloclaw-router.ts +++ b/src/routers/kiloclaw-router.ts @@ -44,7 +44,7 @@ import { restoreDestroyedInstance, type ActiveKiloClawInstance, } from '@/lib/kiloclaw/instance-registry'; -import { sandboxIdFromUserId } from '@/lib/kiloclaw/sandbox-id'; + import { client as stripe } from '@/lib/stripe-client'; import { APP_URL } from '@/lib/constants'; import { getRewardfulReferral } from '@/lib/rewardful'; @@ -81,8 +81,8 @@ const UNSAFE_ERROR_CODES = new Set(['config_read_failed', 'config_replace_failed * exists (e.g. trial expired and personal instance was destroyed). * * When a new row is created, the subscription row linked to the user's - * destroyed personal instance (identified by sandboxIdFromUserId) is - * reassigned to the new instance_id. The update is scoped to that exact + * most recently destroyed personal instance is reassigned to the new + * instance_id. The update is scoped to that exact * destroyed instance row so that subscriptions on other (org or multi-) * instances are never touched and UQ_kiloclaw_subscriptions_instance is not * violated. @@ -94,20 +94,20 @@ async function getOrCreateInstanceForBilling(userId: string): Promise Date: Mon, 30 Mar 2026 14:10:04 -0500 Subject: [PATCH 09/15] fix(kiloclaw): derive access gateway token from DO sandboxId, not userId MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The access gateway's buildRedirectUrl derived the gateway token from sandboxIdFromUserId(userId), but instance-keyed DOs use sandboxIdFromInstanceId(instanceId) — a different sandboxId. This caused token_mismatch on every new instance's WebSocket auth. Fix: resolve the DO's actual sandboxId via the registry + getStatus() before deriving the token. Falls back to legacy derivation if the DO is unreachable. --- kiloclaw/src/routes/access-gateway.ts | 38 ++++++++++++++++++++------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/kiloclaw/src/routes/access-gateway.ts b/kiloclaw/src/routes/access-gateway.ts index afb24e0b2..d27e74063 100644 --- a/kiloclaw/src/routes/access-gateway.ts +++ b/kiloclaw/src/routes/access-gateway.ts @@ -6,6 +6,7 @@ import { getWorkerDb, validateAndRedeemAccessCode, findPepperByUserId } from '.. import { signKiloToken, validateKiloToken } from '../auth/jwt'; import { deriveGatewayToken } from '../auth/gateway-token'; import { sandboxIdFromUserId } from '../auth/sandbox-id'; +import type { KiloClawEnv } from '../types'; /** * Access gateway routes — unauthenticated. @@ -38,13 +39,32 @@ const BASE_STYLES = /* css */ ` * Derive the redirect URL with the gateway token hash fragment. * The token is computed server-side from the userId — never touches the client. */ -async function buildRedirectUrl( - userId: string, - gatewayTokenSecret: string | undefined -): Promise { - if (!gatewayTokenSecret) return '/'; - const sandboxId = sandboxIdFromUserId(userId); - const token = await deriveGatewayToken(sandboxId, gatewayTokenSecret); +/** + * Resolve the DO's authoritative sandboxId for gateway token derivation. + * For instance-keyed DOs this is the ki_ value; for legacy DOs it's the + * userId-derived base64url value. Falls back to sandboxIdFromUserId if + * the DO can't be reached (e.g. not provisioned yet). + */ +async function resolveSandboxId(userId: string, env: KiloClawEnv): Promise { + try { + const registryKey = `user:${userId}`; + const registryStub = env.KILOCLAW_REGISTRY.get(env.KILOCLAW_REGISTRY.idFromName(registryKey)); + const entries = await registryStub.listInstances(registryKey); + if (entries.length > 0) { + const stub = env.KILOCLAW_INSTANCE.get(env.KILOCLAW_INSTANCE.idFromName(entries[0].doKey)); + const status = await stub.getStatus(); + if (status.sandboxId) return status.sandboxId; + } + } catch { + // Fall back to legacy derivation if registry/DO is unreachable + } + return sandboxIdFromUserId(userId); +} + +async function buildRedirectUrl(userId: string, env: KiloClawEnv): Promise { + if (!env.GATEWAY_TOKEN_SECRET) return '/'; + const sandboxId = await resolveSandboxId(userId, env); + const token = await deriveGatewayToken(sandboxId, env.GATEWAY_TOKEN_SECRET); return `/#token=${token}`; } @@ -210,7 +230,7 @@ async function redeemCodeAndSetCookie( maxAge: KILOCLAW_AUTH_COOKIE_MAX_AGE, }); - const redirectUrl = await buildRedirectUrl(redeemedUserId, c.env.GATEWAY_TOKEN_SECRET); + const redirectUrl = await buildRedirectUrl(redeemedUserId, c.env); return { redirectUrl }; } @@ -225,7 +245,7 @@ accessGatewayRoutes.get('/kilo-access-gateway', async c => { if (secret) { const cookie = getCookie(c, KILOCLAW_AUTH_COOKIE); if (await hasValidCookie(cookie, userId, secret, c.env.WORKER_ENV)) { - const redirectUrl = await buildRedirectUrl(userId, c.env.GATEWAY_TOKEN_SECRET); + const redirectUrl = await buildRedirectUrl(userId, c.env); return c.redirect(redirectUrl); } } From c8d38a0ad973a85ac49e3f553cb7cb1fff104963 Mon Sep 17 00:00:00 2001 From: syn Date: Mon, 30 Mar 2026 14:27:14 -0500 Subject: [PATCH 10/15] fix(kiloclaw): include missed instanceId threading from stash Three files with instanceId threading were in the git stash during rebase and not included in the lifecycle threading commit: - billing-lifecycle-cron.ts: pass instance_id to client.stop/destroy - instance-lifecycle.ts: formatting only (already had instanceId) - admin-router.ts: pass activeInstance.id to client.start --- src/lib/kiloclaw/billing-lifecycle-cron.ts | 12 ++++++++---- src/lib/kiloclaw/instance-lifecycle.ts | 2 +- src/routers/admin-router.ts | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/lib/kiloclaw/billing-lifecycle-cron.ts b/src/lib/kiloclaw/billing-lifecycle-cron.ts index 21e2ef926..e03a60b26 100644 --- a/src/lib/kiloclaw/billing-lifecycle-cron.ts +++ b/src/lib/kiloclaw/billing-lifecycle-cron.ts @@ -674,6 +674,7 @@ export async function runKiloClawBillingLifecycleCron( .select({ user_id: kiloclaw_subscriptions.user_id, email: kilocode_users.google_user_email, + instance_id: kiloclaw_subscriptions.instance_id, }) .from(kiloclaw_subscriptions) .innerJoin(kilocode_users, eq(kiloclaw_subscriptions.user_id, kilocode_users.id)) @@ -691,7 +692,7 @@ export async function runKiloClawBillingLifecycleCron( // transition MUST proceed regardless, so transient outages don't // leave expired accounts active. try { - await client.stop(row.user_id); + await client.stop(row.user_id, row.instance_id ?? undefined); } catch (stopError) { const isExpected = stopError instanceof KiloClawApiError && @@ -748,6 +749,7 @@ export async function runKiloClawBillingLifecycleCron( .select({ user_id: kiloclaw_subscriptions.user_id, email: kilocode_users.google_user_email, + instance_id: kiloclaw_subscriptions.instance_id, }) .from(kiloclaw_subscriptions) .innerJoin(kilocode_users, eq(kiloclaw_subscriptions.user_id, kilocode_users.id)) @@ -762,7 +764,7 @@ export async function runKiloClawBillingLifecycleCron( for (const row of expiredSubscriptions) { try { try { - await client.stop(row.user_id); + await client.stop(row.user_id, row.instance_id ?? undefined); } catch (stopError) { const isExpected = stopError instanceof KiloClawApiError && @@ -862,6 +864,7 @@ export async function runKiloClawBillingLifecycleCron( .select({ user_id: kiloclaw_subscriptions.user_id, email: kilocode_users.google_user_email, + instance_id: kiloclaw_subscriptions.instance_id, }) .from(kiloclaw_subscriptions) .innerJoin(kilocode_users, eq(kiloclaw_subscriptions.user_id, kilocode_users.id)) @@ -875,7 +878,7 @@ export async function runKiloClawBillingLifecycleCron( for (const row of destructionCandidates) { try { try { - await client.destroy(row.user_id); + await client.destroy(row.user_id, row.instance_id ?? undefined); } catch (destroyError) { const isExpected = destroyError instanceof KiloClawApiError && @@ -944,6 +947,7 @@ export async function runKiloClawBillingLifecycleCron( .select({ user_id: kiloclaw_subscriptions.user_id, email: kilocode_users.google_user_email, + instance_id: kiloclaw_subscriptions.instance_id, }) .from(kiloclaw_subscriptions) .innerJoin(kilocode_users, eq(kiloclaw_subscriptions.user_id, kilocode_users.id)) @@ -958,7 +962,7 @@ export async function runKiloClawBillingLifecycleCron( for (const row of pastDueRows) { try { try { - await client.stop(row.user_id); + await client.stop(row.user_id, row.instance_id ?? undefined); } catch (stopError) { const isExpected = stopError instanceof KiloClawApiError && diff --git a/src/lib/kiloclaw/instance-lifecycle.ts b/src/lib/kiloclaw/instance-lifecycle.ts index 883799610..07807ab85 100644 --- a/src/lib/kiloclaw/instance-lifecycle.ts +++ b/src/lib/kiloclaw/instance-lifecycle.ts @@ -48,7 +48,7 @@ export async function autoResumeIfSuspended( if (targetInstanceId) { try { const client = new KiloClawInternalClient(); - await client.start(kiloUserId); + await client.start(kiloUserId, targetInstanceId); } catch (startError) { logError('Failed to auto-resume instance', { user_id: kiloUserId, diff --git a/src/routers/admin-router.ts b/src/routers/admin-router.ts index adc18a960..1a82b633c 100644 --- a/src/routers/admin-router.ts +++ b/src/routers/admin-router.ts @@ -680,7 +680,7 @@ export const adminRouter = createTRPCRouter({ if (activeInstance) { try { const client = new KiloClawInternalClient(); - await client.start(input.userId); + await client.start(input.userId, activeInstance.id); } catch { // Best effort — instance will be startable by the user from the dashboard } From 843d052a45f2e22af97e8bd528e0f5ba29321140 Mon Sep 17 00:00:00 2001 From: syn Date: Mon, 30 Mar 2026 16:19:03 -0500 Subject: [PATCH 11/15] fix(kiloclaw): don't pass instanceId to worker for legacy instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Legacy instances have DOs at idFromName(userId), not idFromName(instanceId). Passing instance.id as instanceId to the worker causes it to resolve an empty DO — breaking status, start, stop, destroy, and all lifecycle operations for existing instances after upgrading to this branch. Add workerInstanceId() helper that checks the row's sandboxId: only returns instance.id for ki_-prefixed (instance-keyed) rows. Returns undefined for legacy base64url rows so the worker falls back to idFromName(userId). Applied across all callers: tRPC router, admin router, and admin instances router. --- src/lib/kiloclaw/instance-registry.ts | 29 ++++++ .../admin-kiloclaw-instances-router.ts | 45 ++++----- src/routers/admin-router.ts | 8 +- src/routers/kiloclaw-router.ts | 91 +++++++++++++------ 4 files changed, 119 insertions(+), 54 deletions(-) diff --git a/src/lib/kiloclaw/instance-registry.ts b/src/lib/kiloclaw/instance-registry.ts index 5c0a2a8a7..5b71425a1 100644 --- a/src/lib/kiloclaw/instance-registry.ts +++ b/src/lib/kiloclaw/instance-registry.ts @@ -13,6 +13,35 @@ export type ActiveKiloClawInstance = { name: string | null; }; +/** + * Returns true if this instance row uses the instance-keyed identity scheme + * (ki_ sandboxId prefix, DO keyed by instanceId). Legacy rows have + * userId-derived base64url sandboxIds and DOs keyed by userId. + */ +export function isInstanceKeyed(instance: ActiveKiloClawInstance): boolean { + return instance.sandboxId.startsWith('ki_'); +} + +/** + * Returns the instanceId to pass to the worker for DO routing, or undefined + * for legacy instances (where the DO is keyed by userId, not instanceId). + * + * This is the bridge between the Postgres row identity and the worker's + * instanceStubFactory. Legacy rows must NOT pass instanceId because + * their DO lives at idFromName(userId), not idFromName(instanceId). + * + * Accepts either an ActiveKiloClawInstance (camelCase) or a raw DB row + * with snake_case fields — checks for both `sandboxId` and `sandbox_id`. + */ +export function workerInstanceId( + instance: { id: string; sandboxId?: string; sandbox_id?: string } | null | undefined +): string | undefined { + if (!instance) return undefined; + const sandboxId = instance.sandboxId ?? instance.sandbox_id; + if (!sandboxId) return undefined; + return sandboxId.startsWith('ki_') ? instance.id : undefined; +} + type EnsureActiveInstanceOpts = { /** Organization ID. When provided, creates an org-owned instance. */ orgId?: string; diff --git a/src/routers/admin-kiloclaw-instances-router.ts b/src/routers/admin-kiloclaw-instances-router.ts index a8ac1c366..90acb35a4 100644 --- a/src/routers/admin-kiloclaw-instances-router.ts +++ b/src/routers/admin-kiloclaw-instances-router.ts @@ -13,6 +13,7 @@ import { getActiveInstance, markActiveInstanceDestroyed, restoreDestroyedInstance, + workerInstanceId, } from '@/lib/kiloclaw/instance-registry'; import { flyAppNameFromUserId } from '@/lib/kiloclaw/fly-app-name'; import { @@ -194,7 +195,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const client = new KiloClawInternalClient(); - workerStatus = await client.getDebugStatus(instance.user_id, instance.id); + workerStatus = await client.getDebugStatus(instance.user_id, workerInstanceId(instance)); } catch (err) { workerStatusError = err instanceof KiloClawApiError @@ -418,7 +419,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.listVolumeSnapshots(input.userId, instance?.id); + return await client.listVolumeSnapshots(input.userId, workerInstanceId(instance)); } catch (err) { console.error('Failed to fetch volume snapshots for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -430,7 +431,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.getControllerVersion(input.userId, instance?.id); + return await client.getControllerVersion(input.userId, workerInstanceId(instance)); } catch (err) { console.error('Failed to fetch controller version for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -442,7 +443,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.getGatewayStatus(input.userId, instance?.id); + return await client.getGatewayStatus(input.userId, workerInstanceId(instance)); } catch (err) { console.error('Failed to fetch gateway status for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage, { @@ -460,7 +461,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.startGateway(input.userId, instance?.id); + return await client.startGateway(input.userId, workerInstanceId(instance)); } catch (err) { console.error('Failed to start gateway for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -472,7 +473,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.stopGateway(input.userId, instance?.id); + return await client.stopGateway(input.userId, workerInstanceId(instance)); } catch (err) { console.error('Failed to stop gateway for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -484,7 +485,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.restartGatewayProcess(input.userId, instance?.id); + return await client.restartGatewayProcess(input.userId, workerInstanceId(instance)); } catch (err) { console.error('Failed to restart gateway for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -496,7 +497,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.runDoctor(input.userId, instance?.id); + return await client.runDoctor(input.userId, workerInstanceId(instance)); } catch (err) { console.error('Failed to run doctor for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -620,7 +621,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.restoreConfig(input.userId, undefined, instance?.id); + return await client.restoreConfig(input.userId, undefined, workerInstanceId(instance)); } catch (err) { console.error('Failed to restore config for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -633,7 +634,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - const result = await client.getFileTree(input.userId, instance?.id); + const result = await client.getFileTree(input.userId, workerInstanceId(instance)); return result.tree; } catch (err) { throwKiloclawAdminError(err, 'Failed to fetch file tree'); @@ -646,7 +647,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.readFile(input.userId, input.path, instance?.id); + return await client.readFile(input.userId, input.path, workerInstanceId(instance)); } catch (err) { throwKiloclawAdminError(err, 'Failed to read file'); } @@ -670,7 +671,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ input.path, input.content, input.etag, - instance?.id + workerInstanceId(instance) ); } catch (err) { // Propagate file_etag_conflict with UpstreamApiError so the UI can detect it @@ -693,7 +694,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.start(input.userId, instance?.id, { skipCooldown: true }); + return await client.start(input.userId, workerInstanceId(instance), { skipCooldown: true }); } catch (err) { console.error('Failed to start machine for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -705,7 +706,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.forceRetryRecovery(input.userId, instance?.id); + return await client.forceRetryRecovery(input.userId, workerInstanceId(instance)); } catch (err) { console.error('Failed to retry recovery for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -717,7 +718,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.stop(input.userId, instance?.id); + return await client.stop(input.userId, workerInstanceId(instance)); } catch (err) { console.error('Failed to stop machine for user:', input.userId, err); throwKiloclawAdminError(err, fallbackMessage); @@ -789,7 +790,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ // Verify the appName/machineId match the DO's actual state let status: Awaited>; try { - status = await client.getDebugStatus(input.userId, instance?.id); + status = await client.getDebugStatus(input.userId, workerInstanceId(instance)); } catch (err) { throwKiloclawAdminError(err, 'Failed to verify machine state before destroy'); } @@ -806,7 +807,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ input.userId, input.appName, input.machineId, - instance?.id + workerInstanceId(instance) ); try { @@ -862,7 +863,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ const destroyedRow = await markActiveInstanceDestroyed(instance.user_id, instance.id); const client = new KiloClawInternalClient(); try { - await client.destroy(instance.user_id, instance.id); + await client.destroy(instance.user_id, workerInstanceId(instance)); } catch (error) { if (destroyedRow) { await restoreDestroyedInstance(destroyedRow.id); @@ -927,7 +928,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ try { const instance = await getActiveInstance(input.userId); const client = new KiloClawInternalClient(); - return await client.listCandidateVolumes(input.userId, instance?.id); + return await client.listCandidateVolumes(input.userId, workerInstanceId(instance)); } catch (err) { throwKiloclawAdminError(err, 'Failed to list candidate volumes'); } @@ -960,7 +961,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ for (const instance of activeInstances) { const destroyedRow = await markActiveInstanceDestroyed(instance.user_id, instance.id); try { - await client.destroy(instance.user_id, instance.id); + await client.destroy(instance.user_id, workerInstanceId(instance)); destroyed++; } catch (err) { if (destroyedRow) { @@ -997,7 +998,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ input.userId, input.newVolumeId, input.reason, - instance?.id + workerInstanceId(instance) ); try { @@ -1044,7 +1045,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ const result = await client.restoreVolumeFromSnapshot( input.userId, input.snapshotId, - instance?.id + workerInstanceId(instance) ); try { diff --git a/src/routers/admin-router.ts b/src/routers/admin-router.ts index 1a82b633c..937365f28 100644 --- a/src/routers/admin-router.ts +++ b/src/routers/admin-router.ts @@ -33,6 +33,7 @@ import { adminGastownRouter } from '@/routers/admin/gastown-router'; import { adminWebhookTriggersRouter } from '@/routers/admin-webhook-triggers-router'; import { adminAlertingRouter } from '@/routers/admin-alerting-router'; import { adminBotRequestsRouter } from '@/routers/admin-bot-requests-router'; +import { workerInstanceId } from '@/lib/kiloclaw/instance-registry'; import * as z from 'zod'; import { eq, and, ne, or, ilike, desc, asc, sql, isNull, inArray } from 'drizzle-orm'; import { findUsersByIds, findUserById } from '@/lib/user'; @@ -667,7 +668,10 @@ export const adminRouter = createTRPCRouter({ // For resets, attempt to start the instance (best effort, outside transaction) if (isReset) { const [activeInstance] = await db - .select({ id: kiloclaw_instances.id }) + .select({ + id: kiloclaw_instances.id, + sandbox_id: kiloclaw_instances.sandbox_id, + }) .from(kiloclaw_instances) .where( and( @@ -680,7 +684,7 @@ export const adminRouter = createTRPCRouter({ if (activeInstance) { try { const client = new KiloClawInternalClient(); - await client.start(input.userId, activeInstance.id); + await client.start(input.userId, workerInstanceId(activeInstance)); } catch { // Best effort — instance will be startable by the user from the dashboard } diff --git a/src/routers/kiloclaw-router.ts b/src/routers/kiloclaw-router.ts index 2fa7da057..677e1a41e 100644 --- a/src/routers/kiloclaw-router.ts +++ b/src/routers/kiloclaw-router.ts @@ -42,6 +42,7 @@ import { markInstanceDestroyedById, renameInstance, restoreDestroyedInstance, + workerInstanceId, type ActiveKiloClawInstance, } from '@/lib/kiloclaw/instance-registry'; @@ -347,7 +348,7 @@ async function provisionInstance( kilocodeDefaultModel: input.kilocodeDefaultModel ?? undefined, pinnedImageTag, }, - { instanceId: instanceRow.id } + workerInstanceId(instanceRow) ? { instanceId: instanceRow.id } : undefined ); } catch (error) { // Only clean up the exact row this attempt created. Target by primary @@ -383,7 +384,7 @@ async function patchConfig( kilocodeApiKey, kilocodeApiKeyExpiresAt, }, - instance?.id + workerInstanceId(instance) ); return sanitizeKiloCodeConfigResponse(response); @@ -591,7 +592,7 @@ export const kiloclawRouter = createTRPCRouter({ getStatus: baseProcedure.query(async ({ ctx }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const status = await client.getStatus(ctx.user.id, instance?.id); + const status = await client.getStatus(ctx.user.id, workerInstanceId(instance)); const workerUrl = KILOCLAW_API_URL || 'https://claw.kilo.ai'; return { @@ -617,7 +618,7 @@ export const kiloclawRouter = createTRPCRouter({ getStreamChatCredentials: clawAccessProcedure.query(async ({ ctx }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.getStreamChatCredentials(ctx.user.id, instance?.id); + return client.getStreamChatCredentials(ctx.user.id, workerInstanceId(instance)); }), sendChatMessage: clawAccessProcedure @@ -686,7 +687,7 @@ export const kiloclawRouter = createTRPCRouter({ start: clawAccessProcedure.mutation(async ({ ctx }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const result = await client.start(ctx.user.id, instance?.id); + const result = await client.start(ctx.user.id, workerInstanceId(instance)); // /api/platform/start always returns { ok: true } regardless of whether // the machine transitioned state, so this may fire for no-op requests. // The UI only enables Start when isStartable is true, so false fires are rare. @@ -701,7 +702,7 @@ export const kiloclawRouter = createTRPCRouter({ stop: clawAccessProcedure.mutation(async ({ ctx }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.stop(ctx.user.id, instance?.id); + return client.stop(ctx.user.id, workerInstanceId(instance)); }), destroy: baseProcedure.mutation(async ({ ctx }) => { @@ -709,7 +710,7 @@ export const kiloclawRouter = createTRPCRouter({ const client = new KiloClawInternalClient(); let result; try { - result = await client.destroy(ctx.user.id, destroyedRow?.id); + result = await client.destroy(ctx.user.id, workerInstanceId(destroyedRow)); } catch (error) { if (destroyedRow) { await restoreDestroyedInstance(destroyedRow.id); @@ -803,7 +804,7 @@ export const kiloclawRouter = createTRPCRouter({ return client.patchChannels( ctx.user.id, { channels: buildWorkerChannelsPatch(input) }, - instance?.id + workerInstanceId(instance) ); }), @@ -812,7 +813,7 @@ export const kiloclawRouter = createTRPCRouter({ .mutation(async ({ ctx, input }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.patchExecPreset(ctx.user.id, input, instance?.id); + return client.patchExecPreset(ctx.user.id, input, workerInstanceId(instance)); }), /** @@ -900,7 +901,7 @@ export const kiloclawRouter = createTRPCRouter({ return await client.patchSecrets( ctx.user.id, { secrets: encryptedPatch, meta: input.meta }, - instance?.id + workerInstanceId(instance) ); } catch (err) { if (err instanceof KiloClawApiError && err.statusCode >= 400 && err.statusCode < 500) { @@ -1018,7 +1019,7 @@ export const kiloclawRouter = createTRPCRouter({ .query(async ({ ctx, input }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.listPairingRequests(ctx.user.id, input?.refresh, instance?.id); + return client.listPairingRequests(ctx.user.id, input?.refresh, workerInstanceId(instance)); }), approvePairingRequest: clawAccessProcedure @@ -1026,7 +1027,12 @@ export const kiloclawRouter = createTRPCRouter({ .mutation(async ({ ctx, input }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.approvePairingRequest(ctx.user.id, input.channel, input.code, instance?.id); + return client.approvePairingRequest( + ctx.user.id, + input.channel, + input.code, + workerInstanceId(instance) + ); }), listDevicePairingRequests: clawAccessProcedure @@ -1034,7 +1040,11 @@ export const kiloclawRouter = createTRPCRouter({ .query(async ({ ctx, input }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.listDevicePairingRequests(ctx.user.id, input?.refresh, instance?.id); + return client.listDevicePairingRequests( + ctx.user.id, + input?.refresh, + workerInstanceId(instance) + ); }), approveDevicePairingRequest: clawAccessProcedure @@ -1042,14 +1052,18 @@ export const kiloclawRouter = createTRPCRouter({ .mutation(async ({ ctx, input }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.approveDevicePairingRequest(ctx.user.id, input.requestId, instance?.id); + return client.approveDevicePairingRequest( + ctx.user.id, + input.requestId, + workerInstanceId(instance) + ); }), gatewayStatus: baseProcedure.query(async ({ ctx }) => { try { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return await client.getGatewayStatus(ctx.user.id, instance?.id); + return await client.getGatewayStatus(ctx.user.id, workerInstanceId(instance)); } catch (err) { console.error('Failed to fetch gateway status for user:', ctx.user.id, err); if (err instanceof KiloClawApiError && (err.statusCode === 404 || err.statusCode === 409)) { @@ -1069,7 +1083,7 @@ export const kiloclawRouter = createTRPCRouter({ try { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return await client.getGatewayReady(ctx.user.id, instance?.id); + return await client.getGatewayReady(ctx.user.id, workerInstanceId(instance)); } catch (err) { console.error('[gatewayReady] error for user:', ctx.user.id, err); if (err instanceof KiloClawApiError && (err.statusCode === 404 || err.statusCode === 409)) { @@ -1088,19 +1102,19 @@ export const kiloclawRouter = createTRPCRouter({ controllerVersion: baseProcedure.query(async ({ ctx }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.getControllerVersion(ctx.user.id, instance?.id); + return client.getControllerVersion(ctx.user.id, workerInstanceId(instance)); }), restartOpenClaw: clawAccessProcedure.mutation(async ({ ctx }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.restartGatewayProcess(ctx.user.id, instance?.id); + return client.restartGatewayProcess(ctx.user.id, workerInstanceId(instance)); }), runDoctor: clawAccessProcedure.mutation(async ({ ctx }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.runDoctor(ctx.user.id, instance?.id); + return client.runDoctor(ctx.user.id, workerInstanceId(instance)); }), // ── Kilo CLI Run ────────────────────────────────────────────────── @@ -1110,7 +1124,11 @@ export const kiloclawRouter = createTRPCRouter({ .mutation(async ({ ctx, input }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const result = await client.startKiloCliRun(ctx.user.id, input.prompt, instance?.id); + const result = await client.startKiloCliRun( + ctx.user.id, + input.prompt, + workerInstanceId(instance) + ); // Persist the run in the database and return its ID const [row] = await db @@ -1169,7 +1187,10 @@ export const kiloclawRouter = createTRPCRouter({ // Run is still active — poll the controller for live output. const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const controllerStatus = await client.getKiloCliRunStatus(ctx.user.id, instance?.id); + const controllerStatus = await client.getKiloCliRunStatus( + ctx.user.id, + workerInstanceId(instance) + ); // If controller reports the run finished, persist to the DB row. if ( @@ -1205,7 +1226,7 @@ export const kiloclawRouter = createTRPCRouter({ .mutation(async ({ ctx, input }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const result = await client.cancelKiloCliRun(ctx.user.id, instance?.id); + const result = await client.cancelKiloCliRun(ctx.user.id, workerInstanceId(instance)); // Mark the specific run as cancelled in DB if (result.ok) { @@ -1244,7 +1265,7 @@ export const kiloclawRouter = createTRPCRouter({ restoreConfig: clawAccessProcedure.mutation(async ({ ctx }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.restoreConfig(ctx.user.id, undefined, instance?.id); + return client.restoreConfig(ctx.user.id, undefined, workerInstanceId(instance)); }), getGoogleSetupCommand: clawAccessProcedure.query(({ ctx }) => { @@ -1266,7 +1287,7 @@ export const kiloclawRouter = createTRPCRouter({ disconnectGoogle: clawAccessProcedure.mutation(async ({ ctx }) => { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return client.clearGoogleCredentials(ctx.user.id, instance?.id); + return client.clearGoogleCredentials(ctx.user.id, workerInstanceId(instance)); }), setGmailNotifications: baseProcedure @@ -1276,9 +1297,9 @@ export const kiloclawRouter = createTRPCRouter({ const client = new KiloClawInternalClient(); try { if (input.enabled) { - return await client.enableGmailNotifications(ctx.user.id, instance?.id); + return await client.enableGmailNotifications(ctx.user.id, workerInstanceId(instance)); } - return await client.disableGmailNotifications(ctx.user.id, instance?.id); + return await client.disableGmailNotifications(ctx.user.id, workerInstanceId(instance)); } catch (err) { if (err instanceof KiloClawApiError && err.statusCode >= 400 && err.statusCode < 500) { let message = `Failed to update Gmail notifications (${err.statusCode})`; @@ -1576,7 +1597,7 @@ export const kiloclawRouter = createTRPCRouter({ try { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - const result = await client.getFileTree(ctx.user.id, instance?.id); + const result = await client.getFileTree(ctx.user.id, workerInstanceId(instance)); return result.tree; } catch (err) { handleFileOperationError(err, 'fetch file tree'); @@ -1589,7 +1610,7 @@ export const kiloclawRouter = createTRPCRouter({ try { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return await client.readFile(ctx.user.id, input.path, instance?.id); + return await client.readFile(ctx.user.id, input.path, workerInstanceId(instance)); } catch (err) { handleFileOperationError(err, 'read file'); } @@ -1628,7 +1649,13 @@ export const kiloclawRouter = createTRPCRouter({ content = JSON.stringify(userConfig, null, 2); } - return await client.writeFile(ctx.user.id, input.path, content, input.etag, instance?.id); + return await client.writeFile( + ctx.user.id, + input.path, + content, + input.etag, + workerInstanceId(instance) + ); } catch (err) { handleFileOperationError(err, 'write file'); } @@ -1640,7 +1667,11 @@ export const kiloclawRouter = createTRPCRouter({ try { const instance = await getActiveInstance(ctx.user.id); const client = new KiloClawInternalClient(); - return await client.patchOpenclawConfig(ctx.user.id, input.patch, instance?.id); + return await client.patchOpenclawConfig( + ctx.user.id, + input.patch, + workerInstanceId(instance) + ); } catch (err) { handleFileOperationError(err, 'patch openclaw config'); } From 82d951e78649910717a820dbedb0085a3c90cda6 Mon Sep 17 00:00:00 2001 From: syn Date: Mon, 30 Mar 2026 16:32:45 -0500 Subject: [PATCH 12/15] fix(kiloclaw): fix access-gateway fallback and admin destroy missing sandbox_id Two fixes: 1. access-gateway resolveSandboxId: when getStatus() throws but the registry entry exists, derive sandboxId from the entry's doKey (sandboxIdFromInstanceId for UUID doKeys) instead of falling back to sandboxIdFromUserId. Prevents wrong gateway token for instance-keyed DOs when the DO is temporarily unreachable. 2. Admin destroy and devNukeAll queries now select sandbox_id so workerInstanceId() can check the ki_ prefix. Without it, the function always returned undefined, routing to the legacy userId-keyed DO instead of the instance-keyed one. --- kiloclaw/src/routes/access-gateway.ts | 20 +++++++++++++++---- .../admin-kiloclaw-instances-router.ts | 2 ++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/kiloclaw/src/routes/access-gateway.ts b/kiloclaw/src/routes/access-gateway.ts index d27e74063..57eff90cc 100644 --- a/kiloclaw/src/routes/access-gateway.ts +++ b/kiloclaw/src/routes/access-gateway.ts @@ -6,6 +6,7 @@ import { getWorkerDb, validateAndRedeemAccessCode, findPepperByUserId } from '.. import { signKiloToken, validateKiloToken } from '../auth/jwt'; import { deriveGatewayToken } from '../auth/gateway-token'; import { sandboxIdFromUserId } from '../auth/sandbox-id'; +import { sandboxIdFromInstanceId, isValidInstanceId } from '@kilocode/worker-utils/instance-id'; import type { KiloClawEnv } from '../types'; /** @@ -51,12 +52,23 @@ async function resolveSandboxId(userId: string, env: KiloClawEnv): Promise 0) { - const stub = env.KILOCLAW_INSTANCE.get(env.KILOCLAW_INSTANCE.idFromName(entries[0].doKey)); - const status = await stub.getStatus(); - if (status.sandboxId) return status.sandboxId; + const entry = entries[0]; + // Try the DO's authoritative sandboxId first. + try { + const stub = env.KILOCLAW_INSTANCE.get(env.KILOCLAW_INSTANCE.idFromName(entry.doKey)); + const status = await stub.getStatus(); + if (status.sandboxId) return status.sandboxId; + } catch { + // DO unreachable — derive from the registry entry's doKey. + // If doKey is a UUID (instance-keyed), derive ki_ sandboxId from it. + // If doKey is a userId (legacy), fall through to sandboxIdFromUserId. + if (isValidInstanceId(entry.doKey)) { + return sandboxIdFromInstanceId(entry.doKey); + } + } } } catch { - // Fall back to legacy derivation if registry/DO is unreachable + // Registry unreachable — fall back to legacy derivation } return sandboxIdFromUserId(userId); } diff --git a/src/routers/admin-kiloclaw-instances-router.ts b/src/routers/admin-kiloclaw-instances-router.ts index 90acb35a4..1213239cd 100644 --- a/src/routers/admin-kiloclaw-instances-router.ts +++ b/src/routers/admin-kiloclaw-instances-router.ts @@ -842,6 +842,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ .select({ id: kiloclaw_instances.id, user_id: kiloclaw_instances.user_id, + sandbox_id: kiloclaw_instances.sandbox_id, destroyed_at: kiloclaw_instances.destroyed_at, }) .from(kiloclaw_instances) @@ -946,6 +947,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ .select({ id: kiloclaw_instances.id, user_id: kiloclaw_instances.user_id, + sandbox_id: kiloclaw_instances.sandbox_id, }) .from(kiloclaw_instances) .where(isNull(kiloclaw_instances.destroyed_at)); From 72e49b40f62cd38a432011900b7847d817384f44 Mon Sep 17 00:00:00 2001 From: syn Date: Mon, 30 Mar 2026 17:00:35 -0500 Subject: [PATCH 13/15] fix(kiloclaw): clean up registry entries on legacy destroy (no instanceId) The destroy route only cleaned up registry entries when instanceId was provided. Legacy destroys (no instanceId, DO keyed by userId) skipped the cleanup entirely, leaving stale registry entries that point to destroyed DOs. On re-provision, the proxy picks the stale entry (oldest) instead of the new instance's entry. Fix: when no instanceId is provided, list the user registry's entries and find the one with doKey=userId (the legacy entry), then destroy it by its instanceId from the registry. --- kiloclaw/src/routes/platform.ts | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/kiloclaw/src/routes/platform.ts b/kiloclaw/src/routes/platform.ts index 8dd0d9e1b..38c7c1b29 100644 --- a/kiloclaw/src/routes/platform.ts +++ b/kiloclaw/src/routes/platform.ts @@ -1280,21 +1280,30 @@ platform.post('/destroy', async c => { ); // Remove the instance from the registry (best-effort). - if (instanceId) { - // Always try the user registry. If the instance was org-owned and we - // know the orgId, also clean up the org registry. + // When instanceId is provided, destroy by instanceId directly. + // When absent (legacy destroy), find the entry with doKey=userId + // and destroy it by its instanceId from the registry. + try { const registryKeys = [`user:${userId}`]; if (orgId) registryKeys.push(`org:${orgId}`); for (const registryKey of registryKeys) { - try { - const registryStub = c.env.KILOCLAW_REGISTRY.get( - c.env.KILOCLAW_REGISTRY.idFromName(registryKey) - ); + const registryStub = c.env.KILOCLAW_REGISTRY.get( + c.env.KILOCLAW_REGISTRY.idFromName(registryKey) + ); + if (instanceId) { await registryStub.destroyInstance(registryKey, instanceId); - } catch (registryErr) { - console.error('[platform] Registry destroy failed (non-fatal):', registryErr); + } else { + // Legacy destroy (no instanceId): the DO was keyed by userId, + // so find the registry entry with doKey=userId. + const entries = await registryStub.listInstances(registryKey); + const legacyEntry = entries.find(e => e.doKey === userId); + if (legacyEntry) { + await registryStub.destroyInstance(registryKey, legacyEntry.instanceId); + } } } + } catch (registryErr) { + console.error('[platform] Registry destroy failed (non-fatal):', registryErr); } return c.json({ ok: true }); From 94bc618ffe238cf0cc2121bdc73e8e90b5ba2236 Mon Sep 17 00:00:00 2001 From: syn Date: Tue, 31 Mar 2026 08:29:36 -0500 Subject: [PATCH 14/15] feat(kiloclaw): registry lifecycle hardening + admin observability Registry lifecycle: - Instance DO destroy() now cleans up registry entries on finalization, covering alarm-initiated destroys that bypass the platform route - Production logging on registry create/destroy in platform routes Admin observability: - New GET /api/platform/registry-entries endpoint returns all entries (including destroyed) for a user's registry - Admin instance detail page shows Registry Status card with all entries, doKey, timestamps, and current/active badges - orgId added to PlatformDebugStatusResponse and shown in admin UI Known gap documented: GDPR soft-delete does not clean up registry or Fly resources (deferred to follow-up PR). --- .../kiloclaw-instance/index.ts | 38 +++++++++ .../src/durable-objects/kiloclaw-registry.ts | 15 ++++ kiloclaw/src/routes/platform.ts | 37 +++++++++ .../KiloclawInstanceDetail.tsx | 82 +++++++++++++++++++ src/lib/kiloclaw/kiloclaw-internal-client.ts | 8 ++ src/lib/kiloclaw/types.ts | 14 ++++ .../admin-kiloclaw-instances-router.ts | 7 ++ 7 files changed, 201 insertions(+) diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts index 3030febdf..0bf3c89b4 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts @@ -1306,6 +1306,11 @@ export class KiloClawInstance extends DurableObject { await tryDeleteMachine(flyConfig, this.ctx, this.s, destroyRctx); await tryDeleteVolume(flyConfig, this.ctx, this.s, destroyRctx); + // Capture identity before finalization wipes state + const preDestroyUserId = this.s.userId; + const preDestroyOrgId = this.s.orgId; + const preDestroySandboxId = this.s.sandboxId; + const finalized = await finalizeDestroyIfComplete( this.ctx, this.s, @@ -1313,6 +1318,39 @@ export class KiloClawInstance extends DurableObject { (userId, sandboxId) => markDestroyedInPostgresHelper(this.env, this.ctx, this.s, userId, sandboxId) ); + + // Clean up registry entry on finalization. This covers both platform-initiated + // and alarm-initiated destroys. The platform route's registry cleanup is + // redundant but harmless (destroyInstance is idempotent on already-destroyed entries). + if (finalized.finalized && preDestroyUserId && preDestroySandboxId) { + try { + const registryInstanceId = isInstanceKeyedSandboxId(preDestroySandboxId) + ? instanceIdFromSandboxId(preDestroySandboxId) + : null; + + const registryKeys = [`user:${preDestroyUserId}`]; + if (preDestroyOrgId) registryKeys.push(`org:${preDestroyOrgId}`); + + for (const registryKey of registryKeys) { + const registryStub = this.env.KILOCLAW_REGISTRY.get( + this.env.KILOCLAW_REGISTRY.idFromName(registryKey) + ); + if (registryInstanceId) { + await registryStub.destroyInstance(registryKey, registryInstanceId); + } else { + // Legacy: find entry by doKey=userId + const entries = await registryStub.listInstances(registryKey); + const legacyEntry = entries.find(e => e.doKey === preDestroyUserId); + if (legacyEntry) { + await registryStub.destroyInstance(registryKey, legacyEntry.instanceId); + } + } + } + } catch (registryErr) { + console.error('[DO] Registry cleanup on finalization failed (non-fatal):', registryErr); + } + } + if (!finalized.finalized) { doWarn(this.s, 'Destroy incomplete, alarm will retry', { pendingMachineId: this.s.pendingDestroyMachineId, diff --git a/kiloclaw/src/durable-objects/kiloclaw-registry.ts b/kiloclaw/src/durable-objects/kiloclaw-registry.ts index 4d1c0ec47..8ed2f3d34 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-registry.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-registry.ts @@ -98,6 +98,21 @@ export class KiloClawRegistry extends DurableObject { .map(rowToEntry); } + /** List all registry entries including destroyed ones (for admin inspection). */ + async listAllInstances(ownerKey: string): Promise { + await this.ensureOwnerKey(ownerKey); + + if (!this.migrated) { + const now = Date.now(); + if (now - this.lastMigrationAttempt >= KiloClawRegistry.MIGRATION_RETRY_COOLDOWN_MS) { + this.lastMigrationAttempt = now; + await this.lazyMigrate(); + } + } + + return this.db.select().from(registryInstances).all().map(rowToEntry); + } + async createInstance( ownerKey: string, assignedUserId: string, diff --git a/kiloclaw/src/routes/platform.ts b/kiloclaw/src/routes/platform.ts index 38c7c1b29..374f5c6f6 100644 --- a/kiloclaw/src/routes/platform.ts +++ b/kiloclaw/src/routes/platform.ts @@ -343,6 +343,11 @@ platform.post('/provision', async c => { // doKey = instanceId: all new provisions create DOs keyed by instanceId. // For lazy-migrated legacy instances, doKey = userId (set in lazyMigrate). await registryStub.createInstance(registryKey, userId, instanceId, instanceId); + console.log('[platform] Registry entry created:', { + registryKey, + instanceId, + doKey: instanceId, + }); } catch (registryErr) { console.error('[platform] Registry create failed (non-fatal):', registryErr); } @@ -1283,6 +1288,7 @@ platform.post('/destroy', async c => { // When instanceId is provided, destroy by instanceId directly. // When absent (legacy destroy), find the entry with doKey=userId // and destroy it by its instanceId from the registry. + // Note: The Instance DO also cleans up on finalization (belt-and-suspenders). try { const registryKeys = [`user:${userId}`]; if (orgId) registryKeys.push(`org:${orgId}`); @@ -1292,6 +1298,7 @@ platform.post('/destroy', async c => { ); if (instanceId) { await registryStub.destroyInstance(registryKey, instanceId); + console.log('[platform] Registry entry destroyed:', { registryKey, instanceId }); } else { // Legacy destroy (no instanceId): the DO was keyed by userId, // so find the registry entry with doKey=userId. @@ -1299,6 +1306,17 @@ platform.post('/destroy', async c => { const legacyEntry = entries.find(e => e.doKey === userId); if (legacyEntry) { await registryStub.destroyInstance(registryKey, legacyEntry.instanceId); + console.log('[platform] Registry entry destroyed (legacy):', { + registryKey, + instanceId: legacyEntry.instanceId, + doKey: userId, + }); + } else { + console.log('[platform] No registry entry found for legacy destroy:', { + registryKey, + doKey: userId, + entriesCount: entries.length, + }); } } } @@ -1429,6 +1447,25 @@ platform.get('/debug-status', async c => { } }); +// GET /api/platform/registry-entries?userId=... +// Returns all registry entries (including destroyed) for admin inspection. +platform.get('/registry-entries', async c => { + const userId = setValidatedQueryUserId(c); + if (!userId) return c.json({ error: 'userId query parameter is required' }, 400); + + try { + const registryKey = `user:${userId}`; + const registryStub = c.env.KILOCLAW_REGISTRY.get( + c.env.KILOCLAW_REGISTRY.idFromName(registryKey) + ); + const entries = await registryStub.listAllInstances(registryKey); + return c.json({ entries, registryKey, migrated: true }); + } catch (err) { + const { message, status } = sanitizeError(err, 'registry-entries'); + return jsonError(message, status); + } +}); + // GET /api/platform/gateway-token?userId=...&instanceId=... // Returns the derived gateway token for a user's sandbox. The Next.js // dashboard calls this so it never needs GATEWAY_TOKEN_SECRET directly. diff --git a/src/app/admin/components/KiloclawInstances/KiloclawInstanceDetail.tsx b/src/app/admin/components/KiloclawInstances/KiloclawInstanceDetail.tsx index 95989b2c7..c5d30bbd5 100644 --- a/src/app/admin/components/KiloclawInstances/KiloclawInstanceDetail.tsx +++ b/src/app/admin/components/KiloclawInstances/KiloclawInstanceDetail.tsx @@ -1145,6 +1145,12 @@ export function KiloclawInstanceDetail({ instanceId }: { instanceId: string }) { refetchInterval: awaitingRestartCompletion || awaitingRestoreCompletion ? 3000 : false, }); + const userId = data?.user_id; + const { data: registryData } = useQuery({ + ...trpc.admin.kiloclawInstances.registryEntries.queryOptions({ userId: userId ?? '' }), + enabled: !!userId, + }); + const { mutateAsync: destroyInstance, isPending: isDestroying } = useMutation( trpc.admin.kiloclawInstances.destroy.mutationOptions({ onSuccess: () => { @@ -1620,6 +1626,78 @@ export function KiloclawInstanceDetail({ instanceId }: { instanceId: string }) { + {/* Registry Status */} + {registryData && ( + + + Registry Status + + Registry DO entries for {registryData.registryKey} + + + + {registryData.entries.length === 0 ? ( +

No registry entries found

+ ) : ( +
+ + + + + + + + + + + + {registryData.entries.map(entry => { + const isCurrent = entry.instanceId === data?.id; + const isDestroyed = entry.destroyedAt !== null; + return ( + + + + + + + + ); + })} + +
Instance IDDO KeyCreatedDestroyedStatus
+ {entry.instanceId.slice(0, 8)}... + {isCurrent && ( + + current + + )} + + + {entry.doKey === entry.instanceId + ? 'instanceId' + : entry.doKey.slice(0, 8) + '...'} + + + {new Date(entry.createdAt).toLocaleString()} + + {entry.destroyedAt + ? new Date(entry.destroyedAt).toLocaleString() + : '—'} + + + {isDestroyed ? 'Destroyed' : 'Active'} + +
+
+ )} +
+
+ )} +
@@ -1665,6 +1743,10 @@ export function KiloclawInstanceDetail({ instanceId }: { instanceId: string }) { {data.workerStatus.sandboxId ?? '—'} + + {data.workerStatus.orgId ?? '—'} + +
diff --git a/src/lib/kiloclaw/kiloclaw-internal-client.ts b/src/lib/kiloclaw/kiloclaw-internal-client.ts index 85f040910..881605d98 100644 --- a/src/lib/kiloclaw/kiloclaw-internal-client.ts +++ b/src/lib/kiloclaw/kiloclaw-internal-client.ts @@ -6,6 +6,7 @@ import type { ProvisionInput, PlatformStatusResponse, PlatformDebugStatusResponse, + RegistryEntriesResponse, KiloCodeConfigPatchInput, KiloCodeConfigResponse, ChannelsPatchInput, @@ -222,6 +223,13 @@ export class KiloClawInternalClient { return this.request(`/api/platform/debug-status?${params.toString()}`, undefined, { userId }); } + async getRegistryEntries(userId: string): Promise { + const params = new URLSearchParams({ userId }); + return this.request(`/api/platform/registry-entries?${params.toString()}`, undefined, { + userId, + }); + } + async patchKiloCodeConfig( userId: string, patch: KiloCodeConfigPatchInput, diff --git a/src/lib/kiloclaw/types.ts b/src/lib/kiloclaw/types.ts index d253ead3b..42c7bce19 100644 --- a/src/lib/kiloclaw/types.ts +++ b/src/lib/kiloclaw/types.ts @@ -156,8 +156,22 @@ export type PlatformStatusResponse = { execAsk: string | null; }; +/** Response from GET /api/platform/registry-entries (admin only). */ +export type RegistryEntriesResponse = { + entries: Array<{ + instanceId: string; + doKey: string; + assignedUserId: string; + createdAt: string; + destroyedAt: string | null; + }>; + registryKey: string; + migrated: boolean; +}; + /** Response from GET /api/platform/debug-status (internal/admin only). */ export type PlatformDebugStatusResponse = PlatformStatusResponse & { + orgId: string | null; pendingDestroyMachineId: string | null; pendingDestroyVolumeId: string | null; pendingPostgresMarkOnFinalize: boolean; diff --git a/src/routers/admin-kiloclaw-instances-router.ts b/src/routers/admin-kiloclaw-instances-router.ts index 1213239cd..ccb4af404 100644 --- a/src/routers/admin-kiloclaw-instances-router.ts +++ b/src/routers/admin-kiloclaw-instances-router.ts @@ -213,6 +213,13 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ } satisfies AdminKiloclawInstanceDetail; }), + registryEntries: adminProcedure + .input(z.object({ userId: z.string().min(1) })) + .query(async ({ input }) => { + const client = new KiloClawInternalClient(); + return client.getRegistryEntries(input.userId); + }), + list: adminProcedure.input(ListInstancesSchema).query(async ({ input }) => { const { offset, limit, sortBy, sortOrder, search, status } = input; const searchTerm = search?.trim() || ''; From 6469185c4870742ecfcd4ebcd6222557d1b7fad1 Mon Sep 17 00:00:00 2001 From: syn Date: Tue, 31 Mar 2026 08:47:50 -0500 Subject: [PATCH 15/15] fix(kiloclaw): registry admin endpoint queries org registry, real migration status, detailed logging Three review fixes: 1. Registry admin endpoint now accepts optional orgId param and queries both personal (user:userId) and org (org:orgId) registries. Admin instance detail page passes organization_id. AdminKiloclawInstance type and list/get queries updated to include organization_id. 2. migrated field now sourced from the Registry DO's actual migration state (listAllInstances returns { entries, migrated }) instead of hardcoded true. 3. DO finalization registry cleanup now logs success, legacy-path success, and no-op (already cleaned / never existed) with context for each case. --- .../kiloclaw-instance/index.ts | 20 +++++++++- .../src/durable-objects/kiloclaw-registry.ts | 9 +++-- kiloclaw/src/routes/platform.ts | 37 +++++++++++++++---- .../KiloclawInstanceDetail.tsx | 24 ++++++++---- src/lib/kiloclaw/kiloclaw-internal-client.ts | 3 +- src/lib/kiloclaw/types.ts | 11 ++++-- .../admin-kiloclaw-instances-router.ts | 7 +++- 7 files changed, 86 insertions(+), 25 deletions(-) diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts index 0bf3c89b4..39918a84e 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts @@ -1337,12 +1337,30 @@ export class KiloClawInstance extends DurableObject { ); if (registryInstanceId) { await registryStub.destroyInstance(registryKey, registryInstanceId); + console.log('[DO] Registry entry destroyed on finalization:', { + registryKey, + instanceId: registryInstanceId, + }); } else { - // Legacy: find entry by doKey=userId + // Legacy: find active entry by doKey=userId const entries = await registryStub.listInstances(registryKey); const legacyEntry = entries.find(e => e.doKey === preDestroyUserId); if (legacyEntry) { await registryStub.destroyInstance(registryKey, legacyEntry.instanceId); + console.log('[DO] Registry entry destroyed on finalization (legacy):', { + registryKey, + instanceId: legacyEntry.instanceId, + doKey: preDestroyUserId, + }); + } else { + console.log( + '[DO] Registry cleanup: no active entry found (already cleaned or never existed):', + { + registryKey, + doKey: preDestroyUserId, + activeEntryCount: entries.length, + } + ); } } } diff --git a/kiloclaw/src/durable-objects/kiloclaw-registry.ts b/kiloclaw/src/durable-objects/kiloclaw-registry.ts index 8ed2f3d34..6322ffdb2 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-registry.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-registry.ts @@ -98,8 +98,10 @@ export class KiloClawRegistry extends DurableObject { .map(rowToEntry); } - /** List all registry entries including destroyed ones (for admin inspection). */ - async listAllInstances(ownerKey: string): Promise { + /** List all registry entries including destroyed ones, plus migration status (admin). */ + async listAllInstances( + ownerKey: string + ): Promise<{ entries: RegistryEntry[]; migrated: boolean }> { await this.ensureOwnerKey(ownerKey); if (!this.migrated) { @@ -110,7 +112,8 @@ export class KiloClawRegistry extends DurableObject { } } - return this.db.select().from(registryInstances).all().map(rowToEntry); + const entries = this.db.select().from(registryInstances).all().map(rowToEntry); + return { entries, migrated: this.migrated }; } async createInstance( diff --git a/kiloclaw/src/routes/platform.ts b/kiloclaw/src/routes/platform.ts index 374f5c6f6..7c52cf592 100644 --- a/kiloclaw/src/routes/platform.ts +++ b/kiloclaw/src/routes/platform.ts @@ -1447,19 +1447,42 @@ platform.get('/debug-status', async c => { } }); -// GET /api/platform/registry-entries?userId=... +// GET /api/platform/registry-entries?userId=...&orgId=... // Returns all registry entries (including destroyed) for admin inspection. +// Queries the personal registry and optionally the org registry. platform.get('/registry-entries', async c => { const userId = setValidatedQueryUserId(c); if (!userId) return c.json({ error: 'userId query parameter is required' }, 400); + const orgId = c.req.query('orgId') ?? null; + + const results: Array<{ + registryKey: string; + entries: Array<{ + instanceId: string; + doKey: string; + assignedUserId: string; + createdAt: string; + destroyedAt: string | null; + }>; + migrated: boolean; + }> = []; try { - const registryKey = `user:${userId}`; - const registryStub = c.env.KILOCLAW_REGISTRY.get( - c.env.KILOCLAW_REGISTRY.idFromName(registryKey) - ); - const entries = await registryStub.listAllInstances(registryKey); - return c.json({ entries, registryKey, migrated: true }); + // Always query the personal registry + const userKey = `user:${userId}`; + const userStub = c.env.KILOCLAW_REGISTRY.get(c.env.KILOCLAW_REGISTRY.idFromName(userKey)); + const userResult = await userStub.listAllInstances(userKey); + results.push({ registryKey: userKey, ...userResult }); + + // If orgId is provided, also query the org registry + if (orgId) { + const orgKey = `org:${orgId}`; + const orgStub = c.env.KILOCLAW_REGISTRY.get(c.env.KILOCLAW_REGISTRY.idFromName(orgKey)); + const orgResult = await orgStub.listAllInstances(orgKey); + results.push({ registryKey: orgKey, ...orgResult }); + } + + return c.json({ registries: results }); } catch (err) { const { message, status } = sanitizeError(err, 'registry-entries'); return jsonError(message, status); diff --git a/src/app/admin/components/KiloclawInstances/KiloclawInstanceDetail.tsx b/src/app/admin/components/KiloclawInstances/KiloclawInstanceDetail.tsx index c5d30bbd5..618ef9753 100644 --- a/src/app/admin/components/KiloclawInstances/KiloclawInstanceDetail.tsx +++ b/src/app/admin/components/KiloclawInstances/KiloclawInstanceDetail.tsx @@ -1146,8 +1146,12 @@ export function KiloclawInstanceDetail({ instanceId }: { instanceId: string }) { }); const userId = data?.user_id; + const orgId = data?.organization_id; const { data: registryData } = useQuery({ - ...trpc.admin.kiloclawInstances.registryEntries.queryOptions({ userId: userId ?? '' }), + ...trpc.admin.kiloclawInstances.registryEntries.queryOptions({ + userId: userId ?? '', + orgId: orgId ?? undefined, + }), enabled: !!userId, }); @@ -1627,17 +1631,21 @@ export function KiloclawInstanceDetail({ instanceId }: { instanceId: string }) { {/* Registry Status */} - {registryData && ( - + {registryData?.registries.map(registry => ( + Registry Status - Registry DO entries for {registryData.registryKey} + {registry.registryKey} + {' · '} + + {registry.migrated ? 'migrated' : 'pending migration'} + - {registryData.entries.length === 0 ? ( -

No registry entries found

+ {registry.entries.length === 0 ? ( +

No registry entries

) : (
@@ -1651,7 +1659,7 @@ export function KiloclawInstanceDetail({ instanceId }: { instanceId: string }) { - {registryData.entries.map(entry => { + {registry.entries.map(entry => { const isCurrent = entry.instanceId === data?.id; const isDestroyed = entry.destroyedAt !== null; return ( @@ -1696,7 +1704,7 @@ export function KiloclawInstanceDetail({ instanceId }: { instanceId: string }) { )} - )} + ))} diff --git a/src/lib/kiloclaw/kiloclaw-internal-client.ts b/src/lib/kiloclaw/kiloclaw-internal-client.ts index 881605d98..6f27c815a 100644 --- a/src/lib/kiloclaw/kiloclaw-internal-client.ts +++ b/src/lib/kiloclaw/kiloclaw-internal-client.ts @@ -223,8 +223,9 @@ export class KiloClawInternalClient { return this.request(`/api/platform/debug-status?${params.toString()}`, undefined, { userId }); } - async getRegistryEntries(userId: string): Promise { + async getRegistryEntries(userId: string, orgId?: string): Promise { const params = new URLSearchParams({ userId }); + if (orgId) params.set('orgId', orgId); return this.request(`/api/platform/registry-entries?${params.toString()}`, undefined, { userId, }); diff --git a/src/lib/kiloclaw/types.ts b/src/lib/kiloclaw/types.ts index 42c7bce19..79ba8847e 100644 --- a/src/lib/kiloclaw/types.ts +++ b/src/lib/kiloclaw/types.ts @@ -156,8 +156,9 @@ export type PlatformStatusResponse = { execAsk: string | null; }; -/** Response from GET /api/platform/registry-entries (admin only). */ -export type RegistryEntriesResponse = { +/** A single registry DO's entries + migration status. */ +export type RegistryResult = { + registryKey: string; entries: Array<{ instanceId: string; doKey: string; @@ -165,10 +166,14 @@ export type RegistryEntriesResponse = { createdAt: string; destroyedAt: string | null; }>; - registryKey: string; migrated: boolean; }; +/** Response from GET /api/platform/registry-entries (admin only). */ +export type RegistryEntriesResponse = { + registries: RegistryResult[]; +}; + /** Response from GET /api/platform/debug-status (internal/admin only). */ export type PlatformDebugStatusResponse = PlatformStatusResponse & { orgId: string | null; diff --git a/src/routers/admin-kiloclaw-instances-router.ts b/src/routers/admin-kiloclaw-instances-router.ts index ccb4af404..0c8dc7562 100644 --- a/src/routers/admin-kiloclaw-instances-router.ts +++ b/src/routers/admin-kiloclaw-instances-router.ts @@ -143,6 +143,7 @@ export type AdminKiloclawInstance = { id: string; user_id: string; sandbox_id: string; + organization_id: string | null; created_at: string; destroyed_at: string | null; suspended_at: string | null; @@ -180,6 +181,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ id: result.instance.id, user_id: result.instance.user_id, sandbox_id: result.instance.sandbox_id, + organization_id: result.instance.organization_id, created_at: result.instance.created_at, destroyed_at: result.instance.destroyed_at, suspended_at: result.suspended_at ?? null, @@ -214,10 +216,10 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ }), registryEntries: adminProcedure - .input(z.object({ userId: z.string().min(1) })) + .input(z.object({ userId: z.string().min(1), orgId: z.string().optional() })) .query(async ({ input }) => { const client = new KiloClawInternalClient(); - return client.getRegistryEntries(input.userId); + return client.getRegistryEntries(input.userId, input.orgId ?? undefined); }), list: adminProcedure.input(ListInstancesSchema).query(async ({ input }) => { @@ -295,6 +297,7 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ id: row.instance.id, user_id: row.instance.user_id, sandbox_id: row.instance.sandbox_id, + organization_id: row.instance.organization_id, created_at: row.instance.created_at, destroyed_at: row.instance.destroyed_at, suspended_at: row.suspended_at ?? null,