fix(webapp,run-engine): close cross-table gaps in the task_run_v2 mixed window

d-cs · d-cs · commit ef54cb979f57 · 2026-06-22T17:42:30.000+01:00
Routes that walk the run hierarchy through a Prisma relation only see one
physical table, so during a runTableV2 flag flip (a parent and child on
opposite tables) they silently miss the cross-table run. This closes the
reachable cases:

- cancelRun resolves child runs across both tables, so cancelling a parent
  cascades to a child in the other table instead of leaving it executing
  and holding concurrency.
- updateMetadata routes metadata.parent/root operations to the scalar
  parent/root id, so they reach a parent in the other table instead of
  falling back to the child run.
- a one-time-use token with no idempotency key now takes a cross-table
  claim for v2 orgs, so two presentations straddling a flip cannot each
  mint a run in a different table.
- the Electric shape merge reports up-to-date only when both tables are
  caught up, so a multi-chunk initial snapshot no longer drops the rows
  that arrive after the first chunk.
diff --git a/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts b/apps/webapp/app/runEngine/concerns/idempotencyKeys.server.ts
@@ -215,6 +215,73 @@ export class IdempotencyKeyConcern {
       new Date(Date.now() + 24 * 60 * 60 * 1000 * 30); // 30 days
 
     if (!idempotencyKey) {
+      // A one-time-use token with NO idempotency key would otherwise skip the
+      // claim path below entirely. During a `runTableV2` flag flip, two
+      // concurrent presentations of the same token can mint into DIFFERENT
+      // physical tables (cuid -> TaskRun, ksuid -> task_run_v2); the per-table
+      // unique constraint on `oneTimeUseToken` can't see across the two tables,
+      // so neither INSERT raises P2002 and one token spawns two runs. For
+      // v2-cutover orgs, serialise on the token via a Redis claim so the first
+      // presentation wins and the rest resolve to it. Excludes
+      // resumeParentOnCompletion (triggerAndWait) to match the buffer
+      // fallback's handling — a one-time PUBLIC_JWT token is a fire-and-forget
+      // public trigger, not a parent/child wait, so that case is left to the
+      // per-table constraint.
+      const oneTimeUseToken = request.options?.oneTimeUseToken;
+      if (oneTimeUseToken && !request.body.options?.resumeParentOnCompletion) {
+        const orgFeatureFlags =
+          (request.environment.organization?.featureFlags as
+            | Record<string, unknown>
+            | null
+            | undefined) ?? null;
+        if (shouldUseV2RunTable(orgFeatureFlags)) {
+          // Namespace the claim key so a token can never collide with a real
+          // idempotency key in the same (envId, taskIdentifier) slot. The TTL is
+          // a fixed pipeline-dwell bound, NOT the customer idempotencyKeyTTL:
+          // there is no idempotency key in this path, so a client-supplied TTL
+          // has no meaning here, and a tiny value would expire the claim
+          // mid-flight and reopen the cross-table dup window.
+          const claimKey = `otu:${oneTimeUseToken}`;
+          const outcome = await claimOrAwait({
+            envId: request.environment.id,
+            taskIdentifier: request.taskId,
+            idempotencyKey: claimKey,
+            ttlSeconds: env.TRIGGER_MOLLIFIER_CLAIM_TTL_SECONDS,
+            safetyNetMs: env.TRIGGER_MOLLIFIER_CLAIM_WAIT_MS,
+            pollStepMs: env.TRIGGER_MOLLIFIER_CLAIM_POLL_MS,
+          });
+          if (outcome.kind === "resolved") {
+            // A concurrent presentation of the same one-time token already won
+            // and committed a run. Reject this one exactly as the within-table
+            // path does (the per-table oneTimeUseToken unique constraint raises
+            // P2002 -> RunOneTimeUseTokenError -> this same 4xx), preserving the
+            // "token already used" contract while closing the cross-table gap.
+            throw new ServiceValidationError(
+              `Cannot trigger ${request.taskId} with a one-time use token as it has already been used.`
+            );
+          } else if (outcome.kind === "timed_out") {
+            throw new ServiceValidationError(
+              "One-time-use token claim resolution timed out",
+              503
+            );
+          } else if (outcome.kind === "claimed") {
+            // We own the claim. The trigger pipeline MUST publish (on success)
+            // or release (on error) it — wired through the returned `claim`,
+            // exactly like the idempotency-keyed path.
+            return {
+              isCached: false,
+              idempotencyKey,
+              idempotencyKeyExpiresAt,
+              claim: {
+                envId: request.environment.id,
+                taskIdentifier: request.taskId,
+                idempotencyKey: claimKey,
+                token: outcome.token,
+              },
+            };
+          }
+        }
+      }
       return { isCached: false, idempotencyKey, idempotencyKeyExpiresAt };
     }
 
@@ -329,17 +396,22 @@ export class IdempotencyKeyConcern {
         | Record<string, unknown>
         | null
         | undefined) ?? null;
-    // v2-cutover orgs: ANY idempotency-keyed trigger can straddle a
-    // `runTableV2` flag flip into different physical tables (cuid -> TaskRun,
-    // ksuid -> task_run_v2), so the claim must serialise all of them —
-    // including triggerAndWait (resumeParentOnCompletion), debounce, and
-    // oneTimeUseToken, whose per-table unique constraints (idempotencyKey,
-    // oneTimeUseToken) can't see across the two tables. The
+    // v2-cutover orgs: an idempotency-keyed trigger can straddle a `runTableV2`
+    // flag flip into different physical tables (cuid -> TaskRun, ksuid ->
+    // task_run_v2), and the per-table idempotency-key unique constraints can't
+    // see across the two tables, so this claim (keyed on the idempotency key)
+    // is the only backstop that serialises same-key triggers across the flip,
+    // including triggerAndWait (resumeParentOnCompletion) and debounce. The
     // resumeParentOnCompletion/debounce/oneTimeUseToken exclusions below are
     // mollifier-gate alignment optimisations (those requests always return
-    // pass_through from the gate, so there's no buffer to serialise against)
-    // and don't apply to the cross-table concern. shouldUseV2RunTable is
-    // checked first so a v2 org skips the mollifier-flag resolve entirely.
+    // pass_through from the gate, so there's no buffer to serialise against);
+    // they don't apply to v2 orgs, which short-circuit to claimEligible via
+    // shouldUseV2RunTable regardless. oneTimeUseToken triggers with NO
+    // idempotency key are serialised separately by the token claim in the
+    // early-return block above; the residual same-token-with-two-different-keys
+    // case is not covered here (each key claims its own slot) and would require
+    // a pathological client. shouldUseV2RunTable is checked first so a v2 org
+    // skips the mollifier-flag resolve entirely.
     const claimEligible =
       shouldUseV2RunTable(orgFeatureFlags) ||
       (!request.body.options?.resumeParentOnCompletion &&
diff --git a/apps/webapp/app/services/metadata/updateMetadata.server.ts b/apps/webapp/app/services/metadata/updateMetadata.server.ts
@@ -354,18 +354,14 @@ export class UpdateMetadataService {
           metadata: true,
           metadataType: true,
           metadataVersion: true,
-          parentTaskRun: {
-            select: {
-              id: true,
-              status: true,
-            },
-          },
-          rootTaskRun: {
-            select: {
-              id: true,
-              status: true,
-            },
-          },
+          // Scalar parent/root pointers, NOT the parentTaskRun/rootTaskRun
+          // relations: a relation select is bound to one physical run table and
+          // resolves to null when the parent/root lives in the other table (a
+          // v2 child of a legacy parent in the mixed window). The scalar id is
+          // table-agnostic, and #ingestRunOperations only needs the id — the
+          // flusher routes by id format across both tables.
+          parentTaskRunId: true,
+          rootTaskRunId: true,
         },
       },
       this._prisma
@@ -380,11 +376,11 @@ export class UpdateMetadataService {
     }
 
     if (body.parentOperations && body.parentOperations.length > 0) {
-      this.#ingestRunOperations(taskRun.parentTaskRun?.id ?? taskRun.id, body.parentOperations);
+      this.#ingestRunOperations(taskRun.parentTaskRunId ?? taskRun.id, body.parentOperations);
     }
 
     if (body.rootOperations && body.rootOperations.length > 0) {
-      this.#ingestRunOperations(taskRun.rootTaskRun?.id ?? taskRun.id, body.rootOperations);
+      this.#ingestRunOperations(taskRun.rootTaskRunId ?? taskRun.id, body.rootOperations);
     }
 
     const result = await this.#updateRunMetadata({
diff --git a/apps/webapp/app/services/realtime/electricShapeMerge.server.ts b/apps/webapp/app/services/realtime/electricShapeMerge.server.ts
@@ -51,6 +51,16 @@ export type MergedShape =
       offset: string;
       cursor?: string;
       schema?: string;
+      /**
+       * The composite is up-to-date only when BOTH shapes are. An Electric
+       * snapshot can span multiple chunks: every chunk but the last omits the
+       * `up-to-date` control message. If one table's snapshot is still mid-fetch
+       * (chunk 1 of N) while the other has completed, the merged response must
+       * NOT terminate with `up-to-date` — otherwise the client believes the
+       * whole snapshot is done, flips to live, and never fetches the remaining
+       * chunks (silently dropping that table's overflow rows).
+       */
+      upToDate: boolean;
     };
 
 /**
@@ -146,6 +156,12 @@ export function mergeParsedShapes(
     offset: encodeComposite(a.offset ?? prior.offsetA, b.offset ?? prior.offsetB),
     cursor,
     schema: a.schema ?? b.schema,
+    // Only terminate the composite when BOTH shapes have caught up; an
+    // un-up-to-date shape (a snapshot chunk that isn't the last) keeps the
+    // client requesting the remainder. unpolledShape() reports upToDate:true,
+    // so a live round that returns changes from one shape and carries the
+    // other forward still terminates iff the polled shape is itself up-to-date.
+    upToDate: a.upToDate && b.upToDate,
   };
 }
 
diff --git a/apps/webapp/app/services/realtimeClient.server.ts b/apps/webapp/app/services/realtimeClient.server.ts
@@ -573,7 +573,13 @@ export class RealtimeClient {
       responseHeaders.set("electric-schema", merged.schema);
     }
 
-    const body = JSON.stringify([...merged.changes, UP_TO_DATE_MESSAGE]);
+    // Only append the up-to-date terminator when BOTH upstream shapes are
+    // caught up. If one table's snapshot is still spanning chunks, omitting the
+    // terminator keeps the client in snapshot mode fetching the rest instead of
+    // prematurely flipping to live and dropping that table's remaining rows.
+    const body = JSON.stringify(
+      merged.upToDate ? [...merged.changes, UP_TO_DATE_MESSAGE] : [...merged.changes]
+    );
     const finalBody =
       apiVersion === CURRENT_API_VERSION ? body : this.#rewriteResponseBodyForNoneApiVersion(body);
     return new Response(finalBody, { status: 200, headers: responseHeaders });
diff --git a/apps/webapp/test/electricShapeMerge.test.ts b/apps/webapp/test/electricShapeMerge.test.ts
@@ -198,4 +198,53 @@ describe("mergeParsedShapes", () => {
     if (merged.mustRefetch) throw new Error("unexpected refetch");
     expect(merged.schema).toBe('{"id":{"type":"text"}}');
   });
+
+  it("is up-to-date only when BOTH shapes are caught up (multi-chunk snapshot guard)", () => {
+    // Both caught up -> the composite terminates with up-to-date.
+    const both = mergeParsedShapes(shape({ upToDate: true }), shape({ upToDate: true }), PRIOR);
+    if (both.mustRefetch) throw new Error("unexpected refetch");
+    expect(both.upToDate).toBe(true);
+
+    // Table A is mid-snapshot (chunk 1 of N: rows but no up-to-date control
+    // message); B has completed. The composite must NOT be up-to-date — else
+    // the client flips to live after chunk 1 and silently drops A's remaining
+    // rows. The rows seen so far still flow through.
+    const aMidSnapshot = mergeParsedShapes(
+      shape({ changes: [INSERT], upToDate: false, handle: "hA", offset: "oA" }),
+      shape({ upToDate: true, handle: "hB", offset: "oB" }),
+      PRIOR
+    );
+    if (aMidSnapshot.mustRefetch) throw new Error("unexpected refetch");
+    expect(aMidSnapshot.upToDate).toBe(false);
+    expect(aMidSnapshot.changes).toEqual([INSERT]);
+
+    // Symmetric: B mid-snapshot.
+    const bMidSnapshot = mergeParsedShapes(
+      shape({ upToDate: true }),
+      shape({ changes: [UPDATE], upToDate: false }),
+      PRIOR
+    );
+    if (bMidSnapshot.mustRefetch) throw new Error("unexpected refetch");
+    expect(bMidSnapshot.upToDate).toBe(false);
+  });
+
+  it("a live round carrying the un-polled sibling terminates only when the polled shape is caught up", () => {
+    // unpolledShape reports upToDate:true, so the composite terminates iff the
+    // polled shape is itself caught up.
+    const caughtUp = mergeParsedShapes(
+      shape({ changes: [INSERT], upToDate: true }),
+      unpolledShape("b", PRIOR),
+      PRIOR
+    );
+    if (caughtUp.mustRefetch) throw new Error("unexpected refetch");
+    expect(caughtUp.upToDate).toBe(true);
+
+    const moreComing = mergeParsedShapes(
+      shape({ changes: [INSERT], upToDate: false }),
+      unpolledShape("b", PRIOR),
+      PRIOR
+    );
+    if (moreComing.mustRefetch) throw new Error("unexpected refetch");
+    expect(moreComing.upToDate).toBe(false);
+  });
 });
diff --git a/apps/webapp/test/oneTimeUseTokenClaim.test.ts b/apps/webapp/test/oneTimeUseTokenClaim.test.ts
@@ -0,0 +1,153 @@
+import { describe, expect, it, vi } from "vitest";
+
+// Stub `~/db.server` before importing the concern — the real module eagerly
+// calls `prisma.$connect()` at singleton construction. The concern under test
+// receives its prisma via the constructor, and the one-time-token path below
+// reaches the claim before any DB read, so the stub is never exercised.
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+
+// claimOrAwait resolves its backend through getIdempotencyClaimBuffer; script
+// it via a hoisted handle so each test controls the claim outcome.
+const h = vi.hoisted(() => ({ buffer: null as unknown }));
+vi.mock("~/v3/mollifier/mollifierBuffer.server", () => ({
+  getMollifierBuffer: () => h.buffer,
+  getIdempotencyClaimBuffer: () => h.buffer,
+}));
+// The one-time-token claim runs BEFORE the mollifier-flag resolve, but the
+// concern still imports the gate module; stub it so loading doesn't pull in
+// extra feature-flag wiring.
+vi.mock("~/v3/mollifier/mollifierGate.server", () => ({
+  makeResolveMollifierFlag: () => async () => false,
+}));
+
+import type { MollifierBuffer } from "@trigger.dev/redis-worker";
+import { IdempotencyKeyConcern } from "~/runEngine/concerns/idempotencyKeys.server";
+import type { TriggerTaskRequest } from "~/runEngine/types";
+
+function makeConcern() {
+  return new IdempotencyKeyConcern(
+    {
+      taskRun: { findFirst: async () => null },
+      taskRunV2: { findFirst: async () => null },
+    } as never,
+    {} as never, // engine — unused on this path
+    {} as never // traceEventConcern — unused on this path
+  );
+}
+
+function makeOtuRequest(
+  overrides: {
+    featureFlags?: Record<string, unknown>;
+    oneTimeUseToken?: string | undefined;
+    resumeParentOnCompletion?: boolean;
+  } = {}
+): TriggerTaskRequest {
+  return {
+    taskId: "my-task",
+    environment: {
+      id: "env_a",
+      organizationId: "org_1",
+      organization: { featureFlags: overrides.featureFlags ?? { runTableV2: true } },
+    },
+    // No idempotencyKey on purpose — this is the path the per-table
+    // oneTimeUseToken unique constraint cannot cover across two tables.
+    options: { oneTimeUseToken: "oneTimeUseToken" in overrides ? overrides.oneTimeUseToken : "tok-1" },
+    body: {
+      options: overrides.resumeParentOnCompletion ? { resumeParentOnCompletion: true } : {},
+    },
+  } as unknown as TriggerTaskRequest;
+}
+
+describe("IdempotencyKeyConcern · one-time-use token cross-table claim", () => {
+  it("v2 org: a one-time token with no idempotency key takes a claim keyed on the token", async () => {
+    const claimIdempotency = vi.fn(async () => ({ kind: "claimed" as const }));
+    h.buffer = {
+      claimIdempotency,
+      readClaim: vi.fn(async () => null),
+    } as unknown as MollifierBuffer;
+
+    const result = await makeConcern().handleTriggerRequest(makeOtuRequest(), undefined);
+
+    expect(result.isCached).toBe(false);
+    if (result.isCached === false) {
+      // The trigger pipeline must publish/release this claim — keyed on the
+      // namespaced token so it can never collide with a real idempotency key.
+      expect(result.claim?.idempotencyKey).toBe("otu:tok-1");
+      expect(result.claim?.envId).toBe("env_a");
+      expect(result.claim?.taskIdentifier).toBe("my-task");
+    }
+    expect(claimIdempotency).toHaveBeenCalledTimes(1);
+    expect(claimIdempotency.mock.calls[0][0]).toMatchObject({ idempotencyKey: "otu:tok-1" });
+  });
+
+  it("v2 org: a concurrent winner (claim resolved) rejects the second presentation as already-used", async () => {
+    // The winner committed a run under the token; the loser must be rejected
+    // exactly like the within-table P2002 path, NOT allowed to mint a duplicate
+    // into the other table.
+    h.buffer = {
+      claimIdempotency: vi.fn(async () => ({ kind: "resolved", runId: "run_winner" })),
+      readClaim: vi.fn(async () => null),
+    } as unknown as MollifierBuffer;
+
+    await expect(
+      makeConcern().handleTriggerRequest(makeOtuRequest(), undefined)
+    ).rejects.toThrow(/already been used/i);
+  });
+
+  it("non-v2 org: skips the token claim entirely (no Redis round-trip)", async () => {
+    const claimIdempotency = vi.fn(async () => ({ kind: "claimed" as const }));
+    h.buffer = {
+      claimIdempotency,
+      readClaim: vi.fn(async () => null),
+    } as unknown as MollifierBuffer;
+
+    const result = await makeConcern().handleTriggerRequest(
+      makeOtuRequest({ featureFlags: { mollifierEnabled: true } }),
+      undefined
+    );
+
+    expect(result.isCached).toBe(false);
+    if (result.isCached === false) {
+      expect(result.claim).toBeUndefined();
+    }
+    expect(claimIdempotency).not.toHaveBeenCalled();
+  });
+
+  it("triggerAndWait one-time token: left to the per-table constraint (not claimed here)", async () => {
+    const claimIdempotency = vi.fn(async () => ({ kind: "claimed" as const }));
+    h.buffer = {
+      claimIdempotency,
+      readClaim: vi.fn(async () => null),
+    } as unknown as MollifierBuffer;
+
+    const result = await makeConcern().handleTriggerRequest(
+      makeOtuRequest({ resumeParentOnCompletion: true }),
+      undefined
+    );
+
+    expect(result.isCached).toBe(false);
+    if (result.isCached === false) {
+      expect(result.claim).toBeUndefined();
+    }
+    expect(claimIdempotency).not.toHaveBeenCalled();
+  });
+
+  it("no one-time token: ordinary no-idempotency-key trigger is unaffected", async () => {
+    const claimIdempotency = vi.fn(async () => ({ kind: "claimed" as const }));
+    h.buffer = {
+      claimIdempotency,
+      readClaim: vi.fn(async () => null),
+    } as unknown as MollifierBuffer;
+
+    const result = await makeConcern().handleTriggerRequest(
+      makeOtuRequest({ oneTimeUseToken: undefined }),
+      undefined
+    );
+
+    expect(result.isCached).toBe(false);
+    if (result.isCached === false) {
+      expect(result.claim).toBeUndefined();
+    }
+    expect(claimIdempotency).not.toHaveBeenCalled();
+  });
+});
diff --git a/apps/webapp/test/updateMetadata.test.ts b/apps/webapp/test/updateMetadata.test.ts
diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts