fix(webapp): mutations-layer code-review follow-ups

d-cs · claude · d-cs · commit f4b6064797a2 · 2026-05-26T16:23:50.000+01:00
- metadata route: drop the \`as unknown as Parameters&lt;...&gt;\` cast on
  the parent/root operations path. Widen \`routeOperationsToRun\`'s env
  parameter to \`AuthenticatedEnvironment\` so the service's typed
  signature carries through; the caller always has the full env in
  scope.
- replay route: validate the buffered fallback against a Zod
  \`BufferedReplayInputSchema\` covering the fields
  \`ReplayTaskRunService.call\` actually reads (id, friendlyId,
  runtimeEnvironmentId, taskIdentifier, payload, payloadType, queue,
  isTest, traceId, spanId, engine, runTags + nullable
  concurrencyKey/workerQueue/machinePreset/realtimeStreamsVersion).
  Schema-fail logs the issue list and 404s rather than passing a
  half-shaped object into the service.
- resetIdempotencyKey: distinguish "PG-empty + buffer-cleared-nothing"
  (genuine 404) from "PG-empty + buffer-unreachable" (partial outage —
  503 with retry hint). The previous behaviour silently returned 404
  on outage, hiding the partial failure and leaving a buffered key
  effectively un-reset. New regression test covers all four branches
  (PG-hit + buffer-throws, PG-empty + buffer-hit, PG-empty +
  buffer-clean-miss, PG-empty + buffer-outage, mollifier-disabled).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts
@@ -5,6 +5,7 @@ import type { RunMetadataChangeOperation } from "@trigger.dev/core/v3/schemas";
 import { UpdateMetadataRequestBody } from "@trigger.dev/core/v3";
 import { z } from "zod";
 import { $replica } from "~/db.server";
+import type { AuthenticatedEnvironment } from "~/services/apiAuth.server";
 import { authenticateApiRequest } from "~/services/apiAuth.server";
 import { updateMetadataService } from "~/services/metadata/updateMetadataInstance.server";
 import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
@@ -69,20 +70,17 @@ export async function loader({ request, params }: LoaderFunctionArgs) {
 async function routeOperationsToRun(
   targetRunId: string | undefined,
   operations: RunMetadataChangeOperation[] | undefined,
-  env: { id: string; organizationId: string }
+  env: AuthenticatedEnvironment
 ): Promise<void> {
   if (!targetRunId || !operations || operations.length === 0) return;
 
   // Try PG first via the existing service (this is how parent/root
-  // operations have always landed; preserve that).
+  // operations have always landed; preserve that). Accepts the full
+  // AuthenticatedEnvironment so we don't have to recover the unsafe
+  // `as unknown` cast that the previous narrowed `{ id, organizationId }`
+  // signature forced on us.
   const [error] = await tryCatch(
-    updateMetadataService.call(
-      targetRunId,
-      { operations },
-      { id: env.id, organizationId: env.organizationId } as unknown as Parameters<
-        typeof updateMetadataService.call
-      >[2]
-    )
+    updateMetadataService.call(targetRunId, { operations }, env)
   );
   if (!error) return;
 
diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.replay.ts
@@ -14,6 +14,32 @@ const ParamsSchema = z.object({
   runParam: z.string(),
 });
 
+// Subset of TaskRun fields that ReplayTaskRunService.call actually
+// reads from `existingTaskRun`. Validate the buffered fallback against
+// this before casting to TaskRun so a buffer-format drift surfaces as a
+// 404/422 here rather than as a silent NaN/undefined deep inside
+// replay. The full TaskRun type has many more fields the service never
+// touches; we only assert the ones it reads.
+const BufferedReplayInputSchema = z.object({
+  id: z.string(),
+  friendlyId: z.string(),
+  runtimeEnvironmentId: z.string(),
+  taskIdentifier: z.string(),
+  payload: z.string(),
+  payloadType: z.string(),
+  queue: z.string(),
+  isTest: z.boolean(),
+  traceId: z.string(),
+  spanId: z.string(),
+  engine: z.string(),
+  runTags: z.array(z.string()),
+  // Nullable / optional fields the service tolerates via `??` fallbacks.
+  concurrencyKey: z.string().nullable().optional(),
+  workerQueue: z.string().nullable().optional(),
+  machinePreset: z.string().nullable().optional(),
+  realtimeStreamsVersion: z.string().nullable().optional(),
+});
+
 export async function action({ request, params }: ActionFunctionArgs) {
   // Ensure this is a POST request
   if (request.method.toUpperCase() !== "POST") {
@@ -49,16 +75,28 @@ export async function action({ request, params }: ActionFunctionArgs) {
     if (!taskRun) {
       // Buffered fallback (Q2). The SyntheticRun shape was extended in
       // Phase B4 to carry every field ReplayTaskRunService reads from a
-      // TaskRun. Cast through unknown — the synthesised object has the
-      // same field surface as a real PG row from the service's
-      // perspective.
+      // TaskRun. Validate the subset of fields the service consumes
+      // (BufferedReplayInputSchema above) before casting; a schema
+      // mismatch surfaces as a 404 here rather than as a silent
+      // undefined deep inside the service.
       const buffered = await findRunByIdWithMollifierFallback({
         runId: runParam,
         environmentId: env.id,
         organizationId: env.organizationId,
       });
       if (buffered) {
-        taskRun = buffered as unknown as TaskRun;
+        const parsed = BufferedReplayInputSchema.safeParse(buffered);
+        if (parsed.success) {
+          taskRun = parsed.data as unknown as TaskRun;
+        } else {
+          logger.warn("replay: buffered fallback failed schema validation", {
+            runParam,
+            issues: parsed.error.issues.map((issue) => ({
+              path: issue.path.join("."),
+              code: issue.code,
+            })),
+          });
+        }
       }
     }
 
diff --git a/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts b/apps/webapp/app/v3/services/resetIdempotencyKey.server.ts
@@ -26,6 +26,7 @@ export class ResetIdempotencyKeyService extends BaseService {
     // resetIdempotency clears both the snapshot fields and the Redis
     // lookup atomically. Returns null when nothing was bound there.
     const buffer = getMollifierBuffer();
+    let bufferResetFailed = false;
     const bufferResult = buffer
       ? await buffer
           .resetIdempotency({
@@ -34,8 +35,12 @@ export class ResetIdempotencyKeyService extends BaseService {
             idempotencyKey,
           })
           .catch((err) => {
-            // Buffer outage shouldn't 500 the reset endpoint if PG
-            // already cleared something. Log and treat as a miss.
+            // Don't drop a buffer outage on the floor. We log + flag so
+            // the 404 branch below can distinguish "no record anywhere"
+            // (legitimate not-found) from "PG cleared nothing AND we
+            // couldn't see the buffer" (partial outage — caller should
+            // retry, not be told "doesn't exist").
+            bufferResetFailed = true;
             logger.error("ResetIdempotencyKeyService: buffer reset failed", {
               idempotencyKey,
               taskIdentifier,
@@ -47,6 +52,16 @@ export class ResetIdempotencyKeyService extends BaseService {
 
     const totalCount = pgCount + (bufferResult.clearedRunId ? 1 : 0);
 
+    if (pgCount === 0 && bufferResetFailed) {
+      // PG saw nothing AND the buffer is unreachable. We can't truthfully
+      // say "not found" — there may be a buffered run we can't observe.
+      // Surface as 503 so the caller retries instead of being misled.
+      throw new ServiceValidationError(
+        "Unable to verify buffered idempotency state right now; please retry",
+        503
+      );
+    }
+
     if (totalCount === 0) {
       throw new ServiceValidationError(
         `No runs found with idempotency key: ${idempotencyKey} and task: ${taskIdentifier}`,
diff --git a/apps/webapp/test/mollifierResetIdempotencyKey.test.ts b/apps/webapp/test/mollifierResetIdempotencyKey.test.ts
@@ -0,0 +1,109 @@
+import { describe, expect, it, vi } from "vitest";
+
+// Mock the db module so the BaseService default prisma doesn't try to
+// open a real connection at module load. Each test wires its own
+// prisma stub.
+vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} }));
+// Prevent the runEngine singleton from instantiating and spinning up
+// PG/Redis workers at module load — without this CI fails with
+// unhandled `PrismaClientInitializationError`s even though the
+// assertions all pass (see `mollifierDrainerWorker.test.ts`).
+vi.mock("~/v3/runEngine.server", () => ({ engine: {} }));
+
+// Hoisted mock state so we can swap the buffer per test without
+// re-importing modules.
+const bufferMock: { current: unknown } = { current: null };
+vi.mock("~/v3/mollifier/mollifierBuffer.server", () => ({
+  getMollifierBuffer: () => bufferMock.current,
+}));
+
+import { ResetIdempotencyKeyService } from "~/v3/services/resetIdempotencyKey.server";
+import { ServiceValidationError } from "~/v3/services/baseService.server";
+
+type FakePrisma = {
+  taskRun: { updateMany: (...args: unknown[]) => Promise<{ count: number }> };
+};
+
+function makePrisma(pgCount: number): FakePrisma {
+  return {
+    taskRun: {
+      updateMany: vi.fn(async () => ({ count: pgCount })),
+    },
+  };
+}
+
+const env = {
+  id: "env_a",
+  organizationId: "org_1",
+} as unknown as Parameters<ResetIdempotencyKeyService["call"]>[2];
+
+describe("ResetIdempotencyKeyService — buffer-outage handling", () => {
+  it("returns success when PG cleared >=1 run, even if the buffer reset throws", async () => {
+    bufferMock.current = {
+      resetIdempotency: vi.fn(async () => {
+        throw new Error("ECONNREFUSED");
+      }),
+    };
+    const prisma = makePrisma(1);
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    const result = await service.call("ikey", "task", env);
+    expect(result).toEqual({ id: "ikey" });
+  });
+
+  it("returns success when PG cleared nothing but the buffer cleared a run", async () => {
+    bufferMock.current = {
+      resetIdempotency: vi.fn(async () => ({ clearedRunId: "run_x" })),
+    };
+    const prisma = makePrisma(0);
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    const result = await service.call("ikey", "task", env);
+    expect(result).toEqual({ id: "ikey" });
+  });
+
+  it("404s when PG and buffer both legitimately report 'nothing to clear'", async () => {
+    bufferMock.current = {
+      resetIdempotency: vi.fn(async () => ({ clearedRunId: null })),
+    };
+    const prisma = makePrisma(0);
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    await expect(service.call("ikey", "task", env)).rejects.toMatchObject({
+      status: 404,
+    });
+  });
+
+  // Regression for the silent-not-found hazard CodeRabbit flagged: if PG
+  // sees nothing AND we can't read the buffer (Redis outage), the
+  // previous behaviour was to 404 — masking a partial outage and
+  // leaving a buffered key effectively un-reset while the caller was
+  // told "doesn't exist." We now surface 503 so the caller retries.
+  it("503s when PG cleared nothing AND the buffer reset failed (partial outage)", async () => {
+    bufferMock.current = {
+      resetIdempotency: vi.fn(async () => {
+        throw new Error("ECONNREFUSED");
+      }),
+    };
+    const prisma = makePrisma(0);
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    const error = await service.call("ikey", "task", env).then(
+      () => null,
+      (err) => err,
+    );
+    expect(error).toBeInstanceOf(ServiceValidationError);
+    expect(error.status).toBe(503);
+    expect(error.message).toMatch(/retry/i);
+  });
+
+  it("404s normally when buffer is null (mollifier disabled) and PG cleared nothing", async () => {
+    bufferMock.current = null;
+    const prisma = makePrisma(0);
+    const service = new ResetIdempotencyKeyService(prisma as never);
+
+    await expect(service.call("ikey", "task", env)).rejects.toMatchObject({
+      status: 404,
+    });
+  });
+});