fix(webapp): mollifier read-fallback auth/retry parity + batch reconstruction

d-cs · claude · d-cs · commit 188b8c7c8e42 · 2026-05-28T10:47:48.000+01:00
Addresses the higher-confidence read-fallback review findings:

- attempts GET loader: rebuilt on createLoaderApiRoute so it matches the
  sibling read routes — accepts JWTs with run/task/tag/batch resource
  scoping (was bare authenticateApiRequest, rejecting PUBLIC_JWT and doing
  no scope check), and 404s with `x-should-retry: true` so SDK pollers keep
  retrying a not-yet-materialised run instead of giving up.
- batch reconstruction: the snapshot embeds the batch as `{ id, index }`
  (engine.trigger shape), but readFallback read a non-existent flat
  `batchId`, so SyntheticRun.batchId was always undefined. Read it from
  `snapshot.batch.id` (the internal cuid). synthesiseFoundRunFromBuffer now
  populates `batch` from it, and the spans/trace buffer-path authorization
  pushes the batch resource — so batch-scoped JWTs authorise against
  buffered runs and the retrieve response reports the correct batchId.
- metadata: coerce a non-string buffered metadata defensively (JSON
  stringify + warn) instead of silently dropping to null, mirroring
  synthesisePayload. In practice metadata is always a string, so this is a
  no-op guard, but it surfaces format drift to ops.
- tests: cover batchId extraction from the nested batch object and its
  absence for non-batched runs.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts b/apps/webapp/app/presenters/v3/ApiRetrieveRunPresenter.server.ts
@@ -9,6 +9,7 @@ import {
   logger,
 } from "@trigger.dev/core/v3";
 import { parsePacketAsJson } from "@trigger.dev/core/v3/utils/ioSerialization";
+import { BatchId } from "@trigger.dev/core/v3/isomorphic";
 import { getUserProvidedIdempotencyKey } from "@trigger.dev/core/v3/serverOnly";
 import { Prisma, TaskRunAttemptStatus, TaskRunStatus } from "@trigger.dev/database";
 import assertNever from "assert-never";
@@ -560,6 +561,32 @@ function synthesisePayload(buffered: SyntheticRun): string {
   }
 }
 
+// Mirror synthesisePayload for metadata. The PG path stores
+// `TaskRun.metadata` as `String?`, and the snapshot writes it from
+// `metadataPacket.data` (also a string), so in production it is always a
+// string or absent. We coerce defensively — an object gets JSON-stringified
+// (matching how the trigger path serialises it) rather than silently
+// dropped to null, and the log line surfaces format drift to ops.
+function synthesiseMetadata(buffered: SyntheticRun): string | null {
+  const metadata = buffered.metadata;
+  if (typeof metadata === "string") return metadata;
+  if (metadata === undefined || metadata === null) return null;
+  try {
+    const serialised = JSON.stringify(metadata);
+    logger.warn("ApiRetrieveRunPresenter: buffered snapshot.metadata non-string coerced", {
+      runFriendlyId: buffered.friendlyId,
+      metadataType: typeof metadata,
+    });
+    return typeof serialised === "string" ? serialised : null;
+  } catch {
+    logger.error("ApiRetrieveRunPresenter: buffered snapshot.metadata unserialisable", {
+      runFriendlyId: buffered.friendlyId,
+      metadataType: typeof metadata,
+    });
+    return null;
+  }
+}
+
 function synthesiseFoundRunFromBuffer(buffered: SyntheticRun): FoundRun {
   const status: TaskRunStatus = bufferedStatusToTaskRunStatus(buffered.status);
 
@@ -570,8 +597,7 @@ function synthesiseFoundRunFromBuffer(buffered: SyntheticRun): FoundRun {
       }
     : null;
 
-  const metadata: Prisma.JsonValue =
-    typeof buffered.metadata === "string" ? buffered.metadata : null;
+  const metadata: string | null = synthesiseMetadata(buffered);
 
   return {
     // `id` is the internal cuid (Prisma TaskRun.id column), `friendlyId`
@@ -603,7 +629,13 @@ function synthesiseFoundRunFromBuffer(buffered: SyntheticRun): FoundRun {
     scheduleId: null,
     lockedToVersion: buffered.lockedToVersion ? { version: buffered.lockedToVersion } : null,
     resumeParentOnCompletion: buffered.resumeParentOnCompletion,
-    batch: null,
+    // Reconstruct the batch from the snapshot's internal id so a buffered
+    // run reports the same `batchId` / triggerFunction as it will once
+    // materialised, and so batch-scoped JWTs authorise against it (the
+    // route authorization callbacks read `run.batch?.friendlyId`).
+    batch: buffered.batchId
+      ? { id: buffered.batchId, friendlyId: BatchId.toFriendlyId(buffered.batchId) }
+      : null,
     runTags: buffered.tags,
     traceId: buffered.traceId ?? "",
     payload: synthesisePayload(buffered),
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts b/apps/webapp/app/routes/api.v1.runs.$runId.spans.$spanId.ts
@@ -72,6 +72,9 @@ export const loader = createLoaderApiRoute(
           ...(run.taskIdentifier ? [{ type: "tasks", id: run.taskIdentifier }] : []),
           ...run.tags.map((tag) => ({ type: "tags", id: tag })),
         ];
+        if (run.batchId) {
+          resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) });
+        }
         return anyResource(resources);
       },
     },
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts b/apps/webapp/app/routes/api.v1.runs.$runId.trace.ts
@@ -70,6 +70,9 @@ export const loader = createLoaderApiRoute(
           ...(run.taskIdentifier ? [{ type: "tasks", id: run.taskIdentifier }] : []),
           ...run.tags.map((tag) => ({ type: "tags", id: tag })),
         ];
+        if (run.batchId) {
+          resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) });
+        }
         return anyResource(resources);
       },
     },
diff --git a/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts b/apps/webapp/app/routes/api.v1.runs.$runParam.attempts.ts
@@ -1,9 +1,14 @@
-import type { ActionFunctionArgs, LoaderFunctionArgs } from "@remix-run/server-runtime";
+import type { ActionFunctionArgs } from "@remix-run/server-runtime";
 import { json } from "@remix-run/server-runtime";
+import { BatchId } from "@trigger.dev/core/v3/isomorphic";
 import { z } from "zod";
 import { $replica } from "~/db.server";
 import { authenticateApiRequest } from "~/services/apiAuth.server";
 import { logger } from "~/services/logger.server";
+import {
+  anyResource,
+  createLoaderApiRoute,
+} from "~/services/routeBuilders/apiBuilder.server";
 import { findRunByIdWithMollifierFallback } from "~/v3/mollifier/readFallback.server";
 import { ServiceValidationError } from "~/v3/services/baseService.server";
 import { CreateTaskRunAttemptService } from "~/v3/services/createTaskRunAttempt.server";
@@ -23,44 +28,76 @@ const ParamsSchema = z.object({
 // attempt list belongs on the v3 retrieve endpoint, not here — this is
 // the dual of the POST that creates attempts, and the empty-list shape
 // gives the parity script a stable contract to assert against.
-export async function loader({ request, params }: LoaderFunctionArgs) {
-  const authenticationResult = await authenticateApiRequest(request);
-  if (!authenticationResult) {
-    return json({ error: "Invalid or Missing API Key" }, { status: 401 });
-  }
+//
+// Built with createLoaderApiRoute so it matches the sibling read routes
+// (spans, trace, retrieve): it accepts JWTs (`allowJWT`) with the same
+// run/task/tag/batch resource scoping, and a not-found run returns 404
+// with `x-should-retry: true` (`shouldRetryNotFound`) so SDK pollers keep
+// retrying a run that the drainer hasn't materialised yet. PG-first then
+// buffer fallback, so a third party can't distinguish "exists" from
+// "doesn't exist" cross-environment.
+type ResolvedRun =
+  | { source: "pg"; run: NonNullable<Awaited<ReturnType<typeof findPgRun>>> }
+  | { source: "buffer"; run: NonNullable<Awaited<ReturnType<typeof findRunByIdWithMollifierFallback>>> };
 
-  const parsed = ParamsSchema.safeParse(params);
-  if (!parsed.success) {
-    return json({ error: "Invalid or missing run ID" }, { status: 400 });
-  }
+async function findPgRun(runId: string, environmentId: string) {
+  return $replica.taskRun.findFirst({
+    where: { friendlyId: runId, runtimeEnvironmentId: environmentId },
+    select: { friendlyId: true, taskIdentifier: true, runTags: true, batchId: true },
+  });
+}
 
-  const { runParam } = parsed.data;
-  const env = authenticationResult.environment;
+export const loader = createLoaderApiRoute(
+  {
+    params: ParamsSchema,
+    allowJWT: true,
+    corsStrategy: "all",
+    findResource: async (params, auth): Promise<ResolvedRun | null> => {
+      const pgRun = await findPgRun(params.runParam, auth.environment.id);
+      if (pgRun) return { source: "pg", run: pgRun };
 
-  // Verify the run belongs to the authenticated environment before
-  // returning the parity-empty list. The response body is empty either
-  // way, but other run-scoped endpoints (spans, trace, retrieve) all
-  // 404 on cross-env access; matching that here means a third party
-  // can't distinguish "run exists" from "doesn't exist" via this
-  // endpoint either. PG-first then buffer fallback, consistent with
-  // the other read paths.
-  const pgRun = await $replica.taskRun.findFirst({
-    where: { friendlyId: runParam, runtimeEnvironmentId: env.id },
-    select: { id: true },
-  });
-  if (!pgRun) {
-    const buffered = await findRunByIdWithMollifierFallback({
-      runId: runParam,
-      environmentId: env.id,
-      organizationId: env.organizationId,
-    });
-    if (!buffered) {
-      return json({ error: "Run not found" }, { status: 404 });
-    }
-  }
+      const buffered = await findRunByIdWithMollifierFallback({
+        runId: params.runParam,
+        environmentId: auth.environment.id,
+        organizationId: auth.environment.organizationId,
+      });
+      if (buffered) return { source: "buffer", run: buffered };
 
-  return json({ attempts: [] }, { status: 200 });
-}
+      return null;
+    },
+    shouldRetryNotFound: true,
+    authorization: {
+      action: "read",
+      resource: (resolved) => {
+        if (resolved.source === "pg") {
+          const run = resolved.run;
+          const resources = [
+            { type: "runs", id: run.friendlyId },
+            { type: "tasks", id: run.taskIdentifier },
+            ...run.runTags.map((tag) => ({ type: "tags", id: tag })),
+          ];
+          if (run.batchId) {
+            resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) });
+          }
+          return anyResource(resources);
+        }
+        const run = resolved.run;
+        const resources = [
+          { type: "runs", id: run.friendlyId },
+          ...(run.taskIdentifier ? [{ type: "tasks", id: run.taskIdentifier }] : []),
+          ...run.tags.map((tag) => ({ type: "tags", id: tag })),
+        ];
+        if (run.batchId) {
+          resources.push({ type: "batch", id: BatchId.toFriendlyId(run.batchId) });
+        }
+        return anyResource(resources);
+      },
+    },
+  },
+  async () => {
+    return json({ attempts: [] }, { status: 200 });
+  }
+);
 
 export async function action({ request, params }: ActionFunctionArgs) {
   // Authenticate the request
diff --git a/apps/webapp/app/v3/mollifier/readFallback.server.ts b/apps/webapp/app/v3/mollifier/readFallback.server.ts
@@ -201,7 +201,12 @@ export async function findRunByIdWithMollifierFallback(
       annotations: snapshot.annotations,
       traceContext: snapshot.traceContext,
       scheduleId: asString(snapshot.scheduleId),
-      batchId: asString(snapshot.batchId),
+      // The engine.trigger input embeds the batch as `{ id, index }`
+      // (see triggerTask.server.ts #buildEngineTriggerInput), not as a
+      // flat `batchId`. `id` is the batch's internal cuid — the same value
+      // PG stores in `TaskRun.batchId` — so callers reconstruct the
+      // friendly id via `BatchId.toFriendlyId` exactly as the PG path does.
+      batchId: asString((snapshot.batch as { id?: unknown } | undefined)?.id),
       parentTaskRunFriendlyId: asString(snapshot.parentTaskRunFriendlyId),
       rootTaskRunFriendlyId: asString(snapshot.rootTaskRunFriendlyId),
 
diff --git a/apps/webapp/test/mollifierReadFallback.test.ts b/apps/webapp/test/mollifierReadFallback.test.ts
@@ -259,6 +259,45 @@ describe("findRunByIdWithMollifierFallback", () => {
     expect(result!.runTags).toEqual(["t1", "t2"]);
   });
 
+  it("extracts batchId from the snapshot's nested batch object (engine.trigger shape)", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({
+        taskIdentifier: "t",
+        // The engine.trigger input nests the batch as `{ id, index }`,
+        // where `id` is the batch's internal cuid (not a flat `batchId`).
+        batch: { id: "batch_internal_cuid", index: 3 },
+      }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.batchId).toBe("batch_internal_cuid");
+  });
+
+  it("leaves batchId undefined when the snapshot has no batch (non-batched run)", async () => {
+    const entry: BufferEntry = {
+      runId: "run_1",
+      envId: "env_a",
+      orgId: "org_1",
+      payload: JSON.stringify({ taskIdentifier: "t" }),
+      status: "QUEUED",
+      attempts: 0,
+      createdAt: NOW,
+    };
+    const result = await findRunByIdWithMollifierFallback(
+      { runId: "run_1", environmentId: "env_a", organizationId: "org_1" },
+      { getBuffer: () => fakeBuffer(entry) },
+    );
+    expect(result!.batchId).toBeUndefined();
+  });
+
   it("treats invalid date strings as undefined and does not mis-classify status as CANCELED", async () => {
     const entry: BufferEntry = {
       runId: "run_1",