fix(webapp,redis-worker): enforce MAX_TAGS on buffered runs + surface metadata fallback errors

d-cs · claude · d-cs · commit 0f7365d015dc · 2026-05-28T10:40:55.000+01:00
The tags API skipped MAX_TAGS_PER_RUN enforcement on the buffered path,
letting a buffered run exceed the cap the trigger validator applies at
creation. Enforce it atomically in the mutateSnapshot Lua: append_tags
now accepts an optional maxTags and returns "limit_exceeded" (writing
nothing) when the deduped count would overflow. mutateWithFallback gains
a symmetric rejectedResponse builder + a "rejected" outcome; the tags
route returns 422, matching the PG path.

Also stop silently swallowing PG failures in the metadata route's
parent/root op fan-out: warn (with targetRunId + error) before the
best-effort buffer fallback so a genuine PG outage is observable.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.changeset/mollifier-tag-cap.md b/.changeset/mollifier-tag-cap.md
@@ -0,0 +1,5 @@
+---
+"@trigger.dev/redis-worker": patch
+---
+
+Mollifier `mutateSnapshot` now enforces a tag cap: an `append_tags` patch carrying `maxTags` returns `"limit_exceeded"` (writing nothing) when the deduped tag count would exceed the limit, so a buffered run can't accumulate more tags via the tags API than the trigger validator allows at creation.
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts
@@ -7,6 +7,7 @@ import { z } from "zod";
 import { $replica } from "~/db.server";
 import type { AuthenticatedEnvironment } from "~/services/apiAuth.server";
 import { authenticateApiRequest } from "~/services/apiAuth.server";
+import { logger } from "~/services/logger.server";
 import { updateMetadataService } from "~/services/metadata/updateMetadataInstance.server";
 import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server";
 import { ServiceValidationError } from "~/v3/services/common.server";
@@ -84,10 +85,17 @@ async function routeOperationsToRun(
   );
   if (!error) return;
 
-  // PG service threw — could be "Cannot update metadata for a completed
-  // run" or similar. If the target is buffered, route operations to its
-  // snapshot too. Best-effort; do not surface this failure to the
-  // caller — the parent/root ops are auxiliary.
+  // PG service threw — commonly "Cannot update metadata for a completed
+  // run", but it could also be a transient PG failure. The parent/root
+  // ops are auxiliary, so we stay best-effort and don't surface this to
+  // the caller — but we must not swallow the failure silently, otherwise
+  // a genuine PG outage on these ops is invisible. Warn, then try the
+  // buffer in case the target is itself buffered.
+  logger.warn("metadata route: parent/root PG op failed, falling back to buffer", {
+    targetRunId,
+    error: error instanceof Error ? error.message : String(error),
+  });
+
   await applyMetadataMutationToBufferedRun({
     runId: targetRunId,
     body: { operations },
diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts
@@ -44,11 +44,11 @@ export async function action({ request, params }: ActionFunctionArgs) {
     }
 
     const env = authenticationResult.environment;
-    const outcome = await mutateWithFallback({
+    const outcome = await mutateWithFallback<Response>({
       runId: parsedParams.data.runId,
       environmentId: env.id,
       organizationId: env.organizationId,
-      bufferPatch: { type: "append_tags", tags: nonEmptyTags },
+      bufferPatch: { type: "append_tags", tags: nonEmptyTags, maxTags: MAX_TAGS_PER_RUN },
       pgMutation: async (taskRun) => {
         const existing = taskRun.runTags ?? [];
         const newTags = nonEmptyTags.filter((t) => !existing.includes(t));
@@ -76,13 +76,20 @@ export async function action({ request, params }: ActionFunctionArgs) {
         return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 });
       },
       // Buffer-applied patch path. The mutateSnapshot Lua deduplicates
-      // against existing snapshot tags atomically. MAX_TAGS_PER_RUN
-      // enforcement is skipped on the buffered side — the drainer's
-      // engine.trigger writes the PG row without enforcement either,
-      // matching today's pre-buffer trigger semantics. A future
-      // refinement could push the limit check into the Lua.
+      // against existing snapshot tags atomically and enforces
+      // MAX_TAGS_PER_RUN via the `maxTags` we pass in `bufferPatch` —
+      // matching the PG-path cap above so a buffered run can't exceed the
+      // limit the trigger validator applies at creation.
       synthesisedResponse: () =>
         json({ message: `Successfully set ${nonEmptyTags.length} new tags.` }, { status: 200 }),
+      // Buffer rejected the append because it would exceed the cap. We
+      // don't know the exact deduped overflow count here (the Lua does),
+      // so report the limit rather than a precise "trying to set N".
+      rejectedResponse: () =>
+        json(
+          { error: `Runs can only have ${MAX_TAGS_PER_RUN} tags.` },
+          { status: 422 }
+        ),
       abortSignal: getRequestAbortSignal(),
     });
 
diff --git a/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts b/apps/webapp/app/v3/mollifier/mutateWithFallback.server.ts
@@ -28,6 +28,11 @@ export type MutateWithFallbackInput<TResponse> = {
   // Called when the patch landed cleanly on the buffer snapshot. The
   // drainer will see the patched payload on its next pop.
   synthesisedResponse: () => TResponse | Promise<TResponse>;
+  // Called when the buffer rejected the patch as invalid (e.g. an
+  // `append_tags` patch carrying `maxTags` would exceed the cap). Required
+  // only by callers that send a rejectable patch; the helper throws if the
+  // buffer reports a rejection and no builder was supplied.
+  rejectedResponse?: () => TResponse | Promise<TResponse>;
   abortSignal?: AbortSignal;
   // Override defaults for tests.
   safetyNetMs?: number;
@@ -47,6 +52,7 @@ export type MutateWithFallbackInput<TResponse> = {
 export type MutateWithFallbackOutcome<TResponse> =
   | { kind: "pg"; response: TResponse }
   | { kind: "snapshot"; response: TResponse }
+  | { kind: "rejected"; response: TResponse }
   | { kind: "not_found" }
   | { kind: "timed_out" };
 
@@ -86,6 +92,18 @@ export async function mutateWithFallback<TResponse>(
     return { kind: "snapshot", response: await input.synthesisedResponse() };
   }
 
+  if (result === "limit_exceeded") {
+    // The buffer refused the patch (e.g. tag cap). Nothing was written.
+    // Surface the caller's rejection body; a missing builder means the
+    // caller sent a rejectable patch without handling the rejection.
+    if (!input.rejectedResponse) {
+      throw new Error(
+        "mutateWithFallback: buffer returned 'limit_exceeded' but no rejectedResponse was provided",
+      );
+    }
+    return { kind: "rejected", response: await input.rejectedResponse() };
+  }
+
   if (result === "not_found") {
     // Disambiguate a genuine 404 from a replica-lag miss: ask the writer
     // directly. If the row just appeared post-drain we route through the
diff --git a/apps/webapp/test/mollifierMutateWithFallback.test.ts b/apps/webapp/test/mollifierMutateWithFallback.test.ts
@@ -283,6 +283,36 @@ describe("mutateWithFallback", () => {
     expect(writer.taskRun.findFirst).toHaveBeenCalledTimes(0);
   });
 
+  it("replica miss + buffer limit_exceeded → rejected via rejectedResponse builder", async () => {
+    const pgMutation = vi.fn(async () => "pg");
+    const synthesisedResponse = vi.fn(() => "snap");
+    const result = await mutateWithFallback({
+      ...baseInput,
+      pgMutation,
+      synthesisedResponse,
+      rejectedResponse: () => "too-many-tags",
+      prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+      prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+      getBuffer: () => bufferReturning("limit_exceeded"),
+    });
+    expect(result).toEqual({ kind: "rejected", response: "too-many-tags" });
+    expect(pgMutation).not.toHaveBeenCalled();
+    expect(synthesisedResponse).not.toHaveBeenCalled();
+  });
+
+  it("buffer limit_exceeded without a rejectedResponse builder → throws (programmer error)", async () => {
+    await expect(
+      mutateWithFallback({
+        ...baseInput,
+        pgMutation: async () => "pg",
+        synthesisedResponse: () => "snap",
+        prismaReplica: fakePrisma([null]) as unknown as typeof import("~/db.server").$replica,
+        prismaWriter: fakePrisma([]) as unknown as typeof import("~/db.server").prisma,
+        getBuffer: () => bufferReturning("limit_exceeded"),
+      })
+    ).rejects.toThrow(/limit_exceeded/);
+  });
+
   it("buffer is null (mollifier disabled) → not_found after replica miss", async () => {
     const result = await mutateWithFallback({
       ...baseInput,
diff --git a/packages/redis-worker/src/mollifier/buffer.test.ts b/packages/redis-worker/src/mollifier/buffer.test.ts
@@ -1937,6 +1937,61 @@ describe("MollifierBuffer.mutateSnapshot", () => {
     },
   );
 
+  redisTest(
+    "append_tags rejects with limit_exceeded when maxTags would be exceeded, writing nothing",
+    { timeout: 20_000 },
+    async ({ redisContainer }) => {
+      const buffer = new MollifierBuffer({
+        redisOptions: {
+          host: redisContainer.getHost(),
+          port: redisContainer.getPort(),
+          password: redisContainer.getPassword(),
+        },
+        logger: new Logger("test", "log"),
+      });
+      try {
+        await buffer.accept({
+          runId: "r_cap",
+          envId: "env_m",
+          orgId: "org_1",
+          payload: serialiseSnapshot({ tags: ["a", "b"] }),
+        });
+
+        // 2 existing + 2 new = 4 deduped > cap of 3 → rejected, nothing written.
+        const rejected = await buffer.mutateSnapshot("r_cap", {
+          type: "append_tags",
+          tags: ["c", "d"],
+          maxTags: 3,
+        });
+        expect(rejected).toBe("limit_exceeded");
+        const afterReject = await buffer.getEntry("r_cap");
+        const rejPayload = JSON.parse(afterReject!.payload) as { tags: string[] };
+        expect(rejPayload.tags).toEqual(["a", "b"]);
+
+        // Dedup keeps the count under the cap → applied.
+        const applied = await buffer.mutateSnapshot("r_cap", {
+          type: "append_tags",
+          tags: ["a", "c"],
+          maxTags: 3,
+        });
+        expect(applied).toBe("applied_to_snapshot");
+        const afterApply = await buffer.getEntry("r_cap");
+        const appPayload = JSON.parse(afterApply!.payload) as { tags: string[] };
+        expect(appPayload.tags).toEqual(["a", "b", "c"]);
+
+        // Landing exactly on the cap is allowed.
+        const exact = await buffer.mutateSnapshot("r_cap", {
+          type: "append_tags",
+          tags: ["a", "b", "c"],
+          maxTags: 3,
+        });
+        expect(exact).toBe("applied_to_snapshot");
+      } finally {
+        await buffer.close();
+      }
+    },
+  );
+
   redisTest(
     "set_metadata replaces metadata + metadataType (last-write-wins)",
     { timeout: 20_000 },
diff --git a/packages/redis-worker/src/mollifier/buffer.ts b/packages/redis-worker/src/mollifier/buffer.ts
@@ -37,12 +37,21 @@ export function mollifierReconnectDelayMs(
 }
 
 export type SnapshotPatch =
-  | { type: "append_tags"; tags: string[] }
+  // `maxTags`, when set, caps the deduped tag count atomically inside the
+  // Lua: if appending would push the snapshot over the limit the patch is
+  // rejected ("limit_exceeded") and nothing is written, mirroring the
+  // PG-path MAX_TAGS_PER_RUN check so a buffered run can't accumulate more
+  // tags than the trigger validator would have allowed at creation.
+  | { type: "append_tags"; tags: string[]; maxTags?: number }
   | { type: "set_metadata"; metadata: string; metadataType: string }
   | { type: "set_delay"; delayUntil: string }
   | { type: "mark_cancelled"; cancelledAt: string; cancelReason?: string };
 
-export type MutateSnapshotResult = "applied_to_snapshot" | "not_found" | "busy";
+export type MutateSnapshotResult =
+  | "applied_to_snapshot"
+  | "not_found"
+  | "busy"
+  | "limit_exceeded";
 
 export type CasSetMetadataResult =
   | { kind: "applied"; newVersion: number }
@@ -311,6 +320,8 @@ export class MollifierBuffer {
   //     FAILED entry, whose hash the drainer-terminal `fail` path DELs.
   //   - "busy": entry is DRAINING or materialised. The API
   //     wait-and-bounces through PG.
+  //   - "limit_exceeded": an `append_tags` patch carrying `maxTags` would
+  //     push the deduped tag count over the cap; nothing is written.
   async mutateSnapshot(runId: string, patch: SnapshotPatch): Promise<MutateSnapshotResult> {
     const result = (await this.redis.mutateMollifierSnapshot(
       `mollifier:entries:${runId}`,
@@ -319,7 +330,8 @@ export class MollifierBuffer {
     if (
       result === "applied_to_snapshot" ||
       result === "not_found" ||
-      result === "busy"
+      result === "busy" ||
+      result === "limit_exceeded"
     ) {
       return result;
     }
@@ -914,6 +926,12 @@ export class MollifierBuffer {
               table.insert(merged, t)
             end
           end
+          -- Cap the deduped count when the caller supplies a limit, so a
+          -- buffered run can't exceed MAX_TAGS_PER_RUN via the tags API.
+          -- Reject the whole patch (write nothing) rather than truncating.
+          if patch.maxTags ~= nil and #merged > patch.maxTags then
+            return 'limit_exceeded'
+          end
           payload.tags = merged
         elseif patch.type == 'set_metadata' then
           payload.metadata = patch.metadata

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@trigger.dev/redis-worker": patch
 +---
++
 +Mollifier `mutateSnapshot` now enforces a tag cap: an `append_tags` patch carrying `maxTags` returns `"limit_exceeded"` (writing nothing) when the deduped tag count would exceed the limit, so a buffered run can't accumulate more tags via the tags API than the trigger validator allows at creation.