fix(webapp,run-engine): replay-layer code-review follow-ups

d-cs · claude · d-cs · commit 062bcface6eb · 2026-05-26T15:05:53.000+01:00
- `isRetryablePgError`: also accept `errorCode === "P1001"` so
  `PrismaClientInitializationError` (which surfaces P1001 on a
  different field than `PrismaClientKnownRequestError`) retries.
- Drop `envId` from OTel metric labels on
  `mollifier.realtime_subscriptions.buffered`,
  `mollifier.stale_entries`, and the
  `mollifier.stale_entries.current` gauge. `envId` is a banned
  high-cardinality attribute; the structured warn log alongside each
  counter tick still carries envId for forensic drill-down.
- Stale-sweep test name + comments now match the assertion shape
  (all three entries stale, not "two stale + one fresh").
- `RunEngine.createCancelledRun` P2002 path now requires the existing
  row's status to be CANCELED; a non-canceled conflict throws rather
  than silently reporting success, so the caller can route to
  `engine.cancelRun()` or skip.
- Regression test pins the new conflict guard.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts
@@ -10,8 +10,15 @@ const tracer = trace.getTracer("mollifier-drainer");
 export function isRetryablePgError(err: unknown): boolean {
   if (!(err instanceof Error)) return false;
   const msg = err.message ?? "";
+  // Prisma surfaces P1001 ("Can't reach database server") via two
+  // different error classes — `PrismaClientKnownRequestError` exposes
+  // it as `err.code`, `PrismaClientInitializationError` exposes it as
+  // `err.errorCode`. Check both so reconnection-time errors retry
+  // regardless of which class fires.
   const code = (err as { code?: string }).code;
+  const errorCode = (err as { errorCode?: string }).errorCode;
   if (code === "P2024") return true;
+  if (code === "P1001" || errorCode === "P1001") return true;
   if (msg.includes("Can't reach database server")) return true;
   if (msg.includes("Connection lost")) return true;
   if (msg.includes("ECONNRESET")) return true;
diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts
@@ -22,7 +22,10 @@ export type StaleSweepConfig = {
 
 export type StaleSweepDeps = {
   getBuffer?: () => MollifierBuffer | null;
-  recordStaleEntry?: (envId: string) => void;
+  // No `envId` arg — `envId` is a high-cardinality metric attribute and
+  // is intentionally not emitted as a metric label. The structured warn
+  // log below carries envId for forensic drill-down.
+  recordStaleEntry?: () => void;
   reportStaleEntrySnapshot?: (snapshot: Map<string, number>) => void;
   logger?: { warn: (message: string, fields: Record<string, unknown>) => void };
   now?: () => number;
@@ -82,7 +85,7 @@ export async function runStaleSweepOnce(
         entriesScanned += 1;
         const dwellMs = now - entry.createdAt.getTime();
         if (dwellMs > config.staleThresholdMs) {
-          recordStale(envId);
+          recordStale();
           log.warn("mollifier.stale_entry", {
             runId: entry.runId,
             envId,
diff --git a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts
@@ -29,16 +29,21 @@ export const realtimeBufferedSubscriptionsCounter = meter.createCounter(
   },
 );
 
-export function recordRealtimeBufferedSubscription(envId: string): void {
-  realtimeBufferedSubscriptionsCounter.add(1, { envId });
+// No `envId` attribute — `envId` is a banned high-cardinality metric
+// label per the repo's OTel rules. The structured warn log emitted
+// alongside the counter tick (in `mollifierStaleSweep.server.ts`)
+// carries the envId / orgId / runId for forensic drill-down; the
+// metric stays an aggregate.
+export function recordRealtimeBufferedSubscription(): void {
+  realtimeBufferedSubscriptionsCounter.add(1);
 }
 
 // Counts buffer entries that have been waiting in the queue ZSET longer
-// than the configured stale threshold (typically half of entryTtlSeconds).
-// Useful for historical "stale events over time" views, but not directly
-// alertable on its own — a single stuck entry observed by N sweep ticks
-// adds N to the counter, so `rate()` over an alerting window reflects
-// (entries × ticks), not "entries that are stale right now".
+// than the configured stale threshold. Useful for historical "stale
+// events over time" views, but not directly alertable on its own — a
+// single stuck entry observed by N sweep ticks adds N to the counter,
+// so `rate()` over an alerting window reflects (entries × ticks), not
+// "entries that are stale right now".
 export const staleEntriesCounter = meter.createCounter(
   "mollifier.stale_entries",
   {
@@ -47,16 +52,16 @@ export const staleEntriesCounter = meter.createCounter(
   },
 );
 
-export function recordStaleEntry(envId: string): void {
-  staleEntriesCounter.add(1, { envId });
+// No `envId` attribute — see comment above.
+export function recordStaleEntry(): void {
+  staleEntriesCounter.add(1);
 }
 
-// Alertable signal: the count of stale entries observed by the latest
-// sweep, per env. The sweep snapshots the full per-env picture on each
-// pass (including zeros for envs that no longer have any stale entries)
-// so an env that was paging can clear when the drainer catches up
-// instead of staying latched. Recommended alert:
-//   mollifier_stale_entries_current{envId=...} > 0 for 5m
+// Alertable signal: the total count of stale entries observed by the
+// latest sweep. The sweep snapshots the full picture on each pass so
+// the gauge drops back to 0 when the drainer catches up instead of
+// staying latched. Recommended alert:
+//   mollifier_stale_entries_current > 0 for 5m
 export const staleEntriesGauge = meter.createObservableGauge(
   "mollifier.stale_entries.current",
   {
@@ -65,23 +70,22 @@ export const staleEntriesGauge = meter.createObservableGauge(
   },
 );
 
-const latestStaleSnapshot = new Map<string, number>();
+let latestStaleTotal = 0;
 
 export function reportStaleEntrySnapshot(snapshot: Map<string, number>): void {
-  // Replace, don't merge — envs absent from the new snapshot have either
-  // drained or no longer exist; leaving their last value cached would
-  // keep alerts latched forever.
-  latestStaleSnapshot.clear();
-  for (const [envId, count] of snapshot) {
-    latestStaleSnapshot.set(envId, count);
+  // Sum across envs. Per-env breakdown is intentionally NOT emitted as
+  // a metric label (high-cardinality); the structured warn log lines
+  // from the sweep carry per-env detail for ops to drill down.
+  let total = 0;
+  for (const count of snapshot.values()) {
+    total += count;
   }
+  latestStaleTotal = total;
 }
 
 meter.addBatchObservableCallback(
   (result) => {
-    for (const [envId, count] of latestStaleSnapshot) {
-      result.observe(staleEntriesGauge, count, { envId });
-    }
+    result.observe(staleEntriesGauge, latestStaleTotal);
   },
   [staleEntriesGauge],
 );
diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts
@@ -14,16 +14,21 @@ const SNAPSHOT = {
 };
 
 function spyDeps() {
-  const recordedStaleEnvIds: string[] = [];
+  // Counter ticks — metric carries no `envId` label (high-cardinality)
+  // so the spy is a simple call count. Per-env detail lives on the
+  // structured warn log and the snapshot map.
+  let staleEntryCount = 0;
   const snapshots: Array<Map<string, number>> = [];
   const warnings: Array<{ message: string; fields: Record<string, unknown> }> = [];
   return {
-    recordedStaleEnvIds,
+    get staleEntryCount() {
+      return staleEntryCount;
+    },
     snapshots,
     warnings,
     deps: {
-      recordStaleEntry: (envId: string) => {
-        recordedStaleEnvIds.push(envId);
+      recordStaleEntry: () => {
+        staleEntryCount += 1;
       },
       reportStaleEntrySnapshot: (snapshot: Map<string, number>) => {
         // Clone so post-sweep assertions see what was reported *at that
@@ -45,19 +50,20 @@ describe("runStaleSweepOnce — unit", () => {
     // Mirrors the prod gate: if TRIGGER_MOLLIFIER_ENABLED=0 the buffer
     // singleton is null and the sweep is a no-op. We don't want it to
     // emit a metric (or throw) just because mollifier is disabled.
-    const { deps, recordedStaleEnvIds, warnings, snapshots } = spyDeps();
+    const spies = spyDeps();
     const result = await runStaleSweepOnce(
       { staleThresholdMs: 1000 },
-      { ...deps, getBuffer: () => null },
+      { ...spies.deps, getBuffer: () => null },
     );
     expect(result).toEqual({
       orgsScanned: 0,
       envsScanned: 0,
       entriesScanned: 0,
       staleCount: 0,
     });
-    expect(recordedStaleEnvIds).toEqual([]);
-    expect(warnings).toEqual([]);
+    expect(spies.staleEntryCount).toBe(0);
+    expect(spies.warnings).toEqual([]);
+    const snapshots = spies.snapshots;
     // An empty snapshot is still reported so any previously-paging env
     // (from a prior sweep before mollifier was disabled) clears.
     expect(snapshots).toHaveLength(1);
@@ -67,14 +73,15 @@ describe("runStaleSweepOnce — unit", () => {
 
 describe("runStaleSweepOnce — testcontainers", () => {
   redisTest(
-    "flags entries whose dwell exceeds the stale threshold and skips fresh ones",
+    "flags every entry whose dwell exceeds the stale threshold",
     async ({ redisOptions }) => {
       const buffer = new MollifierBuffer({ redisOptions });
       try {
-        // Two stale entries (one in each env) + one fresh entry. Sweep
-        // should flag the two stale, leave the fresh one alone, record
-        // the counter once per stale entry, and emit a warning per
-        // stale entry with the dwell + threshold.
+        // Three entries across two envs in the same org. The sweep below
+        // runs against a `now` advanced by 5 minutes, so all three have
+        // dwell ~5min and ALL THREE are stale against a 1-minute
+        // threshold — there is no "fresh" entry in this scenario. The
+        // assertions below pin the all-three-stale shape.
         await buffer.accept({
           runId: "run_stale_a",
           envId: "env_a",
@@ -88,7 +95,7 @@ describe("runStaleSweepOnce — testcontainers", () => {
           payload: JSON.stringify(SNAPSHOT),
         });
         await buffer.accept({
-          runId: "run_fresh",
+          runId: "run_stale_c",
           envId: "env_a",
           orgId: "org_1",
           payload: JSON.stringify(SNAPSHOT),
@@ -98,11 +105,11 @@ describe("runStaleSweepOnce — testcontainers", () => {
         // the threshold without actually waiting in real time.
         const futureNow = Date.now() + 5 * 60 * 1000;
 
-        const { deps, recordedStaleEnvIds, warnings, snapshots } = spyDeps();
+        const spies = spyDeps();
         const result = await runStaleSweepOnce(
           { staleThresholdMs: 60 * 1000 },
           {
-            ...deps,
+            ...spies.deps,
             getBuffer: () => buffer,
             now: () => futureNow,
           },
@@ -111,22 +118,21 @@ describe("runStaleSweepOnce — testcontainers", () => {
         expect(result.envsScanned).toBe(2);
         expect(result.entriesScanned).toBe(3);
         expect(result.staleCount).toBe(3);
-        // All three entries have dwell ~5min, all exceed the 1-min
-        // threshold; each emits one counter tick + one warning.
-        expect(recordedStaleEnvIds.sort()).toEqual(
-          ["env_a", "env_a", "env_b"].sort(),
-        );
-        expect(warnings).toHaveLength(3);
-        for (const w of warnings) {
+        // All three entries exceed the threshold; each emits one
+        // counter tick + one warning.
+        expect(spies.staleEntryCount).toBe(3);
+        expect(spies.warnings).toHaveLength(3);
+        for (const w of spies.warnings) {
           expect(w.message).toBe("mollifier.stale_entry");
           expect(w.fields.staleThresholdMs).toBe(60 * 1000);
           expect(w.fields.dwellMs).toBeGreaterThan(60 * 1000);
         }
         // Snapshot drives the alertable gauge — env_a has 2 stale
-        // entries, env_b has 1. Both must appear so a future alert can
-        // identify which env is paging.
-        expect(snapshots).toHaveLength(1);
-        expect(Object.fromEntries(snapshots[0])).toEqual({
+        // entries, env_b has 1. Per-env detail is still passed to
+        // `reportStaleEntrySnapshot` for forensic value even though the
+        // gauge itself aggregates the total.
+        expect(spies.snapshots).toHaveLength(1);
+        expect(Object.fromEntries(spies.snapshots[0])).toEqual({
           env_a: 2,
           env_b: 1,
         });
@@ -151,13 +157,13 @@ describe("runStaleSweepOnce — testcontainers", () => {
           orgId: "org_1",
           payload: JSON.stringify(SNAPSHOT),
         });
-        const { deps, snapshots } = spyDeps();
+        const spies = spyDeps();
         await runStaleSweepOnce(
           { staleThresholdMs: 60 * 1000 },
-          { ...deps, getBuffer: () => buffer },
+          { ...spies.deps, getBuffer: () => buffer },
         );
-        expect(snapshots).toHaveLength(1);
-        expect(Object.fromEntries(snapshots[0])).toEqual({ env_a: 0 });
+        expect(spies.snapshots).toHaveLength(1);
+        expect(Object.fromEntries(spies.snapshots[0])).toEqual({ env_a: 0 });
       } finally {
         await buffer.close();
       }
@@ -179,14 +185,14 @@ describe("runStaleSweepOnce — testcontainers", () => {
           orgId: "org_1",
           payload: JSON.stringify(SNAPSHOT),
         });
-        const { deps, recordedStaleEnvIds, warnings } = spyDeps();
+        const spies = spyDeps();
         const result = await runStaleSweepOnce(
           { staleThresholdMs: 60 * 1000 },
-          { ...deps, getBuffer: () => buffer },
+          { ...spies.deps, getBuffer: () => buffer },
         );
         expect(result.staleCount).toBe(0);
-        expect(recordedStaleEnvIds).toEqual([]);
-        expect(warnings).toEqual([]);
+        expect(spies.staleEntryCount).toBe(0);
+        expect(spies.warnings).toEqual([]);
       } finally {
         await buffer.close();
       }
@@ -215,10 +221,10 @@ describe("runStaleSweepOnce — testcontainers", () => {
           payload: JSON.stringify(SNAPSHOT),
         });
         const futureNow = Date.now() + 5 * 60 * 1000;
-        const { deps } = spyDeps();
+        const spies = spyDeps();
         const result = await runStaleSweepOnce(
           { staleThresholdMs: 60 * 1000 },
-          { ...deps, getBuffer: () => buffer, now: () => futureNow },
+          { ...spies.deps, getBuffer: () => buffer, now: () => futureNow },
         );
         expect(result.orgsScanned).toBe(2);
         expect(result.envsScanned).toBe(2);
diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts
@@ -599,7 +599,21 @@ export class RunEngine {
             { friendlyId: snapshot.friendlyId },
           );
           const existing = await prisma.taskRun.findFirst({ where: { id } });
-          if (existing) return existing;
+          if (existing) {
+            // Only treat the conflict as idempotent when the existing
+            // row is ALREADY canceled. If a non-canceled row landed
+            // first (e.g. the drainer's normal `engine.trigger` replay
+            // path raced ahead of the cancel) we surface a conflict
+            // rather than silently reporting "cancelled" — the run is
+            // genuinely live and the caller must decide between
+            // engine.cancelRun() and skipping.
+            if (existing.status === "CANCELED") {
+              return existing;
+            }
+            throw new Error(
+              `createCancelledRun conflict: existing run ${snapshot.friendlyId} has status ${existing.status}`,
+            );
+          }
         }
         throw err;
       }
diff --git a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts
@@ -230,4 +230,65 @@ describe("RunEngine.createCancelledRun", () => {
       }
     },
   );
+
+  // Regression: the P2002-on-id idempotency path used to return ANY
+  // existing row, which would silently report success even if a live
+  // (non-CANCELED) row landed first. The guard now requires the
+  // existing row's status to be CANCELED; anything else surfaces a
+  // conflict so the caller can route to engine.cancelRun() or skip.
+  containerTest(
+    "P2002 conflict with non-CANCELED existing row throws (does not silently succeed)",
+    async ({ prisma, redisOptions }) => {
+      const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+      const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) });
+      try {
+        const friendlyId = freshRunId();
+        const id = RunId.fromFriendlyId(friendlyId);
+
+        // Plant a live (non-CANCELED) row with the same id so the
+        // cancelled-run INSERT hits P2002 and the guard finds a row
+        // that ISN'T CANCELED.
+        await prisma.taskRun.create({
+          data: {
+            id,
+            friendlyId,
+            taskIdentifier: "test-task",
+            payload: "{}",
+            payloadType: "application/json",
+            status: "PENDING",
+            runtimeEnvironmentId: env.id,
+            projectId: env.project.id,
+            organizationId: env.organizationId,
+            queue: "task/test-task",
+            traceId: "0000000000000000aaaa000000000000",
+            spanId: "bbbb000000000000",
+            engine: "V2",
+          },
+        });
+
+        await expect(
+          engine.createCancelledRun({
+            snapshot: {
+              friendlyId,
+              environment: env,
+              taskIdentifier: "test-task",
+              payload: "{}",
+              payloadType: "application/json",
+              context: {},
+              traceContext: {},
+              traceId: "0000000000000000aaaa000000000000",
+              spanId: "bbbb000000000000",
+              queue: "task/test-task",
+              isTest: false,
+              tags: [],
+            },
+            cancelledAt: new Date(),
+            cancelReason: "Should not silently overwrite a live row",
+          }),
+        ).rejects.toThrow(/createCancelledRun conflict.*PENDING/);
+      } finally {
+        await engine.quit();
+      }
+    },
+  );
 });