fix(run-engine): reschedule batch drain on candidateIds count, not pendingRuns

ericallam · ericallam · commit ff6db49d1ea7 · 2026-05-22T16:53:20.000+01:00
After the ClickHouse migration, pendingRuns.length is post-status-guard;
runs that have already left PENDING_VERSION between the CH lookup and
the Postgres refetch get filtered out. Using it as the more-work signal
under-reports when more candidates exist on the worker and stops short.
Switch to candidateIds.length, which is the raw lookup result.
diff --git a/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts
@@ -176,8 +176,12 @@ export class PendingVersionSystem {
       });
     }
 
-    //enqueue more if needed
-    if (pendingRuns.length > maxCount) {
+    // Reschedule when the lookup returned a full-plus-one batch — that's
+    // the signal there are more candidates to drain. Use `candidateIds`
+    // (the raw lookup result) rather than `pendingRuns` (post-status-guard)
+    // because runs that already left PENDING_VERSION shouldn't suppress
+    // the next batch.
+    if (candidateIds.length > maxCount) {
       await this.scheduleResolvePendingVersionRuns(backgroundWorkerId);
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -176,8 +176,12 @@ export class PendingVersionSystem {`
`176`	`176`	`});`
`177`	`177`	`}`
`178`	`178`
`179`		`- //enqueue more if needed`
`180`		`- if (pendingRuns.length > maxCount) {`
	`179`	`+ // Reschedule when the lookup returned a full-plus-one batch — that's`
	`180`	+ // the signal there are more candidates to drain. Use `candidateIds`
	`181`	+ // (the raw lookup result) rather than `pendingRuns` (post-status-guard)
	`182`	`+ // because runs that already left PENDING_VERSION shouldn't suppress`
	`183`	`+ // the next batch.`
	`184`	`+ if (candidateIds.length > maxCount) {`
`181`	`185`	`await this.scheduleResolvePendingVersionRuns(backgroundWorkerId);`
`182`	`186`	`}`
`183`	`187`	`}`