@@ -33,10 +33,7 @@ import {
3333} from "./services/warmStartVerificationService.js" ;
3434import { extractTraceparent , getRestoreRunnerId } from "./util.js" ;
3535import { Redis } from "ioredis" ;
36- import {
37- BackpressureMonitor ,
38- type BackpressureSignalSource ,
39- } from "./backpressure/backpressureMonitor.js" ;
36+ import { BackpressureMonitor } from "./backpressure/backpressureMonitor.js" ;
4037import { RedisBackpressureSignalSource } from "./backpressure/redisBackpressureSignalSource.js" ;
4138import { BackpressureMetrics } from "./backpressure/backpressureMetrics.js" ;
4239import { K8sPodCountSignalSource } from "./backpressure/k8sPodCountSignalSource.js" ;
@@ -76,7 +73,7 @@ class ManagedSupervisor {
7673 private readonly podCleaner ?: PodCleaner ;
7774 private readonly failedPodHandler ?: FailedPodHandler ;
7875 private readonly tracing ?: OtlpTraceService ;
79- private readonly backpressureMonitor ? : BackpressureMonitor ;
76+ private readonly backpressureMonitors : BackpressureMonitor [ ] = [ ] ;
8077 private readonly backpressureRedis ?: Redis ;
8178
8279 private readonly isKubernetes = isKubernetesEnvironment ( env . KUBERNETES_FORCE_ENABLED ) ;
@@ -217,71 +214,79 @@ class ManagedSupervisor {
217214 ) ;
218215 }
219216
217+ // Redis-verdict source (external aggregator). Keeps existing metric names.
220218 if ( env . TRIGGER_DEQUEUE_BACKPRESSURE_ENABLED ) {
221- let source : BackpressureSignalSource ;
222- let refreshIntervalMs = env . TRIGGER_DEQUEUE_BACKPRESSURE_REFRESH_MS ;
223-
224- if ( env . TRIGGER_DEQUEUE_BACKPRESSURE_SOURCE === "k8s-pod-count" ) {
225- refreshIntervalMs = env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_REFRESH_MS ;
226- if (
227- env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE >=
228- env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE
229- ) {
230- throw new Error (
231- "TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE must be less than TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE"
232- ) ;
233- }
234- const podCountGauge = new Gauge ( {
235- name : "supervisor_cluster_pod_count" ,
236- help : "Total pod objects stored in the cluster, scraped for backpressure" ,
237- registers : [ register ] ,
238- } ) ;
239- source = new K8sPodCountSignalSource ( {
240- fetchMetrics : createApiserverMetricsFetcher ( ) ,
241- engageThreshold : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE ,
242- releaseThreshold : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE ,
243- reportPodCount : ( count ) => podCountGauge . set ( count ) ,
244- } ) ;
245- this . logger . log ( "🛑 Dequeue backpressure enabled (pod-count source)" , {
246- engage : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE ,
247- release : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE ,
248- refreshIntervalMs,
249- dryRun : env . TRIGGER_DEQUEUE_BACKPRESSURE_DRY_RUN ,
250- } ) ;
251- } else {
252- this . backpressureRedis = new Redis ( {
253- host : env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_HOST ,
254- port : env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_PORT ,
255- username : env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_USERNAME ,
256- password : env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_PASSWORD ,
257- ...( env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_TLS_DISABLED ? { } : { tls : { } } ) ,
258- maxRetriesPerRequest : null ,
259- } ) ;
260- this . backpressureRedis . on ( "error" , ( error ) =>
261- this . logger . error ( "Backpressure redis error" , { error : error . message } )
262- ) ;
263- source = new RedisBackpressureSignalSource (
264- this . backpressureRedis ,
265- env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_KEY
266- ) ;
267- this . logger . log ( "🛑 Dequeue backpressure enabled (redis source)" , {
268- key : env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_KEY ,
219+ this . backpressureRedis = new Redis ( {
220+ host : env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_HOST ,
221+ port : env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_PORT ,
222+ username : env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_USERNAME ,
223+ password : env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_PASSWORD ,
224+ ...( env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_TLS_DISABLED ? { } : { tls : { } } ) ,
225+ maxRetriesPerRequest : null ,
226+ } ) ;
227+ this . backpressureRedis . on ( "error" , ( error ) =>
228+ this . logger . error ( "Backpressure redis error" , { error : error . message } )
229+ ) ;
230+ this . backpressureMonitors . push (
231+ new BackpressureMonitor ( {
232+ enabled : true ,
233+ source : new RedisBackpressureSignalSource (
234+ this . backpressureRedis ,
235+ env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_KEY
236+ ) ,
269237 refreshIntervalMs : env . TRIGGER_DEQUEUE_BACKPRESSURE_REFRESH_MS ,
270238 maxVerdictAgeMs : env . TRIGGER_DEQUEUE_BACKPRESSURE_MAX_VERDICT_AGE_MS ,
271239 rampMs : env . TRIGGER_DEQUEUE_BACKPRESSURE_RAMP_MS ,
272240 dryRun : env . TRIGGER_DEQUEUE_BACKPRESSURE_DRY_RUN ,
273- } ) ;
274- }
275-
276- this . backpressureMonitor = new BackpressureMonitor ( {
277- enabled : true ,
278- source,
279- refreshIntervalMs,
280- maxVerdictAgeMs : env . TRIGGER_DEQUEUE_BACKPRESSURE_MAX_VERDICT_AGE_MS ,
281- rampMs : env . TRIGGER_DEQUEUE_BACKPRESSURE_RAMP_MS ,
241+ logger : this . logger ,
242+ metrics : new BackpressureMetrics ( { register } ) ,
243+ } )
244+ ) ;
245+ this . logger . log ( "🛑 Dequeue backpressure enabled (redis source)" , {
246+ key : env . TRIGGER_DEQUEUE_BACKPRESSURE_REDIS_KEY ,
247+ refreshIntervalMs : env . TRIGGER_DEQUEUE_BACKPRESSURE_REFRESH_MS ,
282248 dryRun : env . TRIGGER_DEQUEUE_BACKPRESSURE_DRY_RUN ,
283- logger : this . logger ,
284- metrics : new BackpressureMetrics ( { register } ) ,
249+ } ) ;
250+ }
251+
252+ // Pod-count source (in-process apiserver scrape). Namespaced metrics so the
253+ // redis source's metric names are preserved.
254+ if ( env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENABLED ) {
255+ if (
256+ env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE >=
257+ env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE
258+ ) {
259+ throw new Error (
260+ "TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE must be less than TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE"
261+ ) ;
262+ }
263+ const podCountGauge = new Gauge ( {
264+ name : "supervisor_cluster_pod_count" ,
265+ help : "Total pod objects stored in the cluster, scraped for backpressure" ,
266+ registers : [ register ] ,
267+ } ) ;
268+ this . backpressureMonitors . push (
269+ new BackpressureMonitor ( {
270+ enabled : true ,
271+ source : new K8sPodCountSignalSource ( {
272+ fetchMetrics : createApiserverMetricsFetcher ( ) ,
273+ engageThreshold : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE ,
274+ releaseThreshold : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE ,
275+ reportPodCount : ( count ) => podCountGauge . set ( count ) ,
276+ } ) ,
277+ refreshIntervalMs : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_REFRESH_MS ,
278+ maxVerdictAgeMs : env . TRIGGER_DEQUEUE_BACKPRESSURE_MAX_VERDICT_AGE_MS ,
279+ rampMs : env . TRIGGER_DEQUEUE_BACKPRESSURE_RAMP_MS ,
280+ dryRun : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_DRY_RUN ,
281+ logger : this . logger ,
282+ metrics : new BackpressureMetrics ( { register, prefix : "supervisor_backpressure_pod_count" } ) ,
283+ } )
284+ ) ;
285+ this . logger . log ( "🛑 Dequeue backpressure enabled (pod-count source)" , {
286+ engage : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE ,
287+ release : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE ,
288+ refreshIntervalMs : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_REFRESH_MS ,
289+ dryRun : env . TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_DRY_RUN ,
285290 } ) ;
286291 }
287292
@@ -308,14 +313,14 @@ class ManagedSupervisor {
308313 dampingFactor : env . TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR ,
309314 // Freeze scale-up while backpressure is hard-engaged (not during the resume
310315 // ramp). Undefined when backpressure is disabled → no effect on scaling.
311- shouldPauseScaling : ( ) => this . backpressureMonitor ?. isEngaged ( ) ?? false ,
316+ shouldPauseScaling : ( ) => this . backpressureMonitors . some ( ( m ) => m . isEngaged ( ) ) ,
312317 } ,
313318 runNotificationsEnabled : env . TRIGGER_WORKLOAD_API_ENABLED ,
314319 heartbeatIntervalSeconds : env . TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS ,
315320 sendRunDebugLogs : env . SEND_RUN_DEBUG_LOGS ,
316321 preDequeue : async ( ) => {
317- // Synchronous, hot-path-safe cached read; undefined when backpressure is disabled .
318- const skipForBackpressure = this . backpressureMonitor ?. shouldSkipDequeue ( ) ?? false ;
322+ // Synchronous, hot-path-safe cached read; false when no monitors are active .
323+ const skipForBackpressure = this . backpressureMonitors . some ( ( m ) => m . shouldSkipDequeue ( ) ) ;
319324
320325 if ( ! env . RESOURCE_MONITOR_ENABLED || this . isKubernetes ) {
321326 // Resource monitor is not used in k8s; backpressure is the only gate there.
@@ -710,7 +715,7 @@ class ManagedSupervisor {
710715 this . logger . log ( "Starting up" ) ;
711716
712717 // Optional services
713- this . backpressureMonitor ?. start ( ) ;
718+ this . backpressureMonitors . forEach ( ( m ) => m . start ( ) ) ;
714719 await this . podCleaner ?. start ( ) ;
715720 await this . failedPodHandler ?. start ( ) ;
716721 await this . metricsServer ?. start ( ) ;
@@ -738,7 +743,7 @@ class ManagedSupervisor {
738743 await this . workerSession . stop ( ) ;
739744
740745 // Optional services
741- this . backpressureMonitor ?. stop ( ) ;
746+ this . backpressureMonitors . forEach ( ( m ) => m . stop ( ) ) ;
742747 await this . backpressureRedis ?. quit ( ) ;
743748 await this . podCleaner ?. stop ( ) ;
744749 await this . failedPodHandler ?. stop ( ) ;
0 commit comments