@@ -13,6 +13,7 @@ import { logger } from "~/services/logger.server";
1313import { updateMetadataService } from "~/services/metadata/updateMetadataInstance.server" ;
1414import { reportInvocationUsage } from "~/services/platform.v3.server" ;
1515import { MetadataTooLargeError } from "~/utils/packets" ;
16+ import { QueueSizeLimitExceededError } from "~/v3/services/common.server" ;
1617import { TriggerTaskService } from "~/v3/services/triggerTask.server" ;
1718import { tracer } from "~/v3/tracer.server" ;
1819import { createExceptionPropertiesFromError } from "./eventRepository/common.server" ;
@@ -637,6 +638,15 @@ export function registerRunEngineEventBusHandlers() {
637638 } ) ;
638639}
639640
641+ /**
642+ * errorCode returned by the batch process-item callback when the trigger was
643+ * rejected because the environment's queue is at its maximum size. The
644+ * BatchQueue (via `skipRetries`) short-circuits retries for this code, and the
645+ * batch completion callback collapses per-item errors into a single aggregate
646+ * `BatchTaskRunError` row instead of writing one per item.
647+ */
648+ const QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE = "QUEUE_SIZE_LIMIT_EXCEEDED" ;
649+
640650/**
641651 * Set up the BatchQueue processing callbacks.
642652 * These handle creating runs from batch items and completing batches.
@@ -808,6 +818,37 @@ export function setupBatchQueueCallbacks() {
808818 } catch ( error ) {
809819 const errorMessage = error instanceof Error ? error . message : String ( error ) ;
810820
821+ // Queue-size-limit rejections are a customer-overload scenario (the
822+ // env's queue is at its configured max). Retrying is pointless — the
823+ // same item will fail again — and creating pre-failed TaskRuns for
824+ // every item of every retried batch is exactly what chews through
825+ // DB capacity when a noisy tenant fills their queue. Signal the
826+ // BatchQueue to skip retries and skip pre-failed run creation, and
827+ // let the completion callback collapse the per-item errors into a
828+ // single summary row.
829+ if ( error instanceof QueueSizeLimitExceededError ) {
830+ logger . warn ( "[BatchQueue] Batch item rejected: queue size limit reached" , {
831+ batchId,
832+ friendlyId,
833+ itemIndex,
834+ task : item . task ,
835+ environmentId : meta . environmentId ,
836+ maximumSize : error . maximumSize ,
837+ } ) ;
838+
839+ span . setAttribute ( "batch.result.error" , errorMessage ) ;
840+ span . setAttribute ( "batch.result.errorCode" , QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE ) ;
841+ span . setAttribute ( "batch.result.skipRetries" , true ) ;
842+ span . end ( ) ;
843+
844+ return {
845+ success : false as const ,
846+ error : errorMessage ,
847+ errorCode : QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE ,
848+ skipRetries : true ,
849+ } ;
850+ }
851+
811852 logger . error ( "[BatchQueue] Failed to trigger batch item" , {
812853 batchId,
813854 friendlyId,
@@ -889,20 +930,51 @@ export function setupBatchQueueCallbacks() {
889930 } ,
890931 } ) ;
891932
892- // Create error records if there were failures
933+ // Create error records if there were failures.
934+ //
935+ // Fast-path for queue-size-limit overload: when every failure is the
936+ // same QUEUE_SIZE_LIMIT_EXCEEDED error, collapse them into a single
937+ // aggregate row instead of writing one per item. This keeps the DB
938+ // write volume bounded to O(batches) instead of O(items) when a noisy
939+ // tenant fills their queue and all of their batches start bouncing.
893940 if ( failures . length > 0 ) {
894- await tx . batchTaskRunError . createMany ( {
895- data : failures . map ( ( failure ) => ( {
896- batchTaskRunId : batchId ,
897- index : failure . index ,
898- taskIdentifier : failure . taskIdentifier ,
899- payload : failure . payload ,
900- options : failure . options as Prisma . InputJsonValue | undefined ,
901- error : failure . error ,
902- errorCode : failure . errorCode ,
903- } ) ) ,
904- skipDuplicates : true ,
905- } ) ;
941+ const allQueueSizeLimit = failures . every (
942+ ( f ) => f . errorCode === QUEUE_SIZE_LIMIT_EXCEEDED_ERROR_CODE
943+ ) ;
944+
945+ if ( allQueueSizeLimit ) {
946+ const sample = failures [ 0 ] ! ;
947+ await tx . batchTaskRunError . createMany ( {
948+ data : [
949+ {
950+ batchTaskRunId : batchId ,
951+ // Use the first item's index as a stable anchor for the
952+ // (batchTaskRunId, index) unique constraint so callback
953+ // retries remain idempotent.
954+ index : sample . index ,
955+ taskIdentifier : sample . taskIdentifier ,
956+ payload : sample . payload ,
957+ options : sample . options as Prisma . InputJsonValue | undefined ,
958+ error : `${ sample . error } (${ failures . length } items in this batch failed with the same error)` ,
959+ errorCode : sample . errorCode ,
960+ } ,
961+ ] ,
962+ skipDuplicates : true ,
963+ } ) ;
964+ } else {
965+ await tx . batchTaskRunError . createMany ( {
966+ data : failures . map ( ( failure ) => ( {
967+ batchTaskRunId : batchId ,
968+ index : failure . index ,
969+ taskIdentifier : failure . taskIdentifier ,
970+ payload : failure . payload ,
971+ options : failure . options as Prisma . InputJsonValue | undefined ,
972+ error : failure . error ,
973+ errorCode : failure . errorCode ,
974+ } ) ) ,
975+ skipDuplicates : true ,
976+ } ) ;
977+ }
906978 }
907979 } ) ;
908980
0 commit comments