Fix Arrow iterator leak in LocalTableScanExec via mkBatches refactor

garlandz-db · garlandz-db · commit bac383ccac03 · 2026-03-19T17:46:08.000Z
rowToArrowConverter wraps ArrowBatchWithSchemaIterator in a Scala .map()
which is not AutoCloseable. In the LocalTableScanExec branch, if sendBatch
throws (e.g., client disconnect), the underlying iterator was never closed,
leaking 131072 bytes into ArrowUtils.rootAllocator.

Extract mkBatches to create the raw ArrowBatchWithSchemaIterator directly.
LocalTableScanExec uses mkBatches with try/finally to guarantee close().
converter is rewritten to delegate to mkBatches, eliminating duplication
of the toBatchWithSchemaIterator parameters.

Co-authored-by: Isaac
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.connect.planner.{InvalidInputErrors, SparkConnectPla
 import org.apache.spark.sql.connect.service.ExecuteHolder
 import org.apache.spark.sql.connect.utils.{ErrorUtils, MetricGenerator, PipelineAnalysisContextUtils}
 import org.apache.spark.sql.execution.{DoNotCleanup, LocalTableScanExec, QueryExecution, RemoveShuffleFiles, SkipMigration, SQLExecution}
-import org.apache.spark.sql.execution.arrow.ArrowConverters
+import org.apache.spark.sql.execution.arrow.{ArrowBatchWithSchemaIterator, ArrowConverters}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.util.ThreadUtils
@@ -142,13 +142,25 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder)
     // Whether to enable arrow batch chunking for large result batches.
     val isResultChunkingEnabled = executePlan.resultChunkingEnabled
 
-    val converter = rowToArrowConverter(
-      schema,
-      maxRecordsPerBatch,
-      maxBatchSize,
-      timeZoneId,
-      errorOnDuplicatedFieldNames = false,
-      largeVarTypes = largeVarTypes)
+    // mkBatches creates an ArrowBatchWithSchemaIterator (AutoCloseable). It is used directly
+    // in the LocalTableScanExec branch so that we can close it in a finally block — the
+    // converter wrapper below returns a plain Scala-mapped iterator that is NOT AutoCloseable,
+    // so if sendBatch throws (e.g., client disconnect) the underlying iterator would leak
+    // 131072 bytes into ArrowUtils.rootAllocator.
+    val mkBatches: Iterator[InternalRow] => ArrowBatchWithSchemaIterator = rows =>
+      ArrowConverters.toBatchWithSchemaIterator(
+        rows,
+        schema,
+        maxRecordsPerBatch,
+        maxBatchSize,
+        timeZoneId,
+        errorOnDuplicatedFieldNames = false,
+        largeVarTypes = largeVarTypes)
+
+    val converter: Iterator[InternalRow] => Iterator[Batch] = rows => {
+      val batches = mkBatches(rows)
+      batches.map(b => b -> batches.rowCountInLastBatch)
+    }
 
     var numSent = 0
     def sendBatch(bytes: Array[Byte], count: Long, startOffset: Long): Unit = {
@@ -209,9 +221,16 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder)
       case LocalTableScanExec(_, rows, _) =>
         executePlan.eventsManager.postFinished(Some(rows.length))
         var offset = 0L
-        converter(rows.iterator).foreach { case (bytes, count) =>
-          sendBatch(bytes, count, offset)
-          offset += count
+        val batches = mkBatches(rows.iterator)
+        try {
+          while (batches.hasNext) {
+            val batchBytes = batches.next()
+            val count = batches.rowCountInLastBatch
+            sendBatch(batchBytes, count, offset)
+            offset += count
+          }
+        } finally {
+          batches.close()
         }
       case _ =>
         SQLExecution.withNewExecutionId(dataframe.queryExecution, Some("collectArrow")) {