fix: Convert Spark columnar batches to Arrow in CometNativeWriteExec (#2944)

Shekharrajak · Shekharrajak · commit 37e4a236a6c4 · 2026-01-13T00:12:25.000+05:30
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeWriteExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometNativeWriteExec.scala
@@ -26,16 +26,19 @@ import scala.jdk.CollectionConverters._
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext, TaskAttemptID, TaskID, TaskType}
 import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.spark.TaskContext
 import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.comet.execution.arrow.CometArrowConverters
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.Utils
 
-import org.apache.comet.CometExecIterator
+import org.apache.comet.{CometConf, CometExecIterator}
 import org.apache.comet.serde.OperatorOuterClass.Operator
+import org.apache.comet.vector.CometVector
 
 /**
  * Comet physical operator for native Parquet write operations with FileCommitProtocol support.
@@ -138,16 +141,21 @@ case class CometNativeWriteExec(
   }
 
   override def doExecuteColumnar(): RDD[ColumnarBatch] = {
+    // Check if the child produces Arrow/Comet batches or Spark batches
+    val childIsComet = child.isInstanceOf[CometPlan]
+
     // Get the input data from the child operator
     val childRDD = if (child.supportsColumnar) {
       child.executeColumnar()
     } else {
-      // If child doesn't support columnar, convert to columnar
-      child.execute().mapPartitionsInternal { _ =>
-        // TODO this could delegate to CometRowToColumnar, but maybe Comet
-        // does not need to support this case?
-        throw new UnsupportedOperationException(
-          "Row-based child operators not yet supported for native write")
+      // If child doesn't support columnar, convert rows to Arrow columnar batches
+      val maxRecordsPerBatch = CometConf.COMET_BATCH_SIZE.get(conf)
+      val timeZoneId = conf.sessionLocalTimeZone
+      val schema = child.schema
+      child.execute().mapPartitionsInternal { rowIter =>
+        val context = TaskContext.get()
+        CometArrowConverters
+          .rowToArrowBatchIter(rowIter, schema, maxRecordsPerBatch, timeZoneId, context)
       }
     }
 
@@ -158,6 +166,10 @@ case class CometNativeWriteExec(
     val capturedJobTrackerID = jobTrackerID
     val capturedNativeOp = nativeOp
     val capturedAccumulator = taskCommitMessagesAccum // Capture accumulator for use in tasks
+    val capturedChildIsComet = childIsComet
+    val capturedSchema = child.schema
+    val capturedMaxRecordsPerBatch = CometConf.COMET_BATCH_SIZE.get(conf)
+    val capturedTimeZoneId = conf.sessionLocalTimeZone
 
     // Execute native write operation with task-level commit protocol
     childRDD.mapPartitionsInternal { iter =>
@@ -201,9 +213,28 @@ case class CometNativeWriteExec(
       outputStream.close()
       val planBytes = outputStream.toByteArray
 
+      // Convert Spark columnar batches to Arrow format if child is not a Comet operator.
+      // Comet native execution expects Arrow arrays, but Spark operators like RangeExec
+      // produce OnHeapColumnVector which must be converted.
+      val arrowIter = if (capturedChildIsComet) {
+        // Child is already producing Arrow/Comet batches
+        iter
+      } else {
+        // Convert Spark columnar batches to Arrow format
+        val context = TaskContext.get()
+        iter.flatMap { sparkBatch =>
+          CometArrowConverters.columnarBatchToArrowBatchIter(
+            sparkBatch,
+            capturedSchema,
+            capturedMaxRecordsPerBatch,
+            capturedTimeZoneId,
+            context)
+        }
+      }
+
       val execIterator = new CometExecIterator(
         CometExec.newIterId,
-        Seq(iter),
+        Seq(arrowIter),
         numOutputCols,
         planBytes,
         nativeMetrics,
diff --git a/spark/src/test/scala/org/apache/comet/parquet/CometParquetWriterSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/CometParquetWriterSuite.scala
@@ -228,4 +228,113 @@ class CometParquetWriterSuite extends CometTestBase {
       }
     }
   }
+
+  test("parquet write with spark.range() as data source - with spark-to-arrow conversion") {
+    // Test that spark.range() works when CometSparkToColumnarExec is enabled to convert
+    // Spark's OnHeapColumnVector to Arrow format
+    withTempPath { dir =>
+      val outputPath = new File(dir, "output.parquet").getAbsolutePath
+
+      withSQLConf(
+        CometConf.COMET_NATIVE_PARQUET_WRITE_ENABLED.key -> "true",
+        SQLConf.SESSION_LOCAL_TIMEZONE.key -> "America/Halifax",
+        CometConf.getOperatorAllowIncompatConfigKey(classOf[DataWritingCommandExec]) -> "true",
+        CometConf.COMET_EXEC_ENABLED.key -> "true",
+        CometConf.COMET_SPARK_TO_ARROW_ENABLED.key -> "true",
+        CometConf.COMET_SPARK_TO_ARROW_SUPPORTED_OPERATOR_LIST.key -> "Range") {
+
+        // Use a listener to capture the execution plan during write
+        var capturedPlan: Option[org.apache.spark.sql.execution.QueryExecution] = None
+
+        val listener = new org.apache.spark.sql.util.QueryExecutionListener {
+          override def onSuccess(
+              funcName: String,
+              qe: org.apache.spark.sql.execution.QueryExecution,
+              durationNs: Long): Unit = {
+            if (funcName == "save" || funcName.contains("command")) {
+              capturedPlan = Some(qe)
+            }
+          }
+
+          override def onFailure(
+              funcName: String,
+              qe: org.apache.spark.sql.execution.QueryExecution,
+              exception: Exception): Unit = {}
+        }
+
+        spark.listenerManager.register(listener)
+
+        try {
+          // spark.range() uses RangeExec which produces OnHeapColumnVector
+          // CometSparkToColumnarExec converts these to Arrow format
+          spark.range(1000).write.mode("overwrite").parquet(outputPath)
+
+          // Wait for listener
+          val maxWaitTimeMs = 15000
+          val checkIntervalMs = 100
+          var iterations = 0
+
+          while (capturedPlan.isEmpty && iterations < maxWaitTimeMs / checkIntervalMs) {
+            Thread.sleep(checkIntervalMs)
+            iterations += 1
+          }
+
+          // Verify that CometNativeWriteExec was used
+          capturedPlan.foreach { qe =>
+            val executedPlan = stripAQEPlan(qe.executedPlan)
+
+            var nativeWriteCount = 0
+            executedPlan.foreach {
+              case _: CometNativeWriteExec =>
+                nativeWriteCount += 1
+              case d: DataWritingCommandExec =>
+                d.child.foreach {
+                  case _: CometNativeWriteExec =>
+                    nativeWriteCount += 1
+                  case _ =>
+                }
+              case _ =>
+            }
+
+            assert(
+              nativeWriteCount == 1,
+              s"Expected exactly one CometNativeWriteExec in the plan, but found $nativeWriteCount:\n${executedPlan.treeString}")
+          }
+
+          // Verify the data was written correctly
+          val resultDf = spark.read.parquet(outputPath)
+          assert(resultDf.count() == 1000, "Expected 1000 rows to be written")
+        } finally {
+          spark.listenerManager.unregister(listener)
+        }
+      }
+    }
+  }
+
+  test("parquet write with spark.range() - issue #2944 without spark-to-arrow") {
+    // This test reproduces https://github.com/apache/datafusion-comet/issues/2944
+    // Without CometSparkToColumnarExec enabled, the native writer should handle
+    // Spark columnar batches by converting them to Arrow format internally.
+    withTempPath { dir =>
+      val outputPath = new File(dir, "output.parquet").getAbsolutePath
+
+      withSQLConf(
+        CometConf.COMET_NATIVE_PARQUET_WRITE_ENABLED.key -> "true",
+        SQLConf.SESSION_LOCAL_TIMEZONE.key -> "America/Halifax",
+        CometConf.getOperatorAllowIncompatConfigKey(classOf[DataWritingCommandExec]) -> "true",
+        CometConf.COMET_EXEC_ENABLED.key -> "true",
+        // Explicitly disable spark-to-arrow conversion to reproduce the issue
+        CometConf.COMET_SPARK_TO_ARROW_ENABLED.key -> "false") {
+
+        // spark.range() uses RangeExec which produces OnHeapColumnVector (not Arrow)
+        // Without the fix, this would fail with:
+        // "Comet execution only takes Arrow Arrays, but got OnHeapColumnVector"
+        spark.range(1000).write.mode("overwrite").parquet(outputPath)
+
+        // Verify the data was written correctly
+        val resultDf = spark.read.parquet(outputPath)
+        assert(resultDf.count() == 1000, "Expected 1000 rows to be written")
+      }
+    }
+  }
 }