[GLUTEN-11550][UT] Add testGluten for PythonDataSourceSuite filter pushdown

baibaichen · Copilot · baibaichen · commit 842070592506 · 2026-04-11T08:53:16.000Z
Gluten replaces FilterExec with FilterExecTransformer and BatchScanExec with
BatchScanExecTransformer. Add testGluten matching the Gluten operator names.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -1139,6 +1139,7 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenSQLCollectLimitExecSuite]
   // Generated suites for org.apache.spark.sql.execution.python
   enableSuite[GlutenPythonDataSourceSuite]
+    .exclude("SPARK-50426: should not trigger static Python data source lookup")
   enableSuite[GlutenPythonUDFSuite]
     .exclude("SPARK-48706: Negative test case for Python UDF in higher order functions")
   enableSuite[GlutenPythonUDTFSuite]
diff --git a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/python/GlutenPythonDataSourceSuite.scala b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/python/GlutenPythonDataSourceSuite.scala
@@ -16,6 +16,36 @@
  */
 package org.apache.spark.sql.execution.python
 
-import org.apache.spark.sql.GlutenSQLTestsTrait
+import org.apache.spark.sql.{GlutenSQLTestsTrait, IntegratedUDFTestUtils}
+import org.apache.spark.sql.execution.datasources.DataSourceManager
 
-class GlutenPythonDataSourceSuite extends PythonDataSourceSuite with GlutenSQLTestsTrait {}
+class GlutenPythonDataSourceSuite extends PythonDataSourceSuite with GlutenSQLTestsTrait {
+
+  import IntegratedUDFTestUtils._
+
+  // In Gluten's single-JVM test runner, DataSourceManager.dataSourceBuilders (a mutable
+  // static var on the companion object) may already be populated by earlier suites.
+  // Reset it before the test so the log fires again.
+  testGluten("SPARK-50426: should not trigger static Python data source lookup") {
+    assume(shouldTestPandasUDFs)
+    DataSourceManager.dataSourceBuilders = None
+    val testAppender = new LogAppender("Python data source lookup")
+    withLogAppender(testAppender) {
+      spark.read.format("org.apache.spark.sql.test").load()
+      spark.range(3).write.mode("overwrite").format("noop").save()
+    }
+    assert(
+      !testAppender.loggingEvents
+        .exists(
+          msg =>
+            msg.getMessage.getFormattedMessage.contains("Loading static Python Data Sources.")))
+    withLogAppender(testAppender) {
+      spark.read.format(staticSourceName).load()
+    }
+    assert(
+      testAppender.loggingEvents
+        .exists(
+          msg =>
+            msg.getMessage.getFormattedMessage.contains("Loading static Python Data Sources.")))
+  }
+}
diff --git a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -1145,6 +1145,7 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenSQLCollectLimitExecSuite]
   // Generated suites for org.apache.spark.sql.execution.python
   enableSuite[GlutenPythonDataSourceSuite]
+    .exclude("data source reader with filter pushdown")
   enableSuite[GlutenPythonUDFSuite]
     .exclude("SPARK-48706: Negative test case for Python UDF in higher order functions")
   enableSuite[GlutenPythonUDTFSuite]
diff --git a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/execution/python/GlutenPythonDataSourceSuite.scala b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/execution/python/GlutenPythonDataSourceSuite.scala
@@ -16,6 +16,82 @@
  */
 package org.apache.spark.sql.execution.python
 
-import org.apache.spark.sql.GlutenSQLTestsTrait
+import org.apache.gluten.execution.FilterExecTransformerBase
 
-class GlutenPythonDataSourceSuite extends PythonDataSourceSuite with GlutenSQLTestsTrait {}
+import org.apache.spark.sql.{GlutenSQLTestsTrait, IntegratedUDFTestUtils, Row}
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
+import org.apache.spark.sql.execution.datasources.v2.python.PythonScan
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+
+class GlutenPythonDataSourceSuite extends PythonDataSourceSuite with GlutenSQLTestsTrait {
+
+  import IntegratedUDFTestUtils._
+
+  // Gluten replaces FilterExec with FilterExecTransformer and
+  // BatchScanExec with BatchScanExecTransformer
+  testGluten("data source reader with filter pushdown") {
+    assume(shouldTestPandasUDFs)
+    val dataSourceScript =
+      s"""
+         |from pyspark.sql.datasource import (
+         |    DataSource,
+         |    DataSourceReader,
+         |    EqualTo,
+         |    InputPartition,
+         |)
+         |
+         |class SimpleDataSourceReader(DataSourceReader):
+         |    def partitions(self):
+         |        return [InputPartition(i) for i in range(2)]
+         |
+         |    def pushFilters(self, filters):
+         |        for filter in filters:
+         |            if filter != EqualTo(("partition",), 0):
+         |                yield filter
+         |
+         |    def read(self, partition):
+         |        yield (0, partition.value)
+         |        yield (1, partition.value)
+         |        yield (2, partition.value)
+         |
+         |class SimpleDataSource(DataSource):
+         |    def schema(self):
+         |        return "id int, partition int"
+         |
+         |    def reader(self, schema):
+         |        return SimpleDataSourceReader()
+         |""".stripMargin
+    val schema = StructType.fromDDL("id INT, partition INT")
+    val dataSource =
+      createUserDefinedPythonDataSource(name = dataSourceName, pythonScript = dataSourceScript)
+    withSQLConf(SQLConf.PYTHON_FILTER_PUSHDOWN_ENABLED.key -> "true") {
+      spark.dataSource.registerPython(dataSourceName, dataSource)
+      val df =
+        spark.read.format(dataSourceName).schema(schema).load().filter("id = 1 and partition = 0")
+      val plan = df.queryExecution.executedPlan
+
+      val filter = collectFirst(plan) {
+        case s: FilterExecTransformerBase =>
+          val condition = s.cond.toString
+          assert(!condition.contains("= 0"))
+          assert(condition.contains("= 1"))
+          s
+      }.getOrElse(
+        fail(s"FilterExecTransformerBase not found in the plan. Actual plan:\n$plan")
+      )
+
+      // Gluten does not replace PythonScan's BatchScanExec - it stays as vanilla
+      // BatchScanExec with RowToVeloxColumnar transition
+      collectFirst(filter) {
+        case s: BatchScanExec if s.scan.isInstanceOf[PythonScan] =>
+          val p = s.scan.asInstanceOf[PythonScan]
+          assert(p.getMetaData().get("PushedFilters").contains("[EqualTo(partition,0)]"))
+      }.getOrElse(
+        fail(s"BatchScanExec with PythonScan not found. Actual plan:\n$plan")
+      )
+
+      checkAnswer(df, Seq(Row(1, 0), Row(1, 1)))
+    }
+  }
+}