From d709aa015d86cc1a187b560c375ae7a4c4f8bcbd Mon Sep 17 00:00:00 2001 From: 0lai0 Date: Sat, 20 Jun 2026 02:26:52 +0800 Subject: [PATCH] Gate non-default collations for Spark 4 datetime expressions --- .../org/apache/comet/serde/datetime.scala | 177 +++++++++++++----- .../org/apache/comet/serde/unixtime.scala | 18 +- .../spark/sql/CometCollationSuite.scala | 131 +++++++++++++ .../spark/sql/CometCollationSuite.scala | 133 +++++++++++++ 4 files changed, 412 insertions(+), 47 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/datetime.scala b/spark/src/main/scala/org/apache/comet/serde/datetime.scala index 2ce75ccc0d..24af91ae15 100644 --- a/spark/src/main/scala/org/apache/comet/serde/datetime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/datetime.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, FromUTCTimestamp, GetDateField, GetTimestamp, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUnixTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMicros, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} +import org.apache.spark.sql.catalyst.expressions.{AddMonths, Attribute, ConvertTimezone, DateAdd, DateDiff, DateFormatClass, DateFromUnixDate, DateSub, DayOfMonth, DayOfWeek, DayOfYear, Days, Expression, FromUTCTimestamp, GetDateField, GetTimestamp, Hour, Hours, LastDay, Literal, MakeDate, MakeTimestamp, MicrosToTimestamp, MillisToTimestamp, Minute, Month, MonthsBetween, NextDay, Quarter, Second, SecondsToTimestamp, ToUnixTimestamp, ToUTCTimestamp, TruncDate, TruncTimestamp, UnixDate, UnixMicros, UnixMillis, UnixSeconds, UnixTimestamp, WeekDay, WeekOfYear, Year} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, DateType, DoubleType, FloatType, IntegerType, LongType, StringType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.UTF8String @@ -32,6 +32,7 @@ import org.apache.comet.expressions.{CometCast, CometEvalMode} import org.apache.comet.serde.CometGetDateField.CometGetDateField import org.apache.comet.serde.ExprOuterClass.Expr import org.apache.comet.serde.QueryPlanSerde._ +import org.apache.comet.shims.CometTypeShim private object CometGetDateField extends Enumeration { type CometGetDateField = Value @@ -289,13 +290,26 @@ object CometSecond extends CometExpressionSerde[Second] with CodegenDispatchFall } } +private[serde] object DatetimeCollation extends CometTypeShim { + def reason(functionName: String): String = + s"$functionName does not support non-UTF8_BINARY collations " + + "(https://github.com/apache/datafusion-comet/issues/4646)" + + def hasNonDefaultCollation(expr: Expression): Boolean = + expr.children.exists(c => hasNonDefaultStringCollation(c.dataType)) +} + object CometUnixTimestamp extends CometExpressionSerde[UnixTimestamp] { + private val collationReason = DatetimeCollation.reason("unix_timestamp") + override def getUnsupportedReasons(): Seq[String] = Seq( "Only `TimestampType` and `DateType` inputs are supported." + " `TimestampNTZType` is not supported because Comet incorrectly applies timezone" + " conversion to TimestampNTZ values.") + override def getIncompatibleReasons(): Seq[String] = Seq(collationReason) + private def isSupportedInputType(expr: UnixTimestamp): Boolean = { expr.children.head.dataType match { case TimestampType | DateType => true @@ -305,7 +319,9 @@ object CometUnixTimestamp extends CometExpressionSerde[UnixTimestamp] { } override def getSupportLevel(expr: UnixTimestamp): SupportLevel = { - if (isSupportedInputType(expr)) { + if (DatetimeCollation.hasNonDefaultCollation(expr)) { + Incompatible(Some(collationReason)) + } else if (isSupportedInputType(expr)) { Compatible() } else { val inputType = expr.children.head.dataType @@ -401,11 +417,18 @@ object CometConvertTimezone extends CometExpressionSerde[ConvertTimezone] with CodegenDispatchFallback { - override def getSupportLevel(expr: ConvertTimezone): SupportLevel = - Incompatible(Some(UTCTimestampSerde.tzParseIncompatReason)) + private val collationReason = DatetimeCollation.reason("convert_timezone") + + override def getSupportLevel(expr: ConvertTimezone): SupportLevel = { + if (DatetimeCollation.hasNonDefaultCollation(expr)) { + Incompatible(Some(collationReason)) + } else { + Incompatible(Some(UTCTimestampSerde.tzParseIncompatReason)) + } + } override def getIncompatibleReasons(): Seq[String] = - Seq(UTCTimestampSerde.tzParseIncompatReason) + Seq(UTCTimestampSerde.tzParseIncompatReason, collationReason) override def convert( expr: ConvertTimezone, @@ -427,6 +450,17 @@ object CometNextDay extends CometExpressionSerde[NextDay] { * `dayOfWeek` rather than returning NULL. The resolved flag is passed to native via the * `ScalarFunc.fail_on_error` field. */ + private val collationReason = DatetimeCollation.reason("next_day") + + override def getIncompatibleReasons(): Seq[String] = Seq(collationReason) + + override def getSupportLevel(expr: NextDay): SupportLevel = { + if (DatetimeCollation.hasNonDefaultCollation(expr)) { + Incompatible(Some(collationReason)) + } else { + Compatible() + } + } override def convert(expr: NextDay, inputs: Seq[Attribute], binding: Boolean): Option[Expr] = { val childExpr = expr.children.map(exprToProtoInternal(_, inputs, binding)) val optExpr = scalarFunctionExprToProtoWithReturnType( @@ -508,6 +542,8 @@ object CometTruncDate extends CometExpressionSerde[TruncDate] with CodegenDispat val supportedFormats: Seq[String] = Seq("year", "yyyy", "yy", "quarter", "mon", "month", "mm", "week") + private val collationReason = DatetimeCollation.reason("trunc") + private val nonLiteralFormatIncompatReason: String = "Non-literal format strings will throw an exception instead of returning NULL" @@ -515,21 +551,26 @@ object CometTruncDate extends CometExpressionSerde[TruncDate] with CodegenDispat s"Format $fmt is not supported. Only the following formats are supported: " + supportedFormats.mkString(", ") - override def getIncompatibleReasons(): Seq[String] = Seq(nonLiteralFormatIncompatReason) + override def getIncompatibleReasons(): Seq[String] = + Seq(nonLiteralFormatIncompatReason, collationReason) override def getUnsupportedReasons(): Seq[String] = Seq( "Only the following formats are supported: " + supportedFormats.mkString(", ")) override def getSupportLevel(expr: TruncDate): SupportLevel = { - expr.format match { - case Literal(fmt: UTF8String, _) => - if (supportedFormats.contains(fmt.toString.toLowerCase(Locale.ROOT))) { - Compatible() - } else { - Unsupported(Some(unsupportedFormatReason(fmt))) - } - case _ => - Incompatible(Some(nonLiteralFormatIncompatReason)) + if (DatetimeCollation.hasNonDefaultCollation(expr)) { + Incompatible(Some(collationReason)) + } else { + expr.format match { + case Literal(fmt: UTF8String, _) => + if (supportedFormats.contains(fmt.toString.toLowerCase(Locale.ROOT))) { + Compatible() + } else { + Unsupported(Some(unsupportedFormatReason(fmt))) + } + case _ => + Incompatible(Some(nonLiteralFormatIncompatReason)) + } } } @@ -572,6 +613,8 @@ object CometTruncTimestamp "millisecond", "microsecond") + private val collationReason = DatetimeCollation.reason("date_trunc") + private val nonUtcIncompatReason: String = "Produces incorrect results when used with non-UTC timezones. Compatible when timezone is" + " UTC. (https://github.com/apache/datafusion-comet/issues/2649)" @@ -584,27 +627,31 @@ object CometTruncTimestamp supportedFormats.mkString(", ") override def getIncompatibleReasons(): Seq[String] = - Seq(nonUtcIncompatReason, nonLiteralFormatIncompatReason) + Seq(nonUtcIncompatReason, nonLiteralFormatIncompatReason, collationReason) override def getUnsupportedReasons(): Seq[String] = Seq( "Only the following formats are supported: " + supportedFormats.mkString(", ")) override def getSupportLevel(expr: TruncTimestamp): SupportLevel = { - val timezone = expr.timeZoneId.getOrElse("UTC") - val isUtc = timezone == "UTC" || timezone == "Etc/UTC" - expr.format match { - case Literal(fmt: UTF8String, _) => - if (supportedFormats.contains(fmt.toString.toLowerCase(Locale.ROOT))) { - if (isUtc) { - Compatible() + if (DatetimeCollation.hasNonDefaultCollation(expr)) { + Incompatible(Some(collationReason)) + } else { + val timezone = expr.timeZoneId.getOrElse("UTC") + val isUtc = timezone == "UTC" || timezone == "Etc/UTC" + expr.format match { + case Literal(fmt: UTF8String, _) => + if (supportedFormats.contains(fmt.toString.toLowerCase(Locale.ROOT))) { + if (isUtc) { + Compatible() + } else { + Incompatible(Some(nonUtcIncompatReason)) + } } else { - Incompatible(Some(nonUtcIncompatReason)) + Unsupported(Some(unsupportedFormatReason(fmt))) } - } else { - Unsupported(Some(unsupportedFormatReason(fmt))) - } - case _ => - Incompatible(Some(nonLiteralFormatIncompatReason)) + case _ => + Incompatible(Some(nonLiteralFormatIncompatReason)) + } } } @@ -648,7 +695,10 @@ object CometTruncTimestamp * by [[CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED]]. When that flag is disabled the operator * falls back to Spark. */ -object CometDateFormat extends CometExpressionSerde[DateFormatClass] { +object CometDateFormat + extends CometExpressionSerde[DateFormatClass] + with CodegenDispatchFallback + with CometTypeShim { /** * Mapping from Spark SimpleDateFormat patterns to strftime patterns. Only formats in this map @@ -686,18 +736,26 @@ object CometDateFormat extends CometExpressionSerde[DateFormatClass] { // ISO formats "yyyy-MM-dd'T'HH:mm:ss" -> "%Y-%m-%dT%H:%M:%S") - // Compatibility is decided inside `convert`: the native path covers a subset, and the codegen - // dispatcher covers everything else when enabled. Plan-time tagging happens via - // `withFallbackReason` on the path that returns None. - override def getSupportLevel(expr: DateFormatClass): SupportLevel = Compatible() + private val collationReason = DatetimeCollation.reason("date_format") + + override def getIncompatibleReasons(): Seq[String] = Seq(collationReason) + + // Non-default collations return Incompatible; all other inputs are Compatible. In both cases + // convert() decides between the native to_char path and the codegen dispatcher. + override def getSupportLevel(expr: DateFormatClass): SupportLevel = { + if (DatetimeCollation.hasNonDefaultCollation(expr)) { + Incompatible(Some(collationReason)) + } else { + Compatible() + } + } override def getCompatibleNotes(): Seq[String] = Seq( "Format strings in a curated allow-list run natively via DataFusion's `to_char` for UTC " + - "sessions. Other format strings (including non-literal formats), as well as non-UTC " + - "sessions, route through Spark's own `DateFormatClass.doGenCode` via the Arrow-direct " + - "codegen dispatcher when `spark.comet.exec.scalaUDF.codegen.enabled=true`. When the " + - "codegen dispatcher is disabled (default) the operator falls back to Spark in those " + - "cases.") + "sessions. Other format strings (including non-literal formats) and non-UTC sessions " + + "route through Spark's own `DateFormatClass.doGenCode` via the Arrow-direct codegen " + + "dispatcher when `spark.comet.exec.scalaUDF.codegen.enabled=true`. When the codegen " + + "dispatcher is disabled (default) the operator falls back to Spark in those cases.") override def convert( expr: DateFormatClass, @@ -711,9 +769,10 @@ object CometDateFormat extends CometExpressionSerde[DateFormatClass] { case _ => None } - val canUseNative = nativeFormat.isDefined && { - isUtc || CometConf.isExprAllowIncompat(getExprConfigName(expr)) - } + val canUseNative = nativeFormat.isDefined && + !expr.children.exists(c => hasNonDefaultStringCollation(c.dataType)) && { + isUtc || CometConf.isExprAllowIncompat(getExprConfigName(expr)) + } if (canUseNative) { val childExpr = exprToProtoInternal(expr.left, inputs, binding) @@ -829,7 +888,22 @@ object CometAddMonths extends CometCodegenDispatch[AddMonths] object CometMonthsBetween extends CometCodegenDispatch[MonthsBetween] -object CometMakeTimestamp extends CometCodegenDispatch[MakeTimestamp] +object CometMakeTimestamp + extends CometCodegenDispatch[MakeTimestamp] + with CodegenDispatchFallback { + + private val collationReason = DatetimeCollation.reason("make_timestamp") + + override def getIncompatibleReasons(): Seq[String] = Seq(collationReason) + + override def getSupportLevel(expr: MakeTimestamp): SupportLevel = { + if (DatetimeCollation.hasNonDefaultCollation(expr)) { + Incompatible(Some(collationReason)) + } else { + Compatible() + } + } +} object CometMicrosToTimestamp extends CometCodegenDispatch[MicrosToTimestamp] @@ -841,6 +915,21 @@ object CometUnixMillis extends CometCodegenDispatch[UnixMillis] object CometUnixMicros extends CometCodegenDispatch[UnixMicros] -object CometToUnixTimestamp extends CometCodegenDispatch[ToUnixTimestamp] +object CometToUnixTimestamp + extends CometCodegenDispatch[ToUnixTimestamp] + with CodegenDispatchFallback { + + private val collationReason = DatetimeCollation.reason("to_unix_timestamp") + + override def getIncompatibleReasons(): Seq[String] = Seq(collationReason) + + override def getSupportLevel(expr: ToUnixTimestamp): SupportLevel = { + if (DatetimeCollation.hasNonDefaultCollation(expr)) { + Incompatible(Some(collationReason)) + } else { + Compatible() + } + } +} object CometGetTimestamp extends CometCodegenDispatch[GetTimestamp] diff --git a/spark/src/main/scala/org/apache/comet/serde/unixtime.scala b/spark/src/main/scala/org/apache/comet/serde/unixtime.scala index 1a3a4ca677..363d6dff21 100644 --- a/spark/src/main/scala/org/apache/comet/serde/unixtime.scala +++ b/spark/src/main/scala/org/apache/comet/serde/unixtime.scala @@ -29,12 +29,24 @@ import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithFa // https://github.com/apache/datafusion/issues/16594 object CometFromUnixTime extends CometExpressionSerde[FromUnixTime] with CodegenDispatchFallback { - override def getIncompatibleReasons(): Seq[String] = Seq( + private val collationReason = + "from_unixtime does not support non-UTF8_BINARY collations " + + "(https://github.com/apache/datafusion-comet/issues/4646)" + + private val formatReason = "Only supports the default datetime format pattern `yyyy-MM-dd HH:mm:ss`." + " DataFusion's valid timestamp range differs from Spark" + - " (https://github.com/apache/datafusion/issues/16594)") + " (https://github.com/apache/datafusion/issues/16594)" + + override def getIncompatibleReasons(): Seq[String] = Seq(formatReason, collationReason) - override def getSupportLevel(expr: FromUnixTime): SupportLevel = Incompatible(None) + override def getSupportLevel(expr: FromUnixTime): SupportLevel = { + if (DatetimeCollation.hasNonDefaultCollation(expr)) { + Incompatible(Some(collationReason)) + } else { + Incompatible(Some(formatReason)) + } + } override def convert( expr: FromUnixTime, diff --git a/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala b/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala index 2c8451fc36..6be49e4924 100644 --- a/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala +++ b/spark/src/test/spark-4.0/org/apache/spark/sql/CometCollationSuite.scala @@ -247,4 +247,135 @@ class CometCollationSuite extends CometTestBase { "string keys; the collation guard for #4051 must not over-block.") } } + + // ---- datetime expression collation guards (issue #4646) -------------------------------- + // + // Comet's native datetime functions use string arguments (format patterns, timezones, + // day-of-week) as raw bytes, so non-default collations on those arguments must not reach + // the native path silently. Expressions without a codegen-dispatcher fallback (next_day, + // unix_timestamp) fall back to Spark entirely. Expressions with CodegenDispatchFallback + // (trunc, date_trunc, date_format, from_unixtime, make_timestamp, to_unix_timestamp, + // convert_timezone) fall back to Spark when COMET_SCALA_UDF_CODEGEN_ENABLED is false + // or route through Spark codegen inside the Comet pipeline when it is true. + + private def withDatetimeCollationTable(f: => Unit): Unit = { + withParquetTable( + Seq( + ( + "2024-01-01", + "2024-01-01 00:00:00", + "2024-06-15", + "2024-06-15 10:00:00", + "MON", + "yyyy-MM-dd", + "yyyy-MM-dd HH:mm:ss", + "YEAR", + "UTC", + 1718451045L)), + "datetime_collation_tbl")(f) + } + + private def checkDatetimeFallback(query: String, fallbackReason: String): Unit = { + withDatetimeCollationTable { + withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "false") { + checkSparkAnswerAndFallbackReason(query, fallbackReason) + } + } + } + + private def checkDatetimeDispatcher(query: String): Unit = { + withDatetimeCollationTable { + withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "true") { + checkSparkAnswerAndOperator(query) + } + } + } + + test("next_day rejects non-UTF8_BINARY collated dayOfWeek (issue #4646)") { + checkDatetimeFallback( + "SELECT next_day(CAST(_1 AS DATE), _5 COLLATE utf8_lcase) " + + "FROM datetime_collation_tbl", + "next_day does not support non-UTF8_BINARY collations") + } + + test("unix_timestamp rejects non-UTF8_BINARY collated format (issue #4646)") { + checkDatetimeFallback( + "SELECT unix_timestamp(CAST(_2 AS TIMESTAMP), _7 COLLATE utf8_lcase) " + + "FROM datetime_collation_tbl", + "unix_timestamp does not support non-UTF8_BINARY collations") + } + + test("from_unixtime rejects non-UTF8_BINARY collated format (issue #4646)") { + checkDatetimeFallback( + "SELECT from_unixtime(_10, _6 COLLATE utf8_lcase) FROM datetime_collation_tbl", + "from_unixtime does not support non-UTF8_BINARY collations") + } + + test("make_timestamp rejects non-UTF8_BINARY collated timezone (issue #4646)") { + checkDatetimeFallback( + "SELECT make_timestamp(2024, 6, 15, 10, 30, 45.0, _9 COLLATE utf8_lcase) " + + "FROM datetime_collation_tbl", + "make_timestamp does not support non-UTF8_BINARY collations") + } + + test("to_unix_timestamp rejects non-UTF8_BINARY collated format (issue #4646)") { + checkDatetimeFallback( + "SELECT to_unix_timestamp(_3, _6 COLLATE utf8_lcase) FROM datetime_collation_tbl", + "to_unix_timestamp does not support non-UTF8_BINARY collations") + } + + test("convert_timezone rejects non-UTF8_BINARY collated timezone (issue #4646)") { + checkDatetimeFallback( + "SELECT convert_timezone(_9 COLLATE utf8_lcase, 'America/Los_Angeles', " + + "TIMESTAMP_NTZ '2024-01-01 00:00:00') FROM datetime_collation_tbl", + "convert_timezone does not support non-UTF8_BINARY collations") + } + + test("trunc falls back with collated format when codegen is disabled (issue #4646)") { + checkDatetimeFallback( + "SELECT trunc(CAST(_3 AS DATE), _8 COLLATE utf8_lcase) FROM datetime_collation_tbl", + "trunc does not support non-UTF8_BINARY collations") + } + + test("date_trunc falls back with collated format when codegen is disabled (issue #4646)") { + checkDatetimeFallback( + "SELECT date_trunc(_8 COLLATE utf8_lcase, CAST(_4 AS TIMESTAMP)) " + + "FROM datetime_collation_tbl", + "date_trunc does not support non-UTF8_BINARY collations") + } + + test("date_format falls back with collated format when codegen is disabled (issue #4646)") { + checkDatetimeFallback( + "SELECT date_format(CAST(_2 AS TIMESTAMP), _6 COLLATE utf8_lcase) " + + "FROM datetime_collation_tbl", + "date_format does not support non-UTF8_BINARY collations") + } + + test("trunc routes collated format through codegen dispatcher (issue #4646)") { + checkDatetimeDispatcher( + "SELECT trunc(CAST(_3 AS DATE), _8 COLLATE utf8_lcase) FROM datetime_collation_tbl") + } + + test("date_trunc routes collated format through codegen dispatcher (issue #4646)") { + checkDatetimeDispatcher( + "SELECT date_trunc(_8 COLLATE utf8_lcase, CAST(_4 AS TIMESTAMP)) " + + "FROM datetime_collation_tbl") + } + + test("date_format routes collated format through codegen dispatcher (issue #4646)") { + checkDatetimeDispatcher( + "SELECT date_format(CAST(_2 AS TIMESTAMP), _6 COLLATE utf8_lcase) " + + "FROM datetime_collation_tbl") + } + + test("datetime expressions still run with default UTF8_BINARY collation (issue #4646)") { + withDatetimeCollationTable { + checkSparkAnswerAndOperator( + "SELECT next_day(CAST(_1 AS DATE), _5) FROM datetime_collation_tbl") + checkSparkAnswerAndOperator( + "SELECT trunc(CAST(_3 AS DATE), _8) FROM datetime_collation_tbl") + checkSparkAnswerAndOperator( + "SELECT unix_timestamp(CAST(_2 AS TIMESTAMP), _7) FROM datetime_collation_tbl") + } + } } diff --git a/spark/src/test/spark-4.1/org/apache/spark/sql/CometCollationSuite.scala b/spark/src/test/spark-4.1/org/apache/spark/sql/CometCollationSuite.scala index 463e169b66..11e06818a5 100644 --- a/spark/src/test/spark-4.1/org/apache/spark/sql/CometCollationSuite.scala +++ b/spark/src/test/spark-4.1/org/apache/spark/sql/CometCollationSuite.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql +import org.apache.comet.CometConf + class CometCollationSuite extends CometTestBase { // Queries that group, sort, or shuffle on a non-default collated string must fall back to @@ -66,4 +68,135 @@ class CometCollationSuite extends CometTestBase { checkSparkAnswerAndOperator("SELECT DISTINCT _1 FROM tbl ORDER BY _1") } } + + // ---- datetime expression collation guards (issue #4646) -------------------------------- + // + // Comet's native datetime functions use string arguments (format patterns, timezones, + // day-of-week) as raw bytes, so non-default collations on those arguments must not reach + // the native path silently. Expressions without a codegen-dispatcher fallback (next_day, + // unix_timestamp) fall back to Spark entirely. Expressions with CodegenDispatchFallback + // (trunc, date_trunc, date_format, from_unixtime, make_timestamp, to_unix_timestamp, + // convert_timezone) fall back to Spark when COMET_SCALA_UDF_CODEGEN_ENABLED is false + // or route through Spark codegen inside the Comet pipeline when it is true. + + private def withDatetimeCollationTable(f: => Unit): Unit = { + withParquetTable( + Seq( + ( + "2024-01-01", + "2024-01-01 00:00:00", + "2024-06-15", + "2024-06-15 10:00:00", + "MON", + "yyyy-MM-dd", + "yyyy-MM-dd HH:mm:ss", + "YEAR", + "UTC", + 1718451045L)), + "datetime_collation_tbl")(f) + } + + private def checkDatetimeFallback(query: String, fallbackReason: String): Unit = { + withDatetimeCollationTable { + withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "false") { + checkSparkAnswerAndFallbackReason(query, fallbackReason) + } + } + } + + private def checkDatetimeDispatcher(query: String): Unit = { + withDatetimeCollationTable { + withSQLConf(CometConf.COMET_SCALA_UDF_CODEGEN_ENABLED.key -> "true") { + checkSparkAnswerAndOperator(query) + } + } + } + + test("next_day rejects non-UTF8_BINARY collated dayOfWeek (issue #4646)") { + checkDatetimeFallback( + "SELECT next_day(CAST(_1 AS DATE), _5 COLLATE utf8_lcase) " + + "FROM datetime_collation_tbl", + "next_day does not support non-UTF8_BINARY collations") + } + + test("unix_timestamp rejects non-UTF8_BINARY collated format (issue #4646)") { + checkDatetimeFallback( + "SELECT unix_timestamp(CAST(_2 AS TIMESTAMP), _7 COLLATE utf8_lcase) " + + "FROM datetime_collation_tbl", + "unix_timestamp does not support non-UTF8_BINARY collations") + } + + test("from_unixtime rejects non-UTF8_BINARY collated format (issue #4646)") { + checkDatetimeFallback( + "SELECT from_unixtime(_10, _6 COLLATE utf8_lcase) FROM datetime_collation_tbl", + "from_unixtime does not support non-UTF8_BINARY collations") + } + + test("make_timestamp rejects non-UTF8_BINARY collated timezone (issue #4646)") { + checkDatetimeFallback( + "SELECT make_timestamp(2024, 6, 15, 10, 30, 45.0, _9 COLLATE utf8_lcase) " + + "FROM datetime_collation_tbl", + "make_timestamp does not support non-UTF8_BINARY collations") + } + + test("to_unix_timestamp rejects non-UTF8_BINARY collated format (issue #4646)") { + checkDatetimeFallback( + "SELECT to_unix_timestamp(_3, _6 COLLATE utf8_lcase) FROM datetime_collation_tbl", + "to_unix_timestamp does not support non-UTF8_BINARY collations") + } + + test("convert_timezone rejects non-UTF8_BINARY collated timezone (issue #4646)") { + checkDatetimeFallback( + "SELECT convert_timezone(_9 COLLATE utf8_lcase, 'America/Los_Angeles', " + + "TIMESTAMP_NTZ '2024-01-01 00:00:00') FROM datetime_collation_tbl", + "convert_timezone does not support non-UTF8_BINARY collations") + } + + test("trunc falls back with collated format when codegen is disabled (issue #4646)") { + checkDatetimeFallback( + "SELECT trunc(CAST(_3 AS DATE), _8 COLLATE utf8_lcase) FROM datetime_collation_tbl", + "trunc does not support non-UTF8_BINARY collations") + } + + test("date_trunc falls back with collated format when codegen is disabled (issue #4646)") { + checkDatetimeFallback( + "SELECT date_trunc(_8 COLLATE utf8_lcase, CAST(_4 AS TIMESTAMP)) " + + "FROM datetime_collation_tbl", + "date_trunc does not support non-UTF8_BINARY collations") + } + + test("date_format falls back with collated format when codegen is disabled (issue #4646)") { + checkDatetimeFallback( + "SELECT date_format(CAST(_2 AS TIMESTAMP), _6 COLLATE utf8_lcase) " + + "FROM datetime_collation_tbl", + "date_format does not support non-UTF8_BINARY collations") + } + + test("trunc routes collated format through codegen dispatcher (issue #4646)") { + checkDatetimeDispatcher( + "SELECT trunc(CAST(_3 AS DATE), _8 COLLATE utf8_lcase) FROM datetime_collation_tbl") + } + + test("date_trunc routes collated format through codegen dispatcher (issue #4646)") { + checkDatetimeDispatcher( + "SELECT date_trunc(_8 COLLATE utf8_lcase, CAST(_4 AS TIMESTAMP)) " + + "FROM datetime_collation_tbl") + } + + test("date_format routes collated format through codegen dispatcher (issue #4646)") { + checkDatetimeDispatcher( + "SELECT date_format(CAST(_2 AS TIMESTAMP), _6 COLLATE utf8_lcase) " + + "FROM datetime_collation_tbl") + } + + test("datetime expressions still run with default UTF8_BINARY collation (issue #4646)") { + withDatetimeCollationTable { + checkSparkAnswerAndOperator( + "SELECT next_day(CAST(_1 AS DATE), _5) FROM datetime_collation_tbl") + checkSparkAnswerAndOperator( + "SELECT trunc(CAST(_3 AS DATE), _8) FROM datetime_collation_tbl") + checkSparkAnswerAndOperator( + "SELECT unix_timestamp(CAST(_2 AS TIMESTAMP), _7) FROM datetime_collation_tbl") + } + } }