From 0511026a17736b3ae07c5b9538d84c9966e05fbe Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Tue, 10 Mar 2026 12:42:16 -0700 Subject: [PATCH 1/3] fix: fix error reporting for string to timestamp cast --- .../spark-expr/src/conversion_funcs/string.rs | 94 ++++++++++----- native/spark-expr/src/error.rs | 25 +++- .../apache/comet/SparkErrorConverter.scala | 2 +- .../apache/comet/expressions/CometCast.scala | 3 - .../comet/shims/ShimSparkErrorConverter.scala | 18 ++- .../comet/shims/ShimSparkErrorConverter.scala | 18 ++- .../comet/shims/ShimSparkErrorConverter.scala | 7 ++ .../org/apache/comet/CometCastSuite.scala | 112 ++++++++++++++---- 8 files changed, 220 insertions(+), 59 deletions(-) diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index 7c193716d0..2d83484357 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -37,19 +37,29 @@ macro_rules! cast_utf8_to_timestamp { ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr) => {{ let len = $array.len(); let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC"); + let mut cast_err: Option = None; for i in 0..len { if $array.is_null(i) { cast_array.append_null() - } else if let Ok(Some(cast_value)) = - $cast_method($array.value(i).trim(), $eval_mode, $tz) - { - cast_array.append_value(cast_value); } else { - cast_array.append_null() + match $cast_method($array.value(i).trim(), $eval_mode, $tz) { + Ok(Some(cast_value)) => cast_array.append_value(cast_value), + Ok(None) => cast_array.append_null(), + Err(e) => { + if $eval_mode == EvalMode::Ansi { + cast_err = Some(e); + break; + } + cast_array.append_null() + } + } } } - let result: ArrayRef = Arc::new(cast_array.finish()) as ArrayRef; - result + if let Some(e) = cast_err { + Err(e) + } else { + Ok(Arc::new(cast_array.finish()) as ArrayRef) + } }}; } @@ -668,15 +678,13 @@ pub(crate) fn cast_string_to_timestamp( let tz = &timezone::Tz::from_str(timezone_str).unwrap(); let cast_array: ArrayRef = match to_type { - DataType::Timestamp(_, _) => { - cast_utf8_to_timestamp!( - string_array, - eval_mode, - TimestampMicrosecondType, - timestamp_parser, - tz - ) - } + DataType::Timestamp(_, _) => cast_utf8_to_timestamp!( + string_array, + eval_mode, + TimestampMicrosecondType, + timestamp_parser, + tz + )?, _ => unreachable!("Invalid data type {:?} in cast from string", to_type), }; Ok(cast_array) @@ -1004,7 +1012,7 @@ fn get_timestamp_values( .with_second(second) .with_microsecond(microsecond), _ => { - return Err(SparkError::CastInvalidValue { + return Err(SparkError::InvalidInputInCastToDatetime { value: value.to_string(), from_type: "STRING".to_string(), to_type: "TIMESTAMP".to_string(), @@ -1095,31 +1103,31 @@ fn timestamp_parser( // Define regex patterns and corresponding parsing functions let patterns = &[ ( - Regex::new(r"^\d{4,5}$").unwrap(), + Regex::new(r"^\d{4,7}$").unwrap(), parse_str_to_year_timestamp as fn(&str, &T) -> SparkResult>, ), ( - Regex::new(r"^\d{4,5}-\d{2}$").unwrap(), + Regex::new(r"^\d{4,7}-\d{2}$").unwrap(), parse_str_to_month_timestamp, ), ( - Regex::new(r"^\d{4,5}-\d{2}-\d{2}$").unwrap(), + Regex::new(r"^\d{4,7}-\d{2}-\d{2}$").unwrap(), parse_str_to_day_timestamp, ), ( - Regex::new(r"^\d{4,5}-\d{2}-\d{2}T\d{1,2}$").unwrap(), + Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{1,2}$").unwrap(), parse_str_to_hour_timestamp, ), ( - Regex::new(r"^\d{4,5}-\d{2}-\d{2}T\d{2}:\d{2}$").unwrap(), + Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}$").unwrap(), parse_str_to_minute_timestamp, ), ( - Regex::new(r"^\d{4,5}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$").unwrap(), + Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$").unwrap(), parse_str_to_second_timestamp, ), ( - Regex::new(r"^\d{4,5}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap(), + Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap(), parse_str_to_microsecond_timestamp, ), ( @@ -1140,7 +1148,7 @@ fn timestamp_parser( if timestamp.is_none() { return if eval_mode == EvalMode::Ansi { - Err(SparkError::CastInvalidValue { + Err(SparkError::InvalidInputInCastToDatetime { value: value.to_string(), from_type: "STRING".to_string(), to_type: "TIMESTAMP".to_string(), @@ -1204,15 +1212,15 @@ fn date_parser(date_str: &str, eval_mode: EvalMode) -> SparkResult> fn is_valid_digits(segment: i32, digits: usize) -> bool { // An integer is able to represent a date within [+-]5 million years. let max_digits_year = 7; - //year (segment 0) can be between 4 to 7 digits, - //month and day (segment 1 and 2) can be between 1 to 2 digits + // year (segment 0) can be between 4 to 7 digits, + // month and day (segment 1 and 2) can be between 1 to 2 digits (segment == 0 && digits >= 4 && digits <= max_digits_year) || (segment != 0 && digits > 0 && digits <= 2) } fn return_result(date_str: &str, eval_mode: EvalMode) -> SparkResult> { if eval_mode == EvalMode::Ansi { - Err(SparkError::CastInvalidValue { + Err(SparkError::InvalidInputInCastToDatetime { value: date_str.to_string(), from_type: "STRING".to_string(), to_type: "DATE".to_string(), @@ -1341,7 +1349,8 @@ mod tests { TimestampMicrosecondType, timestamp_parser, tz - ); + ) + .unwrap(); assert_eq!( result.data_type(), @@ -1350,6 +1359,33 @@ mod tests { assert_eq!(result.len(), 4); } + #[test] + fn test_cast_string_to_timestamp_ansi_error() { + // In ANSI mode, an invalid timestamp string must produce an error rather than null. + let array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("2020-01-01T12:34:56.123456"), + Some("not_a_timestamp"), + ])); + let tz = &timezone::Tz::from_str("UTC").unwrap(); + let string_array = array + .as_any() + .downcast_ref::>() + .expect("Expected a string array"); + + let eval_mode = EvalMode::Ansi; + let result = cast_utf8_to_timestamp!( + &string_array, + eval_mode, + TimestampMicrosecondType, + timestamp_parser, + tz + ); + assert!( + result.is_err(), + "ANSI mode should return Err for an invalid timestamp string" + ); + } + #[test] fn test_cast_dict_string_to_timestamp() -> DataFusionResult<()> { // prepare input data diff --git a/native/spark-expr/src/error.rs b/native/spark-expr/src/error.rs index ae3b5c0eda..267eabe66b 100644 --- a/native/spark-expr/src/error.rs +++ b/native/spark-expr/src/error.rs @@ -32,6 +32,19 @@ pub enum SparkError { to_type: String, }, + /// Like CastInvalidValue but maps to SparkDateTimeException instead of SparkNumberFormatException. + /// Used for string → timestamp/date cast failures where Spark throws SparkDateTimeException + /// with the CAST_INVALID_INPUT error class. + #[error("[CAST_INVALID_INPUT] The value '{value}' of the type \"{from_type}\" cannot be cast to \"{to_type}\" \ + because it is malformed. Correct the value as per the syntax, or change its target type. \ + Use `try_cast` to tolerate malformed input and return NULL instead. If necessary \ + set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error.")] + InvalidInputInCastToDatetime { + value: String, + from_type: String, + to_type: String, + }, + #[error("[NUMERIC_VALUE_OUT_OF_RANGE.WITH_SUGGESTION] {value} cannot be represented as Decimal({precision}, {scale}). If necessary set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error, and return NULL instead.")] NumericValueOutOfRange { value: String, @@ -199,6 +212,7 @@ impl SparkError { fn error_type_name(&self) -> &'static str { match self { SparkError::CastInvalidValue { .. } => "CastInvalidValue", + SparkError::InvalidInputInCastToDatetime { .. } => "InvalidInputInCastToDatetime", SparkError::NumericValueOutOfRange { .. } => "NumericValueOutOfRange", SparkError::NumericOutOfRange { .. } => "NumericOutOfRange", SparkError::CastOverFlow { .. } => "CastOverFlow", @@ -248,6 +262,11 @@ impl SparkError { value, from_type, to_type, + } + | SparkError::InvalidInputInCastToDatetime { + value, + from_type, + to_type, } => { serde_json::json!({ "value": value, @@ -456,9 +475,12 @@ impl SparkError { // CastOverflow gets special handling with CastOverflowException SparkError::CastOverFlow { .. } => "org/apache/spark/sql/comet/CastOverflowException", - // NumberFormatException (for cast invalid input errors) + // NumberFormatException (for cast invalid input errors on numeric types) SparkError::CastInvalidValue { .. } => "org/apache/spark/SparkNumberFormatException", + // DateTimeException (for cast invalid input errors on datetime types) + SparkError::InvalidInputInCastToDatetime { .. } => "org/apache/spark/SparkDateTimeException", + // ArrayIndexOutOfBoundsException SparkError::InvalidArrayIndex { .. } | SparkError::InvalidElementAtIndex { .. } @@ -497,6 +519,7 @@ impl SparkError { match self { // Cast errors SparkError::CastInvalidValue { .. } => Some("CAST_INVALID_INPUT"), + SparkError::InvalidInputInCastToDatetime { .. } => Some("CAST_INVALID_INPUT"), SparkError::CastOverFlow { .. } => Some("CAST_OVERFLOW"), SparkError::NumericValueOutOfRange { .. } => { Some("NUMERIC_VALUE_OUT_OF_RANGE.WITH_SUGGESTION") diff --git a/spark/src/main/scala/org/apache/comet/SparkErrorConverter.scala b/spark/src/main/scala/org/apache/comet/SparkErrorConverter.scala index a8dea4cf46..2846718961 100644 --- a/spark/src/main/scala/org/apache/comet/SparkErrorConverter.scala +++ b/spark/src/main/scala/org/apache/comet/SparkErrorConverter.scala @@ -100,7 +100,7 @@ object SparkErrorConverter extends ShimSparkErrorConverter { case None => Array.empty[QueryContext] // No context } - val summary: String = errorJson.summary.orNull + val summary: String = errorJson.summary.getOrElse("") // Delegate to version-specific shim - let conversion exceptions propagate val optEx = convertErrorType(errorJson.errorType, errorClass, params, sparkContext, summary) diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 95d5366907..d50aa5d8d8 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -217,10 +217,7 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { Compatible(Some("Only supports years between 262143 BC and 262142 AD")) case DataTypes.TimestampType if timeZoneId.exists(tz => tz != "UTC") => Incompatible(Some(s"Cast will use UTC instead of $timeZoneId")) - case DataTypes.TimestampType if evalMode == CometEvalMode.ANSI => - Incompatible(Some("ANSI mode not supported")) case DataTypes.TimestampType => - // https://github.com/apache/datafusion-comet/issues/328 Incompatible(Some("Not all valid formats are supported")) case _ => unsupported(DataTypes.StringType, toType) diff --git a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala index 8e6ed1a927..d5f4039cd9 100644 --- a/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala +++ b/spark/src/main/spark-3.4/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.comet.shims -import org.apache.spark.{QueryContext, SparkException} +import org.apache.spark.{QueryContext, SparkDateTimeException, SparkException} import org.apache.spark.sql.catalyst.trees.SQLQueryContext import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.types._ @@ -164,6 +164,22 @@ trait ShimSparkErrorConverter { QueryExecutionErrors .invalidInputInCastToNumberError(targetType, str, sqlCtx(context))) + case "InvalidInputInCastToDatetime" => + val expression = + s"'${params("value").toString.replace("\\", "\\\\").replace("'", "\\'")}'" + val sourceType = s""""${params("fromType").toString}"""" + val targetType = s""""${params("toType").toString}"""" + Some( + new SparkDateTimeException( + errorClass = "CAST_INVALID_INPUT", + messageParameters = Map( + "expression" -> expression, + "sourceType" -> sourceType, + "targetType" -> targetType, + "ansiConfig" -> "\"spark.sql.ansi.enabled\""), + context = context, + summary = summary)) + case "CastOverFlow" => val fromType = getDataType(params("fromType").toString) val toType = getDataType(params("toType").toString) diff --git a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala index 9bd8c7dba1..be4dafac9b 100644 --- a/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala +++ b/spark/src/main/spark-3.5/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.comet.shims -import org.apache.spark.{QueryContext, SparkException} +import org.apache.spark.{QueryContext, SparkDateTimeException, SparkException} import org.apache.spark.sql.catalyst.trees.SQLQueryContext import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.types._ @@ -162,6 +162,22 @@ trait ShimSparkErrorConverter { QueryExecutionErrors .invalidInputInCastToNumberError(targetType, str, sqlCtx(context))) + case "InvalidInputInCastToDatetime" => + val expression = + s"'${params("value").toString.replace("\\", "\\\\").replace("'", "\\'")}'" + val sourceType = s""""${params("fromType").toString}"""" + val targetType = s""""${params("toType").toString}"""" + Some( + new SparkDateTimeException( + errorClass = "CAST_INVALID_INPUT", + messageParameters = Map( + "expression" -> expression, + "sourceType" -> sourceType, + "targetType" -> targetType, + "ansiConfig" -> "\"spark.sql.ansi.enabled\""), + context = context, + summary = summary)) + case "CastOverFlow" => val fromType = getDataType(params("fromType").toString) val toType = getDataType(params("toType").toString) diff --git a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala index e49c789a77..25eea5f460 100644 --- a/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala +++ b/spark/src/main/spark-4.0/org/apache/spark/sql/comet/shims/ShimSparkErrorConverter.scala @@ -175,6 +175,13 @@ trait ShimSparkErrorConverter { QueryExecutionErrors .invalidInputInCastToNumberError(targetType, str, context.headOption.orNull)) + case "InvalidInputInCastToDatetime" => + val str = UTF8String.fromString(params("value").toString) + val targetType = getDataType(params("toType").toString) + Some( + QueryExecutionErrors + .invalidInputInCastToDatetimeError(str, targetType, context.headOption.orNull)) + case "CastOverFlow" => val fromType = getDataType(params("fromType").toString) val toType = getDataType(params("toType").toString) diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index 3d9acc39e4..ac4ac9e08c 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -23,7 +23,6 @@ import java.io.File import scala.collection.mutable.ListBuffer import scala.util.Random -import scala.util.matching.Regex import org.apache.hadoop.fs.Path import org.apache.spark.sql.{CometTestBase, DataFrame, Row, SaveMode} @@ -916,11 +915,15 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { "\r\n 962 \r\n", "\r\n 62 \r\n") - // due to limitations of NaiveDate we only support years between 262143 BC and 262142 AD" - val unsupportedYearPattern: Regex = "^\\s*[0-9]{5,}".r + // due to limitations of NaiveDate we only support years between 262143 BC and 262142 AD + // Filter out strings where the leading digit sequence represents a year > 262142. + // All 5-digit years (10000-99999) are within bounds; only 6-digit years may exceed the limit. val fuzzDates = gen .generateStrings(dataSize, datePattern, 8) - .filterNot(str => unsupportedYearPattern.findFirstMatchIn(str).isDefined) + .filterNot { str => + val yearStr = str.trim.takeWhile(_.isDigit) + yearStr.length > 6 || (yearStr.length == 6 && yearStr.toInt > 262142) + } castTest((validDates ++ invalidDates ++ fuzzDates).toDF("a"), DataTypes.DateType) } @@ -951,19 +954,47 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } - test("cast StringType to TimestampType disabled by default") { - withSQLConf((SQLConf.SESSION_LOCAL_TIMEZONE.key, "UTC")) { - val values = Seq("2020-01-01T12:34:56.123456", "T2").toDF("a") - castFallbackTest( - values.toDF("a"), - DataTypes.TimestampType, - "Not all valid formats are supported") + test("cast StringType to TimestampType - UTC") { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { + val values = Seq( + "2020", + "2020-01", + "2020-01-01", + "2020-01-01T12", + "2020-01-01T12:34", + "2020-01-01T12:34:56", + "2020-01-01T12:34:56.123456", + "T2", + "-9?", + "0100", + "0100-01", + "0100-01-01", + "0100-01-01T12", + "0100-01-01T12:34", + "0100-01-01T12:34:56", + "0100-01-01T12:34:56.123456", + "10000", + "10000-01", + "10000-01-01", + "10000-01-01T12", + "10000-01-01T12:34", + "10000-01-01T12:34:56", + "10000-01-01T12:34:56.123456", + "213170", + "213170-06", + "213170-06-15", + "213170-06-15T12", + "213170-06-15T12:34", + "213170-06-15T12:34:56", + "213170-06-15T12:34:56.123456") + castTimestampTest(values.toDF("a"), DataTypes.TimestampType) } } ignore("cast StringType to TimestampType") { - // https://github.com/apache/datafusion-comet/issues/328 - withSQLConf((CometConf.getExprAllowIncompatConfigKey(classOf[Cast]), "true")) { + // TODO: enable once all Spark timestamp formats are supported natively. + // Currently missing: time-only formats with colon (e.g. "T12:34", "4:4"). + withSQLConf((SQLConf.SESSION_LOCAL_TIMEZONE.key, "UTC")) { val values = Seq("2020-01-01T12:34:56.123456", "T2") ++ gen.generateStrings( dataSize, timestampPattern, @@ -994,6 +1025,13 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { "2020-01-01T12:34:56.123456", "T2", "-9?", + "100", + "100-01", + "100-01-01", + "100-01-01T12", + "100-01-01T12:34", + "100-01-01T12:34:56", + "100-01-01T12:34:56.123456", "0100", "0100-01", "0100-01-01", @@ -1010,14 +1048,6 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { "10000-01-01T12:34:56.123456") castTimestampTest(values.toDF("a"), DataTypes.TimestampType) } - - // test for invalid inputs - withSQLConf( - SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC", - CometConf.getExprAllowIncompatConfigKey(classOf[Cast]) -> "true") { - val values = Seq("-9?", "1-", "0.5") - castTimestampTest(values.toDF("a"), DataTypes.TimestampType) - } } // CAST from BinaryType @@ -1476,7 +1506,10 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } - private def castTimestampTest(input: DataFrame, toType: DataType) = { + private def castTimestampTest( + input: DataFrame, + toType: DataType, + assertNative: Boolean = false) = { withTempPath { dir => val data = roundtripParquet(input, dir).coalesce(1) data.createOrReplaceTempView("t") @@ -1484,12 +1517,45 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { withSQLConf((SQLConf.ANSI_ENABLED.key, "false")) { // cast() should return null for invalid inputs when ansi mode is disabled val df = data.withColumn("converted", col("a").cast(toType)) - checkSparkAnswer(df) + if (assertNative) { + checkSparkAnswerAndOperator(df) + } else { + checkSparkAnswer(df) + } // try_cast() should always return null for invalid inputs val df2 = spark.sql(s"select try_cast(a as ${toType.sql}) from t") checkSparkAnswer(df2) } + + // with ANSI enabled, we should produce the same exception as Spark + withSQLConf(SQLConf.ANSI_ENABLED.key -> "true") { + val df = data.withColumn("converted", col("a").cast(toType)) + checkSparkAnswerMaybeThrows(df) match { + case (None, None) => + // both succeeded, results already compared + case (None, Some(e)) => + throw e + case (Some(e), None) => + fail(s"Comet should have failed with ${e.getCause.getMessage}") + case (Some(sparkException), Some(cometException)) => + val sparkMessage = + if (sparkException.getCause != null) sparkException.getCause.getMessage + else sparkException.getMessage + val cometMessage = + if (cometException.getCause != null) cometException.getCause.getMessage + else cometException.getMessage + if (CometSparkSessionExtensions.isSpark40Plus) { + assert(sparkMessage.contains("SQLSTATE")) + assert(sparkMessage.startsWith(cometMessage.substring(0, 40))) + } else { + assert(cometMessage == sparkMessage) + } + } + // try_cast() + val dfTryCast = spark.sql(s"select try_cast(a as ${toType.sql}) from t") + checkSparkAnswer(dfTryCast) + } } } From 1fb44d27c37d34da3a88dcc39d0b181ee2b05aaf Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Thu, 12 Mar 2026 16:59:52 -0700 Subject: [PATCH 2/3] stricter bounds check for year --- .../spark-expr/src/conversion_funcs/string.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index 2d83484357..b0d4b8ea6b 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -969,6 +969,12 @@ fn get_timestamp_values( ) -> SparkResult> { let values: Vec<_> = value.split(['T', '-', ':', '.']).collect(); let year = values[0].parse::().unwrap_or_default(); + + // NaiveDate (used internally by chrono's with_ymd_and_hms) is bounded to ±262142. + if !(-262143..=262142).contains(&year) { + return Ok(None); + } + let month = values.get(1).map_or(1, |m| m.parse::().unwrap_or(1)); let day = values.get(2).map_or(1, |d| d.parse::().unwrap_or(1)); let hour = values.get(3).map_or(0, |h| h.parse::().unwrap_or(0)); @@ -1210,7 +1216,10 @@ fn date_parser(date_str: &str, eval_mode: EvalMode) -> SparkResult> } fn is_valid_digits(segment: i32, digits: usize) -> bool { - // An integer is able to represent a date within [+-]5 million years. + // NaiveDate is bounded to [-262142, 262142] (6 digits). We allow up to 7 digits to support + // leading-zero year strings like "0002020" (= year 2020), matching Spark's + // isValidDigits. Values outside the bounds are caught by an explicit bounds + // check below. let max_digits_year = 7; // year (segment 0) can be between 4 to 7 digits, // month and day (segment 1 and 2) can be between 1 to 2 digits @@ -1293,8 +1302,14 @@ fn date_parser(date_str: &str, eval_mode: EvalMode) -> SparkResult> date_segments[current_segment as usize] = current_segment_value.0; + // Reject out-of-range years explicitly + let year = sign * date_segments[0]; + if !(-262143..=262142).contains(&year) { + return Ok(None); + } + match NaiveDate::from_ymd_opt( - sign * date_segments[0], + year, date_segments[1] as u32, date_segments[2] as u32, ) { From 71a6a2cd013bd566f6b797c64b8f7815f43f1081 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Thu, 12 Mar 2026 17:23:50 -0700 Subject: [PATCH 3/3] format --- native/spark-expr/src/conversion_funcs/string.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index b0d4b8ea6b..167d6f1769 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -1308,11 +1308,7 @@ fn date_parser(date_str: &str, eval_mode: EvalMode) -> SparkResult> return Ok(None); } - match NaiveDate::from_ymd_opt( - year, - date_segments[1] as u32, - date_segments[2] as u32, - ) { + match NaiveDate::from_ymd_opt(year, date_segments[1] as u32, date_segments[2] as u32) { Some(date) => { let duration_since_epoch = date .signed_duration_since(DateTime::UNIX_EPOCH.naive_utc().date())