[SPARK-56167][PS] Align astype with pandas 3 default string behavior

ueshin · HyukjinKwon · commit 992d932e1d5f · 2026-03-24T19:57:37.000+09:00
### What changes were proposed in this pull request? This PR updates a few pandas-on-Spark `astype` paths to match pandas 3 behavior for the default string dtype. In pandas 3, `astype(str)` returns the default string dtype and preserves missing values instead of converting them to string literals such as `"NaN"` or `"<NA>"`. pandas-on-Spark still used the older behavior in a few localized conversion paths, including numeric, null, string, and boolean casts. This PR makes three small changes in `python/pyspark/pandas/data_type_ops/`: - update the shared string cast helper so `astype(str)` preserves missing values for pandas 3 string results - align boolean-to-string casting with the same pandas 3 behavior, including the nullable metadata on the result field - align string-to-bool casting for pandas 3 string-backed data with pandas' current `astype(bool)` result ### Why are the changes needed? Without this change, several pandas-on-Spark `astype` tests fail with pandas 3 because some conversion paths still follow the older string-casting behavior. The failures came from two related mismatches: - `astype(str)` converted missing values into string literals instead of preserving them as missing values - some follow-up casts from pandas 3 string-backed data did not match pandas' current behavior This patch fixes those localized mismatches while keeping the pandas 2 behavior unchanged. ### Does this PR introduce _any_ user-facing change? Yes. For pandas 3 users, pandas-on-Spark `astype(str)` now preserves missing values in the affected paths instead of converting them to string literals. This also fixes related behavior for boolean and string-backed casts that depend on pandas 3's default string behavior. ### How was this patch tested? The existing tests should pass. ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Codex (GPT-5) Closes #54968 from ueshin/issues/SPARK-56167/astype. Authored-by: Takuya Ueshin <ueshin@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py
@@ -51,6 +51,7 @@
     extension_float_dtypes_available,
     extension_object_dtypes_available,
     handle_dtype_as_extension_dtype,
+    is_str_dtype,
     spark_type_to_pandas_dtype,
 )
 
@@ -193,7 +194,7 @@ def _as_string_type(
     representing null Spark column. Note that `null_str` is for non-extension dtypes only.
     """
     spark_type = StringType()
-    if handle_dtype_as_extension_dtype(dtype):
+    if handle_dtype_as_extension_dtype(dtype) or is_str_dtype(dtype):
         scol = index_ops.spark.column.cast(spark_type)
     else:
         casted = index_ops.spark.column.cast(spark_type)
diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -39,6 +39,7 @@
 from pyspark.pandas.typedef.typehints import (
     as_spark_type,
     handle_dtype_as_extension_dtype,
+    is_str_dtype,
     pandas_on_spark_type,
 )
 from pyspark.pandas.utils import is_ansi_mode_enabled
@@ -326,12 +327,12 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
         elif isinstance(spark_type, BooleanType):
             return _as_bool_type(index_ops, dtype)
         elif isinstance(spark_type, StringType):
-            if handle_dtype_as_extension_dtype(dtype):
+            if handle_dtype_as_extension_dtype(dtype) or is_str_dtype(dtype):
                 scol = F.when(
                     index_ops.spark.column.isNotNull(),
                     F.when(index_ops.spark.column, "True").otherwise("False"),
                 )
-                nullable = index_ops.spark.nullable
+                nullable = index_ops.spark.nullable or is_str_dtype(dtype)
             else:
                 null_str = str(pd.NA) if isinstance(self, BooleanExtensionOps) else str(None)
                 casted = F.when(index_ops.spark.column, "True").otherwise("False")
diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py b/python/pyspark/pandas/data_type_ops/string_ops.py
@@ -33,7 +33,11 @@
     _as_string_type,
     _sanitize_list_like,
 )
-from pyspark.pandas.typedef import handle_dtype_as_extension_dtype, pandas_on_spark_type
+from pyspark.pandas.typedef import (
+    handle_dtype_as_extension_dtype,
+    is_str_dtype,
+    pandas_on_spark_type,
+)
 from pyspark.sql.types import BooleanType
 
 
@@ -128,7 +132,10 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
             if handle_dtype_as_extension_dtype(dtype):
                 scol = index_ops.spark.column.cast(spark_type)
             else:
-                scol = F.when(index_ops.spark.column.isNull(), F.lit(False)).otherwise(
+                # pandas 3 maps `str` to StringDtype, where astype(bool)
+                # treats missing values as True.
+                null_value = F.lit(True) if is_str_dtype(self.dtype) else F.lit(False)
+                scol = F.when(index_ops.spark.column.isNull(), null_value).otherwise(
                     F.length(index_ops.spark.column) > 0
                 )
             return index_ops._with_new_scol(