diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py index 21177847a3128..72ce6cf7d9301 100644 --- a/python/pyspark/pandas/data_type_ops/base.py +++ b/python/pyspark/pandas/data_type_ops/base.py @@ -51,6 +51,7 @@ extension_float_dtypes_available, extension_object_dtypes_available, handle_dtype_as_extension_dtype, + is_str_dtype, spark_type_to_pandas_dtype, ) @@ -193,7 +194,7 @@ def _as_string_type( representing null Spark column. Note that `null_str` is for non-extension dtypes only. """ spark_type = StringType() - if handle_dtype_as_extension_dtype(dtype): + if handle_dtype_as_extension_dtype(dtype) or is_str_dtype(dtype): scol = index_ops.spark.column.cast(spark_type) else: casted = index_ops.spark.column.cast(spark_type) diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py b/python/pyspark/pandas/data_type_ops/boolean_ops.py index 05797735b1cec..c52bc9b5051c9 100644 --- a/python/pyspark/pandas/data_type_ops/boolean_ops.py +++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py @@ -39,6 +39,7 @@ from pyspark.pandas.typedef.typehints import ( as_spark_type, handle_dtype_as_extension_dtype, + is_str_dtype, pandas_on_spark_type, ) from pyspark.pandas.utils import is_ansi_mode_enabled @@ -326,12 +327,12 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind elif isinstance(spark_type, BooleanType): return _as_bool_type(index_ops, dtype) elif isinstance(spark_type, StringType): - if handle_dtype_as_extension_dtype(dtype): + if handle_dtype_as_extension_dtype(dtype) or is_str_dtype(dtype): scol = F.when( index_ops.spark.column.isNotNull(), F.when(index_ops.spark.column, "True").otherwise("False"), ) - nullable = index_ops.spark.nullable + nullable = index_ops.spark.nullable or is_str_dtype(dtype) else: null_str = str(pd.NA) if isinstance(self, BooleanExtensionOps) else str(None) casted = F.when(index_ops.spark.column, "True").otherwise("False") diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py b/python/pyspark/pandas/data_type_ops/string_ops.py index f7eed649017be..de35568cf2de4 100644 --- a/python/pyspark/pandas/data_type_ops/string_ops.py +++ b/python/pyspark/pandas/data_type_ops/string_ops.py @@ -33,7 +33,11 @@ _as_string_type, _sanitize_list_like, ) -from pyspark.pandas.typedef import handle_dtype_as_extension_dtype, pandas_on_spark_type +from pyspark.pandas.typedef import ( + handle_dtype_as_extension_dtype, + is_str_dtype, + pandas_on_spark_type, +) from pyspark.sql.types import BooleanType @@ -128,7 +132,10 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind if handle_dtype_as_extension_dtype(dtype): scol = index_ops.spark.column.cast(spark_type) else: - scol = F.when(index_ops.spark.column.isNull(), F.lit(False)).otherwise( + # pandas 3 maps `str` to StringDtype, where astype(bool) + # treats missing values as True. + null_value = F.lit(True) if is_str_dtype(self.dtype) else F.lit(False) + scol = F.when(index_ops.spark.column.isNull(), null_value).otherwise( F.length(index_ops.spark.column) > 0 ) return index_ops._with_new_scol(