diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..fdbcd7d9 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Fixed pandas 3.0 compatibility issues with StringDtype and StringArray diff --git a/policyengine_core/charts/formatting.py b/policyengine_core/charts/formatting.py index 8fcf536a..182774c0 100644 --- a/policyengine_core/charts/formatting.py +++ b/policyengine_core/charts/formatting.py @@ -1,7 +1,6 @@ import plotly.graph_objects as go from IPython.display import HTML - GREEN = "#29d40f" LIGHT_GREEN = "#C5E1A5" DARK_GREEN = "#558B2F" diff --git a/policyengine_core/parameters/vectorial_parameter_node_at_instant.py b/policyengine_core/parameters/vectorial_parameter_node_at_instant.py index 9a7ce385..c5ce1367 100644 --- a/policyengine_core/parameters/vectorial_parameter_node_at_instant.py +++ b/policyengine_core/parameters/vectorial_parameter_node_at_instant.py @@ -196,7 +196,10 @@ def __getitem__(self, key: str) -> Any: if isinstance(key, str): return self.__getattr__(key) # If the key is a vector, e.g. ['zone_1', 'zone_2', 'zone_1'] - elif isinstance(key, numpy.ndarray): + # Convert pandas arrays (e.g., StringArray from pandas 3) to numpy + if hasattr(key, "__array__") and not isinstance(key, numpy.ndarray): + key = numpy.asarray(key) + if isinstance(key, numpy.ndarray): if not numpy.issubdtype(key.dtype, numpy.str_): # In case the key is not a string vector, stringify it if key.dtype == object and issubclass(type(key[0]), Enum): diff --git a/policyengine_core/populations/population.py b/policyengine_core/populations/population.py index a3c9f5aa..988485ec 100644 --- a/policyengine_core/populations/population.py +++ b/policyengine_core/populations/population.py @@ -41,6 +41,12 @@ def empty_array(self) -> numpy.ndarray: return numpy.zeros(self.count) def filled_array(self, value: Any, dtype: Any = None) -> numpy.ndarray: + import pandas as pd + + # Handle pandas extension dtypes (e.g., StringDtype in pandas 3) + # numpy.full() cannot handle these, so convert to object dtype + if isinstance(dtype, pd.api.extensions.ExtensionDtype): + dtype = object return numpy.full(self.count, value, dtype) def __getattr__(self, attribute: str) -> Any: @@ -72,17 +78,13 @@ def check_period_validity( if period is None: stack = traceback.extract_stack() filename, line_number, function_name, line_of_code = stack[-3] - raise ValueError( - """ + raise ValueError(""" You requested computation of variable "{}", but you did not specify on which period in "{}:{}": {} When you request the computation of a variable within a formula, you must always specify the period as the second parameter. The convention is to call this parameter "period". For example: computed_salary = person('salary', period). See more information at . -""".format( - variable_name, filename, line_number, line_of_code - ) - ) +""".format(variable_name, filename, line_number, line_of_code)) def __call__( self, diff --git a/tests/core/test_pandas3_compatibility.py b/tests/core/test_pandas3_compatibility.py new file mode 100644 index 00000000..1cce2978 --- /dev/null +++ b/tests/core/test_pandas3_compatibility.py @@ -0,0 +1,180 @@ +""" +Tests for pandas 3.0.0 compatibility. + +These tests verify that policyengine-core works correctly with pandas 3.0.0, +which introduces: +1. PyArrow-backed strings as default (StringDtype) +2. Copy-on-Write by default +""" + +import numpy as np +import pandas as pd +import pytest + + +class TestFilledArrayWithStringDtype: + """Test that filled_array works with pandas StringDtype.""" + + def test_filled_array_with_string_dtype(self): + """ + In pandas 3.0.0, string columns use StringDtype by default. + numpy.full() cannot handle StringDtype, so we need to handle this case. + """ + from policyengine_core.populations.population import Population + from policyengine_core.entities import Entity + + # Create a minimal entity for testing + entity = Entity( + key="person", + plural="people", + label="Person", + doc="Test person entity", + ) + + # Create a population with some count + population = Population(entity) + population.count = 5 + + # Test with regular numpy dtype - should work + result = population.filled_array("test_value", dtype=object) + assert len(result) == 5 + assert all(v == "test_value" for v in result) + + # Test with pandas StringDtype - this is what pandas 3 uses by default + # This should NOT raise an error + string_dtype = pd.StringDtype() + result = population.filled_array("test_value", dtype=string_dtype) + assert len(result) == 5 + assert all(v == "test_value" for v in result) + + def test_filled_array_with_pyarrow_string_dtype(self): + """ + Test with PyArrow-backed string dtype, which pandas 3 uses by default. + """ + pa = pytest.importorskip("pyarrow") + + from policyengine_core.populations.population import Population + from policyengine_core.entities import Entity + + entity = Entity( + key="person", + plural="people", + label="Person", + doc="Test person entity", + ) + population = Population(entity) + population.count = 5 + + # PyArrow string dtype (proper way to create it) + arrow_string_dtype = pd.ArrowDtype(pa.string()) + result = population.filled_array( + "test_value", dtype=arrow_string_dtype + ) + assert len(result) == 5 + + +class TestParameterLookupWithStringArray: + """Test that parameter lookup works with pandas StringArray.""" + + def test_parameter_node_getitem_with_string_array(self): + """ + In pandas 3.0.0, series.values.astype(str) returns a StringArray + instead of a numpy array. ParameterNodeAtInstant.__getitem__ should + handle this. + """ + # Create a pandas StringArray (what pandas 3 returns) + string_array = pd.array(["value1", "value2", "value3"], dtype="string") + + # Verify it's a StringArray (not numpy array) + assert not isinstance(string_array, np.ndarray) + assert hasattr(string_array, "__array__") + + # Convert to numpy - this is what the fix should do + numpy_array = np.asarray(string_array) + assert isinstance(numpy_array, np.ndarray) + + def test_vectorial_parameter_node_with_string_array(self): + """ + VectorialParameterNodeAtInstant.__getitem__ should handle pandas + StringArray by converting it to numpy array. + """ + from policyengine_core.parameters.vectorial_parameter_node_at_instant import ( + VectorialParameterNodeAtInstant, + ) + + # Create a simple vectorial node for testing with proper structure + vector = np.array( + [(1.0, 2.0)], + dtype=[("zone_1", "float"), ("zone_2", "float")], + ).view(np.recarray) + + node = VectorialParameterNodeAtInstant("test", vector, "2024-01-01") + + # Test with numpy array - should work + key_numpy = np.array(["zone_1", "zone_2"]) + result_numpy = node[key_numpy] + assert len(result_numpy) == 2 + + # Test with pandas StringArray - this is what pandas 3 returns + key_string_array = pd.array(["zone_1", "zone_2"], dtype="string") + + # This should NOT raise TypeError: unhashable type: 'StringArray' + # The node should accept StringArray by converting to numpy + result_string_array = node[key_string_array] + assert len(result_string_array) == 2 + + # Results should be the same + np.testing.assert_array_equal(result_numpy, result_string_array) + + +class TestMicroSeriesCompatibility: + """Test that MicroSeries operations work with pandas 3.""" + + def test_series_subclass_preserved(self): + """ + Pandas 3.0.0 may change how Series subclasses are handled. + Operations should return the subclass, not plain Series. + """ + # This test documents expected behavior that may break in pandas 3 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + # Test that operations preserve Series type + result = df["a"] + df["b"] + assert isinstance(result, pd.Series) + + # With pandas 3, some operations may return different types + result = df["a"].astype(str) + # In pandas 3, this might return StringArray-backed Series + assert isinstance(result, pd.Series) + + +class TestStringDtypeConversion: + """Test utilities for converting pandas StringDtype to numpy-compatible types.""" + + def test_convert_string_dtype_to_object(self): + """ + When pandas StringDtype is passed to numpy functions, + we should convert it to object dtype. + """ + string_dtype = pd.StringDtype() + + # numpy.full doesn't understand StringDtype + with pytest.raises(TypeError): + np.full(5, "test", dtype=string_dtype) + + # But it works with object dtype + result = np.full(5, "test", dtype=object) + assert len(result) == 5 + + def test_is_pandas_extension_dtype(self): + """Test detection of pandas extension dtypes.""" + # pandas StringDtype is an ExtensionDtype + assert isinstance(pd.StringDtype(), pd.api.extensions.ExtensionDtype) + + # numpy dtypes are not + assert not isinstance( + np.dtype("float64"), pd.api.extensions.ExtensionDtype + ) + assert not isinstance( + np.dtype("object"), pd.api.extensions.ExtensionDtype + )