Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: patch
changes:
fixed:
- Fixed pandas 3.0 compatibility issues with StringDtype and StringArray
1 change: 0 additions & 1 deletion policyengine_core/charts/formatting.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import plotly.graph_objects as go
from IPython.display import HTML


GREEN = "#29d40f"
LIGHT_GREEN = "#C5E1A5"
DARK_GREEN = "#558B2F"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,10 @@ def __getitem__(self, key: str) -> Any:
if isinstance(key, str):
return self.__getattr__(key)
# If the key is a vector, e.g. ['zone_1', 'zone_2', 'zone_1']
elif isinstance(key, numpy.ndarray):
# Convert pandas arrays (e.g., StringArray from pandas 3) to numpy
if hasattr(key, "__array__") and not isinstance(key, numpy.ndarray):
key = numpy.asarray(key)
if isinstance(key, numpy.ndarray):
if not numpy.issubdtype(key.dtype, numpy.str_):
# In case the key is not a string vector, stringify it
if key.dtype == object and issubclass(type(key[0]), Enum):
Expand Down
14 changes: 8 additions & 6 deletions policyengine_core/populations/population.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ def empty_array(self) -> numpy.ndarray:
return numpy.zeros(self.count)

def filled_array(self, value: Any, dtype: Any = None) -> numpy.ndarray:
import pandas as pd

# Handle pandas extension dtypes (e.g., StringDtype in pandas 3)
# numpy.full() cannot handle these, so convert to object dtype
if isinstance(dtype, pd.api.extensions.ExtensionDtype):
dtype = object
return numpy.full(self.count, value, dtype)

def __getattr__(self, attribute: str) -> Any:
Expand Down Expand Up @@ -72,17 +78,13 @@ def check_period_validity(
if period is None:
stack = traceback.extract_stack()
filename, line_number, function_name, line_of_code = stack[-3]
raise ValueError(
"""
raise ValueError("""
You requested computation of variable "{}", but you did not specify on which period in "{}:{}":
{}
When you request the computation of a variable within a formula, you must always specify the period as the second parameter. The convention is to call this parameter "period". For example:
computed_salary = person('salary', period).
See more information at <https://openfisca.org/doc/coding-the-legislation/35_periods.html#periods-in-variable-definition>.
""".format(
variable_name, filename, line_number, line_of_code
)
)
""".format(variable_name, filename, line_number, line_of_code))

def __call__(
self,
Expand Down
180 changes: 180 additions & 0 deletions tests/core/test_pandas3_compatibility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
"""
Tests for pandas 3.0.0 compatibility.

These tests verify that policyengine-core works correctly with pandas 3.0.0,
which introduces:
1. PyArrow-backed strings as default (StringDtype)
2. Copy-on-Write by default
"""

import numpy as np
import pandas as pd
import pytest


class TestFilledArrayWithStringDtype:
"""Test that filled_array works with pandas StringDtype."""

def test_filled_array_with_string_dtype(self):
"""
In pandas 3.0.0, string columns use StringDtype by default.
numpy.full() cannot handle StringDtype, so we need to handle this case.
"""
from policyengine_core.populations.population import Population
from policyengine_core.entities import Entity

# Create a minimal entity for testing
entity = Entity(
key="person",
plural="people",
label="Person",
doc="Test person entity",
)

# Create a population with some count
population = Population(entity)
population.count = 5

# Test with regular numpy dtype - should work
result = population.filled_array("test_value", dtype=object)
assert len(result) == 5
assert all(v == "test_value" for v in result)

# Test with pandas StringDtype - this is what pandas 3 uses by default
# This should NOT raise an error
string_dtype = pd.StringDtype()
result = population.filled_array("test_value", dtype=string_dtype)
assert len(result) == 5
assert all(v == "test_value" for v in result)

def test_filled_array_with_pyarrow_string_dtype(self):
"""
Test with PyArrow-backed string dtype, which pandas 3 uses by default.
"""
pa = pytest.importorskip("pyarrow")

from policyengine_core.populations.population import Population
from policyengine_core.entities import Entity

entity = Entity(
key="person",
plural="people",
label="Person",
doc="Test person entity",
)
population = Population(entity)
population.count = 5

# PyArrow string dtype (proper way to create it)
arrow_string_dtype = pd.ArrowDtype(pa.string())
result = population.filled_array(
"test_value", dtype=arrow_string_dtype
)
assert len(result) == 5


class TestParameterLookupWithStringArray:
"""Test that parameter lookup works with pandas StringArray."""

def test_parameter_node_getitem_with_string_array(self):
"""
In pandas 3.0.0, series.values.astype(str) returns a StringArray
instead of a numpy array. ParameterNodeAtInstant.__getitem__ should
handle this.
"""
# Create a pandas StringArray (what pandas 3 returns)
string_array = pd.array(["value1", "value2", "value3"], dtype="string")

# Verify it's a StringArray (not numpy array)
assert not isinstance(string_array, np.ndarray)
assert hasattr(string_array, "__array__")

# Convert to numpy - this is what the fix should do
numpy_array = np.asarray(string_array)
assert isinstance(numpy_array, np.ndarray)

def test_vectorial_parameter_node_with_string_array(self):
"""
VectorialParameterNodeAtInstant.__getitem__ should handle pandas
StringArray by converting it to numpy array.
"""
from policyengine_core.parameters.vectorial_parameter_node_at_instant import (
VectorialParameterNodeAtInstant,
)

# Create a simple vectorial node for testing with proper structure
vector = np.array(
[(1.0, 2.0)],
dtype=[("zone_1", "float"), ("zone_2", "float")],
).view(np.recarray)

node = VectorialParameterNodeAtInstant("test", vector, "2024-01-01")

# Test with numpy array - should work
key_numpy = np.array(["zone_1", "zone_2"])
result_numpy = node[key_numpy]
assert len(result_numpy) == 2

# Test with pandas StringArray - this is what pandas 3 returns
key_string_array = pd.array(["zone_1", "zone_2"], dtype="string")

# This should NOT raise TypeError: unhashable type: 'StringArray'
# The node should accept StringArray by converting to numpy
result_string_array = node[key_string_array]
assert len(result_string_array) == 2

# Results should be the same
np.testing.assert_array_equal(result_numpy, result_string_array)


class TestMicroSeriesCompatibility:
"""Test that MicroSeries operations work with pandas 3."""

def test_series_subclass_preserved(self):
"""
Pandas 3.0.0 may change how Series subclasses are handled.
Operations should return the subclass, not plain Series.
"""
# This test documents expected behavior that may break in pandas 3
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})

# Test that operations preserve Series type
result = df["a"] + df["b"]
assert isinstance(result, pd.Series)

# With pandas 3, some operations may return different types
result = df["a"].astype(str)
# In pandas 3, this might return StringArray-backed Series
assert isinstance(result, pd.Series)


class TestStringDtypeConversion:
"""Test utilities for converting pandas StringDtype to numpy-compatible types."""

def test_convert_string_dtype_to_object(self):
"""
When pandas StringDtype is passed to numpy functions,
we should convert it to object dtype.
"""
string_dtype = pd.StringDtype()

# numpy.full doesn't understand StringDtype
with pytest.raises(TypeError):
np.full(5, "test", dtype=string_dtype)

# But it works with object dtype
result = np.full(5, "test", dtype=object)
assert len(result) == 5

def test_is_pandas_extension_dtype(self):
"""Test detection of pandas extension dtypes."""
# pandas StringDtype is an ExtensionDtype
assert isinstance(pd.StringDtype(), pd.api.extensions.ExtensionDtype)

# numpy dtypes are not
assert not isinstance(
np.dtype("float64"), pd.api.extensions.ExtensionDtype
)
assert not isinstance(
np.dtype("object"), pd.api.extensions.ExtensionDtype
)