diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..58baa004 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Fixed pandas 3.0 compatibility in Enum.encode() by using positional access (.iloc[0]) for pandas Series instead of label-based access (array[0]), which fails with KeyError when Series has a non-integer index (fixes #427) diff --git a/policyengine_core/enums/enum.py b/policyengine_core/enums/enum.py index 5b6e4b9e..c13a99b6 100644 --- a/policyengine_core/enums/enum.py +++ b/policyengine_core/enums/enum.py @@ -66,7 +66,14 @@ def encode(cls, array: Union[EnumArray, np.ndarray]) -> EnumArray: return array # Handle Enum item arrays by extracting indices directly - if len(array) > 0 and isinstance(array[0], Enum): + # Use .iloc[0] for pandas Series to avoid KeyError with non-integer index + # (pandas 3.0 uses StringDtype by default, causing array[0] to do + # label-based lookup instead of positional access) + if len(array) > 0: + first_elem = array.iloc[0] if hasattr(array, "iloc") else array[0] + else: + first_elem = None + if first_elem is not None and isinstance(first_elem, Enum): indices = np.array( [item.index for item in array], dtype=ENUM_ARRAY_DTYPE ) diff --git a/tests/core/enums/test_enum.py b/tests/core/enums/test_enum.py index 3605ef6d..0fe586f0 100644 --- a/tests/core/enums/test_enum.py +++ b/tests/core/enums/test_enum.py @@ -94,3 +94,59 @@ class Sample(Enum): # Empty string should be in the error message (represented as '') assert "''" in str(exc_info.value) or '""' in str(exc_info.value) + + +def test_enum_encode_pandas_series_with_enum_items(): + """ + Test that encoding a pandas Series containing Enum items works. + + In pandas 3.0, Series may have StringDtype index. The encode() method + uses array[0] to check if items are Enum instances, but this fails + with KeyError when the Series has a non-integer index. + + This test verifies the fix for GitHub issue #427. + """ + import pandas as pd + + class Sample(Enum): + MAXWELL = "maxwell" + DWORKIN = "dworkin" + + # Create a pandas Series with Enum items (simulates what happens in + # policyengine-us county variable with pandas 3.0) + enum_items = [Sample.MAXWELL, Sample.DWORKIN, Sample.MAXWELL] + series = pd.Series(enum_items) + + # This should work but fails with KeyError: 0 before the fix + encoded_array = Sample.encode(series) + + assert len(encoded_array) == 3 + assert isinstance(encoded_array, EnumArray) + assert encoded_array.dtype.kind == "i" + # Verify correct encoding + assert list(encoded_array) == [0, 1, 0] # MAXWELL=0, DWORKIN=1 + + +def test_enum_encode_pandas_series_with_string_index(): + """ + Test that encoding a pandas Series with a string index works. + + This specifically tests the pandas 3.0 case where StringDtype is used + and array[0] does label-based lookup instead of positional access. + """ + import pandas as pd + + class Sample(Enum): + MAXWELL = "maxwell" + DWORKIN = "dworkin" + + # Create a Series with a string index (like pandas 3.0 StringDtype) + enum_items = [Sample.MAXWELL, Sample.DWORKIN, Sample.MAXWELL] + series = pd.Series(enum_items, index=["a", "b", "c"]) + + # This fails with KeyError: 0 when using array[0] instead of .iloc[0] + encoded_array = Sample.encode(series) + + assert len(encoded_array) == 3 + assert isinstance(encoded_array, EnumArray) + assert list(encoded_array) == [0, 1, 0]