Skip to content

Commit f4d2fe3

Browse files
MaxGhenisclaude
andcommitted
Fix pandas 3.0 KeyError in Enum.encode() for Series with non-integer index
In pandas 3.0, string columns use StringDtype by default. When a Series has a string index, array[0] does label-based lookup (looking for key "0") instead of positional access, causing KeyError. The fix uses .iloc[0] for pandas Series to ensure positional access. Fixes #427 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent b9f6d03 commit f4d2fe3

4 files changed

Lines changed: 74 additions & 3 deletions

File tree

changelog_entry.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- bump: patch
2+
changes:
3+
fixed:
4+
- Fixed pandas 3.0 compatibility in Enum.encode() by using positional access (.iloc[0]) for pandas Series instead of label-based access (array[0]), which fails with KeyError when Series has a non-integer index (fixes #427)

policyengine_core/enums/enum.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,14 @@ def encode(cls, array: Union[EnumArray, np.ndarray]) -> EnumArray:
6666
return array
6767

6868
# Handle Enum item arrays by extracting indices directly
69-
if len(array) > 0 and isinstance(array[0], Enum):
69+
# Use .iloc[0] for pandas Series to avoid KeyError with non-integer index
70+
# (pandas 3.0 uses StringDtype by default, causing array[0] to do
71+
# label-based lookup instead of positional access)
72+
if len(array) > 0:
73+
first_elem = array.iloc[0] if hasattr(array, "iloc") else array[0]
74+
else:
75+
first_elem = None
76+
if first_elem is not None and isinstance(first_elem, Enum):
7077
indices = np.array(
7178
[item.index for item in array], dtype=ENUM_ARRAY_DTYPE
7279
)

policyengine_core/populations/population.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,13 +78,17 @@ def check_period_validity(
7878
if period is None:
7979
stack = traceback.extract_stack()
8080
filename, line_number, function_name, line_of_code = stack[-3]
81-
raise ValueError("""
81+
raise ValueError(
82+
"""
8283
You requested computation of variable "{}", but you did not specify on which period in "{}:{}":
8384
{}
8485
When you request the computation of a variable within a formula, you must always specify the period as the second parameter. The convention is to call this parameter "period". For example:
8586
computed_salary = person('salary', period).
8687
See more information at <https://openfisca.org/doc/coding-the-legislation/35_periods.html#periods-in-variable-definition>.
87-
""".format(variable_name, filename, line_number, line_of_code))
88+
""".format(
89+
variable_name, filename, line_number, line_of_code
90+
)
91+
)
8892

8993
def __call__(
9094
self,

tests/core/enums/test_enum.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,59 @@ class Sample(Enum):
9494

9595
# Empty string should be in the error message (represented as '')
9696
assert "''" in str(exc_info.value) or '""' in str(exc_info.value)
97+
98+
99+
def test_enum_encode_pandas_series_with_enum_items():
100+
"""
101+
Test that encoding a pandas Series containing Enum items works.
102+
103+
In pandas 3.0, Series may have StringDtype index. The encode() method
104+
uses array[0] to check if items are Enum instances, but this fails
105+
with KeyError when the Series has a non-integer index.
106+
107+
This test verifies the fix for GitHub issue #427.
108+
"""
109+
import pandas as pd
110+
111+
class Sample(Enum):
112+
MAXWELL = "maxwell"
113+
DWORKIN = "dworkin"
114+
115+
# Create a pandas Series with Enum items (simulates what happens in
116+
# policyengine-us county variable with pandas 3.0)
117+
enum_items = [Sample.MAXWELL, Sample.DWORKIN, Sample.MAXWELL]
118+
series = pd.Series(enum_items)
119+
120+
# This should work but fails with KeyError: 0 before the fix
121+
encoded_array = Sample.encode(series)
122+
123+
assert len(encoded_array) == 3
124+
assert isinstance(encoded_array, EnumArray)
125+
assert encoded_array.dtype.kind == "i"
126+
# Verify correct encoding
127+
assert list(encoded_array) == [0, 1, 0] # MAXWELL=0, DWORKIN=1
128+
129+
130+
def test_enum_encode_pandas_series_with_string_index():
131+
"""
132+
Test that encoding a pandas Series with a string index works.
133+
134+
This specifically tests the pandas 3.0 case where StringDtype is used
135+
and array[0] does label-based lookup instead of positional access.
136+
"""
137+
import pandas as pd
138+
139+
class Sample(Enum):
140+
MAXWELL = "maxwell"
141+
DWORKIN = "dworkin"
142+
143+
# Create a Series with a string index (like pandas 3.0 StringDtype)
144+
enum_items = [Sample.MAXWELL, Sample.DWORKIN, Sample.MAXWELL]
145+
series = pd.Series(enum_items, index=["a", "b", "c"])
146+
147+
# This fails with KeyError: 0 when using array[0] instead of .iloc[0]
148+
encoded_array = Sample.encode(series)
149+
150+
assert len(encoded_array) == 3
151+
assert isinstance(encoded_array, EnumArray)
152+
assert list(encoded_array) == [0, 1, 0]

0 commit comments

Comments
 (0)