From 8f218062cbc07e35ebc757504b88925b941a379b Mon Sep 17 00:00:00 2001 From: Kay Robbins Date: Tue, 16 Jun 2026 13:43:06 -0500 Subject: [PATCH 1/2] Fixed KeyMap issue with pandas 3.0.0 --- hed/tools/analysis/key_map.py | 6 +- spec_tests/hed-schemas | 2 +- spec_tests/hed-tests | 2 +- tests/tools/analysis/test_key_map.py | 132 +++++++++++++++++++++++++++ 4 files changed, 139 insertions(+), 3 deletions(-) diff --git a/hed/tools/analysis/key_map.py b/hed/tools/analysis/key_map.py index 0ca57c14..675dffac 100644 --- a/hed/tools/analysis/key_map.py +++ b/hed/tools/analysis/key_map.py @@ -146,7 +146,11 @@ def _remap(self, df): # Key series now contains row_number: hash for each row in the dataframe # Add a column containing the mapped index for each row - map_series = pd.Series(self.map_dict) # map_series is hash:row_index for each entry in the map_dict index + # Use explicit index/data to ensure pandas 3.0+ compatibility + map_series = pd.Series( + data=list(self.map_dict.values()), + index=list(self.map_dict.keys()) + ) # map_series is hash:row_index for each entry in the map_dict index key_values = key_series.map(map_series) # key_values is df_row_number:map_dict_index # e.g. a key_value entry of 0:79 means row 0 maps to row 79 in the map_dict diff --git a/spec_tests/hed-schemas b/spec_tests/hed-schemas index 73ecb358..0a4a7b61 160000 --- a/spec_tests/hed-schemas +++ b/spec_tests/hed-schemas @@ -1 +1 @@ -Subproject commit 73ecb358f61d7a470e1da5a73a10661db588071a +Subproject commit 0a4a7b613b82a9b09955bba9a22a36791d1de035 diff --git a/spec_tests/hed-tests b/spec_tests/hed-tests index 8c35600d..ea4237ca 160000 --- a/spec_tests/hed-tests +++ b/spec_tests/hed-tests @@ -1 +1 @@ -Subproject commit 8c35600d13ee65c65bd634490f2fb1f0ff0514f1 +Subproject commit ea4237cacb3bf2cad784e50808d370a9d0251c76 diff --git a/tests/tools/analysis/test_key_map.py b/tests/tools/analysis/test_key_map.py index 0beb4425..09e7e621 100644 --- a/tests/tools/analysis/test_key_map.py +++ b/tests/tools/analysis/test_key_map.py @@ -163,6 +163,138 @@ def test_update_map_not_unique(self): self.assertEqual(len(t_map.col_map.columns), 4, "update should produce correct number of columns") self.assertEqual(len(t_map.col_map), len(t_map.count_dict), "update should produce the correct number of rows") + def test_remap_numeric_keys_simple(self): + """Test remap with simple numeric keys (pandas 3.0 compatibility).""" + # Create a simple KeyMap with numeric keys + key_map = KeyMap(['col1'], ['result']) + + # Create a mapping DataFrame with numeric keys + map_df = pd.DataFrame({ + 'col1': [1, 2, 3], + 'result': ['one', 'two', 'three'] + }) + key_map.update(map_df) + + # Create test data with numeric values + test_df = pd.DataFrame({ + 'col1': [1, 2, 1, 3, 2] + }) + + # This should not raise ValueError on pandas 3.0.3 + df_result, missing = key_map.remap(test_df) + + self.assertEqual(len(df_result), 5, "remap should preserve number of rows") + self.assertEqual(df_result.iloc[0]['result'], 'one', "remap should map 1 to 'one'") + self.assertEqual(df_result.iloc[1]['result'], 'two', "remap should map 2 to 'two'") + self.assertEqual(df_result.iloc[2]['result'], 'one', "remap should map 1 to 'one'") + self.assertEqual(df_result.iloc[3]['result'], 'three', "remap should map 3 to 'three'") + self.assertFalse(missing, "remap should not have missing keys") + + def test_remap_numeric_keys_as_strings(self): + """Test remap with numeric keys stored as strings (common case).""" + key_map = KeyMap(['test_code'], ['test_label']) + + # Create a mapping where numeric keys are stored as strings + map_df = pd.DataFrame({ + 'test_code': ['1', '2', '3', '4'], + 'test_label': ['low', 'medium', 'high', 'critical'] + }) + key_map.update(map_df) + + # Create test data with numeric values as strings + test_df = pd.DataFrame({ + 'test_code': ['1', '2', '3', '1', '4', '2'] + }) + + df_result, missing = key_map.remap(test_df) + + self.assertEqual(len(df_result), 6, "remap should preserve number of rows") + self.assertEqual(df_result.iloc[0]['test_label'], 'low') + self.assertEqual(df_result.iloc[1]['test_label'], 'medium') + self.assertEqual(df_result.iloc[2]['test_label'], 'high') + self.assertEqual(df_result.iloc[4]['test_label'], 'critical') + self.assertFalse(missing, "remap should not have missing keys") + + def test_remap_numeric_keys_with_na(self): + """Test remap with numeric keys including n/a values.""" + key_map = KeyMap(['value'], ['category']) + + # Create mapping with numeric and string keys + map_df = pd.DataFrame({ + 'value': ['1', '2', '3'], + 'category': ['cat_a', 'cat_b', 'cat_c'] + }) + key_map.update(map_df) + + # Create test data with n/a values + test_df = pd.DataFrame({ + 'value': ['1', '2', 'n/a', '3', 'n/a'] + }) + + df_result, missing = key_map.remap(test_df) + + self.assertEqual(len(df_result), 5, "remap should preserve number of rows") + self.assertEqual(df_result.iloc[0]['category'], 'cat_a') + self.assertEqual(df_result.iloc[2]['category'], 'n/a', "remap should map n/a to n/a") + self.assertEqual(df_result.iloc[3]['category'], 'cat_c') + + def test_remap_multiple_numeric_keys_cascade(self): + """Test remap with multiple numeric keys cascading (the pandas 3.0.3 failing case).""" + # This is the exact scenario from pandas_fail.md that was failing + key_map = KeyMap(['test', 'response_accuracy'], ['result']) + + # Create mapping for multiple key combination + map_df = pd.DataFrame({ + 'test': ['1', '2'], + 'response_accuracy': ['correct', 'correct'], + 'result': ['correct_left', 'correct_right'] + }) + key_map.update(map_df) + + # Create test data matching the failure scenario + test_df = pd.DataFrame({ + 'test': ['1', '2', 'n/a', '3', '4', '5'], + 'response_accuracy': ['correct', 'correct', 'correct', 'n/a', 'correct', 'correct'] + }) + + # This was the failing line: map_series = pd.Series(self.map_dict) + # Should work now with explicit index/data parameters + df_result, missing = key_map.remap(test_df) + + self.assertEqual(len(df_result), 6, "remap should preserve number of rows") + self.assertEqual(df_result.iloc[0]['result'], 'correct_left') + self.assertEqual(df_result.iloc[1]['result'], 'correct_right') + # Rows with missing key combinations should get 'n/a' + self.assertEqual(df_result.iloc[2]['result'], 'n/a') + self.assertEqual(df_result.iloc[3]['result'], 'n/a') + + def test_remap_large_numeric_key_dict(self): + """Test remap with a large dictionary of numeric keys to ensure Series construction works.""" + key_map = KeyMap(['event_id'], ['event_name']) + + # Create a large mapping with numeric event IDs + size = 100 + map_data = { + 'event_id': [str(i) for i in range(size)], + 'event_name': [f'event_{i}' for i in range(size)] + } + map_df = pd.DataFrame(map_data) + key_map.update(map_df) + + # Create test data with random event IDs + test_data = { + 'event_id': [str(i % 50) for i in range(200)] # Use first 50 event IDs + } + test_df = pd.DataFrame(test_data) + + df_result, missing = key_map.remap(test_df) + + self.assertEqual(len(df_result), 200, "remap should preserve number of rows") + # Verify some mappings + self.assertEqual(df_result.iloc[0]['event_name'], 'event_0') + self.assertEqual(df_result.iloc[50]['event_name'], 'event_0') # 50 % 50 = 0 + self.assertEqual(df_result.iloc[99]['event_name'], 'event_49') # 99 % 50 = 49 + if __name__ == "__main__": unittest.main() From ce981b4753e3ed405b98a7ee6ccdf65cff2f593e Mon Sep 17 00:00:00 2001 From: Kay Robbins Date: Tue, 16 Jun 2026 14:37:44 -0500 Subject: [PATCH 2/2] Addressed copilot comments --- hed/tools/analysis/key_map.py | 3 +- tests/tools/analysis/test_key_map.py | 150 ++++++++++++--------------- 2 files changed, 70 insertions(+), 83 deletions(-) diff --git a/hed/tools/analysis/key_map.py b/hed/tools/analysis/key_map.py index 675dffac..51b925a9 100644 --- a/hed/tools/analysis/key_map.py +++ b/hed/tools/analysis/key_map.py @@ -148,8 +148,7 @@ def _remap(self, df): # Add a column containing the mapped index for each row # Use explicit index/data to ensure pandas 3.0+ compatibility map_series = pd.Series( - data=list(self.map_dict.values()), - index=list(self.map_dict.keys()) + data=list(self.map_dict.values()), index=list(self.map_dict.keys()) ) # map_series is hash:row_index for each entry in the map_dict index key_values = key_series.map(map_series) # key_values is df_row_number:map_dict_index # e.g. a key_value entry of 0:79 means row 0 maps to row 79 in the map_dict diff --git a/tests/tools/analysis/test_key_map.py b/tests/tools/analysis/test_key_map.py index 09e7e621..6f8a52f1 100644 --- a/tests/tools/analysis/test_key_map.py +++ b/tests/tools/analysis/test_key_map.py @@ -166,134 +166,122 @@ def test_update_map_not_unique(self): def test_remap_numeric_keys_simple(self): """Test remap with simple numeric keys (pandas 3.0 compatibility).""" # Create a simple KeyMap with numeric keys - key_map = KeyMap(['col1'], ['result']) - + key_map = KeyMap(["col1"], ["result"]) + # Create a mapping DataFrame with numeric keys - map_df = pd.DataFrame({ - 'col1': [1, 2, 3], - 'result': ['one', 'two', 'three'] - }) + map_df = pd.DataFrame({"col1": [1, 2, 3], "result": ["one", "two", "three"]}) key_map.update(map_df) - + # Create test data with numeric values - test_df = pd.DataFrame({ - 'col1': [1, 2, 1, 3, 2] - }) - + test_df = pd.DataFrame({"col1": [1, 2, 1, 3, 2]}) + # This should not raise ValueError on pandas 3.0.3 df_result, missing = key_map.remap(test_df) - + self.assertEqual(len(df_result), 5, "remap should preserve number of rows") - self.assertEqual(df_result.iloc[0]['result'], 'one', "remap should map 1 to 'one'") - self.assertEqual(df_result.iloc[1]['result'], 'two', "remap should map 2 to 'two'") - self.assertEqual(df_result.iloc[2]['result'], 'one', "remap should map 1 to 'one'") - self.assertEqual(df_result.iloc[3]['result'], 'three', "remap should map 3 to 'three'") + self.assertEqual(df_result.iloc[0]["result"], "one", "remap should map 1 to 'one'") + self.assertEqual(df_result.iloc[1]["result"], "two", "remap should map 2 to 'two'") + self.assertEqual(df_result.iloc[2]["result"], "one", "remap should map 1 to 'one'") + self.assertEqual(df_result.iloc[3]["result"], "three", "remap should map 3 to 'three'") self.assertFalse(missing, "remap should not have missing keys") def test_remap_numeric_keys_as_strings(self): """Test remap with numeric keys stored as strings (common case).""" - key_map = KeyMap(['test_code'], ['test_label']) - + key_map = KeyMap(["test_code"], ["test_label"]) + # Create a mapping where numeric keys are stored as strings - map_df = pd.DataFrame({ - 'test_code': ['1', '2', '3', '4'], - 'test_label': ['low', 'medium', 'high', 'critical'] - }) + map_df = pd.DataFrame({"test_code": ["1", "2", "3", "4"], "test_label": ["low", "medium", "high", "critical"]}) key_map.update(map_df) - + # Create test data with numeric values as strings - test_df = pd.DataFrame({ - 'test_code': ['1', '2', '3', '1', '4', '2'] - }) - + test_df = pd.DataFrame({"test_code": ["1", "2", "3", "1", "4", "2"]}) + df_result, missing = key_map.remap(test_df) - + self.assertEqual(len(df_result), 6, "remap should preserve number of rows") - self.assertEqual(df_result.iloc[0]['test_label'], 'low') - self.assertEqual(df_result.iloc[1]['test_label'], 'medium') - self.assertEqual(df_result.iloc[2]['test_label'], 'high') - self.assertEqual(df_result.iloc[4]['test_label'], 'critical') + self.assertEqual(df_result.iloc[0]["test_label"], "low") + self.assertEqual(df_result.iloc[1]["test_label"], "medium") + self.assertEqual(df_result.iloc[2]["test_label"], "high") + self.assertEqual(df_result.iloc[4]["test_label"], "critical") self.assertFalse(missing, "remap should not have missing keys") def test_remap_numeric_keys_with_na(self): """Test remap with numeric keys including n/a values.""" - key_map = KeyMap(['value'], ['category']) - + key_map = KeyMap(["value"], ["category"]) + # Create mapping with numeric and string keys - map_df = pd.DataFrame({ - 'value': ['1', '2', '3'], - 'category': ['cat_a', 'cat_b', 'cat_c'] - }) + map_df = pd.DataFrame({"value": ["1", "2", "3"], "category": ["cat_a", "cat_b", "cat_c"]}) key_map.update(map_df) - + # Create test data with n/a values - test_df = pd.DataFrame({ - 'value': ['1', '2', 'n/a', '3', 'n/a'] - }) - + test_df = pd.DataFrame({"value": ["1", "2", "n/a", "3", "n/a"]}) + df_result, missing = key_map.remap(test_df) - + self.assertEqual(len(df_result), 5, "remap should preserve number of rows") - self.assertEqual(df_result.iloc[0]['category'], 'cat_a') - self.assertEqual(df_result.iloc[2]['category'], 'n/a', "remap should map n/a to n/a") - self.assertEqual(df_result.iloc[3]['category'], 'cat_c') + self.assertEqual(df_result.iloc[0]["category"], "cat_a") + self.assertEqual(df_result.iloc[2]["category"], "n/a", "remap should map n/a to n/a") + self.assertEqual(df_result.iloc[3]["category"], "cat_c") + self.assertEqual(missing, [2, 4], "remap should report rows with unmapped n/a key") def test_remap_multiple_numeric_keys_cascade(self): """Test remap with multiple numeric keys cascading (the pandas 3.0.3 failing case).""" - # This is the exact scenario from pandas_fail.md that was failing - key_map = KeyMap(['test', 'response_accuracy'], ['result']) - + # This is the exact scenario from issue #1329 that was failing + key_map = KeyMap(["test", "response_accuracy"], ["result"]) + # Create mapping for multiple key combination - map_df = pd.DataFrame({ - 'test': ['1', '2'], - 'response_accuracy': ['correct', 'correct'], - 'result': ['correct_left', 'correct_right'] - }) + map_df = pd.DataFrame( + { + "test": ["1", "2"], + "response_accuracy": ["correct", "correct"], + "result": ["correct_left", "correct_right"], + } + ) key_map.update(map_df) - + # Create test data matching the failure scenario - test_df = pd.DataFrame({ - 'test': ['1', '2', 'n/a', '3', '4', '5'], - 'response_accuracy': ['correct', 'correct', 'correct', 'n/a', 'correct', 'correct'] - }) - + test_df = pd.DataFrame( + { + "test": ["1", "2", "n/a", "3", "4", "5"], + "response_accuracy": ["correct", "correct", "correct", "n/a", "correct", "correct"], + } + ) + # This was the failing line: map_series = pd.Series(self.map_dict) # Should work now with explicit index/data parameters df_result, missing = key_map.remap(test_df) - + self.assertEqual(len(df_result), 6, "remap should preserve number of rows") - self.assertEqual(df_result.iloc[0]['result'], 'correct_left') - self.assertEqual(df_result.iloc[1]['result'], 'correct_right') - # Rows with missing key combinations should get 'n/a' - self.assertEqual(df_result.iloc[2]['result'], 'n/a') - self.assertEqual(df_result.iloc[3]['result'], 'n/a') + self.assertEqual(df_result.iloc[0]["result"], "correct_left") + self.assertEqual(df_result.iloc[1]["result"], "correct_right") + # Rows with missing key combinations should get n/a + self.assertEqual(df_result.iloc[2]["result"], "n/a") + self.assertEqual(df_result.iloc[3]["result"], "n/a") + self.assertEqual(missing, [2, 3, 4, 5], "remap should report rows with unmapped key combinations") def test_remap_large_numeric_key_dict(self): """Test remap with a large dictionary of numeric keys to ensure Series construction works.""" - key_map = KeyMap(['event_id'], ['event_name']) - + key_map = KeyMap(["event_id"], ["event_name"]) + # Create a large mapping with numeric event IDs size = 100 - map_data = { - 'event_id': [str(i) for i in range(size)], - 'event_name': [f'event_{i}' for i in range(size)] - } + map_data = {"event_id": [str(i) for i in range(size)], "event_name": [f"event_{i}" for i in range(size)]} map_df = pd.DataFrame(map_data) key_map.update(map_df) - + # Create test data with random event IDs test_data = { - 'event_id': [str(i % 50) for i in range(200)] # Use first 50 event IDs + "event_id": [str(i % 50) for i in range(200)] # Use first 50 event IDs } test_df = pd.DataFrame(test_data) - - df_result, missing = key_map.remap(test_df) - + + df_result, _missing = key_map.remap(test_df) + self.assertEqual(len(df_result), 200, "remap should preserve number of rows") # Verify some mappings - self.assertEqual(df_result.iloc[0]['event_name'], 'event_0') - self.assertEqual(df_result.iloc[50]['event_name'], 'event_0') # 50 % 50 = 0 - self.assertEqual(df_result.iloc[99]['event_name'], 'event_49') # 99 % 50 = 49 + self.assertEqual(df_result.iloc[0]["event_name"], "event_0") + self.assertEqual(df_result.iloc[50]["event_name"], "event_0") # 50 % 50 = 0 + self.assertEqual(df_result.iloc[99]["event_name"], "event_49") # 99 % 50 = 49 if __name__ == "__main__":