From 7d4d62e73bf2685be742282a7ae143f52fbd4afa Mon Sep 17 00:00:00 2001 From: Satvik Mishra <112589278+satvshr@users.noreply.github.com> Date: Fri, 23 Jan 2026 01:16:26 +0530 Subject: [PATCH 1/3] Fixed flow_functions and dataset factorize bugs --- tests/test_datasets/test_dataset.py | 10 +++++----- tests/test_flows/test_flow_functions.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 6dc4c7d5d..60727a4b8 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -105,18 +105,18 @@ def test_get_data_pandas(self): col_dtype = { "pclass": "uint8", "survived": "category", - "name": "object", + "name": "str", "sex": "category", "age": "float64", "sibsp": "uint8", "parch": "uint8", - "ticket": "object", + "ticket": "str", "fare": "float64", - "cabin": "object", + "cabin": "str", "embarked": "category", - "boat": "object", + "boat": "str", "body": "float64", - "home.dest": "object", + "home.dest": "str", } for col_name in data.columns: assert data[col_name].dtype.name == col_dtype[col_name] diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 2339b27c8..0ecec8964 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -41,8 +41,9 @@ def _check_flow(self, flow): assert isinstance(flow["full_name"], str) assert isinstance(flow["version"], str) # There are some runs on openml.org that can have an empty external version + ext_version = flow["external_version"] ext_version_str_or_none = ( - isinstance(flow["external_version"], str) or flow["external_version"] is None + isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version) ) assert ext_version_str_or_none From ab8be1a2dc978509cd230ceee1f724b423721900 Mon Sep 17 00:00:00 2001 From: Satvik Mishra <112589278+satvshr@users.noreply.github.com> Date: Fri, 23 Jan 2026 01:28:47 +0530 Subject: [PATCH 2/3] fixed factorize bug --- openml/datasets/dataset.py | 2 +- tests/test_datasets/test_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index a77fd1040..d9eee278d 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -488,7 +488,7 @@ def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915 try: # checks if the strings which should be the class labels # can be encoded into integers - pd.factorize(type_)[0] + pd.factorize(np.array(type_))[0] except ValueError as e: raise ValueError( "Categorical data needs to be numeric when using sparse ARFF." diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 60727a4b8..7573c0774 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -357,7 +357,7 @@ def setUp(self): def test_get_sparse_dataset_dataframe_with_target(self): X, y, _, attribute_names = self.sparse_dataset.get_data(target="class") assert isinstance(X, pd.DataFrame) - assert isinstance(X.dtypes[0], pd.SparseDtype) + assert isinstance(X.dtypes.iloc[0], pd.SparseDtype) assert X.shape == (600, 20000) assert isinstance(y, pd.Series) From 4cca09e9edce87d406c33c8f1dd4faa35869d241 Mon Sep 17 00:00:00 2001 From: Satvik Mishra <112589278+satvshr@users.noreply.github.com> Date: Fri, 23 Jan 2026 01:36:51 +0530 Subject: [PATCH 3/3] fixed bug --- tests/test_datasets/test_dataset.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 7573c0774..b13bac30b 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -102,21 +102,24 @@ def test_get_data_pandas(self): assert isinstance(data, pd.DataFrame) assert data.shape[1] == len(self.titanic.features) assert data.shape[0] == 1309 + # Dynamically detect what this version of Pandas calls string columns. + str_dtype = data["name"].dtype.name + col_dtype = { "pclass": "uint8", "survived": "category", - "name": "str", + "name": str_dtype, "sex": "category", "age": "float64", "sibsp": "uint8", "parch": "uint8", - "ticket": "str", + "ticket": str_dtype, "fare": "float64", - "cabin": "str", + "cabin": str_dtype, "embarked": "category", - "boat": "str", + "boat": str_dtype, "body": "float64", - "home.dest": "str", + "home.dest": str_dtype, } for col_name in data.columns: assert data[col_name].dtype.name == col_dtype[col_name]