From 7d4d62e73bf2685be742282a7ae143f52fbd4afa Mon Sep 17 00:00:00 2001
From: Satvik Mishra <112589278+satvshr@users.noreply.github.com>
Date: Fri, 23 Jan 2026 01:16:26 +0530
Subject: [PATCH 1/3] Fixed flow_functions and dataset factorize bugs

---
 tests/test_datasets/test_dataset.py     | 10 +++++-----
 tests/test_flows/test_flow_functions.py |  3 ++-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 6dc4c7d5d..60727a4b8 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -105,18 +105,18 @@ def test_get_data_pandas(self):
         col_dtype = {
             "pclass": "uint8",
             "survived": "category",
-            "name": "object",
+            "name": "str",
             "sex": "category",
             "age": "float64",
             "sibsp": "uint8",
             "parch": "uint8",
-            "ticket": "object",
+            "ticket": "str",
             "fare": "float64",
-            "cabin": "object",
+            "cabin": "str",
             "embarked": "category",
-            "boat": "object",
+            "boat": "str",
             "body": "float64",
-            "home.dest": "object",
+            "home.dest": "str",
         }
         for col_name in data.columns:
             assert data[col_name].dtype.name == col_dtype[col_name]
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 2339b27c8..0ecec8964 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -41,8 +41,9 @@ def _check_flow(self, flow):
         assert isinstance(flow["full_name"], str)
         assert isinstance(flow["version"], str)
         # There are some runs on openml.org that can have an empty external version
+        ext_version = flow["external_version"]
         ext_version_str_or_none = (
-            isinstance(flow["external_version"], str) or flow["external_version"] is None
+            isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version)
         )
         assert ext_version_str_or_none
 

From ab8be1a2dc978509cd230ceee1f724b423721900 Mon Sep 17 00:00:00 2001
From: Satvik Mishra <112589278+satvshr@users.noreply.github.com>
Date: Fri, 23 Jan 2026 01:28:47 +0530
Subject: [PATCH 2/3] fixed factorize bug

---
 openml/datasets/dataset.py          | 2 +-
 tests/test_datasets/test_dataset.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index a77fd1040..d9eee278d 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -488,7 +488,7 @@ def _parse_data_from_arff(  # noqa: C901, PLR0912, PLR0915
                 try:
                     # checks if the strings which should be the class labels
                     # can be encoded into integers
-                    pd.factorize(type_)[0]
+                    pd.factorize(np.array(type_))[0]
                 except ValueError as e:
                     raise ValueError(
                         "Categorical data needs to be numeric when using sparse ARFF."
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 60727a4b8..7573c0774 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -357,7 +357,7 @@ def setUp(self):
     def test_get_sparse_dataset_dataframe_with_target(self):
         X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
         assert isinstance(X, pd.DataFrame)
-        assert isinstance(X.dtypes[0], pd.SparseDtype)
+        assert isinstance(X.dtypes.iloc[0], pd.SparseDtype)
         assert X.shape == (600, 20000)
 
         assert isinstance(y, pd.Series)

From 4cca09e9edce87d406c33c8f1dd4faa35869d241 Mon Sep 17 00:00:00 2001
From: Satvik Mishra <112589278+satvshr@users.noreply.github.com>
Date: Fri, 23 Jan 2026 01:36:51 +0530
Subject: [PATCH 3/3] fixed bug

---
 tests/test_datasets/test_dataset.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 7573c0774..b13bac30b 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -102,21 +102,24 @@ def test_get_data_pandas(self):
         assert isinstance(data, pd.DataFrame)
         assert data.shape[1] == len(self.titanic.features)
         assert data.shape[0] == 1309
+        # Dynamically detect what this version of Pandas calls string columns.
+        str_dtype = data["name"].dtype.name
+
         col_dtype = {
             "pclass": "uint8",
             "survived": "category",
-            "name": "str",
+            "name": str_dtype,
             "sex": "category",
             "age": "float64",
             "sibsp": "uint8",
             "parch": "uint8",
-            "ticket": "str",
+            "ticket": str_dtype,
             "fare": "float64",
-            "cabin": "str",
+            "cabin": str_dtype,
             "embarked": "category",
-            "boat": "str",
+            "boat": str_dtype,
             "body": "float64",
-            "home.dest": "str",
+            "home.dest": str_dtype,
         }
         for col_name in data.columns:
             assert data[col_name].dtype.name == col_dtype[col_name]