Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915
try:
# checks if the strings which should be the class labels
# can be encoded into integers
pd.factorize(type_)[0]
pd.factorize(np.array(type_))[0]
except ValueError as e:
raise ValueError(
"Categorical data needs to be numeric when using sparse ARFF."
Expand Down
15 changes: 9 additions & 6 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,21 +102,24 @@ def test_get_data_pandas(self):
assert isinstance(data, pd.DataFrame)
assert data.shape[1] == len(self.titanic.features)
assert data.shape[0] == 1309
# Dynamically detect what this version of Pandas calls string columns.
str_dtype = data["name"].dtype.name

Comment on lines +105 to +107
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't that cheating? why would it ever fail for "name" column if we are dynamically extracting the dtype. Is it to make sure it works with pandas<3.0.0?

would make sense to replace "object" simply with "str" in col_dtype if we are setting pandas>=3.0.0 in dependencies

CC: @fkiraly, are we bounding to this pandas version in pyproject.toml?

Copy link
Contributor Author

@satvshr satvshr Jan 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't that cheating?

It is definitely cheating! Thought it was better to get a quick fix out and then fine-tune the fixes later on. It was between this or bounding pandas, and for me this was the better/less intense option between the 2.

col_dtype = {
"pclass": "uint8",
"survived": "category",
"name": "object",
"name": str_dtype,
"sex": "category",
"age": "float64",
"sibsp": "uint8",
"parch": "uint8",
"ticket": "object",
"ticket": str_dtype,
"fare": "float64",
"cabin": "object",
"cabin": str_dtype,
"embarked": "category",
"boat": "object",
"boat": str_dtype,
"body": "float64",
"home.dest": "object",
"home.dest": str_dtype,
}
for col_name in data.columns:
assert data[col_name].dtype.name == col_dtype[col_name]
Expand Down Expand Up @@ -357,7 +360,7 @@ def setUp(self):
def test_get_sparse_dataset_dataframe_with_target(self):
X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
assert isinstance(X, pd.DataFrame)
assert isinstance(X.dtypes[0], pd.SparseDtype)
assert isinstance(X.dtypes.iloc[0], pd.SparseDtype)
assert X.shape == (600, 20000)

assert isinstance(y, pd.Series)
Expand Down
3 changes: 2 additions & 1 deletion tests/test_flows/test_flow_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ def _check_flow(self, flow):
assert isinstance(flow["full_name"], str)
assert isinstance(flow["version"], str)
# There are some runs on openml.org that can have an empty external version
ext_version = flow["external_version"]
ext_version_str_or_none = (
isinstance(flow["external_version"], str) or flow["external_version"] is None
isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version)
)
assert ext_version_str_or_none

Expand Down