diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d65cc3796..b10721f55 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -22,7 +22,7 @@ concurrency: jobs: test: - name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }},sk-only:${{ matrix.sklearn-only }}) + name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }}${{ matrix.pandas-version != '' && format(',pd:{0}', matrix.pandas-version) || '' }},sk-only:${{ matrix.sklearn-only }}) runs-on: ${{ matrix.os }} strategy: @@ -64,6 +64,14 @@ jobs: sklearn-only: "false" code-cov: true + # Pandas 2 run + - os: ubuntu-latest + python-version: "3.12" + scikit-learn: "1.5.*" + sklearn-only: "false" + pandas-version: "2.*" + code-cov: false + steps: - uses: actions/checkout@v6 with: @@ -74,10 +82,16 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Install test dependencies and scikit-learn + - name: Install test dependencies, scikit-learn, and optional pandas + shell: bash run: | python -m pip install --upgrade pip pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }} + + if [ "${{ matrix.pandas-version }}" != "" ]; then + echo "Installing specific pandas version: ${{ matrix.pandas-version }}" + pip install "pandas==${{ matrix.pandas-version }}" + fi - name: Store repository status id: status-before diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index a77fd1040..d9eee278d 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -488,7 +488,7 @@ def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915 try: # checks if the strings which should be the class labels # can be encoded into integers - pd.factorize(type_)[0] + pd.factorize(np.array(type_))[0] except ValueError as e: raise ValueError( "Categorical data needs to be numeric when using sparse ARFF." diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 6dc4c7d5d..b13bac30b 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -102,21 +102,24 @@ def test_get_data_pandas(self): assert isinstance(data, pd.DataFrame) assert data.shape[1] == len(self.titanic.features) assert data.shape[0] == 1309 + # Dynamically detect what this version of Pandas calls string columns. + str_dtype = data["name"].dtype.name + col_dtype = { "pclass": "uint8", "survived": "category", - "name": "object", + "name": str_dtype, "sex": "category", "age": "float64", "sibsp": "uint8", "parch": "uint8", - "ticket": "object", + "ticket": str_dtype, "fare": "float64", - "cabin": "object", + "cabin": str_dtype, "embarked": "category", - "boat": "object", + "boat": str_dtype, "body": "float64", - "home.dest": "object", + "home.dest": str_dtype, } for col_name in data.columns: assert data[col_name].dtype.name == col_dtype[col_name] @@ -357,7 +360,7 @@ def setUp(self): def test_get_sparse_dataset_dataframe_with_target(self): X, y, _, attribute_names = self.sparse_dataset.get_data(target="class") assert isinstance(X, pd.DataFrame) - assert isinstance(X.dtypes[0], pd.SparseDtype) + assert isinstance(X.dtypes.iloc[0], pd.SparseDtype) assert X.shape == (600, 20000) assert isinstance(y, pd.Series) diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 875ba8517..5aa99cd62 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -41,8 +41,9 @@ def _check_flow(self, flow): assert isinstance(flow["full_name"], str) assert isinstance(flow["version"], str) # There are some runs on openml.org that can have an empty external version + ext_version = flow["external_version"] ext_version_str_or_none = ( - isinstance(flow["external_version"], str) or flow["external_version"] is None + isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version) ) assert ext_version_str_or_none