[MNT] Use new test database image (#236)

PGijsbers · web-flow · commit 294ca47b856e · 2026-02-12T10:20:49.000+02:00
* Update to newer database image * Update validation to work with new user keys (temporarily) * Update constant with new unprocessed datasets * Update test users with their new IDs * Update for new ownership of studies * Ignore ontology of features for now as it is not in Python API * Add back in key for private dataset owner * User private dataset owner * Update constants to match new test database state * Database setup script not needed with new image * Allow more lenient api key through configuration * Apply suggestion from @PGijsbers * Wrap line to adhere to line limit
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -1,7 +1,7 @@
 services:
   database:
     profiles: ["python", "php", "all"]
-    image: "openml/test-database:20240105"
+    image: "openml/test-database:v0.1.20260204"
     container_name: "openml-test-database"
     environment:
       MYSQL_ROOT_PASSWORD: ok
@@ -15,17 +15,6 @@ services:
       interval: 5s
       retries: 10
 
-  database-setup:
-    profiles: ["python", "php", "all"]
-    image: mysql
-    container_name: "openml-test-database-setup"
-    volumes:
-      - ./docker/database/update.sh:/database-update.sh
-    command: /bin/sh -c "/database-update.sh"
-    depends_on:
-      database:
-        condition: service_healthy
-
   docs:
     profiles: ["all"]
     build:
diff --git a/src/config.toml b/src/config.toml
@@ -1,6 +1,9 @@
 arff_base_url="https://test.openml.org"
 minio_base_url="https://openml1.win.tue.nl"
 
+[development]
+allow_test_api_keys=true
+
 [fastapi]
 root_path=""
 
diff --git a/src/database/users.py b/src/database/users.py
@@ -5,8 +5,19 @@
 from pydantic import StringConstraints
 from sqlalchemy import Connection, text
 
+from config import load_configuration
+
 # Enforces str is 32 hexadecimal characters, does not check validity.
-APIKey = Annotated[str, StringConstraints(pattern=r"^[0-9a-fA-F]{32}$")]
+# If `allow_test_api_keys` is set, the key may also be one of `normaluser`,
+# `normaluser2`, or `abc` (admin).
+api_key_pattern = r"^[0-9a-fA-F]{32}$"
+if load_configuration()["development"].get("allow_test_api_keys"):
+    api_key_pattern = r"^([0-9a-fA-F]{32}|normaluser|normaluser2|abc)$"
+
+APIKey = Annotated[
+    str,
+    StringConstraints(pattern=api_key_pattern),
+]
 
 
 class UserGroup(IntEnum):
diff --git a/tests/constants.py b/tests/constants.py
@@ -1,7 +1,7 @@
 PRIVATE_DATASET_ID = {130}
-IN_PREPARATION_ID = {33}
+IN_PREPARATION_ID = {33, 161, 162, 163}
 DEACTIVATED_DATASETS = {131}
-DATASETS = set(range(1, 132))
+DATASETS = set(range(1, 132)) | {161, 162, 163}
 
 NUMBER_OF_DATASETS = len(DATASETS)
 NUMBER_OF_DEACTIVATED_DATASETS = len(DEACTIVATED_DATASETS)
diff --git a/tests/routers/openml/datasets_list_datasets_test.py b/tests/routers/openml/datasets_list_datasets_test.py
@@ -47,7 +47,7 @@ def test_list_filter_active(status: str, amount: int, py_api: TestClient) -> Non
     ("api_key", "amount"),
     [
         (ApiKey.ADMIN, constants.NUMBER_OF_DATASETS),
-        (ApiKey.OWNER_USER, constants.NUMBER_OF_DATASETS),
+        (ApiKey.DATASET_130_OWNER, constants.NUMBER_OF_DATASETS),
         (ApiKey.SOME_USER, constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS),
         (None, constants.NUMBER_OF_DATASETS - constants.NUMBER_OF_PRIVATE_DATASETS),
     ],
@@ -91,13 +91,15 @@ def test_list_data_name_absent(name: str, py_api: TestClient) -> None:
 
 
 @pytest.mark.parametrize("limit", [None, 5, 10, 200])
-@pytest.mark.parametrize("offset", [None, 0, 5, 129, 130, 200])
+@pytest.mark.parametrize("offset", [None, 0, 5, 129, 140, 200])
 def test_list_pagination(limit: int | None, offset: int | None, py_api: TestClient) -> None:
+    # dataset ids are contiguous until 131, then there are 161, 162, and 163.
+    extra_datasets = [161, 162, 163]
     all_ids = [
         did
-        for did in range(1, 1 + constants.NUMBER_OF_DATASETS)
+        for did in range(1, 1 + constants.NUMBER_OF_DATASETS - len(extra_datasets))
         if did not in constants.PRIVATE_DATASET_ID
-    ]
+    ] + extra_datasets
 
     start = 0 if offset is None else offset
     end = start + (100 if limit is None else limit)
@@ -108,7 +110,7 @@ def test_list_pagination(limit: int | None, offset: int | None, py_api: TestClie
     filters = {"status": "all", "pagination": offset_body | limit_body}
     response = py_api.post("/datasets/list", json=filters)
 
-    if offset in [130, 200]:
+    if offset in [140, 200]:
         _assert_empty_result(response)
         return
 
@@ -119,7 +121,7 @@ def test_list_pagination(limit: int | None, offset: int | None, py_api: TestClie
 
 @pytest.mark.parametrize(
     ("version", "count"),
-    [(1, 100), (2, 6), (5, 1)],
+    [(1, 100), (2, 7), (5, 1)],
 )
 def test_list_data_version(version: int, count: int, py_api: TestClient) -> None:
     response = py_api.post(
@@ -133,16 +135,17 @@ def test_list_data_version(version: int, count: int, py_api: TestClient) -> None
 
 
 def test_list_data_version_no_result(py_api: TestClient) -> None:
+    version_with_no_datasets = 42
     response = py_api.post(
         f"/datasets/list?api_key={ApiKey.ADMIN}",
-        json={"status": "all", "data_version": 4},
+        json={"status": "all", "data_version": version_with_no_datasets},
     )
     _assert_empty_result(response)
 
 
 @pytest.mark.parametrize(
     "key",
-    [ApiKey.SOME_USER, ApiKey.OWNER_USER, ApiKey.ADMIN],
+    [ApiKey.SOME_USER, ApiKey.DATASET_130_OWNER, ApiKey.ADMIN],
 )
 @pytest.mark.parametrize(
     ("user_id", "count"),
@@ -211,7 +214,7 @@ def test_list_data_tag_empty(py_api: TestClient) -> None:
         ("number_classes", "2", 51),
         ("number_classes", "2..3", 56),
         ("number_missing_values", "2", 1),
-        ("number_missing_values", "2..100000", 22),
+        ("number_missing_values", "2..100000", 23),
     ],
 )
 def test_list_data_quality(quality: str, range_: str, count: int, py_api: TestClient) -> None:
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
@@ -9,7 +9,7 @@
 from routers.openml.datasets import get_dataset
 from schemas.datasets.openml import DatasetMetadata, DatasetStatus
 from tests import constants
-from tests.users import ADMIN_USER, NO_USER, OWNER_USER, SOME_USER, ApiKey
+from tests.users import ADMIN_USER, DATASET_130_OWNER, NO_USER, SOME_USER, ApiKey
 
 
 @pytest.mark.parametrize(
@@ -92,7 +92,7 @@ def test_private_dataset_no_access(
 
 
 @pytest.mark.parametrize(
-    "user", [OWNER_USER, ADMIN_USER, pytest.param(SOME_USER, marks=pytest.mark.xfail)]
+    "user", [DATASET_130_OWNER, ADMIN_USER, pytest.param(SOME_USER, marks=pytest.mark.xfail)]
 )
 def test_private_dataset_access(user: User, expdb_test: Connection, user_test: Connection) -> None:
     dataset = get_dataset(
@@ -165,7 +165,7 @@ def test_dataset_features_no_access(py_api: TestClient) -> None:
 
 @pytest.mark.parametrize(
     "api_key",
-    [ApiKey.ADMIN, ApiKey.OWNER_USER],
+    [ApiKey.ADMIN, ApiKey.DATASET_130_OWNER],
 )
 def test_dataset_features_access_to_private(api_key: ApiKey, py_api: TestClient) -> None:
     response = py_api.get(f"/datasets/features/130?api_key={api_key}")
diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py
@@ -123,7 +123,7 @@ def test_private_dataset_no_user_no_access(
 
 @pytest.mark.parametrize(
     "api_key",
-    [ApiKey.OWNER_USER, ApiKey.ADMIN],
+    [ApiKey.DATASET_130_OWNER, ApiKey.ADMIN],
 )
 def test_private_dataset_owner_access(
     py_api: TestClient,
@@ -225,4 +225,7 @@ def test_datasets_feature_is_identical(
             else:
                 # The old API formats bool as string in lower-case
                 feature[key] = str(value) if not isinstance(value, bool) else str(value).lower()
-    assert python_body == original.json()["data_features"]["feature"]
+    original_features = original.json()["data_features"]["feature"]
+    for feature in original_features:
+        feature.pop("ontology", None)
+    assert python_body == original_features
diff --git a/tests/routers/openml/study_test.py b/tests/routers/openml/study_test.py
@@ -6,6 +6,7 @@
 from starlette.testclient import TestClient
 
 from schemas.study import StudyType
+from tests.users import ApiKey
 
 
 def test_get_task_study_by_id(py_api: TestClient) -> None:
@@ -458,7 +459,7 @@ def test_get_task_study_by_alias(py_api: TestClient) -> None:
 
 def test_create_task_study(py_api: TestClient) -> None:
     response = py_api.post(
-        "/studies?api_key=00000000000000000000000000000000",
+        f"/studies?api_key={ApiKey.SOME_USER}",
         json={
             "name": "Test Study",
             "alias": "test-study",
@@ -518,27 +519,28 @@ def _attach_tasks_to_study(
 
 
 def test_attach_task_to_study(py_api: TestClient, expdb_test: Connection) -> None:
+    expdb_test.execute(text("UPDATE study SET status = 'in_preparation' WHERE id = 7"))
     response = _attach_tasks_to_study(
-        study_id=1,
-        task_ids=[2, 3, 4],
-        api_key="AD000000000000000000000000000000",
+        study_id=7,
+        task_ids=[50],
+        api_key=ApiKey.OWNER_USER,
         py_api=py_api,
         expdb_test=expdb_test,
     )
-    assert response.status_code == HTTPStatus.OK
-    assert response.json() == {"study_id": 1, "main_entity_type": StudyType.TASK}
+    assert response.status_code == HTTPStatus.OK, response.content
+    assert response.json() == {"study_id": 7, "main_entity_type": StudyType.TASK}
 
 
 def test_attach_task_to_study_needs_owner(py_api: TestClient, expdb_test: Connection) -> None:
-    expdb_test.execute(text("UPDATE study SET status = 'in_preparation' WHERE id = 1"))
+    expdb_test.execute(text("UPDATE study SET status = 'in_preparation' WHERE id = 7"))
     response = _attach_tasks_to_study(
         study_id=1,
         task_ids=[2, 3, 4],
-        api_key="00000000000000000000000000000000",
+        api_key=ApiKey.OWNER_USER,
         py_api=py_api,
         expdb_test=expdb_test,
     )
-    assert response.status_code == HTTPStatus.FORBIDDEN
+    assert response.status_code == HTTPStatus.FORBIDDEN, response.content
 
 
 def test_attach_task_to_study_already_linked_raises(
@@ -549,11 +551,11 @@ def test_attach_task_to_study_already_linked_raises(
     response = _attach_tasks_to_study(
         study_id=1,
         task_ids=[1, 3, 4],
-        api_key="AD000000000000000000000000000000",
+        api_key=ApiKey.ADMIN,
         py_api=py_api,
         expdb_test=expdb_test,
     )
-    assert response.status_code == HTTPStatus.CONFLICT
+    assert response.status_code == HTTPStatus.CONFLICT, response.content
     assert response.json() == {"detail": "Task 1 is already attached to study 1."}
 
 
@@ -565,7 +567,7 @@ def test_attach_task_to_study_but_task_not_exist_raises(
     response = _attach_tasks_to_study(
         study_id=1,
         task_ids=[80123, 78914],
-        api_key="AD000000000000000000000000000000",
+        api_key=ApiKey.ADMIN,
         py_api=py_api,
         expdb_test=expdb_test,
     )
diff --git a/tests/routers/openml/users_test.py b/tests/routers/openml/users_test.py
@@ -18,7 +18,7 @@ def test_fetch_user(api_key: str, user: User, user_test: Connection) -> None:
     db_user = fetch_user(api_key, user_data=user_test)
     assert db_user is not None
     assert user.user_id == db_user.user_id
-    assert user.groups == db_user.groups
+    assert set(user.groups) == set(db_user.groups)
 
 
 def test_fetch_user_invalid_key_returns_none(user_test: Connection) -> None:
diff --git a/tests/users.py b/tests/users.py
@@ -4,12 +4,14 @@
 
 NO_USER = None
 SOME_USER = User(user_id=2, _database=None, _groups=[UserGroup.READ_WRITE])
-OWNER_USER = User(user_id=16, _database=None, _groups=[UserGroup.READ_WRITE])
-ADMIN_USER = User(user_id=1, _database=None, _groups=[UserGroup.ADMIN, UserGroup.READ_WRITE])
+OWNER_USER = User(user_id=3229, _database=None, _groups=[UserGroup.READ_WRITE])
+DATASET_130_OWNER = User(user_id=16, _database=None, _groups=[UserGroup.READ_WRITE])
+ADMIN_USER = User(user_id=1159, _database=None, _groups=[UserGroup.ADMIN, UserGroup.READ_WRITE])
 
 
 class ApiKey(StrEnum):
-    ADMIN = "AD000000000000000000000000000000"
-    SOME_USER = "00000000000000000000000000000000"
-    OWNER_USER = "DA1A0000000000000000000000000000"
+    ADMIN = "abc"
+    SOME_USER = "normaluser2"
+    OWNER_USER = "normaluser"
+    DATASET_130_OWNER = "DA1A0000000000000000000000000000"
     INVALID = "11111111111111111111111111111111"