diff --git a/.github/workflows/build-with-profiles.yml b/.github/workflows/build-with-profiles.yml
index ba77faf..2ad137d 100644
--- a/.github/workflows/build-with-profiles.yml
+++ b/.github/workflows/build-with-profiles.yml
@@ -39,8 +39,13 @@ jobs:
uses: docker/build-push-action@v6
with:
context: .
- file: ./Dockerfile.fivesafes-profile
push: true
+ # Add the five-safes profile (via EXTRA_PROFILES_PATH) and warm its
+ # cache, using the shared Dockerfile's build args.
+ build-args: |
+ FIVE_SAFES_PROFILE_VERSION=five-safes-0.7.4-beta
+ PROFILES_ARCHIVE_URL=https://github.com/eScienceLab/rocrate-validator/archive/refs/tags/five-safes-0.7.4-beta.tar.gz
+ EXTRA_PROFILES_PATH=/app/extra-profiles
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..1e760d2
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,29 @@
+name: Lint
+
+on:
+ pull_request:
+ branches: [ develop ]
+
+jobs:
+ ruff:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Install ruff
+ run: |
+ python -m pip install --upgrade pip
+ pip install ruff
+
+ - name: Lint
+ run: ruff check .
+
+ - name: Format check
+ run: ruff format --check .
diff --git a/.github/workflows/test_docker.yml b/.github/workflows/test_docker.yml
index 9af7b23..b0c8c1e 100644
--- a/.github/workflows/test_docker.yml
+++ b/.github/workflows/test_docker.yml
@@ -20,16 +20,15 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install pytest requests minio docker
+ pip install pytest requests boto3
- - name: Build Docker Compose Containers
+ - name: Run integration tests (brings up the compose stack)
run: |
cp example.env .env
- docker compose -f docker-compose-develop.yml build
+ pytest -s -v tests/test_integration.py
- - name: Spin Up Docker Compose and Run Tests
- run: pytest -s -v tests/test_integration.py
-
- - name: Ensure that Docker Compose is Shutdown
+ - name: Ensure Docker Compose is shut down
if: always()
- run: docker compose down
+ run: >
+ docker compose -f docker-compose-develop.yml -p cratey_integration
+ --profile objectstore down -v || true
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 7c0a996..3d44ed4 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -20,8 +20,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install -r requirements.txt
- pip install pytest pytest-mock
+ pip install -r requirements-dev.txt
- name: Run tests (excluding integration tests)
run: |
diff --git a/Dockerfile b/Dockerfile
index 19fca58..5ed6bb6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
FROM python:3.11-slim
-# Install required system packages, including git
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+# git is needed by some dependencies; wget is only used when baking a profile.
+RUN apt-get update && apt-get install -y git wget && rm -rf /var/lib/apt/lists/*
WORKDIR /app
@@ -9,9 +9,38 @@ COPY requirements.txt .
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt
-COPY cratey.py LICENSE /app/
+COPY wsgi.py LICENSE /app/
COPY app /app/app
+# Optionally fetch an extra RO-Crate profile into a normal directory. It is
+# *added* to the bundled profiles at runtime via EXTRA_PROFILES_PATH.
+# A plain build leaves PROFILES_ARCHIVE_URL empty
+# and skips this; the "with profiles" image build passes it as --build-arg.
+ARG PROFILES_ARCHIVE_URL=""
+ARG FIVE_SAFES_PROFILE_VERSION=""
+# Set EXTRA_PROFILES_PATH only for the profiles build (passed as a build arg).
+ARG EXTRA_PROFILES_PATH=""
+ENV EXTRA_PROFILES_PATH=${EXTRA_PROFILES_PATH}
+ENV CACHE_PATH=/app/.rocrate-cache
+RUN if [ -n "$PROFILES_ARCHIVE_URL" ]; then \
+ mkdir -p /app/extra-profiles && \
+ wget -O /tmp/profiles.tar.gz "$PROFILES_ARCHIVE_URL" && \
+ tar -xzf /tmp/profiles.tar.gz \
+ -C /app/extra-profiles \
+ --strip-components=3 \
+ "rocrate-validator-${FIVE_SAFES_PROFILE_VERSION}/rocrate_validator/profiles/five-safes-crate" && \
+ rm /tmp/profiles.tar.gz ; \
+ fi
+
+# Pre-populate the HTTP cache so opt-in offline validation
+# (VALIDATION_OFFLINE=true) works without network at runtime.
+RUN if [ -n "$EXTRA_PROFILES_PATH" ]; then \
+ rocrate-validator cache warm --all-profiles \
+ --extra-profiles-path "$EXTRA_PROFILES_PATH" --cache-path "$CACHE_PATH" ; \
+ else \
+ rocrate-validator cache warm --all-profiles --cache-path "$CACHE_PATH" ; \
+ fi
+
RUN useradd -ms /bin/bash flaskuser
RUN chown -R flaskuser:flaskuser /app
@@ -21,4 +50,5 @@ EXPOSE 5000
CMD ["flask", "run", "--host=0.0.0.0"]
-LABEL org.opencontainers.image.source="https://github.com/eScienceLab/Cratey-Validator"
\ No newline at end of file
+LABEL org.opencontainers.image.source="https://github.com/eScienceLab/RO-Crate-Validation-Service"
+LABEL org.ro-crate-validation-service.five-safes-profile-version="${FIVE_SAFES_PROFILE_VERSION}"
diff --git a/Dockerfile.fivesafes-profile b/Dockerfile.fivesafes-profile
deleted file mode 100644
index df234b9..0000000
--- a/Dockerfile.fivesafes-profile
+++ /dev/null
@@ -1,39 +0,0 @@
-FROM python:3.11-slim
-
-ARG FIVE_SAFES_PROFILE_VERSION=five-safes-0.7.4-beta
-ARG PROFILES_ARCHIVE_URL=https://github.com/eScienceLab/rocrate-validator/archive/refs/tags/${FIVE_SAFES_PROFILE_VERSION}.tar.gz
-ARG PY_VER=3.11
-
-# Install required system packages, including git
-RUN apt-get update && apt-get install -y git wget && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /app
-
-COPY requirements.txt .
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir -r requirements.txt
-
-COPY cratey.py LICENSE /app/
-COPY app /app/app
-RUN <e.g., RustFS / AWS S3")]
+
+ Client -->|HTTP| API
+ API -->|"Crate metadata-only flow: validate inline"| Validator
+ API -->|"Crate ID flow: resolve, then queue"| Broker
+ Broker --> Worker
+ Worker --> Validator
+ API -->|"resolve / read result"| Store
+ Worker -->|"fetch crate / write result"| Store
+ Worker -.->|"optional webhook"| Client
+```
-### Optional MinIO object storage
+- **Flask API** handles HTTP. Metadata-only validation runs **inline** (stateless).
+ S3-backed requests are validated by the **Celery worker**.
+- **Redis** is the Celery broker.
+- **S3-compatible store** holds crates and validation results. Credentials live
+ **server-side** (the service is configured with them); clients never send
+ storage credentials. Any S3-compatible store should work — the dev stack uses
+ [RustFS](https://rustfs.com/).
-The RO-Crate Validation Service can validate an RO-Crate's metadata directly from a JSON payload (the `POST v1/ro_crates/validate_metadata` endpoint) without storing anything. This is the default mode.
+## Concepts
-Optionally, the service can read crates from — and write validation results
-back to — a [MinIO](https://min.io/) object store. This is disabled by
-default and controlled by the `MINIO_ENABLED` environment variable:
+### Crate ID
-- `MINIO_ENABLED=false` (default): only a stateless validation endpoint is available and nothing is stored.
-- `MINIO_ENABLED=true`: the ID endpoints (`POST`/`GET v1/ro_crates/{crate_id}/validation`) are also registered, and a MinIO instance is required. With Docker Compose, start MinIO with its opt-in profile: `docker compose --profile minio up`.
+In the S3 flow, a crate is addressed by a **Crate ID**. This a short,
+opaque label chosen by the caller (e.g. `my-dataset-2026`). It is **not** a
+filename, a path, or a URL: the service composes the actual object keys from it.
-When MinIO is disabled the ID-based endpoints are not registered and return `404`.
+Crate IDs are validated strictly: they must match
+`^[A-Za-z0-9][A-Za-z0-9._-]{0,127}$` (start alphanumeric; only letters, digits,
+`.`, `_`, `-`; max 128 characters; no `/` or `..`). This:
-## API
+- keeps IDs safe to compose into object keys and local paths;
+- means the ID is treated as a single opaque token — a `.zip` **inside** an ID is
+ harmless, because the service never parses meaning out of the ID.
-#### Request Validation of RO-Crate
-
-
- POST v1/ro_crates/{crate_id}/validation (Request validation of RO-Crate validation in Object Store)
-
-##### Path Parameters
-
-| name | type | data type | description |
-|------------|-----------|-------------------------|-----------------------------------------------------------------------|
-| crate_id | required | string | RO-Crate identifer string |
-
-##### Parameters
-
-| name | type | data type | description |
-|------------|-----------|-------------------------|-----------------------------------------------------------------------|
-| root_path | optional | string | Root path which contains the RO-Crate |
-| webhook_url | optional | string | Webhook to send validation result to |
-| profile_name | optional | string | RO-Crate profile to validate against |
-| minio_config | required | dictionary | MinIO Configuration Details |
+Invalid IDs are rejected with `400` before anything else happens.
-`minio_config`
-> | name | type | data type | description |
-> |------------|-----------|-------------------------|-----------------------------------------------------------------------|
-> | endpoint | required | string | MinIO endpoint |
-> | accesskey | required | string | MinIO access key or username |
-> | secret | required | string | MinIO secret or password |
-> | ssl | required | boolean | Use SSL encryption for MinIO access? |
-> | bucket | required | string | MinIO bucket containing RO-Crate |
+### Storage layout
-##### Responses
+Crates and their results live under separate, configurable prefixes in one
+bucket, so a result can never be confused with (or collide with) a crate:
-| http code | content-type | response |
-|---------------|-----------------------------------|---------------------------------------------------------------------|
-| `202` | `application/json` | `{"message": "Validation in progress"}` |
-| `400` | `application/json` | `{"message": "No RO-Crate with prefix: "}` |
-| `500` | `application/json` | `{"message": "Internal server errors"}` |
+| Item | Object key |
+|------|------------|
+| Crate (zip) | `{S3_CRATE_PREFIX}/{id}.zip` |
+| Crate (directory) | `{S3_CRATE_PREFIX}/{id}/` containing `ro-crate-metadata.json` |
+| Validation result | `{S3_RESULTS_PREFIX}/{id}.json` |
-```javascript
-curl -X 'POST' \
- 'http://localhost:5001/v1/ro_crates//validation' \
- -H 'accept: application/json' \
- -H 'Content-Type: application/json' \
- -d '{
- "minio_config": {
- "accesskey": "",
- "bucket": "ro-crates",
- "endpoint": "minio:9000",
- "secret": "",
- "ssl": false
- },
- "profile_name": "",
- "webhook_url": ""
-}'
-```
+Defaults: `S3_CRATE_PREFIX=crates`, `S3_RESULTS_PREFIX=validation-results`.
-
+> **Zip layout matters:** for a zip crate, `ro-crate-metadata.json` must be at
+> the **root** of the archive. Zipping a folder (so entries look like
+> `mycrate/ro-crate-metadata.json`) makes the crate invalid. Zip the crate's
+> **contents**, not its containing folder.
+### Crate resolution
-#### Get RO-Crate Validation Result
+The service resolves a Crate ID to a concrete object by **direct existence
+checks on the canonical keys**, rather than by listing a prefix and assuming. This
+makes resolution deterministic and unambiguous:
-
- GET v1/ro_crates/{crate_id}/validation (Obtain RO-Crate validation result from Object Store)
+Where the **zip object** is `{prefix}/{id}.zip` and the **directory metadata** is
+`{prefix}/{id}/ro-crate-metadata.json`:
-##### Path Parameters
+```mermaid
+flowchart TD
+ A["crate_id"] --> B{"valid format?"}
+ B -- no --> E["400 Invalid crate ID"]
+ B -- yes --> C{"zip object exists?"}
+ C -- yes --> Z{"directory metadata also exists?"}
+ Z -- yes --> AMB["409 Ambiguous"]
+ Z -- no --> ZIP["resolve as zip crate"]
+ C -- no --> D{"directory metadata exists?"}
+ D -- yes --> DIR["resolve as directory crate"]
+ D -- no --> NF["404 Not found"]
+```
-| name | type | data type | description |
-|------------|-----------|-------------------------|-----------------------------------------------------------------------|
-| crate_id | required | string | RO-Crate identifer string |
+### Validation outcome
-##### Parameters
+Every validation produces a single **outcome** object with an explicit status:
-| name | type | data type | description |
-|------------|-----------|-------------------------|-----------------------------------------------------------------------|
-| root_path | optional | string | Root path which contains the RO-Crate |
-| minio_config | required | dictionary | MinIO Configuration Details |
+| status | meaning |
+|--------|---------|
+| `valid` | the crate/metadata conforms to the profile |
+| `invalid` | it was validated but has conformance issues (see `detail`) |
+| `error` | it could not be validated (bad input, validator failure) |
-`minio_config`
-> | name | type | data type | description |
-> |------------|-----------|-------------------------|-----------------------------------------------------------------------|
-> | endpoint | required | string | MinIO endpoint |
-> | accesskey | required | string | MinIO access key or username |
-> | secret | required | string | MinIO secret or password |
-> | ssl | required | boolean | Use SSL encryption for MinIO access? |
-> | bucket | required | string | MinIO bucket containing RO-Crate |
+```json
+{ "status": "invalid", "profile": "ro-crate-1.2", "created_at": "…", "detail": { … } }
+```
-##### Responses
+## Request flows
-| http code | content-type | response |
-|---------------|-----------------------------------|---------------------------------------------------------------------|
-| `200` | `application/json` | `Successful Validation` |
-| `422` | `application/json` | `Error: Details of Validation Error` |
-| `404` | `application/json` | `Not found` |
+### Metadata-only (synchronous)
-##### Example cURL
+```mermaid
+sequenceDiagram
+ actor Client
+ participant API as Flask API
+ participant V as rocrate-validator
+ Client->>API: POST /v1/ro_crates/validate_metadata { crate_json }
+ API->>V: validate metadata (inline)
+ V-->>API: outcome
+ API-->>Client: 200 valid/invalid · 422 error/bad input
+```
-```javascript
- curl -X 'GET' \
- 'http://localhost:5001/v1/ro_crates//validation' \
- -H 'accept: application/json' \
- -H 'Content-Type: application/json' \
- -d '{
- "minio_config": {
- "accesskey": "",
- "bucket": "ro-crates",
- "endpoint": "minio:9000",
- "secret": "",
- "ssl": false
- }
-}'
+### S3-backed (asynchronous)
+
+```mermaid
+sequenceDiagram
+ actor Client
+ participant API as Flask API
+ participant S as S3 store
+ participant Q as Redis
+ participant W as Celery worker
+ Client->>API: POST /v1/ro_crates/{id}/validation
+ API->>S: resolve crate (stat canonical keys)
+ alt invalid id / not found / ambiguous
+ API-->>Client: 400 / 404 / 409
+ else exists
+ API->>Q: queue (id, profile?, webhook?)
+ API-->>Client: 202 Validation in progress
+ Q->>W: deliver task
+ W->>S: download crate
+ W->>W: validate
+ W->>S: persist {results_prefix}/{id}.json
+ opt webhook_url given
+ W-->>Client: POST outcome (retried with backoff)
+ end
+ end
+ Client->>API: GET /v1/ro_crates/{id}/validation
+ API->>S: read result object
+ API-->>Client: 200 outcome · 404 not validated yet
```
-
+The worker runs the stages in order (**fetch → validate → persist → webhook**), so
+a storage write failure can never trigger a "success" webhook, and the outcome
+(including an `error` outcome) is always persisted so `GET` reflects it.
-#### Validate RO-Crate Metadata
+## API
-
- POST v1/ro_crates/validate_metadata (validates submitted RO-Crate Metadata)
+Base URL in the dev stack: `http://localhost:5001`.
-##### Parameters
+### `POST /v1/ro_crates/validate_metadata`
-| name | type | data type | description |
-|------------|-----------|-------------------------|-----------------------------------------------------------------------|
-| crate_json | required | string | RO-Crate metadata, stored as a single string |
-| profile_name | optional | string | RO-Crate profile to validate against |
+Validate an RO-Crate metadata document inline. This is always available.
+| field | required | type | description |
+|-------|----------|------|-------------|
+| `crate_json` | yes | string | RO-Crate metadata JSON-LD, as a string |
+| `profile_name` | no | string | profile to validate against (default: auto/base profile) |
-##### Responses
+Responses: `200` (valid/invalid outcome), `422` (missing/empty/invalid JSON, or
+an `error` outcome).
-| http code | content-type | response |
-|---------------|-----------------------------------|---------------------------------------------------------------------|
-| `200` | `application/json` | `Successful Validation` |
-| `422` | `application/json` | `Error: Details of Validation Error` |
+`crate_json` is the metadata document as a **string**, so the easiest way to
+validate a file is to let `jq` read and escape it (`-R` raw, `-s` slurp), and then
+post the result:
-##### Example cURL
+```bash
+jq -Rs '{crate_json: .}' ro-crate-metadata.json \
+ | curl -X POST http://localhost:5001/v1/ro_crates/validate_metadata \
+ -H 'Content-Type: application/json' -d @-
+```
+
+Add a profile with `jq -Rs '{crate_json: ., profile_name: "ro-crate-1.2"}' …`.
-```javascript
- curl -X 'POST' \
- 'http://localhost:5001/v1/ro_crates/validate_metadata' \
- -H 'accept: application/json' \
- -H 'Content-Type: application/json' \
- -d '{
- "crate_json": "{'\''test1'\'':'\''test2'\''}"
- }'
+Or inline a small document directly:
+
+```bash
+curl -X POST http://localhost:5001/v1/ro_crates/validate_metadata \
+ -H 'Content-Type: application/json' \
+ -d '{"crate_json": "{\"@context\": \"https://w3id.org/ro/crate/1.2/context\", \"@graph\": []}"}'
```
-
+### `POST /v1/ro_crates/{crate_id}/validation`
+
+Queue a stored crate for validation. **This is only registered when storage is enabled**
+(otherwise `404`). The request body carries no credentials.
+| field | required | type | description |
+|-------|----------|------|-------------|
+| `profile_name` | no | string | profile to validate against |
+| `webhook_url` | no | string | URL to POST the result to when done |
-## Setting up the project
+Responses: `202` queued, `400` invalid ID, `404` crate not found, `409`
+ambiguous (both zip and directory exist) in the object store, `503` storage unavailable.
+
+```bash
+curl -X POST http://localhost:5001/v1/ro_crates/my-crate/validation \
+ -H 'Content-Type: application/json' -d '{"profile_name": "ro-crate-1.2"}'
+```
+
+### `GET /v1/ro_crates/{crate_id}/validation`
+
+Fetch a stored validation result. Only registered when storage is enabled.
+
+Responses: `200` (the stored outcome, including a persisted `error` outcome),
+`400` invalid ID, `404` no result stored yet.
+
+```bash
+curl http://localhost:5001/v1/ro_crates/my-crate/validation
+```
+
+### Health
+
+- `GET /healthz` - liveness; `200 {"status": "ok"}` whenever the process is up.
+- `GET /readyz` - readiness; checks the object store and Celery broker. `200`
+ when ready/available, `503` otherwise. When storage is disabled, those dependencies
+ report `disabled`.
+
+## Configuration
+
+Configuration is read once at the start and validated; a misconfigured deployment
+fails quickly with a clear error rather than at the first request.
+
+| config | default | description |
+|----------|---------|-------------|
+| `STORAGE_ENABLED` | `false` | enable the S3 ID endpoints |
+| `S3_ENDPOINT` | — | object store endpoint, e.g. `objectstore:9000` |
+| `S3_ACCESS_KEY` | — | access key |
+| `S3_SECRET_KEY` | — | secret key |
+| `S3_BUCKET` | — | bucket holding crates and results |
+| `S3_USE_SSL` | `false` | use HTTPS to the store |
+| `S3_REGION` | `us-east-1` | region (for AWS; not used elsewhere) |
+| `S3_CRATE_PREFIX` | `crates` | key prefix for crates |
+| `S3_RESULTS_PREFIX` | `validation-results` | key prefix for results |
+| `CELERY_BROKER_URL` | — | Redis broker URL |
+| `CELERY_RESULT_BACKEND` | — | Celery result backend URL |
+| `PROFILES_PATH` | — | directory of profiles that **replaces** the bundled set (optional) |
+| `EXTRA_PROFILES_PATH` | — | directory of profiles **added** to the bundled set (optional) |
+| `CACHE_PATH` | per-user dir | HTTP cache location for the validator |
+| `VALIDATION_OFFLINE` | `false` | validate using only the cache (no network) |
+| `FLASK_ENV` | `development` | `production` disables debug |
+
+When `STORAGE_ENABLED=true`, the `S3_*` and `CELERY_*` variables above are
+**required** — startup fails if any are missing.
+
+### Profiles, cache, and offline validation
+
+Custom profiles (e.g. `five-safes-crate`) are best added with
+`EXTRA_PROFILES_PATH`, which **adds** them to the validator's bundled profiles —
+unlike `PROFILES_PATH`, which replaces the bundled set entirely. The published
+"with profiles" image bakes the five-safes profile in this way.
+
+The validator caches the profile/context HTTP resources it fetches. The Docker
+image pre-populates this cache at build time (`rocrate-validator cache warm`),
+so setting `VALIDATION_OFFLINE=true` runs validation entirely from the cache
+with no network access. Online validation (the default) also uses and refreshes
+the cache. (Requires rocrate-validator ≥ 0.10.0.)
+
+## Running the service
### Prerequisites
- Docker with Docker Compose
-### Installation
-
-1. Clone the repository:
- ```bash
- git clone https://github.com/eScienceLab/Cratey-Validator.git
- cd crate-validation-service
- ```
-
-2. Create the `.env` file for shared environment information. An example environment file is included (`example.env`), which can be copied for this purpose. But make sure to change any security settings (username and passwords).
-
-3. A directory containing RO-Crate profiles to replace the default RO-Crate profiles for validation may be provided. Note that this will need to contain all profile files, as the default profile data will not be used. An example of this is given in the `docker-compose-develop.yml` file, and described here:
- 1. Store the profiles in a convenient directory, e.g.: `./local/rocrate_validator_profiles`
- 2. Add a volume to the celery worker container for these, e.g.:
- ```
- volumes:
- - ./local/rocrate_validator_profiles:/app/profiles:ro
- ```
- 3. Provide the `PROFILES_PATH` environment to the flask container (not the celery worker container) to match the internal path, e.g.:
- ```
- - PROFILES_PATH=/app/profiles
- ```
-
-4. Build and start the services using Docker Compose:
- ```bash
- docker compose up --build
- ```
- This runs in the default (metadata-only) mode. To enable the MinIO-backed
- endpoints, set `MINIO_ENABLED=true` in your `.env` and start the `minio`
- profile:
- ```bash
- docker compose --profile minio up --build
- ```
-
-5. **(Only when `MINIO_ENABLED=true`)** Set up the MinIO bucket
- 1. Open the MinIO web interface at `http://localhost:9000`.
- 2. Log in with your MinIO credentials.
- 3. Create a new bucket named `ro-crates`.
- 4. **Enable versioning** for the `ro-crates` bucket — this is important for tracking unique object versions.
-
- 
-
- 5. Upload your RO-Crate files to the `ro-crates` bucket.
- 6. To verify that versioning is enabled:
- - Select the uploaded RO-Crate object in the `ro-crates` bucket.
- - Navigate to the **Actions** panel on the right.
- - The **Display Object Versions** option should be clickable.
-
- 
+### Quick start
+```bash
+git clone https://github.com/eScienceLab/RO-Crate-Validation-Service.git
+cd RO-Crate-Validation-Service
+cp example.env .env # and then edit credentials
+docker compose up --build
+```
+
+This runs in **metadata-only** mode (no storage). The API is at
+`http://localhost:5001`.
+
+### With object storage (RustFS)
+
+Set `STORAGE_ENABLED=true` in `.env`, then start the local object store with its
+opt-in profile (uses the prebuilt image compose file, or `-f
+docker-compose-develop.yml` to build locally):
+
+```bash
+docker compose --profile objectstore up --build
+```
+
+The RustFS console is at `http://localhost:9001` (default credentials are
+`rustfsadmin` / `rustfsadmin`). Create the bucket named in `S3_BUCKET`
+(default `ro-crates`) and upload crates under the crate prefix
+(`crates/.zip` or `crates//…`).
+
+> The service does not create the bucket; create it once via the console, the RustFS
+> Web UI, AWS CLI, or `boto3`.
+
+### Custom profiles
+
+To validate against profiles other than the bundled ones, mount a profiles
+directory into **both** the flask and worker containers (metadata validation
+runs in flask; crate validation runs in the worker) and set `PROFILES_PATH` to
+the mounted path. See `docker-compose-develop.yml` for a working example.
## Development
-For standard usage the Docker Compose script uses prebuilt containers.
-For testing locally developed containers use the alternate Docker Compose file:
```bash
- docker compose --file docker-compose-develop.yml up --build
-```
+docker compose -f docker-compose-develop.yml --profile objectstore up --build
+```
+
+### Dependencies
+
+Direct dependencies live in `pyproject.toml`.
+
+```bash
+pip-compile pyproject.toml -o requirements.txt # runtime lock
+pip-compile --extra dev pyproject.toml -o requirements-dev.txt # + dev tools
+```
+
+### Tests & linting
+
+```bash
+pip install -r requirements-dev.txt
+pytest --ignore=tests/test_integration.py # unit tests (no Docker needed)
+pytest tests/test_integration.py # integration tests (needs Docker)
+ruff check . && ruff format --check . # lint + format
+```
+
+`tests/` mirrors the `app/` package layout. The integration tests bring up the
+compose stack and seed crates via `boto3`.
-### Project Structure
+## Project structure
```
app/
-├── ro_crates/
-│ ├── routes/
-│ │ ├── __init__.py # Registers blueprints
-│ │ └── post_routes.py # POST API routes
-│ └── __init__.py
+├── __init__.py # app factory: config, blueprints, error handlers, request IDs
+├── health.py # /healthz and /readyz
+├── storage/ # object-storage abstraction
+│ ├── base.py # StorageBackend protocol + ObjectStat
+│ ├── s3.py # boto3 implementation (any S3-compatible store)
+│ ├── memory.py # in-memory backend (tests / local)
+│ └── errors.py # StorageError, ObjectNotFound
+├── crates/ # crate identity, layout, resolution
+│ ├── ids.py # 'Crate ID' validation
+│ ├── layout.py # object keys
+│ └── resolver.py # deterministic zip/dir resolution
+├── validation/ # validation boundary
+│ ├── results.py # ValidationOutcome (valid/invalid/error)
+│ └── runner.py # wraps rocrate-validator
+├── ro_crates/routes/ # HTTP endpoints (metadata + ID-based)
├── services/
-│ ├── logging_service.py # Centralised logging
-│ └── validation_service.py # Queue RO-Crates for validation
-├── tasks/
-│ └── validation_tasks.py # Validate RO-Crates
-├── utils/
-│ ├── config.py # Configuration
-│ ├── minio_utils.py # Methods for interacting with MinIO
-│ └── webhook_utils.py # Methods for sending webhooks
-```
\ No newline at end of file
+│ ├── validation_service.py # request handling: resolve, queue, read results
+│ └── logging_service.py # JSON logging, request IDs, redaction
+├── tasks/validation_tasks.py # Celery task: fetch → validate → persist → webhook
+└── utils/
+ ├── config.py # validated Settings
+ └── webhook_utils.py # webhook delivery with retry/backoff
+```
+
+## License
+
+MIT — © eScience Lab, The University of Manchester.
diff --git a/app/__init__.py b/app/__init__.py
index dc3d67b..8dcd59a 100644
--- a/app/__init__.py
+++ b/app/__init__.py
@@ -1,64 +1,96 @@
"""Initialises and configures Flask, integrates Celery, and registers application blueprints."""
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
-
import logging
-import os
from apiflask import APIFlask
+from flask import jsonify, request
-from app.ro_crates.routes import v1_post_bp, v1_minio_post_bp, v1_minio_get_bp
+from app.crates.ids import InvalidCrateId
+from app.crates.resolver import AmbiguousCrate, CrateNotFound
+from app.health import health_bp
+from app.ro_crates.routes import v1_minio_get_bp, v1_minio_post_bp, v1_post_bp
+from app.services.logging_service import (
+ get_request_id,
+ new_request_id,
+ set_request_id,
+)
+from app.storage.errors import StorageError
from app.utils.config import (
- DevelopmentConfig,
- ProductionConfig,
InvalidAPIUsage,
+ Settings,
make_celery,
)
-from flask import jsonify
logger = logging.getLogger(__name__)
+REQUEST_ID_HEADER = "X-Request-ID"
+
-def create_app() -> APIFlask:
+def create_app(settings: Settings | None = None) -> APIFlask:
"""
- Creates and configures Flask application.
+ Creates and configures the Flask application.
- :return: Flask: A configured Flask application instance.
+ Configuration is loaded and validated up front via :class:`Settings`, so a
+ misconfigured deployment fails at startup with a clear error rather than at
+ the first request. A ``settings`` object may be injected for testing.
+
+ :param settings: Pre-built settings; if omitted, loaded from the environment.
+ :return: A configured Flask application instance.
+ :raises ConfigError: If required configuration is missing or invalid.
"""
+ if settings is None:
+ settings = Settings.from_env()
+
app = APIFlask(__name__)
- # Load config before registering blueprints, so MINIO_ENABLED can
- # decide whether the backed endpoints are exposed.
- if os.getenv("FLASK_ENV") == "production":
- app.config.from_object(ProductionConfig)
- else:
- # Development environment:
- app.debug = True
- app.config.from_object(DevelopmentConfig)
+ app.debug = settings.debug
+ app.config["SETTINGS"] = settings
+ app.config["STORAGE_ENABLED"] = settings.storage_enabled
+ app.config["PROFILES_PATH"] = settings.profiles_path
# Always available:
+ app.register_blueprint(health_bp)
app.register_blueprint(v1_post_bp, url_prefix="/v1/ro_crates")
- # MinIO is optional and disabled by default. Only register
- # the MinIO ID routes when enabled:
- if app.config.get("MINIO_ENABLED"):
+ # Object storage is optional and disabled by default. Only register the
+ # ID-based, store-backed routes when storage is enabled.
+ if settings.storage_enabled:
app.register_blueprint(v1_minio_post_bp, url_prefix="/v1/ro_crates")
app.register_blueprint(v1_minio_get_bp, url_prefix="/v1/ro_crates")
- logger.info("MinIO storage enabled: ID-based validation endpoints registered.")
+ logger.info("Storage enabled: ID-based validation endpoints registered.")
else:
- logger.info("MinIO storage disabled: only metadata validation is available.")
+ logger.info("Storage disabled: only metadata validation is available.")
- if app.debug:
- print("URL Map:")
- for rule in app.url_map.iter_rules():
- print(rule)
+ @app.before_request
+ def assign_request_id():
+ set_request_id(request.headers.get(REQUEST_ID_HEADER) or new_request_id())
+
+ @app.after_request
+ def attach_request_id(response):
+ response.headers[REQUEST_ID_HEADER] = get_request_id()
+ return response
@app.errorhandler(InvalidAPIUsage)
def invalid_api_usage(e):
return jsonify(e.to_dict()), e.status_code
+ @app.errorhandler(InvalidCrateId)
+ def invalid_crate_id(e):
+ return jsonify({"error": str(e)}), 400
+
+ @app.errorhandler(CrateNotFound)
+ def crate_not_found(e):
+ return jsonify({"error": str(e)}), 404
+
+ @app.errorhandler(AmbiguousCrate)
+ def ambiguous_crate(e):
+ return jsonify({"error": str(e)}), 409
+
+ @app.errorhandler(StorageError)
+ def storage_error(e):
+ logger.error("Storage error: %s", e)
+ return jsonify({"error": "Storage backend unavailable"}), 503
+
# Integrate Celery
make_celery(app)
diff --git a/app/celery_worker.py b/app/celery_worker.py
index 778407c..90899eb 100644
--- a/app/celery_worker.py
+++ b/app/celery_worker.py
@@ -6,5 +6,4 @@
from celery import Celery
-
celery = Celery()
diff --git a/app/crates/__init__.py b/app/crates/__init__.py
new file mode 100644
index 0000000..2415d5e
--- /dev/null
+++ b/app/crates/__init__.py
@@ -0,0 +1 @@
+"""Crate identification, layout, and resolution within object storage."""
diff --git a/app/crates/ids.py b/app/crates/ids.py
new file mode 100644
index 0000000..973ceed
--- /dev/null
+++ b/app/crates/ids.py
@@ -0,0 +1,37 @@
+"""Strict validation for crate identifiers.
+
+A crate ID is treated as a single-segment label. Constraining it to a safe charset
+means it can be composed into object keys and local paths without risk of collisions
+or traversal, and removes the need to parse meaning (such as a ``.zip`` suffix) back
+out of it.
+"""
+
+import re
+
+# Start with an alphanumeric (so no leading dot/dash), then up to 127 more of a
+# restricted set. No slashes, whitespace, or non-ASCII; max length 128.
+_CRATE_ID_PATTERN = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,127}$")
+
+
+class InvalidCrateId(ValueError):
+ """Raised when a crate ID does not meet the required format."""
+
+
+def is_valid_crate_id(crate_id: object) -> bool:
+ """Return whether ``crate_id`` is a well-formed crate identifier."""
+ if not isinstance(crate_id, str):
+ return False
+ if ".." in crate_id:
+ return False
+ return _CRATE_ID_PATTERN.match(crate_id) is not None
+
+
+def validate_crate_id(crate_id: object) -> str:
+ """Return ``crate_id`` unchanged if valid, else raise :class:`InvalidCrateId`."""
+ if not is_valid_crate_id(crate_id):
+ raise InvalidCrateId(
+ f"Invalid crate ID: {crate_id!r}. Crate IDs must start with a letter "
+ "or digit and contain only letters, digits, '.', '_' or '-' "
+ "(max 128 characters, no '/' or '..')."
+ )
+ return crate_id
diff --git a/app/crates/layout.py b/app/crates/layout.py
new file mode 100644
index 0000000..d3e4649
--- /dev/null
+++ b/app/crates/layout.py
@@ -0,0 +1,40 @@
+"""Canonical object-key layout for crates and their validation results.
+
+A single place defines where crates and results live in the bucket. Crates and
+results use *separate* prefixes so a result object can never collide with, or be
+mistaken for, a crate object.
+
+Layout (given a ``crate_prefix`` and ``results_prefix``):
+
+- Crate (zip): ``{crate_prefix}/{id}.zip``
+- Crate (directory): ``{crate_prefix}/{id}/`` containing ``ro-crate-metadata.json``
+- Validation result: ``{results_prefix}/{id}.json``
+"""
+
+METADATA_FILENAME = "ro-crate-metadata.json"
+
+
+def _join(prefix: str, suffix: str) -> str:
+ """Join an optional prefix and a suffix with a single separator."""
+ prefix = prefix.strip("/")
+ return f"{prefix}/{suffix}" if prefix else suffix
+
+
+def crate_zip_key(crate_prefix: str, crate_id: str) -> str:
+ """Object key for a crate stored as a zip archive."""
+ return _join(crate_prefix, f"{crate_id}.zip")
+
+
+def crate_dir_prefix(crate_prefix: str, crate_id: str) -> str:
+ """Key prefix (with trailing slash) for a crate stored as a directory."""
+ return _join(crate_prefix, f"{crate_id}/")
+
+
+def crate_metadata_key(crate_prefix: str, crate_id: str) -> str:
+ """Object key for the metadata file inside a directory-style crate."""
+ return _join(crate_prefix, f"{crate_id}/{METADATA_FILENAME}")
+
+
+def result_key(results_prefix: str, crate_id: str) -> str:
+ """Object key for a crate's stored validation result."""
+ return _join(results_prefix, f"{crate_id}.json")
diff --git a/app/crates/resolver.py b/app/crates/resolver.py
new file mode 100644
index 0000000..06b5726
--- /dev/null
+++ b/app/crates/resolver.py
@@ -0,0 +1,74 @@
+"""Deterministic crate resolution over a StorageBackend.
+
+Resolution is by direct existence checks on canonical keys, never by listing a
+prefix and substring-matching. This removes the previous fragilities: it cannot
+false-match a sibling prefix, it confirms a directory crate actually contains
+``ro-crate-metadata.json``, and it treats the ID as opaque (so a ``.zip`` in the
+ID is harmless). Ambiguity and absence are reported explicitly.
+"""
+
+from dataclasses import dataclass
+
+from app.crates.ids import validate_crate_id
+from app.crates.layout import crate_dir_prefix, crate_metadata_key, crate_zip_key
+from app.storage.base import StorageBackend
+from app.storage.errors import ObjectNotFound
+
+
+@dataclass(frozen=True)
+class ResolvedCrate:
+ """A crate located in storage.
+
+ ``key`` is the zip object key for a zip crate, or the directory prefix
+ (with trailing slash) for a directory crate.
+ """
+
+ crate_id: str
+ key: str
+ is_zip: bool
+
+
+class CrateNotFound(Exception):
+ """Raised when no crate exists for the given ID."""
+
+
+class AmbiguousCrate(Exception):
+ """Raised when both a zip and a directory crate exist for the same ID."""
+
+
+def _object_exists(storage: StorageBackend, key: str) -> bool:
+ try:
+ storage.stat(key)
+ return True
+ except ObjectNotFound:
+ return False
+
+
+def resolve_crate(storage: StorageBackend, crate_id: str, crate_prefix: str) -> ResolvedCrate:
+ """Resolve ``crate_id`` to a concrete crate object.
+
+ :raises InvalidCrateId: If the ID is malformed.
+ :raises AmbiguousCrate: If both zip and directory forms exist.
+ :raises CrateNotFound: If neither form exists.
+ """
+ validate_crate_id(crate_id)
+
+ zip_key = crate_zip_key(crate_prefix, crate_id)
+ metadata_key = crate_metadata_key(crate_prefix, crate_id)
+
+ zip_exists = _object_exists(storage, zip_key)
+ directory_exists = _object_exists(storage, metadata_key)
+
+ if zip_exists and directory_exists:
+ raise AmbiguousCrate(
+ f"Crate {crate_id!r} exists as both a zip and a directory; refusing to guess."
+ )
+ if zip_exists:
+ return ResolvedCrate(crate_id=crate_id, key=zip_key, is_zip=True)
+ if directory_exists:
+ return ResolvedCrate(
+ crate_id=crate_id,
+ key=crate_dir_prefix(crate_prefix, crate_id),
+ is_zip=False,
+ )
+ raise CrateNotFound(f"No crate found for ID {crate_id!r}")
diff --git a/app/health.py b/app/health.py
new file mode 100644
index 0000000..3988c33
--- /dev/null
+++ b/app/health.py
@@ -0,0 +1,70 @@
+"""Liveness and readiness endpoints for orchestration.
+
+``/healthz`` reports that the process is up. ``/readyz`` reports whether the
+service can actually serve S3 requests, by checking the object store and the
+Celery broker. When storage is disabled, those dependencies are not required
+and report ``disabled``.
+"""
+
+import logging
+
+from apiflask import APIBlueprint
+from flask import current_app, jsonify
+
+from app.storage.s3 import S3Backend
+
+logger = logging.getLogger(__name__)
+
+health_bp = APIBlueprint("health", __name__)
+
+# Short connection timeout (seconds) so readiness checks fail quickly.
+_BROKER_TIMEOUT = 3
+
+
+def check_storage(settings) -> tuple[bool, str]:
+ """Return whether the object store is reachable, with a detail string."""
+ if not settings.storage_enabled:
+ return True, "disabled"
+ try:
+ S3Backend.from_settings(settings).health_check()
+ return True, "ok"
+ except Exception as error: # noqa: BLE001 - any failure means not ready
+ logger.warning("Storage readiness check failed: %s", error)
+ return False, str(error)
+
+
+def check_broker(settings) -> tuple[bool, str]:
+ """Return whether the Celery broker is reachable, with a detail string."""
+ if not settings.storage_enabled:
+ return True, "disabled"
+ try:
+ from kombu import Connection
+
+ with Connection(settings.celery_broker_url) as connection:
+ connection.ensure_connection(max_retries=1, timeout=_BROKER_TIMEOUT)
+ return True, "ok"
+ except Exception as error: # noqa: BLE001 - any failure means not ready
+ logger.warning("Broker readiness check failed: %s", error)
+ return False, str(error)
+
+
+@health_bp.get("/healthz")
+def healthz():
+ """Liveness: the process is running."""
+ return jsonify({"status": "ok"}), 200
+
+
+@health_bp.get("/readyz")
+def readyz():
+ """Readiness: dependencies needed to serve requests are reachable."""
+ settings = current_app.config["SETTINGS"]
+
+ storage_ok, storage_detail = check_storage(settings)
+ broker_ok, broker_detail = check_broker(settings)
+ ready = storage_ok and broker_ok
+
+ body = {
+ "status": "ready" if ready else "not ready",
+ "checks": {"storage": storage_detail, "broker": broker_detail},
+ }
+ return jsonify(body), (200 if ready else 503)
diff --git a/app/ro_crates/routes/__init__.py b/app/ro_crates/routes/__init__.py
index f4ae897..6002963 100644
--- a/app/ro_crates/routes/__init__.py
+++ b/app/ro_crates/routes/__init__.py
@@ -1,15 +1,11 @@
"""Defines main Blueprint and registers sub-Blueprints for organising related routes."""
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
-
-from app.ro_crates.routes.post_routes import post_routes_bp, minio_post_routes_bp
from app.ro_crates.routes.get_routes import get_routes_bp
+from app.ro_crates.routes.post_routes import minio_post_routes_bp, post_routes_bp
# Always registered:
v1_post_bp = post_routes_bp
-# Registered only when MinIO is enabled:
+# Registered only when object storage is enabled:
v1_minio_post_bp = minio_post_routes_bp
v1_minio_get_bp = get_routes_bp
diff --git a/app/ro_crates/routes/get_routes.py b/app/ro_crates/routes/get_routes.py
index d6b23c6..5a477ad 100644
--- a/app/ro_crates/routes/get_routes.py
+++ b/app/ro_crates/routes/get_routes.py
@@ -1,12 +1,6 @@
-"""Defines get API endpoints for validating RO-Crates using their IDs from MinIO."""
+"""GET endpoint for retrieving a stored RO-Crate validation result by ID."""
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
-
-from apiflask import APIBlueprint, Schema
-from apiflask.fields import String, Boolean
-from marshmallow.fields import Nested
+from apiflask import APIBlueprint
from flask import Response
from app.services.validation_service import get_ro_crate_validation_task
@@ -14,49 +8,17 @@
get_routes_bp = APIBlueprint("get_routes", __name__)
-class MinioConfig(Schema):
- endpoint = String(required=True)
- accesskey = String(required=True)
- secret = String(required=True)
- ssl = Boolean(required=True)
- bucket = String(required=True)
-
-
-class ValidateResult(Schema):
- minio_config = Nested(MinioConfig, required=True)
- root_path = String(required=False)
-
-
@get_routes_bp.get("/validation")
-@get_routes_bp.input(ValidateResult(partial=False), location='json')
-def get_ro_crate_validation_by_id(json_data, crate_id) -> tuple[Response, int]:
+def get_ro_crate_validation_by_id(crate_id) -> tuple[Response, int]:
"""
- Endpoint to obtain an RO-Crate validation result using its ID from MinIO.
+ Obtain a stored RO-Crate validation result by its ID.
Path Parameters:
- **crate_id**: The RO-Crate ID. _Required_.
- Request Body Parameters:
- - **minio_config**: The MinIO bucket containing the RO-Crate. _Required_
- - **endpoint**: Endpoint, e.g. 'localhost:9000'
- - **accesskey**: Access key / username
- - **secret**: Secret / password
- - **ssl**: Use SSL encryption? True/False
- - **bucket**: The MinIO bucket to access
- - **root_path**: The root path containing the RO-Crate. _Optional_
-
Returns:
- - A tuple containing the validation result and an HTTP status code.
-
- Raises:
- - KeyError: If required parameters (`crate_id`) are missing.
+ - A tuple containing the stored validation result and an HTTP status code.
+ Returns 404 if no result has been stored for the crate yet.
"""
- minio_config = json_data["minio_config"]
-
- if "root_path" in json_data:
- root_path = json_data["root_path"]
- else:
- root_path = None
-
- return get_ro_crate_validation_task(minio_config, crate_id, root_path)
+ return get_ro_crate_validation_task(crate_id)
diff --git a/app/ro_crates/routes/post_routes.py b/app/ro_crates/routes/post_routes.py
index 5fb1fda..e0eb3e3 100644
--- a/app/ro_crates/routes/post_routes.py
+++ b/app/ro_crates/routes/post_routes.py
@@ -1,38 +1,23 @@
-"""Defines post API endpoints for validating RO-Crates using their IDs from MinIO."""
-
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
+"""POST endpoints for validating RO-Crates by stored ID or by inline metadata."""
from apiflask import APIBlueprint, Schema
-from apiflask.fields import String, Boolean
-from marshmallow.fields import Nested
+from apiflask.fields import String
from flask import Response, current_app
from app.services.validation_service import (
queue_ro_crate_validation_task,
- queue_ro_crate_metadata_validation_task,
+ run_metadata_validation,
)
# Always-on blueprint:
post_routes_bp = APIBlueprint("post_routes", __name__)
-# MinIO blueprint. Only registered when MINIO_ENABLED is true
+# Store-backed blueprint. Only registered when storage is enabled
# (see app.create_app), so the ID-based routes are unreachable by default.
minio_post_routes_bp = APIBlueprint("minio_post_routes", __name__)
-class MinioConfig(Schema):
- endpoint = String(required=True)
- accesskey = String(required=True)
- secret = String(required=True)
- ssl = Boolean(required=True)
- bucket = String(required=True)
-
-
class ValidateCrate(Schema):
- minio_config = Nested(MinioConfig, required=True)
- root_path = String(required=False)
profile_name = String(required=False)
webhook_url = String(required=False)
@@ -46,51 +31,26 @@ class ValidateJSON(Schema):
@minio_post_routes_bp.input(ValidateCrate(partial=False), location="json")
def validate_ro_crate_via_id(json_data, crate_id) -> tuple[Response, int]:
"""
- Endpoint to validate an RO-Crate using its ID from MinIO.
+ Validate a stored RO-Crate by its ID.
+
+ Storage credentials and layout are configured server-side; the request body
+ carries only optional fields.
Path Parameters:
- **crate_id**: The RO-Crate ID. _Required_.
Request Body Parameters:
- - **minio_config**: The MinIO bucket containing the RO-Crate. _Required_
- - **endpoint**: Endpoint, e.g. 'localhost:9000'
- - **accesskey**: Access key / username
- - **secret**: Secret / password
- - **ssl**: Use SSL encryption? True/False
- - **bucket**: The MinIO bucket to access
- - **root_path**: The root path containing the RO-Crate. _Optional_
- **profile_name**: The profile name for validation. _Optional_.
- - **webhook_url**: The webhook URL where validation results will be sent. _Optional_.
+ - **webhook_url**: The webhook URL where the validation result will be sent. _Optional_.
Returns:
- A tuple containing the validation task's response and an HTTP status code.
-
- Raises:
- - KeyError: If required parameters (`crate_id` or `webhook_url`) are missing.
"""
- minio_config = json_data["minio_config"]
+ profile_name = json_data.get("profile_name")
+ webhook_url = json_data.get("webhook_url")
- if "root_path" in json_data:
- root_path = json_data["root_path"]
- else:
- root_path = None
-
- if "webhook_url" in json_data:
- webhook_url = json_data["webhook_url"]
- else:
- webhook_url = None
-
- if "profile_name" in json_data:
- profile_name = json_data["profile_name"]
- else:
- profile_name = None
-
- profiles_path = current_app.config["PROFILES_PATH"]
-
- return queue_ro_crate_validation_task(
- minio_config, crate_id, root_path, profile_name, webhook_url, profiles_path
- )
+ return queue_ro_crate_validation_task(crate_id, profile_name, webhook_url)
@post_routes_bp.post("/validate_metadata")
@@ -111,14 +71,15 @@ def validate_ro_crate_metadata(json_data) -> tuple[Response, int]:
"""
crate_json = json_data["crate_json"]
+ profile_name = json_data.get("profile_name")
- if "profile_name" in json_data:
- profile_name = json_data["profile_name"]
- else:
- profile_name = None
-
- profiles_path = current_app.config["PROFILES_PATH"]
+ settings = current_app.config["SETTINGS"]
- return queue_ro_crate_metadata_validation_task(
- crate_json, profile_name, profiles_path=profiles_path
+ return run_metadata_validation(
+ crate_json,
+ profile_name,
+ profiles_path=settings.profiles_path,
+ extra_profiles_path=settings.extra_profiles_path,
+ cache_path=settings.cache_path,
+ offline=settings.validation_offline,
)
diff --git a/app/services/logging_service.py b/app/services/logging_service.py
index 94c0dc0..ccd8efe 100644
--- a/app/services/logging_service.py
+++ b/app/services/logging_service.py
@@ -1,19 +1,89 @@
-"""Logging service for the application."""
-
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
+"""Structured JSON logging with request IDs and secret redaction."""
+import json
import logging
+import uuid
+from contextvars import ContextVar
+from typing import Iterable, Optional
+
+# correlation ID readable from any logging call in the same context. per request basis.
+# (request handler or Celery task). Defaults to "-" when unset.
+_request_id: ContextVar[str] = ContextVar("request_id", default="-")
+
+
+def new_request_id() -> str:
+ """Return a fresh, unique request ID."""
+ return str(uuid.uuid4())
+
+
+def set_request_id(request_id: Optional[str]) -> None:
+ """Set the current request ID (``None`` resets to the default)."""
+ _request_id.set(request_id or "-")
+
+
+def get_request_id() -> str:
+ """Return the current request ID, or ``"-"`` if unset."""
+ return _request_id.get()
+
+
+class RequestIdFilter(logging.Filter):
+ """Attaches the current request ID to every log record."""
+ def filter(self, record: logging.LogRecord) -> bool:
+ record.request_id = get_request_id()
+ return True
-def setup_logging(level: int = logging.INFO) -> None:
+
+class RedactionFilter(logging.Filter):
+ """Masks known secret values wherever they appear in a log message."""
+
+ def __init__(self, secrets: Iterable[Optional[str]]):
+ super().__init__()
+ self._secrets = [s for s in secrets if s]
+
+ def filter(self, record: logging.LogRecord) -> bool:
+ if self._secrets:
+ message = record.getMessage()
+ for secret in self._secrets:
+ message = message.replace(secret, "***")
+ record.msg = message
+ record.args = None
+ return True
+
+
+class JsonFormatter(logging.Formatter):
+ """Formats log records as single-line JSON."""
+
+ def format(self, record: logging.LogRecord) -> str:
+ payload = {
+ "timestamp": self.formatTime(record),
+ "level": record.levelname,
+ "logger": record.name,
+ "message": record.getMessage(),
+ "request_id": getattr(record, "request_id", "-"),
+ }
+ if record.exc_info:
+ payload["exc_info"] = self.formatException(record.exc_info)
+ return json.dumps(payload)
+
+
+def setup_logging(settings=None, level: int = logging.INFO) -> None:
"""
- Configure the logging for the application.
+ Configure root logging: JSON output, request IDs, and secret redaction.
- :param level: The logging level to set. Defaults to INFO.
+ :param settings: Optional Settings; its credentials are redacted from logs.
+ :param level: The logging level to set.
"""
- logging.basicConfig(
- level=level,
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
- )
+ secrets = []
+ if settings is not None:
+ secrets = [settings.s3_secret_key, settings.s3_access_key]
+
+ handler = logging.StreamHandler()
+ handler.setFormatter(JsonFormatter())
+ handler.addFilter(RequestIdFilter())
+ handler.addFilter(RedactionFilter(secrets))
+
+ root = logging.getLogger()
+ root.handlers.clear()
+ root.addHandler(handler)
+ root.setLevel(level)
diff --git a/app/services/validation_service.py b/app/services/validation_service.py
index 37c5c05..a9f407c 100644
--- a/app/services/validation_service.py
+++ b/app/services/validation_service.py
@@ -1,136 +1,120 @@
-"""Service methods to queue RO-Crates for validation using the CRS4 validator and Celery."""
+"""Service layer for RO-Crate validation requests."""
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
-
-import logging
import json
+import logging
-from flask import jsonify, Response
-
-from app.tasks.validation_tasks import (
- process_validation_task_by_id,
- process_validation_task_by_metadata,
- return_ro_crate_validation,
- check_ro_crate_exists,
- check_validation_exists
- )
+from flask import Response, current_app, jsonify
+from app.crates.ids import validate_crate_id
+from app.crates.layout import result_key
+from app.crates.resolver import resolve_crate
+from app.storage.errors import ObjectNotFound
+from app.storage.s3 import S3Backend
+from app.tasks.validation_tasks import process_validation_task_by_id
from app.utils.config import InvalidAPIUsage
-from app.utils.minio_utils import get_minio_client
-
+from app.validation.results import ValidationStatus
+from app.validation.runner import validate_metadata
logger = logging.getLogger(__name__)
+def _build_storage() -> S3Backend:
+ """Build the storage backend from server-side settings."""
+ settings = current_app.config["SETTINGS"]
+ return S3Backend.from_settings(settings)
+
+
def queue_ro_crate_validation_task(
- minio_config, crate_id, root_path=None, profile_name=None, webhook_url=None,
- profiles_path=None
+ crate_id: str, profile_name=None, webhook_url=None
) -> tuple[Response, int]:
"""
- Queues an RO-Crate for validation with Celery.
+ Resolve a crate by ID and queue it for asynchronous validation.
+
+ Credentials and layout are server-side; the request carries only the ID and
+ optional profile/webhook. Resolution happens before queueing so a bad or
+ missing crate is reported immediately. ``InvalidCrateId`` / ``CrateNotFound``
+ / ``AmbiguousCrate`` / ``StorageError`` propagate to the app error handlers.
- :param minio_config: Access settings for Minio instance containing the RO-Crate.
:param crate_id: The ID of the RO-Crate to validate.
- :param root_path: The root path containing the RO-Crate.
:param profile_name: The profile to validate against.
- :param webhook_url: The URL to POST the validation results to.
- :return: A tuple containing a JSON response and an HTTP status code.
- :raises: Exception: If an error occurs whilst queueing the task.
+ :param webhook_url: The URL to POST the validation result to.
+ :return: A JSON response and HTTP status code.
"""
+ settings = current_app.config["SETTINGS"]
+ storage = _build_storage()
- logging.info(f"Processing: {crate_id}, {profile_name}, {webhook_url}")
- logging.info(f"Minio Bucket: {minio_config['bucket']}; Root path: {root_path}")
-
- minio_client = get_minio_client(minio_config)
-
- if check_ro_crate_exists(minio_client, minio_config["bucket"], crate_id, root_path):
- logging.info("RO-Crate exists")
- else:
- logging.info("RO-Crate does not exist")
- raise InvalidAPIUsage(f"No RO-Crate with prefix: {crate_id}", 400)
-
- try:
- process_validation_task_by_id.delay(minio_config, crate_id, root_path,
- profile_name, webhook_url, profiles_path)
- return jsonify({"message": "Validation in progress"}), 202
+ # Raises if the crate is missing/ambiguous/invalid -> handled as 4xx.
+ resolve_crate(storage, crate_id, settings.s3_crate_prefix)
- except Exception as e:
- return jsonify({"error": str(e)}), 500
+ process_validation_task_by_id.delay(crate_id, profile_name, webhook_url)
+ return jsonify({"message": "Validation in progress"}), 202
-def queue_ro_crate_metadata_validation_task(
- crate_json: str, profile_name=None, webhook_url=None, profiles_path=None
+def run_metadata_validation(
+ crate_json: str,
+ profile_name=None,
+ profiles_path=None,
+ extra_profiles_path=None,
+ cache_path=None,
+ offline=False,
) -> tuple[Response, int]:
"""
- Queues an RO-Crate for validation with Celery.
+ Validate RO-Crate metadata synchronously and return the result inline.
- :param crate_id: The ID of the RO-Crate to validate.
+ Metadata-only validation is fast and stateless, so it runs in the request
+ rather than via Celery. Returns 200 for a valid/invalid outcome and 422
+ when the input cannot be validated (bad JSON, empty, or a validator error).
+
+ :param crate_json: The RO-Crate JSON-LD metadata, as a string.
:param profile_name: The profile to validate against.
- :param webhook_url: The URL to POST the validation results to.
- :param profiles_path: A path to the profile definition directory.
- :return: A tuple containing a JSON response and an HTTP status code.
- :raises: Exception: If an error occurs whilst queueing the task.
+ :param profiles_path: A profiles directory that replaces the bundled set.
+ :param extra_profiles_path: A profiles directory added to the bundled set.
+ :param cache_path: HTTP cache location for the validator.
+ :param offline: Validate using only the cache (no network).
+ :return: A JSON response and HTTP status code.
"""
-
- logging.info(f"Processing: {crate_json}, {profile_name}, {webhook_url}")
-
if not crate_json:
return jsonify({"error": "Missing required parameter: crate_json"}), 422
try:
- json_dict = json.loads(crate_json)
- except json.decoder.JSONDecodeError as err:
- return jsonify({"error": f"Required parameter crate_json is not valid JSON: {err}"}), 422
- else:
- if len(json_dict) == 0:
- return jsonify({"error": "Required parameter crate_json is empty"}), 422
+ metadata = json.loads(crate_json)
+ except json.JSONDecodeError as err:
+ return jsonify({"error": f"crate_json is not valid JSON: {err}"}), 422
+
+ if not metadata:
+ return jsonify({"error": "Required parameter crate_json is empty"}), 422
+
+ outcome = validate_metadata(
+ metadata,
+ profile_name=profile_name,
+ profiles_path=profiles_path,
+ extra_profiles_path=extra_profiles_path,
+ cache_path=cache_path,
+ offline=offline,
+ )
+ status_code = 422 if outcome.status is ValidationStatus.ERROR else 200
+ return jsonify(outcome.to_dict()), status_code
- try:
- result = process_validation_task_by_metadata.delay(
- crate_json,
- profile_name,
- webhook_url,
- profiles_path
- )
- if webhook_url:
- return jsonify({"message": "Validation in progress"}), 202
- else:
- return jsonify({"result": result.get()}), 200
-
- except Exception as e:
- return jsonify({"error": str(e)}), 500
-
-
-def get_ro_crate_validation_task(
- minio_config: dict,
- crate_id: str,
- root_path: str,
-) -> tuple[Response, int]:
- """
- Retrieves an RO-Crate validation result.
- :param minio_config: Access settings for Minio instance containing the RO-Crate.
- :param crate_id: The ID of the RO-Crate to validate.
- :param root_path: The root path containing the RO-Crate.
- :return: A tuple containing a JSON response and an HTTP status code.
- :raises Exception: If an error occurs whilst retreiving validation result
+def get_ro_crate_validation_task(crate_id: str) -> tuple[Response, int]:
"""
- logging.info(f"Retrieving validation for: {crate_id}")
+ Return a crate's stored validation result.
- minio_client = get_minio_client(minio_config)
+ Reads the result object from the results prefix. A missing result yields a
+ 404; the stored outcome (including a persisted ``error`` outcome) is returned
+ as-is otherwise.
- if check_ro_crate_exists(minio_client, minio_config["bucket"], crate_id, root_path):
- logging.info("RO-Crate exists")
- else:
- logging.info("RO-Crate does not exist")
- raise InvalidAPIUsage(f"No RO-Crate with prefix: {crate_id}", 400)
+ :param crate_id: The ID of the RO-Crate whose result is requested.
+ :return: A JSON response and HTTP status code.
+ """
+ settings = current_app.config["SETTINGS"]
+ storage = _build_storage()
+
+ validate_crate_id(crate_id) # raises InvalidCrateId -> 400
- if check_validation_exists(minio_client, minio_config["bucket"], crate_id, root_path):
- logging.info("Validation result exists")
- else:
- logging.info("Validation does not exist")
- raise InvalidAPIUsage(f"No validation result yet for RO-Crate: {crate_id}", 400)
+ try:
+ data = storage.get_bytes(result_key(settings.s3_results_prefix, crate_id))
+ except ObjectNotFound:
+ raise InvalidAPIUsage(f"No validation result yet for RO-Crate: {crate_id}", 404)
- return return_ro_crate_validation(minio_client, minio_config["bucket"], crate_id, root_path), 200
+ return jsonify(json.loads(data)), 200
diff --git a/app/storage/__init__.py b/app/storage/__init__.py
new file mode 100644
index 0000000..9db1e79
--- /dev/null
+++ b/app/storage/__init__.py
@@ -0,0 +1,11 @@
+"""Object-storage abstraction.
+
+The rest of the application depends only on the :class:`StorageBackend`
+protocol, rather than a specific client, so backends (S3/MinIO/RustFS) and tests
+are interchangeable.
+"""
+
+from app.storage.base import ObjectStat, StorageBackend
+from app.storage.errors import ObjectNotFound, StorageError
+
+__all__ = ["StorageBackend", "ObjectStat", "StorageError", "ObjectNotFound"]
diff --git a/app/storage/base.py b/app/storage/base.py
new file mode 100644
index 0000000..0e545fb
--- /dev/null
+++ b/app/storage/base.py
@@ -0,0 +1,46 @@
+"""The storage backend protocol and shared value types."""
+
+from dataclasses import dataclass
+from typing import List, Optional, Protocol, runtime_checkable
+
+
+@dataclass(frozen=True)
+class ObjectStat:
+ """Lightweight metadata for a stored object."""
+
+ key: str
+ size: int
+
+
+@runtime_checkable
+class StorageBackend(Protocol):
+ """Minimal object-storage interface the application depends on.
+
+ Implementations translate backend-specific failures into
+ :class:`~app.storage.errors.StorageError` (and ``ObjectNotFound`` for a
+ missing key), so the caller never handles specific exceptions.
+ """
+
+ def stat(self, key: str) -> ObjectStat:
+ """Return metadata for ``key`` or raise ``ObjectNotFound``."""
+ ...
+
+ def get_bytes(self, key: str) -> bytes:
+ """Return the object's bytes or raise ``ObjectNotFound``."""
+ ...
+
+ def put_bytes(self, key: str, data: bytes, content_type: Optional[str] = None) -> None:
+ """Store ``data`` at ``key``, overwriting any existing object."""
+ ...
+
+ def list(self, prefix: str) -> List[str]:
+ """Return the keys whose names start with ``prefix``, sorted."""
+ ...
+
+ def download_tree(self, prefix: str, dest_dir: str) -> None:
+ """Download every object under ``prefix`` into ``dest_dir``.
+
+ Keys are written relative to ``prefix``, recreating their directory
+ structure beneath ``dest_dir``.
+ """
+ ...
diff --git a/app/storage/errors.py b/app/storage/errors.py
new file mode 100644
index 0000000..64dde9e
--- /dev/null
+++ b/app/storage/errors.py
@@ -0,0 +1,13 @@
+"""Storage-layer exceptions, decoupled from any specific client."""
+
+
+class StorageError(Exception):
+ """Base class for object-storage failures."""
+
+
+class ObjectNotFound(StorageError):
+ """Raised when a requested object key does not exist."""
+
+ def __init__(self, key: str):
+ super().__init__(f"Object not found: {key}")
+ self.key = key
diff --git a/app/storage/memory.py b/app/storage/memory.py
new file mode 100644
index 0000000..bfc9ca6
--- /dev/null
+++ b/app/storage/memory.py
@@ -0,0 +1,42 @@
+"""An in-memory storage backend for tests and local use."""
+
+import os
+from typing import Dict, List, Optional
+
+from app.storage.base import ObjectStat
+from app.storage.errors import ObjectNotFound
+
+
+class InMemoryStorage:
+ """A dict-backed :class:`StorageBackend` implementation.
+
+ Works as a dependency-free test double and for local development without full S3 object store.
+ """
+
+ def __init__(self) -> None:
+ self._objects: Dict[str, bytes] = {}
+
+ def stat(self, key: str) -> ObjectStat:
+ if key not in self._objects:
+ raise ObjectNotFound(key)
+ return ObjectStat(key=key, size=len(self._objects[key]))
+
+ def get_bytes(self, key: str) -> bytes:
+ try:
+ return self._objects[key]
+ except KeyError:
+ raise ObjectNotFound(key)
+
+ def put_bytes(self, key: str, data: bytes, content_type: Optional[str] = None) -> None:
+ self._objects[key] = data
+
+ def list(self, prefix: str) -> List[str]:
+ return sorted(key for key in self._objects if key.startswith(prefix))
+
+ def download_tree(self, prefix: str, dest_dir: str) -> None:
+ for key in self.list(prefix):
+ relative_path = key[len(prefix) :]
+ local_path = os.path.join(dest_dir, *relative_path.split("/"))
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
+ with open(local_path, "wb") as handle:
+ handle.write(self._objects[key])
diff --git a/app/storage/s3.py b/app/storage/s3.py
new file mode 100644
index 0000000..586abf8
--- /dev/null
+++ b/app/storage/s3.py
@@ -0,0 +1,111 @@
+"""A boto3-backed StorageBackend for any S3-compatible object store.
+
+Works against AWS S3, MinIO, RustFS, Ceph, and similar via an explicit
+``endpoint_url``. Backend-specific failures are translated into the storage
+error vocabulary so callers never see botocore exceptions.
+"""
+
+import os
+from typing import List, Optional
+
+import boto3
+from botocore.exceptions import BotoCoreError, ClientError
+
+from app.storage.base import ObjectStat
+from app.storage.errors import ObjectNotFound, StorageError
+
+# botocore error codes that mean "this key isn't here", as opposed to a
+# transport/auth/bucket failure.
+_NOT_FOUND_CODES = {"404", "NoSuchKey"}
+
+
+class S3Backend:
+ """StorageBackend implementation over a boto3 S3 client."""
+
+ def __init__(self, client, bucket: str) -> None:
+ self._client = client
+ self.bucket = bucket
+
+ @classmethod
+ def from_settings(cls, settings) -> "S3Backend":
+ """Build a backend from validated :class:`Settings`.
+
+ The endpoint is taken verbatim and prefixed with the scheme implied by
+ ``s3_use_ssl`` so the same config drives AWS or a self-hosted store.
+ """
+ scheme = "https" if settings.s3_use_ssl else "http"
+ client = boto3.client(
+ "s3",
+ endpoint_url=f"{scheme}://{settings.s3_endpoint}",
+ aws_access_key_id=settings.s3_access_key,
+ aws_secret_access_key=settings.s3_secret_key,
+ region_name=settings.s3_region or "us-east-1",
+ use_ssl=settings.s3_use_ssl,
+ )
+ return cls(client, settings.s3_bucket)
+
+ def stat(self, key: str) -> ObjectStat:
+ try:
+ response = self._client.head_object(Bucket=self.bucket, Key=key)
+ except ClientError as error:
+ raise self._translate(error, key)
+ except BotoCoreError as error:
+ raise StorageError(f"Storage error for {key}: {error}") from error
+ return ObjectStat(key=key, size=response["ContentLength"])
+
+ def get_bytes(self, key: str) -> bytes:
+ try:
+ response = self._client.get_object(Bucket=self.bucket, Key=key)
+ return response["Body"].read()
+ except ClientError as error:
+ raise self._translate(error, key)
+ except BotoCoreError as error:
+ raise StorageError(f"Storage error for {key}: {error}") from error
+
+ def put_bytes(self, key: str, data: bytes, content_type: Optional[str] = None) -> None:
+ kwargs = {"Bucket": self.bucket, "Key": key, "Body": data}
+ if content_type:
+ kwargs["ContentType"] = content_type
+ try:
+ self._client.put_object(**kwargs)
+ except (ClientError, BotoCoreError) as error:
+ raise StorageError(f"Failed to store {key}: {error}") from error
+
+ def list(self, prefix: str) -> List[str]:
+ keys: List[str] = []
+ try:
+ paginator = self._client.get_paginator("list_objects_v2")
+ for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix):
+ keys.extend(obj["Key"] for obj in page.get("Contents", []))
+ except (ClientError, BotoCoreError) as error:
+ raise StorageError(f"Failed to list {prefix}: {error}") from error
+ return sorted(keys)
+
+ def download_tree(self, prefix: str, dest_dir: str) -> None:
+ for key in self.list(prefix):
+ relative_path = key[len(prefix) :]
+ local_path = os.path.join(dest_dir, *relative_path.split("/"))
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
+ with open(local_path, "wb") as handle:
+ handle.write(self.get_bytes(key))
+
+ def health_check(self) -> None:
+ """Verify the bucket is reachable; raise ``StorageError`` if not."""
+ try:
+ self._client.head_bucket(Bucket=self.bucket)
+ except (ClientError, BotoCoreError) as error:
+ raise StorageError(f"Bucket {self.bucket} not reachable: {error}") from error
+
+ @staticmethod
+ def _translate(error: ClientError, key: str) -> StorageError:
+ """Map a botocore ClientError to the storage error vocabulary.
+
+ Only a missing *object* becomes ``ObjectNotFound``; a missing bucket or
+ an auth failure is an infrastructure problem and stays a ``StorageError``.
+ ``head_object`` reports a missing key with code ``"404"`` (no error body);
+ ``get_object`` reports ``"NoSuchKey"``.
+ """
+ code = error.response.get("Error", {}).get("Code", "")
+ if code in _NOT_FOUND_CODES:
+ return ObjectNotFound(key)
+ return StorageError(f"Storage error for {key}: {error}")
diff --git a/app/tasks/validation_tasks.py b/app/tasks/validation_tasks.py
index d0d6925..48a8608 100644
--- a/app/tasks/validation_tasks.py
+++ b/app/tasks/validation_tasks.py
@@ -1,311 +1,134 @@
-"""Tasks and helper methods for processing RO-Crate validation."""
+"""Store-backed RO-Crate validation: orchestration and the Celery task.
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
+The orchestration lives in :func:`run_validation_job`, a plain function that is
+fully testable with an in-memory storage backend. The Celery task is a thin
+wrapper that builds the backend from server-side settings and adds retries for
+transient storage failures.
+
+Stages are ordered fetch -> validate -> persist -> webhook, so a failed store
+write can never trigger a "success" webhook. The outcome (including ``error``
+outcomes) is always persisted, so a later GET reflects what happened.
+"""
import logging
import os
import shutil
-import json
+import tempfile
+from datetime import datetime, timezone
from typing import Optional
-from rocrate_validator import services
-from rocrate_validator.models import ValidationResult
-
from app.celery_worker import celery
-from app.utils.minio_utils import (
- fetch_ro_crate_from_minio,
- update_validation_status_in_minio,
- get_validation_status_from_minio,
- get_minio_client,
- find_rocrate_object_on_minio,
- find_validation_object_on_minio,
+from app.crates.ids import InvalidCrateId
+from app.crates.layout import result_key
+from app.crates.resolver import (
+ AmbiguousCrate,
+ CrateNotFound,
+ ResolvedCrate,
+ resolve_crate,
)
+from app.storage.base import StorageBackend
+from app.storage.errors import StorageError
+from app.storage.s3 import S3Backend
+from app.utils.config import Settings
from app.utils.webhook_utils import send_webhook_notification
+from app.validation.results import ValidationOutcome
+from app.validation.runner import validate_crate_path
logger = logging.getLogger(__name__)
-@celery.task
-def process_validation_task_by_id(
- minio_config: dict,
- crate_id: str,
- root_path: str,
- profile_name: str | None,
- webhook_url: str | None,
- profiles_path: str | None,
-) -> None:
- """
- Background task to process the RO-Crate validation by ID.
-
- :param minio_config: The MinIO configuration.
- :param crate_id: The ID of the RO-Crate to validate.
- :param root_path: The root path containing the RO-Crate.
- :param profile_name: The name of the validation profile to use. Defaults to None.
- :param webhook_url: The webhook URL to send notifications to. Defaults to None.
- :raises Exception: If an error occurs during the validation process.
-
- """
-
- # TODO: Split try statements: (1) fetch and validate; (2) write to minio; (3) webhook
-
- minio_client = get_minio_client(minio_config)
-
- file_path = None
-
- try:
- # Fetch the RO-Crate from MinIO using the provided ID:
- file_path = fetch_ro_crate_from_minio(
- minio_client, minio_config["bucket"], crate_id, root_path
- )
-
- logging.info(f"Processing validation task for {file_path}")
-
- # Perform validation:
- validation_result = perform_ro_crate_validation(
- file_path, profile_name, profiles_path=profiles_path
- )
-
- if isinstance(validation_result, str):
- logging.error(f"Validation failed: {validation_result}")
- # TODO: Send webhook with failure notification
- raise Exception(f"Validation failed: {validation_result}")
-
- if not validation_result.has_issues():
- logging.info(f"RO Crate {crate_id} is valid.")
- else:
- logging.info(f"RO Crate {crate_id} is invalid.")
-
- # Update the validation status in MinIO:
- update_validation_status_in_minio(
- minio_client,
- minio_config["bucket"],
- crate_id,
- root_path,
- validation_result.to_json(),
- )
+def _utcnow_iso() -> str:
+ return datetime.now(timezone.utc).isoformat()
- # TODO: Prepare the data to send to the webhook, and send the webhook notification.
- if webhook_url:
- send_webhook_notification(webhook_url, validation_result.to_json())
+def _download_crate(storage: StorageBackend, resolved: ResolvedCrate, temp_dir: str) -> str:
+ """Download a resolved crate into ``temp_dir`` and return its local path."""
+ if resolved.is_zip:
+ local_path = os.path.join(temp_dir, f"{resolved.crate_id}.zip")
+ with open(local_path, "wb") as handle:
+ handle.write(storage.get_bytes(resolved.key))
+ return local_path
- except Exception as e:
- logging.error(f"Error processing validation task: {e}")
+ dest = os.path.join(temp_dir, resolved.crate_id)
+ os.makedirs(dest, exist_ok=True)
+ storage.download_tree(resolved.key, dest)
+ return dest
- # TODO: Should we write error messages to the minio instance too?
- # Send failure notification via webhook
- if webhook_url:
- error_data = {"profile_name": profile_name, "error": str(e)}
- send_webhook_notification(webhook_url, error_data)
-
- finally:
- # Clean up the temporary file if it was created:
- if file_path and os.path.exists(file_path):
- if os.path.isfile(file_path):
- os.remove(file_path)
- elif os.path.isdir(file_path):
- shutil.rmtree(file_path)
-
-
-@celery.task
-def process_validation_task_by_metadata(
- crate_json: str,
- profile_name: str | None,
- webhook_url: str | None,
- profiles_path: Optional[str] = None,
-) -> ValidationResult | str:
- """
- Background task to process the RO-Crate validation for a given json metadata string.
-
- :param crate_json: A string containing the RO-Crate JSON metadata to validate.
- :param profile_name: The name of the validation profile to use. Defaults to None.
- :param webhook_url: The webhook URL to send notifications to. Defaults to None.
- :param profiles_path: The path to the profiles definition directory. Defaults to None.
- :raises Exception: If an error occurs during the validation process.
-
- :todo: Replace the Crate ID with a more comprehensive system, and replace profile name with URI.
+def run_validation_job(
+ storage: StorageBackend,
+ crate_id: str,
+ settings: Settings,
+ profile_name: Optional[str] = None,
+ webhook_url: Optional[str] = None,
+ created_at: Optional[str] = None,
+) -> ValidationOutcome:
+ """Fetch, validate, persist, and (optionally) notify, for one crate.
+
+ Resolution/validation problems become ``error`` outcomes that are persisted
+ like any other. A :class:`StorageError` (transient infrastructure failure)
+ propagates so the caller can retry.
"""
-
+ created_at = created_at or _utcnow_iso()
+ temp_dir = tempfile.mkdtemp()
try:
- logging.info("Processing validation task for provided metadata string")
-
- # Perform validation:
- validation_result = perform_metadata_validation(
- crate_json, profile_name, profiles_path=profiles_path
- )
-
- if isinstance(validation_result, str):
- logging.error(f"Validation failed: {validation_result}")
- # TODO: Send webhook with failure notification
- raise Exception(f"Validation failed: {validation_result}")
-
- if not validation_result.has_issues():
- logging.info("RO Crate metadata is valid.")
+ try:
+ resolved = resolve_crate(storage, crate_id, settings.s3_crate_prefix)
+ local_path = _download_crate(storage, resolved, temp_dir)
+ except (CrateNotFound, AmbiguousCrate, InvalidCrateId) as error:
+ logger.error("Cannot validate crate %s: %s", crate_id, error)
+ outcome = ValidationOutcome.from_error(
+ str(error), profile=profile_name, created_at=created_at
+ )
else:
- logging.info("RO Crate metadata is invalid.")
-
- if webhook_url:
- send_webhook_notification(webhook_url, validation_result.to_json())
-
- except Exception as e:
- logging.error(f"Error processing validation task: {e}")
-
- # Send failure notification via webhook
- error_data = {"profile_name": profile_name, "error": str(e)}
- if webhook_url:
- send_webhook_notification(webhook_url, error_data)
-
+ outcome = validate_crate_path(
+ local_path,
+ profile_name=profile_name,
+ profiles_path=settings.profiles_path,
+ extra_profiles_path=settings.extra_profiles_path,
+ cache_path=settings.cache_path,
+ offline=settings.validation_offline,
+ created_at=created_at,
+ )
finally:
- if isinstance(validation_result, str):
- return validation_result
- else:
- return validation_result.to_json()
+ shutil.rmtree(temp_dir, ignore_errors=True)
+ # Persist before notifying, so a write failure cannot precede a webhook.
+ storage.put_bytes(
+ result_key(settings.s3_results_prefix, crate_id),
+ outcome.to_json().encode("utf-8"),
+ content_type="application/json",
+ )
-def perform_ro_crate_validation(
- file_path: str,
- profile_name: str | None,
- skip_checks_list: Optional[list] = None,
- profiles_path: Optional[str] = None,
-) -> ValidationResult | str:
- """
- Validates an RO-Crate using the provided file path and profile name.
-
- :param file_path: The path to the RO-Crate file to validate
- :param profile_name: The name of the validation profile to use. Defaults to None. If None, the CRS4 validator will
- attempt to determine the profile.
- :param profiles_path: The path to the profiles definition directory
- :param skip_checks_list: A list of checks to skip, if needed
- :return: The validation result.
- :raises Exception: If an error occurs during the validation process.
- """
-
- try:
- logging.info(f"Validating {file_path} with profile {profile_name}")
-
- full_file_path = os.path.join(
- os.path.dirname(
- os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
- ),
- file_path,
- )
- settings = services.ValidationSettings(
- rocrate_uri=full_file_path,
- **({"profile_identifier": profile_name} if profile_name else {}),
- **({"skip_checks": skip_checks_list} if skip_checks_list else {}),
- **({"profiles_path": profiles_path} if profiles_path else {}),
- )
-
- return services.validate(settings)
-
- except Exception as e:
- logging.error(f"Unexpected error during validation: {e}")
- return str(e)
-
-
-def perform_metadata_validation(
- crate_json: str,
- profile_name: str | None,
- skip_checks_list: Optional[list] = None,
- profiles_path: Optional[str] = None,
-) -> ValidationResult | str:
- """
- Validates only RO-Crate metadata provided as a json string.
-
- :param crate_json: The JSON string containing the metadata
- :param profile_name: The name of the validation profile to use. Defaults to None. If None, the CRS4 validator will
- attempt to determine the profile.
- :param profiles_path: The path to the profiles definition directory
- :param skip_checks_list: A list of checks to skip, if needed
- :return: The validation result.
- :raises Exception: If an error occurs during the validation process.
- """
-
- try:
- logging.info(f"Validating ro-crate metadata with profile {profile_name}")
-
- settings = services.ValidationSettings(
- **({"metadata_only": True}),
- **({"metadata_dict": json.loads(crate_json)}),
- **({"profile_identifier": profile_name} if profile_name else {}),
- **({"skip_checks": skip_checks_list} if skip_checks_list else {}),
- **({"profiles_path": profiles_path} if profiles_path else {}),
- )
-
- return services.validate(settings)
-
- except Exception as e:
- logging.error(f"Unexpected error during validation: {e}")
- return str(e)
-
-
-def check_ro_crate_exists(
- minio_client: object,
- bucket_name: str,
- crate_id: str,
- root_path: str,
-) -> bool:
- """
- Checks for the existence of an RO-Crate using the provided Crate ID.
-
- :param minio_client: The MinIO client
- :param bucket_name: The MinIO bucket containing the RO-Crate.
- :param crate_id: The ID of the RO-Crate to validate.
- :param root_path: The root path containing the RO-Crate.
- :return: Boolean indicating existence
- """
-
- logging.info(f"Checking for existence of RO-Crate {crate_id}")
-
- if find_rocrate_object_on_minio(crate_id, minio_client, bucket_name, root_path):
- return True
- else:
- return False
-
-
-def check_validation_exists(
- minio_client: object,
- bucket_name: str,
- crate_id: str,
- root_path: str,
-) -> bool:
- """
- Checks for the existence of a validation result using the provided Crate ID.
-
- :param minio_client: The MinIO client
- :param minio_bucket: The MinIO bucket containing the RO-Crate.
- :param crate_id: The ID of the RO-Crate to validate.
- :param root_path: The root path containing the RO-Crate.
- :return: Boolean indicating existence
- """
-
- logging.info(f"Checking for existence of RO-Crate {crate_id}")
+ if webhook_url:
+ send_webhook_notification(webhook_url, outcome.to_dict())
- if find_validation_object_on_minio(crate_id, minio_client, bucket_name, root_path):
- return True
- else:
- return False
+ return outcome
-def return_ro_crate_validation(
- minio_client: object,
- bucket_name: str,
+@celery.task(
+ autoretry_for=(StorageError,),
+ max_retries=3,
+ retry_backoff=True,
+ retry_backoff_max=60,
+)
+def process_validation_task_by_id(
crate_id: str,
- root_path: str,
-) -> dict | str:
- """
- Retrieves the validation result for an RO-Crate using the provided Crate ID.
+ profile_name: Optional[str] = None,
+ webhook_url: Optional[str] = None,
+) -> None:
+ """Celery entry point: validate a stored crate by ID.
- :param minio_client: The MinIO client
- :param crate_id: The ID of the RO-Crate that has been validated
- :return: The validation result
+ Credentials and layout come from server-side settings (never the request),
+ so no secrets travel through the broker. Transient storage failures are
+ retried with exponential backoff.
"""
-
- logging.info(f"Fetching validation result for RO-Crate {crate_id}")
-
- return get_validation_status_from_minio(
- minio_client, bucket_name, crate_id, root_path
+ settings = Settings.from_env()
+ storage = S3Backend.from_settings(settings)
+ run_validation_job(
+ storage,
+ crate_id,
+ settings,
+ profile_name=profile_name,
+ webhook_url=webhook_url,
)
diff --git a/app/utils/config.py b/app/utils/config.py
index 465ca60..70e9d02 100644
--- a/app/utils/config.py
+++ b/app/utils/config.py
@@ -1,54 +1,103 @@
"""Configuration module for the Flask application."""
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
-
import os
+from dataclasses import dataclass
+from typing import Mapping, Optional
from celery import Celery
from flask import Flask
-def get_env(name: str, default=None, required=False):
- value = os.environ.get(name, default)
- if required and value is None:
- raise RuntimeError(f"Missing required environment variable: {name}")
- return value
-
-
-def get_bool_env(name: str, default: bool = False) -> bool:
- value = get_env(name)
- if value is None:
- return default
- return value.strip().lower() in ("true", "1", "yes", "on")
-
-
-class Config:
- """Base configuration class for the Flask application."""
+class ConfigError(RuntimeError):
+ """Raised at startup when required configuration is missing or invalid."""
- # Celery configuration:
- CELERY_BROKER_URL = get_env("CELERY_BROKER_URL", required=False)
- CELERY_RESULT_BACKEND = get_env("CELERY_RESULT_BACKEND", required=False)
- # rocrate validator configuration:
- PROFILES_PATH = get_env("PROFILES_PATH", required=False)
+_TRUE_VALUES = ("true", "1", "yes", "on")
- # Optional MinIO storage. Disabled by default - when False the
- # ID validation endpoints are not registered:
- MINIO_ENABLED = get_bool_env("MINIO_ENABLED", default=False)
-
-
-class DevelopmentConfig(Config):
- """Development configuration class."""
-
- DEBUG = True
+def _parse_bool(value: Optional[str], default: bool = False) -> bool:
+ if value is None:
+ return default
+ return value.strip().lower() in _TRUE_VALUES
-class ProductionConfig(Config):
- """Production configuration class."""
- DEBUG = False
+def _clean(value: Optional[str]) -> Optional[str]:
+ """Return a stripped value. Blank strings are treated as absent."""
+ if value is None:
+ return None
+ value = value.strip()
+ return value or None
+
+
+@dataclass(frozen=True)
+class Settings:
+ """Validated application configuration loaded once at startup."""
+
+ flask_env: str
+ debug: bool
+ storage_enabled: bool
+ profiles_path: Optional[str]
+ celery_broker_url: Optional[str]
+ celery_result_backend: Optional[str]
+ s3_endpoint: Optional[str]
+ s3_access_key: Optional[str]
+ s3_secret_key: Optional[str]
+ s3_region: Optional[str]
+ s3_bucket: Optional[str]
+ s3_use_ssl: bool
+ s3_crate_prefix: str
+ s3_results_prefix: str
+ extra_profiles_path: Optional[str]
+ cache_path: Optional[str]
+ validation_offline: bool
+
+ @classmethod
+ def from_env(cls, env: Optional[Mapping[str, str]] = None) -> "Settings":
+ """Build Settings from an environment mapping, failing fast on bad config."""
+ if env is None:
+ env = os.environ
+
+ flask_env = _clean(env.get("FLASK_ENV")) or "development"
+ storage_enabled = _parse_bool(env.get("STORAGE_ENABLED"))
+
+ # S3 validation needs both: (1) an object store, and (2) a broker;
+ # require them up front so misconfiguration fails when starting, not
+ # at the first request.
+ if storage_enabled:
+ required = (
+ "S3_ENDPOINT",
+ "S3_ACCESS_KEY",
+ "S3_SECRET_KEY",
+ "S3_BUCKET",
+ "CELERY_BROKER_URL",
+ "CELERY_RESULT_BACKEND",
+ )
+ missing = [name for name in required if _clean(env.get(name)) is None]
+ if missing:
+ raise ConfigError(
+ "STORAGE_ENABLED is true but these required variables are "
+ f"missing or blank: {', '.join(missing)}"
+ )
+
+ return cls(
+ flask_env=flask_env,
+ debug=flask_env != "production",
+ storage_enabled=storage_enabled,
+ profiles_path=_clean(env.get("PROFILES_PATH")),
+ celery_broker_url=_clean(env.get("CELERY_BROKER_URL")),
+ celery_result_backend=_clean(env.get("CELERY_RESULT_BACKEND")),
+ s3_endpoint=_clean(env.get("S3_ENDPOINT")),
+ s3_access_key=_clean(env.get("S3_ACCESS_KEY")),
+ s3_secret_key=_clean(env.get("S3_SECRET_KEY")),
+ s3_region=_clean(env.get("S3_REGION")),
+ s3_bucket=_clean(env.get("S3_BUCKET")),
+ s3_use_ssl=_parse_bool(env.get("S3_USE_SSL")),
+ s3_crate_prefix=_clean(env.get("S3_CRATE_PREFIX")) or "crates",
+ s3_results_prefix=_clean(env.get("S3_RESULTS_PREFIX")) or "validation-results",
+ extra_profiles_path=_clean(env.get("EXTRA_PROFILES_PATH")),
+ cache_path=_clean(env.get("CACHE_PATH")),
+ validation_offline=_parse_bool(env.get("VALIDATION_OFFLINE")),
+ )
class InvalidAPIUsage(Exception):
@@ -74,13 +123,12 @@ def make_celery(app: Flask = None) -> Celery:
:param app: The Flask application to use.
:return: The Celery instance.
"""
- env = os.environ.get("FLASK_ENV", "development")
- config_cls = ProductionConfig if env == "production" else DevelopmentConfig
+ settings: Optional[Settings] = app.config.get("SETTINGS") if app else None
celery = Celery(
app.import_name if app else __name__,
- broker=config_cls.CELERY_BROKER_URL,
- backend=config_cls.CELERY_RESULT_BACKEND,
+ broker=settings.celery_broker_url if settings else None,
+ backend=settings.celery_result_backend if settings else None,
)
if app:
diff --git a/app/utils/minio_utils.py b/app/utils/minio_utils.py
deleted file mode 100644
index 1612f90..0000000
--- a/app/utils/minio_utils.py
+++ /dev/null
@@ -1,323 +0,0 @@
-"""Utility methods for interacting with MinIO."""
-
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
-
-import json
-import logging
-import os
-import tempfile
-
-from io import BytesIO
-from minio import Minio, S3Error
-from app.utils.config import InvalidAPIUsage
-
-
-logger = logging.getLogger(__name__)
-
-
-def fetch_ro_crate_from_minio(minio_client: object, minio_bucket: str, crate_id: str, root_path: str) -> str:
- """
- Fetches an RO-Crate from MinIO based on the crate ID. Downloads the crate as a file and returns local file path.
-
- :param minio_client: The MinIO client
- :param minio_bucket: The MinIO bucket containing the RO-Crate.
- :param crate_id: The ID of the RO-Crate to fetch from MinIO.
- :param root_path: The root path containing the RO-Crate.
- :return: The local file path where the RO-Crate is saved.
- """
-
- rocrate_object = find_rocrate_object_on_minio(crate_id, minio_client, minio_bucket, root_path)
-
- rocrate_minio_path = rocrate_object.object_name
- rocrate_name = rocrate_minio_path.split('/')[-1]
-
- temp_dir = tempfile.mkdtemp()
- local_root_path = os.path.join(temp_dir, rocrate_name)
-
- logging.info(
- f"Fetching RO-Crate {rocrate_name} from MinIO bucket {minio_bucket}. File path {local_root_path}"
- )
-
- if rocrate_object.is_dir:
- os.makedirs(os.path.dirname(local_root_path), exist_ok=True)
-
- objects_list = get_minio_object_list(rocrate_minio_path, minio_client, minio_bucket, recursive=True)
- for obj in objects_list:
- relative_path = obj.object_name[len(rocrate_minio_path):].lstrip("/")
- local_file_path = os.path.join(local_root_path, relative_path)
- os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
- download_file_from_minio(minio_client, minio_bucket, obj.object_name, local_file_path)
-
- else:
- file_path = local_root_path
- download_file_from_minio(minio_client, minio_bucket, rocrate_minio_path, file_path)
-
- logging.info(
- f"RO-Crate {rocrate_name} fetched successfully and saved to {local_root_path}."
- )
-
- return local_root_path
-
-
-def update_validation_status_in_minio(minio_client: object, minio_bucket: str, crate_id: str, root_path: str, validation_status: str) -> None:
- """
- Uploads the validation status to the MinIO bucket.
-
- :param minio_client: The MinIO client
- :param minio_bucket: The MinIO bucket containing the RO-Crate.
- :param crate_id: The ID of the RO-Crate in MinIO
- :param validation_status: The validation result to upload
- :raises S3Error: If an error occurs during the MinIO operation
- :raises ValueError: If the required environment variables are not set
- :raises Exception: If an unexpected error occurs
- """
-
- # The object in MinIO is _validation/validation_status.txt
- if root_path:
- object_name = f"{root_path}/{crate_id}_validation/validation_status.txt"
- else:
- object_name = f"{crate_id}_validation/validation_status.txt"
-
- # convert pretty string to dictionary, then back to plain utf-8 encoded string
- validation_string = json.dumps(json.loads(validation_status), indent=None).encode("utf-8")
-
- try:
- minio_client.put_object(
- minio_bucket,
- object_name,
- data=BytesIO(validation_string),
- length=len(validation_string),
- content_type="application/json",
- )
-
- except S3Error as s3_error:
- logging.error(f"MinIO S3 Error: {s3_error}")
- raise InvalidAPIUsage(f"MinIO S3 Error: {s3_error}", 500)
-
- except ValueError as value_error:
- logging.error(f"Configuration Error: {value_error}")
- raise InvalidAPIUsage(f"Configuration Error: {value_error}", 500)
-
- except Exception as e:
- logging.error(f"Unexpected error updating validation status in MinIO: {e}")
- raise InvalidAPIUsage(f"Unknown Error: {e}", 500)
-
- logging.info(
- f"Validation status file uploaded to {minio_bucket}/{object_name} successfully."
- )
-
-
-def get_validation_status_from_minio(minio_client: object, minio_bucket: str, crate_id: str, root_path: str) -> dict:
- """
- Checks for the existence of a validation report for the given RO-Crate in the MinIO bucket.
- Returns validation message if it exists, or notification that it is missing if not.
-
- :param minio_client: The MinIO client
- :param minio_bucket: The MinIO bucket containing the RO-Crate.
- :param crate_id: The ID of the RO-Crate in MinIO
- :return validation_status: Either the validation status, or note that this does not exist
-
- """
-
- # The object in MinIO is _validation/validation_status.txt
- if root_path:
- object_name = f"{root_path}/{crate_id}_validation/validation_status.txt"
- else:
- object_name = f"{crate_id}_validation/validation_status.txt"
-
- logging.info(f"Getting object {object_name}")
-
- try:
- response = minio_client.get_object(
- minio_bucket,
- object_name,
- )
-
- validation_message = json.loads(response.data.decode())
- response.close()
- response.release_conn()
-
- except S3Error as s3_error:
- logging.error(f"MinIO S3 Error: {s3_error}")
- raise InvalidAPIUsage(f"MinIO S3 Error: {s3_error}", 500)
-
- except ValueError as value_error:
- logging.error(f"Configuration Error: {value_error}")
- raise InvalidAPIUsage(f"Configuration Error: {value_error}", 500)
-
- except Exception as e:
- logging.error(f"Unexpected error retrieving validation status from MinIO: {e}")
- raise InvalidAPIUsage(f"Unknown Error: {e}", 500)
-
- else:
- return validation_message
-
-
-def download_file_from_minio(minio_client: object, minio_bucket: str, object_path: str, file_path: str) -> None:
- """
- Downloads a file from MinIO
-
- :param minio_client: MinIO object
- :param minio_bucket: name of MinIO bucket, string
- :param object_path: path to object on MinIO, string
- :param file_path: local path, string
- :raises S3Error: If an error occurs during the MinIO operation
- :raises ValueError: If the required environment variables are not set
- :raises Exception: If an unexpected error occurs
- """
-
- try:
- minio_client.fget_object(minio_bucket, object_path, file_path)
-
- except S3Error as s3_error:
- logging.error(f"MinIO S3 Error: {s3_error}")
- raise InvalidAPIUsage(f"MinIO S3 Error: {s3_error}", 500)
-
- except ValueError as value_error:
- logging.error(f"Configuration Error: {value_error}")
- raise InvalidAPIUsage(f"Configuration Error: {value_error}", 500)
-
- except Exception as e:
- logging.error(f"Unexpected error retrieving file from MinIO: {e}")
- raise InvalidAPIUsage(f"Unknown Error: {e}", 500)
-
-
-def find_validation_object_on_minio(rocrate_id: str, minio_client, minio_bucket: str, root_path: str) -> object:
- """
- Checks that the requested object exists on the MinIO instance.
-
- If it does not exist then a False value is returned.
- If it does exist then the minio.datatypes.Object is returned.
-
- :param rocrate_id: string containing the name of ro-crate
- :param root_path: string containing the path within which the ro-crate should be
- :param minio_client: minio object
- :param minio_bucket: string containing bucket on minio
- :return return_object: rocrate object we require
- :raise Exception: If validation result can't be found, 400
- """
-
- logging.info(f"Finding Validation result: {rocrate_id}_validation/validation_status.txt")
-
- if root_path:
- file_path = f"{root_path}/{rocrate_id}_validation/validation_status.txt"
- else:
- file_path = f"{rocrate_id}_validation/validation_status.txt"
-
- file_list = get_minio_object_list(file_path, minio_client, minio_bucket)
-
- return_object = False
- for obj in file_list:
- if obj.object_name == file_path:
- return_object = obj
- break
-
- if not return_object:
- logging.error(f"No validation result yet for RO-Crate: {rocrate_id}")
- return False
- else:
- return return_object
-
-
-def find_rocrate_object_on_minio(rocrate_id: str, minio_client, minio_bucket: str, root_path: str) -> object | bool:
- """
- Checks that the requested object exists on the MinIO instance.
-
- If it does not exist then a False value is returned.
- If it does exist then the minio.datatypes.Object is returned.
-
- :param rocrate_id: string containing the name of ro-crate
- :param root_path: string containing the path within which the ro-crate should be
- :param minio_client: minio object
- :param minio_bucket: string containing bucket on minio
- :return return_object or False: rocrate object we require, or False result
- :raise Exception: If RO-Crate can't be found, 400
- """
-
- logging.info(f"Finding RO-Crate: {rocrate_id}")
-
- if root_path:
- rocrate_path = f"{root_path}/{rocrate_id}"
- else:
- rocrate_path = rocrate_id
-
- rocrate_list = get_minio_object_list(rocrate_path, minio_client, minio_bucket)
-
- return_object = False
- for obj in rocrate_list:
- # TODO: We should be checking here for the existence of the ro-crate metadata file within this object too
- if (obj.object_name == f"{rocrate_path}/" and obj.is_dir) or obj.object_name == f"{rocrate_path}.zip":
- return_object = obj
- break
-
- if not return_object:
- logging.error(f"No RO-Crate with prefix: {rocrate_path}")
- return False
- else:
- return return_object
-
-
-def get_minio_object_list(object_path: str, minio_client, minio_bucket: str, recursive: bool = False) -> list:
- """
- Creates a list of objects which match the object_id and path_prefix
-
- :param object_path: The object ID, string
- :param path_prefix: Path prefix, string, optional
- :param minio_client: MinIO client object
- :param minio_bucket: string
- :param recursive: boolean, default = False
- :return object_list: List containing objects of type minio.datatypes.Object
- :raises S3Error: If an error occurs during the MinIO operation, 500
- :raises ValueError: If the required environment variables are not set, 500
- :raises Exception: If an unexpected error occurs, 500
- """
-
- try:
- response = minio_client.list_objects(
- minio_bucket,
- object_path,
- recursive=recursive
- )
- object_list = [obj for obj in response]
-
- response.close()
-
- except S3Error as s3_error:
- logging.error(f"MinIO S3 Error: {s3_error}")
- raise InvalidAPIUsage(f"MinIO S3 Error: {s3_error}", 500)
-
- except ValueError as value_error:
- logging.error(f"Configuration Error: {value_error}")
- raise InvalidAPIUsage(f"Configuration Error: {value_error}", 500)
-
- except Exception as e:
- logging.error(f"Unexpected error getting object list from MinIO: {e}")
- raise InvalidAPIUsage(f"Unknown Error: {e}", 500)
-
- else:
- return object_list
-
-
-def get_minio_client(minio_config: dict) -> Minio:
- """
- Initialises the MinIO client from provided settings.
-
- :param minio_config: A dictionary containing the below parameters
- :param endpoint: A string containing host and port. E.g. 'localhost:9000'
- :param access_key: A string containing the access key / username
- :param secret_key: A string containing the secret key / password
- :param use_ssl: Boolean defining if SSL connection should be used or not
- :return: The MinIO client.
- :raises ValueError: If required environment variables are not set.
- """
-
- minio_client = Minio(
- endpoint=minio_config["endpoint"],
- access_key=minio_config["accesskey"],
- secret_key=minio_config["secret"],
- secure=minio_config["ssl"],
- )
-
- return minio_client
diff --git a/app/utils/webhook_utils.py b/app/utils/webhook_utils.py
index 3a77fb1..29f506f 100644
--- a/app/utils/webhook_utils.py
+++ b/app/utils/webhook_utils.py
@@ -1,29 +1,61 @@
-"""Utility methods for sending webhook notifications."""
-
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
+"""Webhook delivery with bounded retries and backoff."""
import logging
-import requests
+import time
+from typing import Any, Callable
-from typing import Any
+import requests
logger = logging.getLogger(__name__)
+DEFAULT_TIMEOUT = 10
-def send_webhook_notification(url: str, data: Any) -> None:
- """
- Sends a POST request to the specified webhook URL with the given data.
- :param url: The URL to send the webhook notification to.
- :param data: The data to send in the POST request.
- :raises requests.RequestException: If an error occurs when sending the notification.
- """
+class WebhookDeliveryError(Exception):
+ """Raised when a webhook could not be delivered after all retries."""
+
- try:
- response = requests.post(url, json=data)
- response.raise_for_status()
- logging.info(f"Webhook notification sent successfully to {url}")
- except requests.RequestException as e:
- logging.error(f"Failed to send webhook notification: {e}")
+def send_webhook_notification(
+ url: str,
+ data: Any,
+ max_attempts: int = 3,
+ base_delay: float = 0.5,
+ sleep: Callable[[float], None] = time.sleep,
+) -> None:
+ """
+ POST ``data`` to ``url`` as JSON, retrying transient failures.
+
+ Retries up to ``max_attempts`` times with exponential backoff. On final
+ failure it raises :class:`WebhookDeliveryError` rather than swallowing the
+ error, so the caller can surface it.
+
+ :param url: The webhook URL to POST to.
+ :param data: JSON-serialisable payload.
+ :param max_attempts: Total number of attempts before giving up.
+ :param base_delay: Base backoff delay in seconds (doubled each retry).
+ :param sleep: Sleep function (injectable for testing).
+ :raises WebhookDeliveryError: If delivery fails after ``max_attempts``.
+ """
+ last_error = None
+
+ for attempt in range(1, max_attempts + 1):
+ try:
+ response = requests.post(url, json=data, timeout=DEFAULT_TIMEOUT)
+ response.raise_for_status()
+ logger.info("Webhook delivered to %s (attempt %d)", url, attempt)
+ return
+ except requests.RequestException as error:
+ last_error = error
+ logger.warning(
+ "Webhook attempt %d/%d to %s failed: %s",
+ attempt,
+ max_attempts,
+ url,
+ error,
+ )
+ if attempt < max_attempts:
+ sleep(base_delay * (2 ** (attempt - 1)))
+
+ raise WebhookDeliveryError(
+ f"Failed to deliver webhook to {url} after {max_attempts} attempts: {last_error}"
+ )
diff --git a/app/validation/__init__.py b/app/validation/__init__.py
new file mode 100644
index 0000000..2e275e1
--- /dev/null
+++ b/app/validation/__init__.py
@@ -0,0 +1 @@
+"""RO-Crate validation: a single outcome type and the runner that produces it."""
diff --git a/app/validation/results.py b/app/validation/results.py
new file mode 100644
index 0000000..cd1418c
--- /dev/null
+++ b/app/validation/results.py
@@ -0,0 +1,77 @@
+"""Defines an explicit result type for validation."""
+
+import json
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+
+
+class ValidationStatus(str, Enum):
+ """The outcome of a validation run."""
+
+ VALID = "valid"
+ INVALID = "invalid"
+ ERROR = "error"
+
+
+@dataclass(frozen=True)
+class ValidationOutcome:
+ """The result of validating a crate or its metadata.
+
+ ``detail`` holds the validator's report for ``valid``/``invalid`` outcomes;
+ ``error`` holds the message for an ``error`` outcome. The two are mutually
+ exclusive.
+ """
+
+ status: ValidationStatus
+ profile: Optional[str] = None
+ detail: Optional[dict] = None
+ error: Optional[str] = None
+ created_at: Optional[str] = None
+
+ @property
+ def is_valid(self) -> bool:
+ return self.status is ValidationStatus.VALID
+
+ def to_dict(self) -> dict:
+ data = {
+ "status": self.status.value,
+ "profile": self.profile,
+ "created_at": self.created_at,
+ }
+ if self.detail is not None:
+ data["detail"] = self.detail
+ if self.error is not None:
+ data["error"] = self.error
+ return data
+
+ def to_json(self) -> str:
+ return json.dumps(self.to_dict())
+
+ @classmethod
+ def from_validator_result(
+ cls, result, profile: Optional[str] = None, created_at: Optional[str] = None
+ ) -> "ValidationOutcome":
+ """Build an outcome from a rocrate_validator ``ValidationResult``."""
+ status = ValidationStatus.INVALID if result.has_issues() else ValidationStatus.VALID
+ return cls(
+ status=status,
+ profile=profile,
+ detail=json.loads(result.to_json()),
+ created_at=created_at,
+ )
+
+ @classmethod
+ def from_error(
+ cls,
+ message: str,
+ profile: Optional[str] = None,
+ created_at: Optional[str] = None,
+ ) -> "ValidationOutcome":
+ """Build an error outcome from a failure message."""
+ return cls(
+ status=ValidationStatus.ERROR,
+ profile=profile,
+ error=message,
+ created_at=created_at,
+ )
diff --git a/app/validation/runner.py b/app/validation/runner.py
new file mode 100644
index 0000000..62c4ba0
--- /dev/null
+++ b/app/validation/runner.py
@@ -0,0 +1,97 @@
+"""Runs rocrate_validator and adapts its output to a ValidationOutcome.
+
+This is the boundary to the external validator. Both entry points always
+return a :class:`ValidationOutcome` - a validator exception becomes an ``error``
+outcome rather than a string, so callers never have to type-check the result.
+"""
+
+import logging
+from typing import Optional
+
+from rocrate_validator import services
+
+from app.validation.results import ValidationOutcome
+
+logger = logging.getLogger(__name__)
+
+
+def validate_crate_path(
+ rocrate_uri: str,
+ profile_name: Optional[str] = None,
+ profiles_path: Optional[str] = None,
+ extra_profiles_path: Optional[str] = None,
+ skip_checks: Optional[list] = None,
+ cache_path: Optional[str] = None,
+ offline: bool = False,
+ created_at: Optional[str] = None,
+) -> ValidationOutcome:
+ """Validate a crate on disk (a directory or zip) at ``rocrate_uri``."""
+ return _run(
+ {"rocrate_uri": rocrate_uri},
+ profile_name=profile_name,
+ profiles_path=profiles_path,
+ extra_profiles_path=extra_profiles_path,
+ skip_checks=skip_checks,
+ cache_path=cache_path,
+ offline=offline,
+ created_at=created_at,
+ )
+
+
+def validate_metadata(
+ metadata: dict,
+ profile_name: Optional[str] = None,
+ profiles_path: Optional[str] = None,
+ extra_profiles_path: Optional[str] = None,
+ skip_checks: Optional[list] = None,
+ cache_path: Optional[str] = None,
+ offline: bool = False,
+ created_at: Optional[str] = None,
+) -> ValidationOutcome:
+ """Validate an in-memory RO-Crate metadata graph."""
+ return _run(
+ {"metadata_only": True, "metadata_dict": metadata},
+ profile_name=profile_name,
+ profiles_path=profiles_path,
+ extra_profiles_path=extra_profiles_path,
+ skip_checks=skip_checks,
+ cache_path=cache_path,
+ offline=offline,
+ created_at=created_at,
+ )
+
+
+def _run(
+ base_settings: dict,
+ profile_name: Optional[str],
+ profiles_path: Optional[str],
+ extra_profiles_path: Optional[str],
+ skip_checks: Optional[list],
+ cache_path: Optional[str],
+ offline: bool,
+ created_at: Optional[str],
+) -> ValidationOutcome:
+ options = dict(base_settings)
+ if profile_name:
+ options["profile_identifier"] = profile_name
+ if profiles_path:
+ options["profiles_path"] = profiles_path
+ if extra_profiles_path:
+ options["extra_profiles_path"] = extra_profiles_path
+ if skip_checks:
+ options["skip_checks"] = skip_checks
+ if cache_path:
+ options["cache_path"] = cache_path
+ if offline:
+ options["offline"] = offline
+
+ try:
+ settings = services.ValidationSettings(**options)
+ result = services.validate(settings)
+ except Exception as error: # noqa: BLE001 - adapt any validator failure to an outcome
+ logger.error("Validation failed: %s", error)
+ return ValidationOutcome.from_error(str(error), profile=profile_name, created_at=created_at)
+
+ return ValidationOutcome.from_validator_result(
+ result, profile=profile_name, created_at=created_at
+ )
diff --git a/docker-compose-develop.yml b/docker-compose-develop.yml
index 334b0d6..f7c08e4 100644
--- a/docker-compose-develop.yml
+++ b/docker-compose-develop.yml
@@ -1,4 +1,7 @@
-version: '3.8'
+# Development stack: builds the app image locally from ./Dockerfile and mounts
+# the custom RO-Crate profiles into both the flask and worker containers. Use
+# this when working on the service itself; use docker-compose.yml to run the
+# published image.
services:
flask:
@@ -8,20 +11,25 @@ services:
ports:
- "5001:5000"
environment:
- - FLASK_APP=cratey.py
+ - FLASK_APP=wsgi.py
- FLASK_ENV=development
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- # Optional object storage. Set MINIO_ENABLED=true and start the "minio"
- # profile (docker compose --profile minio up) to use
- - MINIO_ENABLED=${MINIO_ENABLED:-false}
- - MINIO_ENDPOINT=${MINIO_ENDPOINT}
- - MINIO_ROOT_USER=${MINIO_ROOT_USER}
- - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}
- - MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME}
- - PROFILES_PATH=/app/profiles
+ # Optional object storage. Set STORAGE_ENABLED=true and start the
+ # "objectstore" profile (docker compose --profile objectstore up).
+ - STORAGE_ENABLED=${STORAGE_ENABLED:-false}
+ - S3_ENDPOINT=${S3_ENDPOINT}
+ - S3_ACCESS_KEY=${S3_ACCESS_KEY}
+ - S3_SECRET_KEY=${S3_SECRET_KEY}
+ - S3_BUCKET=${S3_BUCKET}
+ - S3_USE_SSL=${S3_USE_SSL:-false}
+ - EXTRA_PROFILES_PATH=/app/profiles
depends_on:
- redis
+ # Metadata validation runs synchronously in this process, so the flask
+ # service needs the custom profiles mounted too (not just the worker).
+ volumes:
+ - ./tests/data/rocrate_validator_profiles:/app/profiles:ro
celery_worker:
build:
@@ -31,7 +39,15 @@ services:
environment:
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- - MINIO_ENABLED=${MINIO_ENABLED:-false}
+ # The worker builds its storage client from these, so it needs the full
+ # S3 config (not just the enabled flag).
+ - STORAGE_ENABLED=${STORAGE_ENABLED:-false}
+ - S3_ENDPOINT=${S3_ENDPOINT}
+ - S3_ACCESS_KEY=${S3_ACCESS_KEY}
+ - S3_SECRET_KEY=${S3_SECRET_KEY}
+ - S3_BUCKET=${S3_BUCKET}
+ - S3_USE_SSL=${S3_USE_SSL:-false}
+ - EXTRA_PROFILES_PATH=/app/profiles
depends_on:
- redis
volumes:
@@ -42,21 +58,23 @@ services:
ports:
- "6379:6379"
- minio:
- image: "minio/minio"
- # Started with `docker compose --profile minio up`.
+ objectstore:
+ image: "rustfs/rustfs:latest"
+ # Local S3-compatible object store (RustFS) for development.
+ # Started with `docker compose --profile objectstore up`.
profiles:
- - minio
+ - objectstore
ports:
- "9000:9000"
- "9001:9001"
environment:
- - MINIO_ROOT_USER=${MINIO_ROOT_USER}
- - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}
- - MINIO_BROWSER_REDIRECT_PORT=9001
- command: server --console-address ":9001" /data
+ - RUSTFS_ACCESS_KEY=${RUSTFS_ACCESS_KEY}
+ - RUSTFS_SECRET_KEY=${RUSTFS_SECRET_KEY}
+ - RUSTFS_VOLUMES=/data
+ - RUSTFS_ADDRESS=0.0.0.0:9000
+ - RUSTFS_CONSOLE_ADDRESS=0.0.0.0:9001
volumes:
- - minio_data:/data
+ - objectstore_data:/data
volumes:
- minio_data:
+ objectstore_data:
diff --git a/docker-compose.yml b/docker-compose.yml
index 2f0bb3b..73c9e2d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,34 +1,46 @@
-version: '3.8'
+# Default stack, using the published image (ghcr.io/.../ro-crate-validation-service).
+# For running the service. Object storage is opt-in: set STORAGE_ENABLED=true
+# and start the "objectstore" profile (docker compose --profile objectstore up).
+# For local development against a freshly built image, use
+# docker-compose-develop.yml instead.
services:
flask:
platform: linux/x86_64
- image: "ghcr.io/esciencelab/cratey-validator:0.1"
+ image: "ghcr.io/esciencelab/ro-crate-validation-service:0.1"
ports:
- "5001:5000"
environment:
- - FLASK_APP=cratey.py
+ - FLASK_APP=wsgi.py
- FLASK_ENV=development
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- # Optional object storage. Set MINIO_ENABLED=true and start the "minio"
- # profile (docker compose --profile minio up) to use
- - MINIO_ENABLED=${MINIO_ENABLED:-false}
- - MINIO_ENDPOINT=${MINIO_ENDPOINT}
- - MINIO_ROOT_USER=${MINIO_ROOT_USER}
- - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}
- - MINIO_BUCKET_NAME=${MINIO_BUCKET_NAME}
+ # Optional object storage. Set STORAGE_ENABLED=true and start the
+ # "objectstore" profile (docker compose --profile objectstore up).
+ - STORAGE_ENABLED=${STORAGE_ENABLED:-false}
+ - S3_ENDPOINT=${S3_ENDPOINT}
+ - S3_ACCESS_KEY=${S3_ACCESS_KEY}
+ - S3_SECRET_KEY=${S3_SECRET_KEY}
+ - S3_BUCKET=${S3_BUCKET}
+ - S3_USE_SSL=${S3_USE_SSL:-false}
depends_on:
- redis
celery_worker:
platform: linux/x86_64
- image: "ghcr.io/esciencelab/cratey-validator:0.1"
+ image: "ghcr.io/esciencelab/ro-crate-validation-service:0.1"
command: celery -A app.celery_worker.celery worker --loglevel=info -E
environment:
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- - MINIO_ENABLED=${MINIO_ENABLED:-false}
+ # The worker builds its storage client from these, so it needs the full
+ # S3 config (not just the enabled flag).
+ - STORAGE_ENABLED=${STORAGE_ENABLED:-false}
+ - S3_ENDPOINT=${S3_ENDPOINT}
+ - S3_ACCESS_KEY=${S3_ACCESS_KEY}
+ - S3_SECRET_KEY=${S3_SECRET_KEY}
+ - S3_BUCKET=${S3_BUCKET}
+ - S3_USE_SSL=${S3_USE_SSL:-false}
depends_on:
- redis
@@ -37,21 +49,23 @@ services:
ports:
- "6379:6379"
- minio:
- image: "minio/minio"
- # Started with `docker compose --profile minio up`.
+ objectstore:
+ image: "rustfs/rustfs:latest"
+ # Local S3-compatible object store (RustFS) for development.
+ # Started with `docker compose --profile objectstore up`.
profiles:
- - minio
+ - objectstore
ports:
- "9000:9000"
- "9001:9001"
environment:
- - MINIO_ROOT_USER=${MINIO_ROOT_USER}
- - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}
- - MINIO_BROWSER_REDIRECT_PORT=9001
- command: server --console-address ":9001" /data
+ - RUSTFS_ACCESS_KEY=${RUSTFS_ACCESS_KEY}
+ - RUSTFS_SECRET_KEY=${RUSTFS_SECRET_KEY}
+ - RUSTFS_VOLUMES=/data
+ - RUSTFS_ADDRESS=0.0.0.0:9000
+ - RUSTFS_CONSOLE_ADDRESS=0.0.0.0:9001
volumes:
- - minio_data:/data
+ - objectstore_data:/data
volumes:
- minio_data:
+ objectstore_data:
diff --git a/docs/assets/minio-versioning-enabled.webp b/docs/assets/minio-versioning-enabled.webp
deleted file mode 100644
index d69b2dd..0000000
Binary files a/docs/assets/minio-versioning-enabled.webp and /dev/null differ
diff --git a/docs/assets/validate-minio-versioning-enabled.webp b/docs/assets/validate-minio-versioning-enabled.webp
deleted file mode 100644
index 3cbee52..0000000
Binary files a/docs/assets/validate-minio-versioning-enabled.webp and /dev/null differ
diff --git a/example.env b/example.env
index 5235486..8544ea4 100644
--- a/example.env
+++ b/example.env
@@ -1,9 +1,21 @@
-# MinIO is off by default; only the stateless validation endpoint is exposed when
-# disabled. The MINIO_* vars below and the "minio" docker-compose profile are
-#only needed when this is true.
-MINIO_ENABLED=false
+# Object storage is disabled by default; only the stateless metadata validation
+# endpoint is exposed. Set STORAGE_ENABLED=true (and start the "objectstore"
+# compose profile) to enable the store-backed, ID-based endpoints.
+STORAGE_ENABLED=false
-MINIO_ROOT_USER=minioadmin
-MINIO_ROOT_PASSWORD=minioadmin
-MINIO_BUCKET_NAME=ro-crates
-MINIO_ENDPOINT=minio:9000
+# Application object-storage client. Works against any S3-compatible store
+# (RustFS, MinIO, Ceph, AWS S3) via the endpoint below. For local development
+# this points at the "objectstore" container (RustFS).
+S3_ENDPOINT=objectstore:9000
+S3_ACCESS_KEY=rustfsadmin
+S3_SECRET_KEY=rustfsadmin
+S3_BUCKET=ro-crates
+S3_USE_SSL=false
+# S3_REGION=us-east-1
+# S3_CRATE_PREFIX=crates
+# S3_RESULTS_PREFIX=validation-results
+
+# Credentials for the local RustFS dev container (the "objectstore" compose
+# profile, development only). These match the S3 credentials above.
+RUSTFS_ACCESS_KEY=rustfsadmin
+RUSTFS_SECRET_KEY=rustfsadmin
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b791232
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,50 @@
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "ro-crate-validation-service"
+version = "0.1.0"
+description = "A service for validating RO-Crates."
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "MIT" }
+authors = [{ name = "eScience Lab, The University of Manchester" }]
+
+# Direct runtime dependencies:
+dependencies = [
+ "celery==5.6.3",
+ "boto3==1.43.29",
+ "requests==2.33.1",
+ "Flask==3.1.3",
+ "Werkzeug==3.1.8",
+ "redis==7.4.0",
+ "python-dotenv==1.2.2",
+ "apiflask==3.1.0",
+ "roc-validator==0.10.0",
+]
+
+[project.optional-dependencies]
+dev = [
+ "pytest",
+ "pytest-mock",
+ "moto[s3]",
+ "ruff",
+ "pip-tools",
+]
+
+[tool.setuptools.packages.find]
+include = ["app*"]
+
+[tool.pytest.ini_options]
+addopts = "--import-mode=importlib"
+pythonpath = ["."]
+log_format = "%(asctime)s %(levelname)s %(message)s"
+log_date_format = "%Y-%m-%d %H:%M:%S"
+
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I"]
diff --git a/pytest.ini b/pytest.ini
deleted file mode 100644
index 96735eb..0000000
--- a/pytest.ini
+++ /dev/null
@@ -1,3 +0,0 @@
-[pytest]
-log_format = %(asctime)s %(levelname)s %(message)s
-log_date_format = %Y-%m-%d %H:%M:%S
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..c38068a
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,265 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+# pip-compile --extra=dev --output-file=requirements-dev.txt pyproject.toml
+#
+amqp==5.3.1
+ # via kombu
+annotated-types==0.7.0
+ # via pydantic
+apiflask==3.1.0
+ # via ro-crate-validation-service (pyproject.toml)
+apispec==6.10.0
+ # via apiflask
+async-timeout==5.0.1
+ # via redis
+attrs==26.1.0
+ # via
+ # cattrs
+ # requests-cache
+billiard==4.2.4
+ # via celery
+blinker==1.9.0
+ # via flask
+boto3==1.43.29
+ # via
+ # moto
+ # ro-crate-validation-service (pyproject.toml)
+botocore==1.43.31
+ # via
+ # boto3
+ # moto
+ # s3transfer
+build==1.5.0
+ # via pip-tools
+cattrs==26.1.0
+ # via requests-cache
+celery==5.6.3
+ # via ro-crate-validation-service (pyproject.toml)
+certifi==2026.6.17
+ # via requests
+cffi==2.0.0
+ # via cryptography
+charset-normalizer==3.4.7
+ # via requests
+click==8.4.1
+ # via
+ # celery
+ # click-didyoumean
+ # click-plugins
+ # click-repl
+ # flask
+ # pip-tools
+ # rich-click
+ # roc-validator
+click-didyoumean==0.3.1
+ # via celery
+click-plugins==1.1.1.2
+ # via celery
+click-repl==0.3.0
+ # via celery
+colorlog==6.10.1
+ # via roc-validator
+cryptography==49.0.0
+ # via moto
+dnspython==2.8.0
+ # via email-validator
+email-validator==2.3.0
+ # via pydantic
+enum-tools==0.12.0
+ # via roc-validator
+flask==3.1.3
+ # via
+ # apiflask
+ # flask-httpauth
+ # flask-marshmallow
+ # ro-crate-validation-service (pyproject.toml)
+flask-httpauth==4.8.1
+ # via apiflask
+flask-marshmallow==1.5.0
+ # via apiflask
+html5rdf==1.2.1
+ # via rdflib
+idna==3.18
+ # via
+ # email-validator
+ # requests
+ # url-normalize
+importlib-metadata==9.0.0
+ # via pyshacl
+iniconfig==2.3.0
+ # via pytest
+inquirerpy==0.3.4
+ # via roc-validator
+itsdangerous==2.2.0
+ # via flask
+jinja2==3.1.6
+ # via flask
+jmespath==1.1.0
+ # via
+ # boto3
+ # botocore
+kombu==5.6.2
+ # via celery
+markdown-it-py==4.2.0
+ # via rich
+markupsafe==3.0.3
+ # via
+ # flask
+ # jinja2
+ # werkzeug
+marshmallow==4.3.0
+ # via
+ # apiflask
+ # flask-marshmallow
+ # webargs
+mdurl==0.1.2
+ # via markdown-it-py
+moto[s3]==5.2.2
+ # via ro-crate-validation-service (pyproject.toml)
+owlrl==7.1.4
+ # via pyshacl
+packaging==26.2
+ # via
+ # apispec
+ # build
+ # kombu
+ # pyshacl
+ # pytest
+ # webargs
+ # wheel
+pfzy==0.3.4
+ # via inquirerpy
+pip-tools==7.5.3
+ # via ro-crate-validation-service (pyproject.toml)
+platformdirs==4.10.0
+ # via requests-cache
+pluggy==1.6.0
+ # via pytest
+prettytable==3.17.0
+ # via pyshacl
+prompt-toolkit==3.0.52
+ # via
+ # click-repl
+ # inquirerpy
+py-partiql-parser==0.6.3
+ # via moto
+pycparser==3.0
+ # via cffi
+pydantic[email]==2.13.4
+ # via apiflask
+pydantic-core==2.46.4
+ # via pydantic
+pygments==2.20.0
+ # via
+ # enum-tools
+ # pytest
+ # rich
+pyparsing==3.3.2
+ # via rdflib
+pyproject-hooks==1.2.0
+ # via
+ # build
+ # pip-tools
+pyshacl==0.31.0
+ # via roc-validator
+pytest==9.1.0
+ # via
+ # pytest-mock
+ # ro-crate-validation-service (pyproject.toml)
+pytest-mock==3.15.1
+ # via ro-crate-validation-service (pyproject.toml)
+python-dateutil==2.9.0.post0
+ # via
+ # botocore
+ # celery
+python-dotenv==1.2.2
+ # via ro-crate-validation-service (pyproject.toml)
+pyyaml==6.0.3
+ # via
+ # moto
+ # responses
+rdflib[html]==7.6.0
+ # via
+ # owlrl
+ # pyshacl
+ # roc-validator
+redis==7.4.0
+ # via ro-crate-validation-service (pyproject.toml)
+requests==2.33.1
+ # via
+ # moto
+ # requests-cache
+ # responses
+ # ro-crate-validation-service (pyproject.toml)
+ # roc-validator
+requests-cache==1.3.2
+ # via roc-validator
+responses==0.26.1
+ # via moto
+rich==13.9.4
+ # via
+ # rich-click
+ # roc-validator
+rich-click==1.9.8
+ # via roc-validator
+roc-validator==0.10.0
+ # via ro-crate-validation-service (pyproject.toml)
+ruff==0.15.17
+ # via ro-crate-validation-service (pyproject.toml)
+s3transfer==0.18.0
+ # via boto3
+six==1.17.0
+ # via python-dateutil
+toml==0.10.2
+ # via roc-validator
+typing-extensions==4.15.0
+ # via
+ # cattrs
+ # enum-tools
+ # pydantic
+ # pydantic-core
+ # typing-inspection
+typing-inspection==0.4.2
+ # via pydantic
+typos==1.47.2
+ # via roc-validator
+tzdata==2026.2
+ # via kombu
+tzlocal==5.4.3
+ # via celery
+url-normalize==3.0.0
+ # via requests-cache
+urllib3==2.7.0
+ # via
+ # botocore
+ # requests
+ # requests-cache
+ # responses
+vine==5.1.0
+ # via
+ # amqp
+ # celery
+ # kombu
+wcwidth==0.8.1
+ # via
+ # prettytable
+ # prompt-toolkit
+webargs==8.7.1
+ # via apiflask
+werkzeug==3.1.8
+ # via
+ # flask
+ # moto
+ # ro-crate-validation-service (pyproject.toml)
+wheel==0.47.0
+ # via pip-tools
+xmltodict==1.0.4
+ # via moto
+zipp==4.1.0
+ # via importlib-metadata
+
+# The following packages are considered to be unsafe in a requirements file:
+# pip
+# setuptools
diff --git a/requirements.in b/requirements.in
deleted file mode 100644
index df00c22..0000000
--- a/requirements.in
+++ /dev/null
@@ -1,9 +0,0 @@
-celery==5.6.3
-minio==7.2.20
-requests==2.33.1
-Flask==3.1.3
-Werkzeug==3.1.8
-redis==7.4.0
-python-dotenv==1.2.2
-apiflask==3.1.0
-roc-validator==0.9.0
diff --git a/requirements.txt b/requirements.txt
index b8f8d34..5a94f45 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,20 +2,18 @@
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
-# pip-compile --output-file=requirements.txt requirements.in
+# pip-compile --output-file=requirements.txt pyproject.toml
#
amqp==5.3.1
# via kombu
annotated-types==0.7.0
# via pydantic
apiflask==3.1.0
- # via -r requirements.in
+ # via ro-crate-validation-service (pyproject.toml)
apispec==6.8.2
# via apiflask
-argon2-cffi==25.1.0
- # via minio
-argon2-cffi-bindings==25.1.0
- # via argon2-cffi
+async-timeout==5.0.1
+ # via redis
attrs==25.3.0
# via
# cattrs
@@ -24,16 +22,18 @@ billiard==4.2.1
# via celery
blinker==1.9.0
# via flask
+boto3==1.43.29
+ # via ro-crate-validation-service (pyproject.toml)
+botocore==1.43.29
+ # via
+ # boto3
+ # s3transfer
cattrs==25.1.1
# via requests-cache
celery==5.6.3
- # via -r requirements.in
+ # via ro-crate-validation-service (pyproject.toml)
certifi==2025.8.3
- # via
- # minio
- # requests
-cffi==1.17.1
- # via argon2-cffi-bindings
+ # via requests
charset-normalizer==3.4.2
# via requests
click==8.2.1
@@ -61,10 +61,10 @@ enum-tools==0.12.0
# via roc-validator
flask==3.1.3
# via
- # -r requirements.in
# apiflask
# flask-httpauth
# flask-marshmallow
+ # ro-crate-validation-service (pyproject.toml)
flask-httpauth==4.8.1
# via apiflask
flask-marshmallow==1.3.0
@@ -84,6 +84,10 @@ itsdangerous==2.2.0
# via flask
jinja2==3.1.6
# via flask
+jmespath==1.1.0
+ # via
+ # boto3
+ # botocore
kombu==5.6.2
# via celery
markdown-it-py==3.0.0
@@ -100,8 +104,6 @@ marshmallow==4.1.2
# webargs
mdurl==0.1.2
# via markdown-it-py
-minio==7.2.20
- # via -r requirements.in
owlrl==7.1.4
# via pyshacl
packaging==25.0
@@ -120,10 +122,6 @@ prompt-toolkit==3.0.51
# via
# click-repl
# inquirerpy
-pycparser==2.22
- # via cffi
-pycryptodome==3.23.0
- # via minio
pydantic[email]==2.12.4
# via apiflask
pydantic-core==2.41.5
@@ -137,20 +135,22 @@ pyparsing==3.2.3
pyshacl==0.30.1
# via roc-validator
python-dateutil==2.9.0.post0
- # via celery
+ # via
+ # botocore
+ # celery
python-dotenv==1.2.2
- # via -r requirements.in
+ # via ro-crate-validation-service (pyproject.toml)
rdflib[html]==7.1.4
# via
# owlrl
# pyshacl
# roc-validator
redis==7.4.0
- # via -r requirements.in
+ # via ro-crate-validation-service (pyproject.toml)
requests==2.33.1
# via
- # -r requirements.in
# requests-cache
+ # ro-crate-validation-service (pyproject.toml)
# roc-validator
requests-cache==1.2.1
# via roc-validator
@@ -160,8 +160,10 @@ rich==13.9.4
# roc-validator
rich-click==1.8.9
# via roc-validator
-roc-validator==0.9.0
- # via -r requirements.in
+roc-validator==0.10.0
+ # via ro-crate-validation-service (pyproject.toml)
+s3transfer==0.18.0
+ # via boto3
six==1.17.0
# via python-dateutil
toml==0.10.2
@@ -170,7 +172,6 @@ typing-extensions==4.14.1
# via
# cattrs
# enum-tools
- # minio
# pydantic
# pydantic-core
# rich-click
@@ -187,7 +188,7 @@ url-normalize==2.2.1
# via requests-cache
urllib3==2.6.3
# via
- # minio
+ # botocore
# requests
# requests-cache
vine==5.1.0
@@ -203,7 +204,7 @@ webargs==8.7.0
# via apiflask
werkzeug==3.1.8
# via
- # -r requirements.in
# flask
+ # ro-crate-validation-service (pyproject.toml)
zipp==3.23.0
# via importlib-metadata
diff --git a/tests/crates/test_ids.py b/tests/crates/test_ids.py
new file mode 100644
index 0000000..ed18a4e
--- /dev/null
+++ b/tests/crates/test_ids.py
@@ -0,0 +1,54 @@
+"""Tests for strict crate-ID validation."""
+
+import pytest
+
+from app.crates.ids import InvalidCrateId, is_valid_crate_id, validate_crate_id
+
+
+@pytest.mark.parametrize(
+ "crate_id",
+ [
+ "a",
+ "crate-123",
+ "my_crate.v2",
+ "ABC.def-123_456",
+ "release.zip", # ".zip" in the ID is harmless now: IDs are opaque
+ "x" * 128, # max length
+ ],
+)
+def test_valid_ids_are_accepted(crate_id):
+ assert validate_crate_id(crate_id) == crate_id
+ assert is_valid_crate_id(crate_id) is True
+
+
+@pytest.mark.parametrize(
+ "crate_id",
+ [
+ "", # empty
+ ".hidden", # leading dot
+ "-leading-dash", # must start alphanumeric
+ "a/b", # path separator
+ "../etc/passwd", # traversal
+ "a..b", # parent-dir sequence
+ "with space", # whitespace
+ "tab\tchar", # control char
+ "x" * 129, # too long
+ "unicodé", # non-ASCII
+ ],
+)
+def test_invalid_ids_are_rejected(crate_id):
+ assert is_valid_crate_id(crate_id) is False
+ with pytest.raises(InvalidCrateId):
+ validate_crate_id(crate_id)
+
+
+def test_non_string_is_rejected():
+ assert is_valid_crate_id(None) is False
+ with pytest.raises(InvalidCrateId):
+ validate_crate_id(None)
+
+
+def test_error_message_names_the_offending_id():
+ with pytest.raises(InvalidCrateId) as exc_info:
+ validate_crate_id("a/b")
+ assert "a/b" in str(exc_info.value)
diff --git a/tests/crates/test_layout.py b/tests/crates/test_layout.py
new file mode 100644
index 0000000..7f0a199
--- /dev/null
+++ b/tests/crates/test_layout.py
@@ -0,0 +1,33 @@
+"""Tests for canonical crate/result key construction."""
+
+import pytest
+
+from app.crates.layout import (
+ crate_dir_prefix,
+ crate_metadata_key,
+ crate_zip_key,
+ result_key,
+)
+
+
+def test_crate_zip_key_under_prefix():
+ assert crate_zip_key("crates", "foo") == "crates/foo.zip"
+
+
+def test_crate_dir_prefix_under_prefix():
+ assert crate_dir_prefix("crates", "foo") == "crates/foo/"
+
+
+def test_crate_metadata_key_under_prefix():
+ assert crate_metadata_key("crates", "foo") == "crates/foo/ro-crate-metadata.json"
+
+
+def test_result_key_uses_separate_results_prefix():
+ assert result_key("validation-results", "foo") == "validation-results/foo.json"
+
+
+@pytest.mark.parametrize("prefix", ["", "crates/"])
+def test_prefix_edge_cases_are_normalised(prefix):
+ """An empty prefix maps to the bucket root; a trailing slash is not doubled."""
+ expected = "foo.zip" if prefix == "" else "crates/foo.zip"
+ assert crate_zip_key(prefix, "foo") == expected
diff --git a/tests/crates/test_resolver.py b/tests/crates/test_resolver.py
new file mode 100644
index 0000000..5d9a41b
--- /dev/null
+++ b/tests/crates/test_resolver.py
@@ -0,0 +1,92 @@
+"""Tests for deterministic crate resolution over a StorageBackend."""
+
+import pytest
+
+from app.crates.ids import InvalidCrateId
+from app.crates.resolver import (
+ AmbiguousCrate,
+ CrateNotFound,
+ ResolvedCrate,
+ resolve_crate,
+)
+from app.storage.memory import InMemoryStorage
+
+PREFIX = "crates"
+
+
+@pytest.fixture
+def storage() -> InMemoryStorage:
+ return InMemoryStorage()
+
+
+def test_resolves_zip_crate(storage):
+ storage.put_bytes("crates/foo.zip", b"PK...")
+
+ resolved = resolve_crate(storage, "foo", PREFIX)
+
+ assert isinstance(resolved, ResolvedCrate)
+ assert resolved.crate_id == "foo"
+ assert resolved.is_zip is True
+ assert resolved.key == "crates/foo.zip"
+
+
+def test_resolves_directory_crate_with_metadata(storage):
+ storage.put_bytes("crates/foo/ro-crate-metadata.json", b"{}")
+ storage.put_bytes("crates/foo/data.csv", b"x")
+
+ resolved = resolve_crate(storage, "foo", PREFIX)
+
+ assert resolved.is_zip is False
+ assert resolved.key == "crates/foo/"
+
+
+def test_directory_without_metadata_is_not_a_crate(storage):
+ """A directory lacking ro-crate-metadata.json must not resolve (old TODO)."""
+ storage.put_bytes("crates/foo/data.csv", b"x")
+
+ with pytest.raises(CrateNotFound):
+ resolve_crate(storage, "foo", PREFIX)
+
+
+def test_ambiguous_when_both_zip_and_directory_exist(storage):
+ storage.put_bytes("crates/foo.zip", b"PK...")
+ storage.put_bytes("crates/foo/ro-crate-metadata.json", b"{}")
+
+ with pytest.raises(AmbiguousCrate):
+ resolve_crate(storage, "foo", PREFIX)
+
+
+def test_missing_crate_raises_not_found(storage):
+ with pytest.raises(CrateNotFound):
+ resolve_crate(storage, "absent", PREFIX)
+
+
+def test_sibling_prefix_does_not_false_match(storage):
+ """Resolving 'foo' must not match 'foobar' (old prefix-substring bug)."""
+ storage.put_bytes("crates/foobar.zip", b"PK...")
+
+ with pytest.raises(CrateNotFound):
+ resolve_crate(storage, "foo", PREFIX)
+
+
+def test_zip_suffix_in_id_resolves_as_directory(storage):
+ """An ID containing '.zip' is opaque: a directory crate named 'data.zip' resolves."""
+ storage.put_bytes("crates/data.zip/ro-crate-metadata.json", b"{}")
+
+ resolved = resolve_crate(storage, "data.zip", PREFIX)
+
+ assert resolved.is_zip is False
+ assert resolved.key == "crates/data.zip/"
+
+
+def test_invalid_id_propagates(storage):
+ with pytest.raises(InvalidCrateId):
+ resolve_crate(storage, "../etc", PREFIX)
+
+
+def test_result_object_does_not_satisfy_crate_resolution(storage):
+ """A stored result under a separate prefix never counts as the crate itself."""
+ storage.put_bytes("validation-results/foo.json", b"{}")
+
+ with pytest.raises(CrateNotFound):
+ resolve_crate(storage, "foo", PREFIX)
diff --git a/tests/ro_crates/test_routes.py b/tests/ro_crates/test_routes.py
new file mode 100644
index 0000000..3743016
--- /dev/null
+++ b/tests/ro_crates/test_routes.py
@@ -0,0 +1,177 @@
+from unittest.mock import patch
+
+import pytest
+from flask.testing import FlaskClient
+
+from app import create_app
+from app.utils.config import Settings
+
+
+def _storage_env() -> dict:
+ """Returns complete storage-enabled environment for building a storage-backed app."""
+ return {
+ "STORAGE_ENABLED": "true",
+ "S3_ENDPOINT": "localhost:9000",
+ "S3_ACCESS_KEY": "minioadmin",
+ "S3_SECRET_KEY": "minioadmin",
+ "S3_BUCKET": "test_bucket",
+ "CELERY_BROKER_URL": "redis://localhost:6379/0",
+ "CELERY_RESULT_BACKEND": "redis://localhost:6379/1",
+ }
+
+
+@pytest.fixture
+def client():
+ """Client with storage disabled (the default): only metadata validation is exposed."""
+ app = create_app(settings=Settings.from_env({}))
+ return app.test_client()
+
+
+@pytest.fixture
+def storage_client():
+ """Client with storage enabled, so the ID-based validation endpoints are registered."""
+ app = create_app(settings=Settings.from_env(_storage_env()))
+ return app.test_client()
+
+
+# Test POST API: /v1/ro_crates/{crate_id}/validation
+
+
+@pytest.mark.parametrize(
+ "payload, expected_args",
+ [
+ (
+ {"profile_name": "ro-crate", "webhook_url": "https://hook"},
+ ("crate-123", "ro-crate", "https://hook"),
+ ),
+ ({"profile_name": "ro-crate"}, ("crate-123", "ro-crate", None)),
+ ({"webhook_url": "https://hook"}, ("crate-123", None, "https://hook")),
+ ({}, ("crate-123", None, None)),
+ ],
+ ids=["all_fields", "no_webhook", "no_profile", "empty_body"],
+)
+def test_validate_by_id_queues_and_returns_202(storage_client, payload, expected_args):
+ with patch("app.ro_crates.routes.post_routes.queue_ro_crate_validation_task") as mock_queue:
+ mock_queue.return_value = ({"message": "Validation in progress"}, 202)
+
+ response = storage_client.post("/v1/ro_crates/crate-123/validation", json=payload)
+
+ assert response.status_code == 202
+ assert response.json == {"message": "Validation in progress"}
+ mock_queue.assert_called_once_with(*expected_args)
+
+
+def test_validate_by_id_no_longer_accepts_credentials(storage_client):
+ """The request body carries no storage credentials; only optional fields."""
+ with patch("app.ro_crates.routes.post_routes.queue_ro_crate_validation_task") as mock_queue:
+ mock_queue.return_value = ({"message": "Validation in progress"}, 202)
+
+ response = storage_client.post(
+ "/v1/ro_crates/crate-123/validation",
+ json={"profile_name": "ro-crate"},
+ )
+
+ assert response.status_code == 202
+ # Only crate_id, profile, webhook are forwarded — no minio_config.
+ mock_queue.assert_called_once_with("crate-123", "ro-crate", None)
+
+
+# Test POST API: /v1/ro_crates/validate_metadata
+
+
+@pytest.mark.parametrize(
+ "payload, status_code, response_json, profiles_path",
+ [
+ (
+ {
+ "crate_json": '{"@context": "https://w3id.org/ro/crate/1.1/context"}',
+ "profile_name": "default",
+ },
+ 200,
+ {"status": "valid"},
+ None,
+ ),
+ (
+ {"crate_json": '{"@context": "https://w3id.org/ro/crate/1.1/context"}'},
+ 200,
+ {"status": "valid"},
+ None,
+ ),
+ ],
+ ids=["success_with_all_fields", "success_without_profile_name"],
+)
+def test_validate_metadata_success(
+ client: FlaskClient, payload, status_code, response_json, profiles_path
+):
+ with patch("app.ro_crates.routes.post_routes.run_metadata_validation") as mock_run:
+ mock_run.return_value = (response_json, status_code)
+
+ response = client.post("/v1/ro_crates/validate_metadata", json=payload)
+
+ crate_json = payload.get("crate_json")
+ profile_name = payload.get("profile_name")
+ # Storage-disabled client => all validation-tuning settings are unset.
+ mock_run.assert_called_once_with(
+ crate_json,
+ profile_name,
+ profiles_path=profiles_path,
+ extra_profiles_path=None,
+ cache_path=None,
+ offline=False,
+ )
+ assert response.status_code == status_code
+ assert response.json == response_json
+
+
+@pytest.mark.parametrize(
+ "payload, status_code, response_text",
+ [
+ ({"profile_name": "default"}, 422, "Missing data for required field"),
+ ({"crate_json": ""}, 422, "Missing required parameter"),
+ ({"crate_json": "{"}, 422, "not valid JSON"),
+ ({"crate_json": "{}"}, 422, "empty"),
+ ],
+ ids=["missing_crate", "blank_crate", "malformed_crate", "empty_crate"],
+)
+def test_validate_metadata_failure(client: FlaskClient, payload, status_code, response_text):
+ response = client.post("/v1/ro_crates/validate_metadata", json=payload)
+ assert response.status_code == status_code
+ assert response_text in response.get_data(as_text=True)
+
+
+# Test GET API: /v1/ro_crates/{crate_id}/validation
+
+
+def test_get_validation_by_id_returns_result(storage_client):
+ with patch("app.ro_crates.routes.get_routes.get_ro_crate_validation_task") as mock_get:
+ mock_get.return_value = ({"status": "valid"}, 200)
+
+ response = storage_client.get("/v1/ro_crates/crate-123/validation")
+
+ assert response.status_code == 200
+ assert response.json == {"status": "valid"}
+ mock_get.assert_called_once_with("crate-123")
+
+
+# Test store-backed endpoints are unavailable when storage is disabled (the default)
+
+
+def test_post_route_not_registered_when_storage_disabled(client: FlaskClient):
+ response = client.post("/v1/ro_crates/crate-123/validation", json={})
+ assert response.status_code == 404
+
+
+def test_get_route_not_registered_when_storage_disabled(client: FlaskClient):
+ response = client.get("/v1/ro_crates/crate-123/validation")
+ assert response.status_code == 404
+
+
+def test_metadata_route_available_when_storage_disabled(client: FlaskClient):
+ payload = {"crate_json": '{"@context": "https://w3id.org/ro/crate/1.1/context"}'}
+ with patch("app.ro_crates.routes.post_routes.run_metadata_validation") as mock_run:
+ mock_run.return_value = ({"status": "valid"}, 200)
+
+ response = client.post("/v1/ro_crates/validate_metadata", json=payload)
+
+ assert response.status_code == 200
+ mock_run.assert_called_once()
diff --git a/tests/services/test_logging_service.py b/tests/services/test_logging_service.py
new file mode 100644
index 0000000..7a0667a
--- /dev/null
+++ b/tests/services/test_logging_service.py
@@ -0,0 +1,74 @@
+"""Tests for structured logging, request IDs, and secret redaction."""
+
+import json
+import logging
+
+from app.services.logging_service import (
+ JsonFormatter,
+ RedactionFilter,
+ RequestIdFilter,
+ get_request_id,
+ new_request_id,
+ set_request_id,
+)
+
+
+def _record(msg, args=None):
+ return logging.LogRecord("svc", logging.INFO, "path", 1, msg, args, None)
+
+
+def test_json_formatter_emits_expected_fields():
+ record = _record("hello")
+ record.request_id = "r1"
+
+ payload = json.loads(JsonFormatter().format(record))
+
+ assert payload["level"] == "INFO"
+ assert payload["logger"] == "svc"
+ assert payload["message"] == "hello"
+ assert payload["request_id"] == "r1"
+ assert "timestamp" in payload
+
+
+def test_redaction_filter_masks_secret_values():
+ redact = RedactionFilter(["supersecret", "AKIAEXAMPLE"])
+ record = _record("connecting with key=%s token=%s", ("AKIAEXAMPLE", "supersecret"))
+
+ redact.filter(record)
+
+ message = record.getMessage()
+ assert "supersecret" not in message
+ assert "AKIAEXAMPLE" not in message
+ assert message.count("***") == 2
+
+
+def test_redaction_filter_ignores_empty_secrets():
+ redact = RedactionFilter([None, "", "real"])
+ record = _record("value=real")
+ redact.filter(record)
+ assert record.getMessage() == "value=***"
+
+
+def test_request_id_filter_injects_current_id():
+ set_request_id("abc-123")
+ record = _record("anything")
+
+ RequestIdFilter().filter(record)
+
+ assert record.request_id == "abc-123"
+
+
+def test_request_id_filter_defaults_when_unset():
+ set_request_id(None)
+ record = _record("anything")
+ RequestIdFilter().filter(record)
+ assert record.request_id == "-"
+
+
+def test_new_request_id_is_unique():
+ assert new_request_id() != new_request_id()
+
+
+def test_get_request_id_round_trips():
+ set_request_id("xyz")
+ assert get_request_id() == "xyz"
diff --git a/tests/services/test_validation_service.py b/tests/services/test_validation_service.py
new file mode 100644
index 0000000..1c393e9
--- /dev/null
+++ b/tests/services/test_validation_service.py
@@ -0,0 +1,173 @@
+"""Tests for the validation service layer."""
+
+from unittest.mock import patch
+
+import pytest
+from flask import Flask
+
+from app import create_app
+from app.crates.ids import InvalidCrateId
+from app.crates.layout import result_key
+from app.crates.resolver import AmbiguousCrate, CrateNotFound
+from app.services.validation_service import (
+ get_ro_crate_validation_task,
+ queue_ro_crate_validation_task,
+ run_metadata_validation,
+)
+from app.storage.memory import InMemoryStorage
+from app.utils.config import InvalidAPIUsage, Settings
+from app.validation.results import ValidationOutcome, ValidationStatus
+
+
+def _storage_env() -> dict:
+ return {
+ "STORAGE_ENABLED": "true",
+ "S3_ENDPOINT": "minio:9000",
+ "S3_ACCESS_KEY": "a",
+ "S3_SECRET_KEY": "b",
+ "S3_BUCKET": "ro-crates",
+ "CELERY_BROKER_URL": "redis://r/0",
+ "CELERY_RESULT_BACKEND": "redis://r/1",
+ }
+
+
+@pytest.fixture
+def flask_app():
+ """Bare app context for functions that only need jsonify."""
+ app = Flask(__name__)
+ with app.app_context():
+ yield app
+
+
+@pytest.fixture
+def app_ctx():
+ """Storage-enabled app context, so current_app.config['SETTINGS'] is set."""
+ app = create_app(settings=Settings.from_env(_storage_env()))
+ with app.app_context():
+ yield app
+
+
+# --- queue_ro_crate_validation_task --------------------------------------
+
+
+@patch("app.services.validation_service.process_validation_task_by_id.delay")
+@patch("app.services.validation_service.resolve_crate")
+@patch("app.services.validation_service._build_storage")
+def test_queue_resolves_then_delays(mock_storage, mock_resolve, mock_delay, app_ctx):
+ response, status = queue_ro_crate_validation_task("crate123", "ro-crate", "https://hook")
+
+ assert status == 202
+ assert response.json == {"message": "Validation in progress"}
+ mock_resolve.assert_called_once()
+ mock_delay.assert_called_once_with("crate123", "ro-crate", "https://hook")
+
+
+@patch("app.services.validation_service.process_validation_task_by_id.delay")
+@patch("app.services.validation_service.resolve_crate", side_effect=CrateNotFound("nope"))
+@patch("app.services.validation_service._build_storage")
+def test_queue_not_found_propagates_without_queueing(
+ mock_storage, mock_resolve, mock_delay, app_ctx
+):
+ with pytest.raises(CrateNotFound):
+ queue_ro_crate_validation_task("missing")
+ mock_delay.assert_not_called()
+
+
+@patch("app.services.validation_service.process_validation_task_by_id.delay")
+@patch("app.services.validation_service.resolve_crate", side_effect=AmbiguousCrate("both"))
+@patch("app.services.validation_service._build_storage")
+def test_queue_ambiguous_propagates_without_queueing(
+ mock_storage, mock_resolve, mock_delay, app_ctx
+):
+ with pytest.raises(AmbiguousCrate):
+ queue_ro_crate_validation_task("dup")
+ mock_delay.assert_not_called()
+
+
+# --- run_metadata_validation (synchronous) -------------------------------
+
+
+@patch("app.services.validation_service.validate_metadata")
+def test_run_metadata_validation_valid_is_200(mock_validate, flask_app):
+ mock_validate.return_value = ValidationOutcome(
+ status=ValidationStatus.VALID, profile="ro-crate", detail={"report": "ok"}
+ )
+
+ response, status = run_metadata_validation('{"@graph": []}', "ro-crate", "/app/profiles")
+
+ assert status == 200
+ assert response.json["status"] == "valid"
+ mock_validate.assert_called_once_with(
+ {"@graph": []},
+ profile_name="ro-crate",
+ profiles_path="/app/profiles",
+ extra_profiles_path=None,
+ cache_path=None,
+ offline=False,
+ )
+
+
+@patch("app.services.validation_service.validate_metadata")
+def test_run_metadata_validation_invalid_is_200(mock_validate, flask_app):
+ mock_validate.return_value = ValidationOutcome(
+ status=ValidationStatus.INVALID, detail={"issues": [1]}
+ )
+ response, status = run_metadata_validation('{"@graph": []}')
+ assert status == 200
+ assert response.json["status"] == "invalid"
+
+
+@patch("app.services.validation_service.validate_metadata")
+def test_run_metadata_validation_error_outcome_is_422(mock_validate, flask_app):
+ mock_validate.return_value = ValidationOutcome.from_error("validator blew up")
+ response, status = run_metadata_validation('{"@graph": []}')
+ assert status == 422
+ assert response.json["status"] == "error"
+ assert "validator blew up" in response.json["error"]
+
+
+@pytest.mark.parametrize(
+ "crate_json, response_error",
+ [
+ (None, "Missing required parameter: crate_json"),
+ ("", "Missing required parameter: crate_json"),
+ ("{", "not valid JSON"),
+ ("{}", "empty"),
+ ],
+ ids=["missing", "blank", "invalid_json", "empty_json"],
+)
+def test_run_metadata_validation_json_errors(flask_app, crate_json, response_error):
+ response, status = run_metadata_validation(crate_json)
+ assert status == 422
+ assert response_error in response.json["error"]
+
+
+# --- get_ro_crate_validation_task ----------------------------------------
+
+
+@patch("app.services.validation_service._build_storage")
+def test_get_returns_stored_result(mock_storage, app_ctx):
+ storage = InMemoryStorage()
+ storage.put_bytes(result_key("validation-results", "crate123"), b'{"status": "valid"}')
+ mock_storage.return_value = storage
+
+ response, status = get_ro_crate_validation_task("crate123")
+
+ assert status == 200
+ assert response.json["status"] == "valid"
+
+
+@patch("app.services.validation_service._build_storage")
+def test_get_missing_result_is_404(mock_storage, app_ctx):
+ mock_storage.return_value = InMemoryStorage()
+
+ with pytest.raises(InvalidAPIUsage) as exc_info:
+ get_ro_crate_validation_task("crate123")
+ assert exc_info.value.status_code == 404
+
+
+@patch("app.services.validation_service._build_storage")
+def test_get_invalid_id_raises(mock_storage, app_ctx):
+ mock_storage.return_value = InMemoryStorage()
+ with pytest.raises(InvalidCrateId):
+ get_ro_crate_validation_task("../bad")
diff --git a/tests/storage/test_memory.py b/tests/storage/test_memory.py
new file mode 100644
index 0000000..a2af99d
--- /dev/null
+++ b/tests/storage/test_memory.py
@@ -0,0 +1,61 @@
+"""Tests for the storage abstraction and its in-memory fake."""
+
+import pytest
+
+from app.storage.base import ObjectStat, StorageBackend
+from app.storage.errors import ObjectNotFound
+from app.storage.memory import InMemoryStorage
+
+
+@pytest.fixture
+def storage() -> InMemoryStorage:
+ return InMemoryStorage()
+
+
+def test_put_then_get_round_trips_bytes(storage):
+ storage.put_bytes("crates/foo.zip", b"payload")
+ assert storage.get_bytes("crates/foo.zip") == b"payload"
+
+
+def test_get_missing_key_raises_object_not_found(storage):
+ with pytest.raises(ObjectNotFound):
+ storage.get_bytes("crates/missing.zip")
+
+
+def test_stat_returns_size_for_existing_object(storage):
+ storage.put_bytes("crates/foo.zip", b"12345")
+ stat = storage.stat("crates/foo.zip")
+ assert isinstance(stat, ObjectStat)
+ assert stat.key == "crates/foo.zip"
+ assert stat.size == 5
+
+
+def test_stat_missing_key_raises_object_not_found(storage):
+ with pytest.raises(ObjectNotFound):
+ storage.stat("crates/missing.zip")
+
+
+def test_list_returns_only_keys_under_prefix(storage):
+ storage.put_bytes("crates/a/ro-crate-metadata.json", b"{}")
+ storage.put_bytes("crates/a/data.csv", b"x")
+ storage.put_bytes("crates/b.zip", b"y")
+ storage.put_bytes("results/a.json", b"z")
+
+ assert storage.list("crates/a/") == [
+ "crates/a/data.csv",
+ "crates/a/ro-crate-metadata.json",
+ ]
+
+
+def test_download_tree_preserves_relative_structure(storage, tmp_path):
+ storage.put_bytes("crates/a/ro-crate-metadata.json", b"{}")
+ storage.put_bytes("crates/a/sub/data.csv", b"col\n1\n")
+
+ storage.download_tree("crates/a/", str(tmp_path))
+
+ assert (tmp_path / "ro-crate-metadata.json").read_bytes() == b"{}"
+ assert (tmp_path / "sub" / "data.csv").read_bytes() == b"col\n1\n"
+
+
+def test_in_memory_storage_satisfies_protocol(storage):
+ assert isinstance(storage, StorageBackend)
diff --git a/tests/storage/test_s3.py b/tests/storage/test_s3.py
new file mode 100644
index 0000000..d1b4bcf
--- /dev/null
+++ b/tests/storage/test_s3.py
@@ -0,0 +1,107 @@
+"""Tests for the boto3-backed S3 storage backend, tested against moto."""
+
+import boto3
+import pytest
+from moto import mock_aws
+
+from app.storage.base import StorageBackend
+from app.storage.errors import ObjectNotFound, StorageError
+from app.storage.s3 import S3Backend
+from app.utils.config import Settings
+
+BUCKET = "test-bucket"
+
+
+@pytest.fixture
+def s3_backend():
+ with mock_aws():
+ client = boto3.client("s3", region_name="us-east-1")
+ client.create_bucket(Bucket=BUCKET)
+ yield S3Backend(client, BUCKET)
+
+
+def test_put_then_get_round_trips_bytes(s3_backend):
+ s3_backend.put_bytes("crates/foo.zip", b"payload")
+ assert s3_backend.get_bytes("crates/foo.zip") == b"payload"
+
+
+def test_get_missing_key_raises_object_not_found(s3_backend):
+ with pytest.raises(ObjectNotFound):
+ s3_backend.get_bytes("crates/missing.zip")
+
+
+def test_stat_returns_size_for_existing_object(s3_backend):
+ s3_backend.put_bytes("crates/foo.zip", b"12345")
+ stat = s3_backend.stat("crates/foo.zip")
+ assert stat.key == "crates/foo.zip"
+ assert stat.size == 5
+
+
+def test_stat_missing_key_raises_object_not_found(s3_backend):
+ with pytest.raises(ObjectNotFound):
+ s3_backend.stat("crates/missing.zip")
+
+
+def test_list_returns_only_keys_under_prefix_sorted(s3_backend):
+ s3_backend.put_bytes("crates/a/ro-crate-metadata.json", b"{}")
+ s3_backend.put_bytes("crates/a/data.csv", b"x")
+ s3_backend.put_bytes("crates/b.zip", b"y")
+ s3_backend.put_bytes("results/a.json", b"z")
+
+ assert s3_backend.list("crates/a/") == [
+ "crates/a/data.csv",
+ "crates/a/ro-crate-metadata.json",
+ ]
+
+
+def test_list_paginates_beyond_one_thousand_objects(s3_backend):
+ for i in range(1500):
+ s3_backend.put_bytes(f"many/{i:04d}.txt", b"x")
+ assert len(s3_backend.list("many/")) == 1500
+
+
+def test_download_tree_preserves_relative_structure(s3_backend, tmp_path):
+ s3_backend.put_bytes("crates/a/ro-crate-metadata.json", b"{}")
+ s3_backend.put_bytes("crates/a/sub/data.csv", b"col\n1\n")
+
+ s3_backend.download_tree("crates/a/", str(tmp_path))
+
+ assert (tmp_path / "ro-crate-metadata.json").read_bytes() == b"{}"
+ assert (tmp_path / "sub" / "data.csv").read_bytes() == b"col\n1\n"
+
+
+def test_non_missing_client_error_becomes_storage_error(s3_backend):
+ """A failure other than a missing key surfaces as StorageError, not ObjectNotFound."""
+ broken = S3Backend(s3_backend._client, "nonexistent-bucket")
+ with pytest.raises(StorageError) as exc_info:
+ broken.get_bytes("whatever")
+ assert not isinstance(exc_info.value, ObjectNotFound)
+
+
+def test_health_check_passes_for_existing_bucket(s3_backend):
+ s3_backend.health_check() # must not raise
+
+
+def test_health_check_fails_for_missing_bucket(s3_backend):
+ broken = S3Backend(s3_backend._client, "nonexistent-bucket")
+ with pytest.raises(StorageError):
+ broken.health_check()
+
+
+def test_s3_backend_satisfies_protocol(s3_backend):
+ assert isinstance(s3_backend, StorageBackend)
+
+
+def test_from_settings_builds_backend_for_s3_compatible_endpoint():
+ env = {
+ "STORAGE_ENABLED": "true",
+ "S3_ENDPOINT": "minio:9000",
+ "S3_ACCESS_KEY": "minioadmin",
+ "S3_SECRET_KEY": "minioadmin",
+ "S3_BUCKET": "ro-crates",
+ "CELERY_BROKER_URL": "redis://redis:6379/0",
+ "CELERY_RESULT_BACKEND": "redis://redis:6379/1",
+ }
+ backend = S3Backend.from_settings(Settings.from_env(env))
+ assert isinstance(backend, StorageBackend)
+ assert backend.bucket == "ro-crates"
diff --git a/tests/tasks/test_validation_tasks.py b/tests/tasks/test_validation_tasks.py
new file mode 100644
index 0000000..e0ad932
--- /dev/null
+++ b/tests/tasks/test_validation_tasks.py
@@ -0,0 +1,133 @@
+"""Tests for the store-backed validation orchestration (run_validation_job)."""
+
+import json
+from unittest import mock
+
+import pytest
+
+from app.crates.layout import result_key
+from app.storage.errors import StorageError
+from app.storage.memory import InMemoryStorage
+from app.tasks.validation_tasks import run_validation_job
+from app.utils.config import Settings
+from app.utils.webhook_utils import WebhookDeliveryError
+from app.validation.results import ValidationOutcome, ValidationStatus
+
+RUNNER = "app.tasks.validation_tasks.validate_crate_path"
+WEBHOOK = "app.tasks.validation_tasks.send_webhook_notification"
+
+
+def _settings() -> Settings:
+ return Settings.from_env(
+ {
+ "STORAGE_ENABLED": "true",
+ "S3_ENDPOINT": "minio:9000",
+ "S3_ACCESS_KEY": "a",
+ "S3_SECRET_KEY": "b",
+ "S3_BUCKET": "ro-crates",
+ "CELERY_BROKER_URL": "redis://r/0",
+ "CELERY_RESULT_BACKEND": "redis://r/1",
+ }
+ )
+
+
+def _stored_outcome(storage: InMemoryStorage, crate_id: str) -> dict:
+ raw = storage.get_bytes(result_key("validation-results", crate_id))
+ return json.loads(raw)
+
+
+@pytest.fixture
+def storage() -> InMemoryStorage:
+ return InMemoryStorage()
+
+
+def test_valid_zip_crate_is_validated_and_persisted(storage):
+ storage.put_bytes("crates/foo.zip", b"PK\x03\x04")
+
+ with mock.patch(RUNNER) as run, mock.patch(WEBHOOK) as hook:
+ run.return_value = ValidationOutcome(
+ status=ValidationStatus.VALID, profile=None, detail={"r": 1}, created_at="t"
+ )
+ outcome = run_validation_job(storage, "foo", _settings(), created_at="t")
+
+ assert outcome.status is ValidationStatus.VALID
+ assert _stored_outcome(storage, "foo")["status"] == "valid"
+ # The validator was handed the downloaded zip path.
+ assert run.call_args.args[0].endswith("foo.zip")
+ hook.assert_not_called()
+
+
+def test_directory_crate_is_downloaded_and_persisted(storage):
+ storage.put_bytes("crates/foo/ro-crate-metadata.json", b"{}")
+ storage.put_bytes("crates/foo/data.csv", b"x")
+
+ with mock.patch(RUNNER) as run, mock.patch(WEBHOOK):
+ run.return_value = ValidationOutcome(
+ status=ValidationStatus.INVALID, detail={"issues": [1]}, created_at="t"
+ )
+ run_validation_job(storage, "foo", _settings(), created_at="t")
+
+ assert _stored_outcome(storage, "foo")["status"] == "invalid"
+
+
+def test_missing_crate_persists_error_outcome(storage):
+ with mock.patch(RUNNER) as run, mock.patch(WEBHOOK):
+ outcome = run_validation_job(storage, "absent", _settings(), created_at="t")
+
+ assert outcome.status is ValidationStatus.ERROR
+ assert _stored_outcome(storage, "absent")["status"] == "error"
+ run.assert_not_called() # never reached the validator
+
+
+def test_webhook_is_sent_with_outcome_when_url_given(storage):
+ storage.put_bytes("crates/foo.zip", b"PK")
+
+ with mock.patch(RUNNER) as run, mock.patch(WEBHOOK) as hook:
+ run.return_value = ValidationOutcome(status=ValidationStatus.VALID, created_at="t")
+ run_validation_job(storage, "foo", _settings(), webhook_url="https://hook", created_at="t")
+
+ hook.assert_called_once()
+ url, payload = hook.call_args.args
+ assert url == "https://hook"
+ assert payload["status"] == "valid"
+
+
+def test_transient_storage_error_propagates_for_retry():
+ class FlakyStorage(InMemoryStorage):
+ def get_bytes(self, key):
+ raise StorageError("temporary outage")
+
+ storage = FlakyStorage()
+ storage.put_bytes("crates/foo.zip", b"PK") # so resolution finds the zip
+
+ with mock.patch(RUNNER), mock.patch(WEBHOOK):
+ with pytest.raises(StorageError):
+ run_validation_job(storage, "foo", _settings(), created_at="t")
+
+
+def test_webhook_failure_surfaces_but_result_is_already_persisted(storage):
+ """A terminal webhook failure propagates, yet the outcome was persisted first."""
+ storage.put_bytes("crates/foo.zip", b"PK")
+
+ with mock.patch(RUNNER) as run, mock.patch(WEBHOOK) as hook:
+ run.return_value = ValidationOutcome(status=ValidationStatus.VALID, created_at="t")
+ hook.side_effect = WebhookDeliveryError("gave up")
+
+ with pytest.raises(WebhookDeliveryError):
+ run_validation_job(
+ storage, "foo", _settings(), webhook_url="https://hook", created_at="t"
+ )
+
+ # Persisted before the webhook was attempted, so GET still works.
+ assert _stored_outcome(storage, "foo")["status"] == "valid"
+
+
+def test_created_at_is_persisted(storage):
+ storage.put_bytes("crates/foo.zip", b"PK")
+ with mock.patch(RUNNER) as run, mock.patch(WEBHOOK):
+ run.return_value = ValidationOutcome(
+ status=ValidationStatus.VALID, created_at="2026-06-16T00:00:00Z"
+ )
+ run_validation_job(storage, "foo", _settings(), created_at="2026-06-16T00:00:00Z")
+
+ assert _stored_outcome(storage, "foo")["created_at"] == "2026-06-16T00:00:00Z"
diff --git a/tests/test_api_routes.py b/tests/test_api_routes.py
deleted file mode 100644
index 2f1071b..0000000
--- a/tests/test_api_routes.py
+++ /dev/null
@@ -1,340 +0,0 @@
-from flask.testing import FlaskClient
-import pytest
-from unittest.mock import patch
-from app import create_app
-
-
-@pytest.fixture
-def client():
- """Client with MinIO disabled (the default): only metadata validation is exposed."""
- app = create_app()
- return app.test_client()
-
-
-@pytest.fixture
-def minio_client(monkeypatch):
- """Client with MinIO enabled, so the ID-based validation endpoints are registered."""
- # MINIO_ENABLED is resolved on the Config class at import time, so override
- # the class attribute before building the app rather than the env var.
- monkeypatch.setattr("app.utils.config.DevelopmentConfig.MINIO_ENABLED", True)
- app = create_app()
- return app.test_client()
-
-
-# Test POST API: /v1/ro_crates/{crate_id}/validation
-
-@pytest.mark.parametrize(
- "crate_id, payload, profiles_path, status_code, response_json",
- [
- (
- "crate-123", {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "root_path": "base_path",
- "webhook_url": "https://webhook.example.com",
- "profile_name": "default"
- },
- None,
- 202, {"message": "Validation in progress"}
- ),
- (
- "crate-123", {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "root_path": "base_path",
- "webhook_url": "https://webhook.example.com",
- },
- None,
- 202, {"message": "Validation in progress"}
- ),
- (
- "crate-123", {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "root_path": "base_path",
- "profile_name": "default"
- },
- None,
- 202, {"message": "Validation in progress"}
- ),
- (
- "crate-123", {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "webhook_url": "https://webhook.example.com",
- "profile_name": "default"
- },
- None,
- 202, {"message": "Validation in progress"}
- ),
- (
- "crate-123", {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- },
- None,
- 202, {"message": "Validation in progress"}
- ),
- ],
- ids=["validate_by_id", "validate_with_missing_profile_name",
- "validate_with_missing_webhook_url", "validate_with_missing_root_path",
- "validate_with_missing_root_path_and_profile_name_and_webhook_url"]
-)
-def test_validate_by_id_success(minio_client: FlaskClient, crate_id: str, payload: dict,
- profiles_path: str, status_code: int, response_json: dict):
- with patch("app.ro_crates.routes.post_routes.queue_ro_crate_validation_task") as mock_queue:
- mock_queue.return_value = (response_json, status_code)
-
- response = minio_client.post(f"/v1/ro_crates/{crate_id}/validation", json=payload)
-
- minio_config = payload["minio_config"] if "minio_config" in payload else None
- root_path = payload["root_path"] if "root_path" in payload else None
- profile_name = payload["profile_name"] if "profile_name" in payload else None
- webhook_url = payload["webhook_url"] if "webhook_url" in payload else None
- assert response.status_code == status_code
- assert response.json == response_json
- mock_queue.assert_called_once_with(minio_config, crate_id, root_path, profile_name, webhook_url, profiles_path)
-
-
-@pytest.mark.parametrize(
- "crate_id, payload, status_code",
- [
- (
- "", {
- "minio_bucket": "test_bucket",
- "root_path": "base_path",
- "webhook_url": "https://webhook.example.com",
- "profile_name": "default"
- }, 404
- ),
- (
- "crate-123", {
- "root_path": "base_path",
- "webhook_url": "https://webhook.example.com",
- "profile_name": "default"
- }, 422
- ),
- ],
- ids=[
- "missing_crate_id_returns_404",
- "missing_minio_bucket_returns_422"
- ]
-)
-def test_validate_fails_missing_elements(minio_client: FlaskClient, crate_id: str, payload: dict, status_code: int):
- response = minio_client.post(f"/v1/ro_crates/{crate_id}/validation", json=payload)
- assert response.status_code == status_code
-
-
-# Test POST API: /v1/ro_crates/validate_metadata
-
-# TODO: Write tests for profiles_path environment variable. This will require a refactoring of the create_app function.
-@pytest.mark.parametrize(
- "payload, status_code, response_json, profiles_path",
- [
- (
- {
- "crate_json": '{"@context": "https://w3id.org/ro/crate/1.1/context"}',
- "profile_name": "default"
- }, 200, {"status": "success"}, None
- ),
- (
- {
- "crate_json": '{"@context": "https://w3id.org/ro/crate/1.1/context"}',
- }, 200, {"status": "success"}, None
- ),
- ],
- ids=["success_with_all_fields", "success_without_profile_name"]
-)
-def test_validate_metadata_success(client: FlaskClient, payload: dict, status_code: int,
- response_json: dict, profiles_path: str):
- with patch("app.ro_crates.routes.post_routes.queue_ro_crate_metadata_validation_task") as mock_queue:
- mock_queue.return_value = (response_json, status_code)
-
- response = client.post("/v1/ro_crates/validate_metadata", json=payload)
-
- crate_json = payload["crate_json"] if "crate_json" in payload else None
- profile_name = payload["profile_name"] if "profile_name" in payload else None
-
- mock_queue.assert_called_once_with(crate_json, profile_name, profiles_path=profiles_path)
- assert response.status_code == status_code
- assert response.json == response_json
-
-
-@pytest.mark.parametrize(
- "payload, status_code, response_text",
- [
- (
- {
- "profile_name": "default"
- }, 422, "Missing data for required field"
- ),
- (
- {
- "crate_json": '',
- }, 422, "Missing required parameter"
- ),
- (
- {
- "crate_json": '{',
- }, 422, "not valid JSON"
- ),
- (
- {
- "crate_json": '{}',
- }, 422, "Required parameter crate_json is empty"
- ),
- ],
- ids=["failure_missing_crate", "failure_empty_crate",
- "failure_malformed_crate", "failure_empty_crate"]
-)
-def test_validate_metadata_failure(client: FlaskClient, payload: dict, status_code: int, response_text: str):
- response = client.post("/v1/ro_crates/validate_metadata", json=payload)
- assert response.status_code == status_code
- assert response_text in response.get_data(as_text=True)
-
-
-# Test GET API: /v1/ro_crates/{crate_id}/validation
-
-@pytest.mark.parametrize(
- "crate_id, payload, status_code",
- [
- (
- "", {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "root_path": "base_path"
- }, 404
- ),
- (
- "crate-123", {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- },
- "root_path": "base_path"
- }, 422
- ),
- ],
- ids=["failure_missing_crate_id", "failure_missing_minio_bucket"]
-)
-def test_get_validation_by_id_failures(minio_client: FlaskClient, crate_id: str, payload: dict, status_code: int):
- response = minio_client.get(f"/v1/ro_crates/{crate_id}/validation", json=payload)
- assert response.status_code == status_code
-
-
-def test_get_validation_by_id_success(minio_client):
- crate_id = "crate-123"
- payload = {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "root_path": "base_path"
- }
-
- with patch("app.ro_crates.routes.get_routes.get_ro_crate_validation_task") as mock_get:
- mock_get.return_value = ({"status": "valid"}, 200)
-
- response = minio_client.get(f"/v1/ro_crates/{crate_id}/validation", json=payload)
-
- assert response.status_code == 200
- assert response.json == {"status": "valid"}
- mock_get.assert_called_once_with(payload["minio_config"], "crate-123", "base_path")
-
-
-def test_get_validation_by_id_missing_root_path(minio_client):
- crate_id = "crate-123"
- payload = {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- }
- }
-
- with patch("app.ro_crates.routes.get_routes.get_ro_crate_validation_task") as mock_get:
- mock_get.return_value = ({"status": "valid"}, 200)
-
- response = minio_client.get(f"/v1/ro_crates/{crate_id}/validation", json=payload)
-
- assert response.status_code == 200
- assert response.json == {"status": "valid"}
- mock_get.assert_called_once_with(payload["minio_config"], "crate-123", None)
-
-
-# Test MinIO-backed endpoints are unavailable when MinIO is disabled (the default)
-
-def test_minio_post_route_not_registered_when_disabled(client: FlaskClient):
- payload = {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- }
- }
- response = client.post("/v1/ro_crates/crate-123/validation", json=payload)
- assert response.status_code == 404
-
-
-def test_minio_get_route_not_registered_when_disabled(client: FlaskClient):
- payload = {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- }
- }
- response = client.get("/v1/ro_crates/crate-123/validation", json=payload)
- assert response.status_code == 404
-
-
-def test_metadata_route_available_when_minio_disabled(client: FlaskClient):
- payload = {"crate_json": '{"@context": "https://w3id.org/ro/crate/1.1/context"}'}
- with patch("app.ro_crates.routes.post_routes.queue_ro_crate_metadata_validation_task") as mock_queue:
- mock_queue.return_value = ({"status": "success"}, 200)
-
- response = client.post("/v1/ro_crates/validate_metadata", json=payload)
-
- assert response.status_code == 200
- mock_queue.assert_called_once()
diff --git a/tests/test_app_factory.py b/tests/test_app_factory.py
new file mode 100644
index 0000000..defcfb3
--- /dev/null
+++ b/tests/test_app_factory.py
@@ -0,0 +1,83 @@
+"""Tests for the application factory's configuration."""
+
+import pytest
+
+from app import create_app
+from app.utils.config import ConfigError, Settings
+
+
+def _storage_env() -> dict:
+ return {
+ "STORAGE_ENABLED": "true",
+ "S3_ENDPOINT": "minio:9000",
+ "S3_ACCESS_KEY": "minioadmin",
+ "S3_SECRET_KEY": "minioadmin",
+ "S3_BUCKET": "ro-crates",
+ "CELERY_BROKER_URL": "redis://redis:6379/0",
+ "CELERY_RESULT_BACKEND": "redis://redis:6379/1",
+ }
+
+
+def _route_paths(app) -> set:
+ return {rule.rule for rule in app.url_map.iter_rules()}
+
+
+def test_create_app_fails_fast_on_invalid_storage_config(monkeypatch):
+ """The default startup path validates config and refuses to start when broken."""
+ monkeypatch.setenv("STORAGE_ENABLED", "true")
+ for var in (
+ "S3_ENDPOINT",
+ "S3_ACCESS_KEY",
+ "S3_SECRET_KEY",
+ "S3_BUCKET",
+ "CELERY_BROKER_URL",
+ "CELERY_RESULT_BACKEND",
+ ):
+ monkeypatch.delenv(var, raising=False)
+
+ with pytest.raises(ConfigError):
+ create_app()
+
+
+def test_storage_routes_absent_when_disabled():
+ app = create_app(settings=Settings.from_env({}))
+
+ paths = _route_paths(app)
+ assert "/v1/ro_crates/validate_metadata" in paths
+ assert not any("validation" in p for p in paths)
+ assert app.config["STORAGE_ENABLED"] is False
+
+
+def test_storage_routes_registered_when_enabled():
+ app = create_app(settings=Settings.from_env(_storage_env()))
+
+ paths = _route_paths(app)
+ assert any(p.endswith("/validation") for p in paths)
+ assert app.config["STORAGE_ENABLED"] is True
+
+
+def test_profiles_path_exposed_to_app_config():
+ app = create_app(settings=Settings.from_env({"PROFILES_PATH": "/custom/profiles"}))
+ assert app.config["PROFILES_PATH"] == "/custom/profiles"
+
+
+def test_response_includes_generated_request_id_header():
+ app = create_app(settings=Settings.from_env({}))
+ client = app.test_client()
+
+ response = client.post("/v1/ro_crates/validate_metadata", json={"crate_json": "{}"})
+
+ assert response.headers.get("X-Request-ID")
+
+
+def test_incoming_request_id_is_echoed():
+ app = create_app(settings=Settings.from_env({}))
+ client = app.test_client()
+
+ response = client.post(
+ "/v1/ro_crates/validate_metadata",
+ json={"crate_json": "{}"},
+ headers={"X-Request-ID": "caller-supplied-id"},
+ )
+
+ assert response.headers["X-Request-ID"] == "caller-supplied-id"
diff --git a/tests/test_health.py b/tests/test_health.py
new file mode 100644
index 0000000..064865d
--- /dev/null
+++ b/tests/test_health.py
@@ -0,0 +1,77 @@
+"""Tests for the health and readiness endpoints."""
+
+from unittest import mock
+
+import pytest
+
+from app import create_app
+from app.utils.config import Settings
+
+
+def _storage_env() -> dict:
+ return {
+ "STORAGE_ENABLED": "true",
+ "S3_ENDPOINT": "minio:9000",
+ "S3_ACCESS_KEY": "a",
+ "S3_SECRET_KEY": "b",
+ "S3_BUCKET": "ro-crates",
+ "CELERY_BROKER_URL": "redis://r/0",
+ "CELERY_RESULT_BACKEND": "redis://r/1",
+ }
+
+
+@pytest.fixture
+def disabled_client():
+ return create_app(settings=Settings.from_env({})).test_client()
+
+
+@pytest.fixture
+def storage_client():
+ return create_app(settings=Settings.from_env(_storage_env())).test_client()
+
+
+def test_healthz_is_always_ok(disabled_client):
+ response = disabled_client.get("/healthz")
+ assert response.status_code == 200
+ assert response.json["status"] == "ok"
+
+
+def test_readyz_ready_when_storage_disabled(disabled_client):
+ response = disabled_client.get("/readyz")
+ assert response.status_code == 200
+ assert response.json["status"] == "ready"
+ assert response.json["checks"]["storage"] == "disabled"
+
+
+def test_readyz_ok_when_all_checks_pass(storage_client):
+ with (
+ mock.patch("app.health.check_storage", return_value=(True, "ok")),
+ mock.patch("app.health.check_broker", return_value=(True, "ok")),
+ ):
+ response = storage_client.get("/readyz")
+
+ assert response.status_code == 200
+ assert response.json["status"] == "ready"
+
+
+def test_readyz_503_when_storage_unreachable(storage_client):
+ with (
+ mock.patch("app.health.check_storage", return_value=(False, "bucket down")),
+ mock.patch("app.health.check_broker", return_value=(True, "ok")),
+ ):
+ response = storage_client.get("/readyz")
+
+ assert response.status_code == 503
+ assert response.json["status"] == "not ready"
+ assert response.json["checks"]["storage"] == "bucket down"
+
+
+def test_readyz_503_when_broker_unreachable(storage_client):
+ with (
+ mock.patch("app.health.check_storage", return_value=(True, "ok")),
+ mock.patch("app.health.check_broker", return_value=(False, "broker down")),
+ ):
+ response = storage_client.get("/readyz")
+
+ assert response.status_code == 503
+ assert response.json["checks"]["broker"] == "broker down"
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 0c3b5fe..0f0a29a 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -1,584 +1,186 @@
-import pytest
-import subprocess
-import time
-import requests
-import json
-import os
-import docker
-from minio import Minio
-import uuid
-
-
-@pytest.fixture(scope="session")
-def docker_client():
- return docker.from_env()
-
-
-@pytest.fixture(scope="session", autouse=True)
-def docker_compose(docker_client):
- """Start Docker Compose before tests, shut down after."""
- print("Starting Docker Compose...")
+"""Integration tests against the full Docker stack. Brings up the dev compose stack (flask +
+celery + redis + RustFS objectstore), seeds crates into the canonical layout via boto3, and
+drives the HTTP API.
- PROJECT = f"test_{uuid.uuid4().hex}"
+Run with Docker available: pytest -s -v tests/test_integration.py
+Excluded from the unit-test run (it needs Docker).
+"""
- # Integration tests use the MinIO endpoints, so enable
- # MinIO and start the opt-in "minio" compose profile.
- compose_env = {**os.environ, "MINIO_ENABLED": "true"}
-
- subprocess.run(
- [
- "docker",
- "compose",
- "-f",
- "docker-compose-develop.yml",
- "-p",
- PROJECT,
- "--profile",
- "minio",
- "up",
- "-d",
- ],
- check=True,
- env=compose_env,
- )
- time.sleep(10) # Wait for services to start — adjust as needed
-
- load_test_data_into_minio()
+import os
+import subprocess
+import time
- yield # Run the tests
+import boto3
+import pytest
+import requests
- for container in docker_client.containers.list():
- if "cratey-validator" in container.name:
- logs = container.logs().decode("utf-8")
+BASE_URL = "http://localhost:5001"
+S3_URL = "http://localhost:9000"
+BUCKET = "ro-crates"
+CRATE_PREFIX = "crates"
+ACCESS_KEY = "rustfsadmin"
+SECRET_KEY = "rustfsadmin"
+COMPOSE_FILE = "docker-compose-develop.yml"
+PROJECT = "cratey_integration"
+TEST_DATA = "tests/data/ro_crates"
- print(f"\n======= Logs from {container.name} container =======")
- print(logs)
- print("Stopping Docker Compose...")
+def _compose(*args, env=None):
subprocess.run(
[
"docker",
"compose",
"-f",
- "docker-compose-develop.yml",
+ COMPOSE_FILE,
"-p",
PROJECT,
"--profile",
- "minio",
- "down",
- "-v",
+ "objectstore",
+ *args,
],
check=True,
+ env=env,
)
-def load_test_data_into_minio():
- """Connect to MinIO and upload test files."""
- minio_client = Minio(
- endpoint="localhost:9000",
- access_key="minioadmin",
- secret_key="minioadmin",
- secure=False,
- )
-
- bucket_name = "ro-crates"
- test_data_dir = "tests/data/ro_crates"
-
- minio_client.make_bucket(bucket_name)
-
- # Walk and upload files
- for root, _, files in os.walk(test_data_dir):
- for file_name in files:
- file_path = os.path.join(root, file_name)
- object_name = os.path.relpath(file_path, test_data_dir)
-
- print(f"Uploading {file_path} as {object_name} to bucket {bucket_name}")
- minio_client.fput_object(bucket_name, object_name, file_path)
-
-
-def test_validate_metadata():
- url = "http://localhost:5001/v1/ro_crates/validate_metadata"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # Load the JSON from file
- filepath = os.path.join("tests/data", "ro-crate-metadata.json")
- with open(filepath, "r", encoding="utf-8") as f:
- crate_json_data = json.load(f)
-
- # The API expects the JSON to be passed as a string
- payload = {"crate_json": json.dumps(crate_json_data)}
-
- response = requests.post(url, json=payload, headers=headers)
-
- response_result = json.loads(response.json()["result"])
+def _wait_for(url, timeout=90):
+ """Poll a URL until it responds, or fail after timeout seconds."""
+ deadline = time.time() + timeout
+ while time.time() < deadline:
+ try:
+ if requests.get(url, timeout=2).status_code:
+ return
+ except requests.RequestException:
+ pass
+ time.sleep(2)
+ raise RuntimeError(f"Timed out waiting for {url}")
+
+
+def _seed_crates(s3):
+ """Create the bucket and upload test crates under the crates/ prefix."""
+ try:
+ s3.create_bucket(Bucket=BUCKET)
+ except s3.exceptions.ClientError:
+ pass # already exists
+
+ for root, _, files in os.walk(TEST_DATA):
+ for name in files:
+ path = os.path.join(root, name)
+ rel = os.path.relpath(path, TEST_DATA)
+ s3.upload_file(path, BUCKET, f"{CRATE_PREFIX}/{rel}")
+
+
+def _poll_result(crate_id, timeout=90):
+ """Poll the GET endpoint until a stored result appears (past 404)."""
+ url = f"{BASE_URL}/v1/ro_crates/{crate_id}/validation"
+ deadline = time.time() + timeout
+ response = requests.get(url)
+ while response.status_code == 404 and time.time() < deadline:
+ time.sleep(3)
+ response = requests.get(url)
+ return response
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
- # Assertions — update based on expected API behavior
- assert response.status_code == 200
- assert response_result["passed"] is True
-
-
-def test_no_rocrate_for_validation():
- ro_crate = "ro_crate_10"
- url = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # The API expects the JSON to be passed as a string
- payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- }
- }
-
- response = requests.post(url, json=payload, headers=headers)
-
- response_result = response.json()
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- # Assertions — update based on expected API behavior
- assert response.status_code == 400
- assert response_result["message"] == f"No RO-Crate with prefix: {ro_crate}"
-
-
-def test_no_validation_result_for_missing_crate():
- ro_crate = "ro_crate_10"
- url_get = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # The API expects the JSON to be passed as a string
- payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- }
- }
-
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- # Assertions
- assert response.status_code == 400
- assert response_result["message"] == f"No RO-Crate with prefix: {ro_crate}"
-
-
-def test_get_existing_validation_result():
- ro_crate = "ro_crate_3"
- url_get = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # The API expects the JSON to be passed as a string
- payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- }
+@pytest.fixture(scope="session", autouse=True)
+def stack():
+ env = {
+ **os.environ,
+ "STORAGE_ENABLED": "true",
+ "S3_ENDPOINT": "objectstore:9000",
+ "S3_ACCESS_KEY": ACCESS_KEY,
+ "S3_SECRET_KEY": SECRET_KEY,
+ "S3_BUCKET": BUCKET,
+ "S3_USE_SSL": "false",
+ "RUSTFS_ACCESS_KEY": ACCESS_KEY,
+ "RUSTFS_SECRET_KEY": SECRET_KEY,
}
+ _compose("up", "-d", "--build", env=env)
+ try:
+ _wait_for(f"{BASE_URL}/healthz")
+ s3 = boto3.client(
+ "s3",
+ endpoint_url=S3_URL,
+ aws_access_key_id=ACCESS_KEY,
+ aws_secret_access_key=SECRET_KEY,
+ region_name="us-east-1",
+ )
+ _seed_crates(s3)
+ yield
+ finally:
+ _compose("down", "-v", env=env)
+
+
+def test_healthz_and_readyz():
+ assert requests.get(f"{BASE_URL}/healthz").json()["status"] == "ok"
+ ready = requests.get(f"{BASE_URL}/readyz")
+ assert ready.status_code == 200
+ body = ready.json()
+ assert body["status"] == "ready"
+ assert body["checks"] == {"storage": "ok", "broker": "ok"}
+
+
+def test_validate_metadata_inline():
+ with open("tests/data/ro-crate-metadata.json", encoding="utf-8") as f:
+ crate_json = f.read()
+
+ response = requests.post(
+ f"{BASE_URL}/v1/ro_crates/validate_metadata",
+ json={"crate_json": crate_json},
+ )
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- # Assertions
assert response.status_code == 200
- assert response_result["passed"] is False
-
-
-def test_rocrate_not_validated_yet():
- ro_crate = "ro_crate_not_validated"
- url_get = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # The API expects the JSON to be passed as a string
- payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- }
- }
-
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- # Assertions
- assert response.status_code == 400
- assert (
- response_result["message"]
- == f"No validation result yet for RO-Crate: {ro_crate}"
- )
+ assert response.json()["status"] == "valid"
-def test_zipped_rocrate_validation():
- ro_crate = "ro_crate_1"
- url_post = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- url_get = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # The API expects the JSON to be passed as a string
- payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- }
- }
+def test_missing_crate_returns_404():
+ crate = "does_not_exist"
+ response = requests.post(f"{BASE_URL}/v1/ro_crates/{crate}/validation", json={})
+ assert response.status_code == 404
+ assert response.json()["error"] == f"No crate found for ID '{crate}'"
- # POST action and tests
- response = requests.post(url_post, json=payload, headers=headers)
- response_result = response.json()["message"]
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
+def test_get_missing_result_returns_404():
+ crate = "does_not_exist"
+ response = requests.get(f"{BASE_URL}/v1/ro_crates/{crate}/validation")
+ assert response.status_code == 404
+ assert response.json()["message"] == f"No validation result yet for RO-Crate: {crate}"
- # Assertions
- assert response.status_code == 202
- assert response_result == "Validation in progress"
-
- # wait for ro-crate to be validated
- time.sleep(10)
-
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- start_time = time.time()
- while response.status_code == 400:
- time.sleep(10)
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- elapsed = time.time() - start_time
- if elapsed > 60:
- print("60 seconds passed. Exiting loop")
- break
-
- # Assertions
- assert response.status_code == 200
- assert response_result["passed"] is False
-
-
-def test_directory_rocrate_validation():
- ro_crate = "ro_crate_2"
- url_post = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- url_get = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # The API expects the JSON to be passed as a string
- payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- }
- }
- # POST action and tests
- response = requests.post(url_post, json=payload, headers=headers)
- response_result = response.json()["message"]
+def test_get_result_for_unvalidated_crate_returns_404():
+ """A crate that exists but has not been validated yet has no stored result."""
+ crate = "ro_crate_not_validated"
+ response = requests.get(f"{BASE_URL}/v1/ro_crates/{crate}/validation")
+ assert response.status_code == 404
+ assert response.json()["message"] == f"No validation result yet for RO-Crate: {crate}"
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
- # Assertions
+def test_zipped_crate_validation():
+ response = requests.post(f"{BASE_URL}/v1/ro_crates/ro_crate_1/validation", json={})
assert response.status_code == 202
- assert response_result == "Validation in progress"
-
- # wait for ro-crate to be validated
- time.sleep(10)
-
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- start_time = time.time()
- while response.status_code == 400:
- time.sleep(10)
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- elapsed = time.time() - start_time
- if elapsed > 60:
- print("60 seconds passed. Exiting loop")
- break
-
- # Assertions
- assert response.status_code == 200
- assert response_result["passed"] is False
-
-
-def test_extra_profile_rocrate_validation():
- ro_crate = "ro_crate_2"
- profile_name = "alpha-crate-0.1"
- url_post = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- url_get = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # The API expects the JSON to be passed as a string
- post_payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- },
- "profile_name": profile_name,
- }
- get_payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- }
- }
+ assert response.json()["message"] == "Validation in progress"
- # POST action and tests
- response = requests.post(url_post, json=post_payload, headers=headers)
- response_result = response.json()["message"]
+ result = _poll_result("ro_crate_1")
+ assert result.status_code == 200
+ assert result.json()["status"] == "invalid"
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
- # Assertions
+def test_directory_crate_validation():
+ response = requests.post(f"{BASE_URL}/v1/ro_crates/ro_crate_2/validation", json={})
assert response.status_code == 202
- assert response_result == "Validation in progress"
-
- # wait for ro-crate to be validated
- time.sleep(10)
-
- # GET action and tests
- response = requests.get(url_get, json=get_payload, headers=headers)
- response_result = response.json()
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- start_time = time.time()
- while response.status_code == 400:
- time.sleep(10)
- # GET action and tests
- response = requests.get(url_get, json=get_payload, headers=headers)
- response_result = response.json()
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- elapsed = time.time() - start_time
- if elapsed > 60:
- print("60 seconds passed. Exiting loop")
- break
-
- # Assertions
- assert response.status_code == 200
- assert response_result["passed"] is False
-
-
-def test_ignore_rocrates_not_on_basepath():
- ro_crate = "ro_crate_4"
- url_post = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # The API expects the JSON to be passed as a string
- payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- }
- }
- # POST action and tests
- response = requests.post(url_post, json=payload, headers=headers)
- response_result = response.json()["message"]
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- # Assertions
- assert response.status_code == 400
- assert response_result == "No RO-Crate with prefix: ro_crate_4"
-
-
-def test_zipped_rocrate_in_subdirectory_validation():
- ro_crate = "ro_crate_4"
- subdir_path = "project_a"
- url_post = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- url_get = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # The API expects the JSON to be passed as a string
- payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- },
- "root_path": subdir_path,
- }
-
- # POST action and tests
- response = requests.post(url_post, json=payload, headers=headers)
- response_result = response.json()["message"]
+ result = _poll_result("ro_crate_2")
+ assert result.status_code == 200
+ assert result.json()["status"] == "invalid"
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
- # Assertions
+def test_validation_with_explicit_profile():
+ """A request carrying an explicit profile_name is accepted and produces a result."""
+ response = requests.post(
+ f"{BASE_URL}/v1/ro_crates/ro_crate_3/validation",
+ json={"profile_name": "alpha-crate-0.1"},
+ )
assert response.status_code == 202
- assert response_result == "Validation in progress"
-
- # wait for ro-crate to be validated
- time.sleep(10)
-
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- start_time = time.time()
- while response.status_code == 400:
- time.sleep(10)
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- elapsed = time.time() - start_time
- if elapsed > 60:
- print("60 seconds passed. Exiting loop")
- break
-
- # Assertions
- assert response.status_code == 200
- assert response_result["passed"] is False
-
-
-def test_directory_rocrate_in_subdirectory_validation():
- ro_crate = "ro_crate_5"
- subdir_path = "project_a"
- url_post = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- url_get = f"http://localhost:5001/v1/ro_crates/{ro_crate}/validation"
- headers = {"accept": "application/json", "Content-Type": "application/json"}
-
- # The API expects the JSON to be passed as a string
- payload = {
- "minio_config": {
- "endpoint": "minio:9000",
- "accesskey": "minioadmin",
- "secret": "minioadmin",
- "ssl": False,
- "bucket": "ro-crates",
- },
- "root_path": subdir_path,
- }
- # POST action and tests
- response = requests.post(url_post, json=payload, headers=headers)
- response_result = response.json()["message"]
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- # Assertions
- assert response.status_code == 202
- assert response_result == "Validation in progress"
-
- # wait for ro-crate to be validated
- time.sleep(10)
-
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
-
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- start_time = time.time()
- while response.status_code == 400:
- time.sleep(10)
- # GET action and tests
- response = requests.get(url_get, json=payload, headers=headers)
- response_result = response.json()
- # Print response for debugging
- print("Status Code:", response.status_code)
- print("Response JSON:", response_result)
-
- elapsed = time.time() - start_time
- if elapsed > 60:
- print("60 seconds passed. Exiting loop")
- break
-
- # Assertions
- assert response.status_code == 200
- assert response_result["passed"] is False
+ result = _poll_result("ro_crate_3")
+ assert result.status_code == 200
+ assert result.json()["status"] == "invalid"
diff --git a/tests/test_minio.py b/tests/test_minio.py
deleted file mode 100644
index 426d901..0000000
--- a/tests/test_minio.py
+++ /dev/null
@@ -1,506 +0,0 @@
-import json
-import pytest
-from io import BytesIO
-from minio import Minio
-from minio.error import S3Error
-from unittest.mock import MagicMock, patch
-from unittest import mock
-
-
-@pytest.fixture
-def mock_minio_response():
- response = MagicMock()
- response.data.decode.return_value = json.dumps({"status": "valid"})
- return response
-
-
-class DummyObject:
- def __init__(self, name, is_dir=False):
- self.object_name = name
- self.is_dir = is_dir
-
-
-# Testing function: get_minio_client
-
-@pytest.mark.parametrize(
- "minio_config",
- [
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False
- },
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "ignore_this"
- }
- ],
- ids=["base_case", "ignore_extra_items"]
-)
-def test_get_minio_client_success(minio_config: dict):
-
- from app.utils.minio_utils import get_minio_client
- client = get_minio_client(minio_config)
-
- assert isinstance(client, Minio)
- assert client._base_url.host == "localhost:9000"
-
-
-# Testing function: get_minio_object_list
-
-def test_get_minio_object_list_success():
- # Setup mock response
- mock_response = MagicMock()
- mock_objects = [DummyObject("file1.txt"), DummyObject("file2.txt")]
- mock_response.__iter__.return_value = iter(mock_objects)
-
- # Patch minio_client
- mock_minio_client = MagicMock()
- mock_minio_client.list_objects.return_value = mock_response
-
- # Call function
- from app.utils.minio_utils import get_minio_object_list
- result = get_minio_object_list("path/", mock_minio_client, "my-bucket", recursive=True)
-
- # Assert
- assert result == mock_objects
- mock_minio_client.list_objects.assert_called_once_with("my-bucket", "path/", recursive=True)
- mock_response.close.assert_called_once()
-
-
-@pytest.mark.parametrize(
- "bucket, path, status_code, list_side_effect, error_check",
- [
- (
- "my-bucket", "path/rocrate.zip", 500,
- S3Error(code="S3 error",
- message=None,
- resource=None,
- request_id=None,
- host_id=None,
- response=None),
- "MinIO S3 Error"
- ),
- (
- "my-bucket", "path/rocrate.zip", 500,
- ValueError("Missing config"),
- "Configuration Error"
- ),
- (
- "my-bucket", "path/rocrate.zip", 500,
- RuntimeError("Something went wrong"),
- "Unknown Error"
- ),
- ],
- ids=["s3error", "value_error", "unexpected_error"]
-)
-def test_get_minio_object_list_errors(bucket: str, path: str, status_code: int, list_side_effect, error_check: str):
- mock_minio_client = MagicMock()
- mock_minio_client.list_objects.side_effect = list_side_effect
-
- from app.utils.minio_utils import get_minio_object_list, InvalidAPIUsage
- with pytest.raises(InvalidAPIUsage) as exc:
- get_minio_object_list(path, mock_minio_client, bucket)
-
- assert exc.value.status_code == status_code
- assert error_check in str(exc.value.message)
-
-
-# Testing function: find_rocrate_object_on_minio
-
-
-@pytest.mark.parametrize(
- "rocrate_object, crateid, bucket, root_path",
- [
- (
- DummyObject("my/path/rocrate123/", is_dir=True),
- "rocrate123", "bucket", "my/path"
- ),
- (
- DummyObject("my/path/rocrate123.zip"),
- "rocrate123", "bucket", "my/path"
- ),
- (
- DummyObject("rocrate123.zip"),
- "rocrate123", "bucket", None
- ),
- ],
- ids=["rocrate_directory", "rocrate_zip", "rootpath_none"]
-)
-@patch("app.utils.minio_utils.get_minio_object_list")
-def test_finding_rocrate_on_minio(
- mock_get_list,
- rocrate_object: DummyObject, crateid: str, bucket: str, root_path: str):
- # Simulate a directory object match
- mock_get_list.return_value = [rocrate_object]
- minio_client = MagicMock()
-
- from app.utils.minio_utils import find_rocrate_object_on_minio
- result = find_rocrate_object_on_minio(crateid, minio_client, bucket, root_path)
- assert result == rocrate_object
-
-
-@patch("app.utils.minio_utils.get_minio_object_list")
-def test_rocrate_not_found(mock_get_list):
- # Simulate no matching object
- mock_get_list.return_value = [
- DummyObject("something_else"),
- DummyObject("another_dir", is_dir=True)
- ]
- minio_client = MagicMock()
-
- from app.utils.minio_utils import find_rocrate_object_on_minio
- result = find_rocrate_object_on_minio("rocrate123", minio_client, "bucket", None)
-
- mock_get_list.assert_called_once()
- assert not result
-
-
-# Testing function: find_validation_object_on_minio
-
-@pytest.mark.parametrize(
- "object_path, crateid, bucket, root_path",
- [
- (
- "my/storage/rocrate123_validation/validation_status.txt",
- "rocrate123", "bucket", "my/storage"
- ),
- (
- "rocrate123_validation/validation_status.txt",
- "rocrate123", "bucket", None
- ),
- ],
- ids=["with_storage_path", "without_storage_path"]
-)
-@patch("app.utils.minio_utils.get_minio_object_list")
-def test_validation_object_found_with_storage_path(
- mock_get_list,
- object_path: str, crateid: str, bucket: str, root_path: str):
- # Setup
- obj = DummyObject(object_path)
- mock_get_list.return_value = [obj]
-
- from app.utils.minio_utils import find_validation_object_on_minio
- # Execute
- result = find_validation_object_on_minio(crateid, MagicMock(), bucket, root_path)
-
- # Assert
- assert result == obj
- mock_get_list.assert_called_once_with(object_path, mock.ANY, bucket)
-
-
-@pytest.mark.parametrize(
- "object_list, crateid, bucket, root_path",
- [
- (
- [DummyObject("some/other/object.txt")],
- "rocrate999", "bucket", None
- ),
- (
- [],
- "rocrate999", "bucket", None
- ),
- ],
- ids=["other_objects", "empty_list"]
-)
-@patch("app.utils.minio_utils.get_minio_object_list")
-def test_validation_object_not_found(
- mock_get_list,
- object_list: list, crateid: str, bucket: str, root_path: str):
- # Setup: no objects returned
- mock_get_list.return_value = object_list
-
- from app.utils.minio_utils import find_validation_object_on_minio
- result = find_validation_object_on_minio(crateid, MagicMock(), bucket, root_path)
-
- assert result is False
-
-
-# Testing function: download_file_from_minio
-
-@patch("app.utils.minio_utils.logging")
-def test_download_success(mock_logging):
- mock_minio = MagicMock()
-
- from app.utils.minio_utils import download_file_from_minio
- # No exceptions raised
- download_file_from_minio(mock_minio, "bucket", "remote/path.txt", "local/path.txt")
-
- mock_minio.fget_object.assert_called_once_with("bucket", "remote/path.txt", "local/path.txt")
- mock_logging.error.assert_not_called()
-
-
-@pytest.mark.parametrize(
- "bucket, remotepath, localpath, status_code, get_side_effect, error_check",
- [
- (
- "my-bucket", "remote/path.txt", "local/path.txt", 500,
- S3Error(code="S3 error",
- message=None,
- resource=None,
- request_id=None,
- host_id=None,
- response=None),
- "MinIO S3 Error"
- ),
- (
- "my-bucket", "remote/path.txt", "local/path.txt", 500,
- ValueError("Missing config"),
- "Configuration Error"
- ),
- (
- "my-bucket", "remote/path.txt", "local/path.txt", 500,
- RuntimeError("Something went wrong"),
- "Unknown Error"
- ),
- ],
- ids=["s3error", "value_error", "unexpected_error"]
-)
-@patch("app.utils.minio_utils.logging")
-def test_download_s3error(
- mock_logging,
- bucket: str, remotepath: str, localpath: str, status_code: int,
- get_side_effect, error_check: str
-):
- mock_minio = MagicMock()
- mock_minio.fget_object.side_effect = get_side_effect
-
- from app.utils.minio_utils import download_file_from_minio, InvalidAPIUsage
- with pytest.raises(InvalidAPIUsage) as exc:
- download_file_from_minio(mock_minio, bucket, remotepath, localpath)
-
- assert exc.value.status_code == status_code
- assert error_check in str(exc.value.message)
- mock_logging.error.assert_called_once()
-
-
-# Testing function: get_validation_status_from_minio
-
-def test_successful_retrieval(mocker, mock_minio_response):
- mock_client = MagicMock()
- mock_client.get_object.return_value = mock_minio_response
-
- from app.utils.minio_utils import get_validation_status_from_minio
- result = get_validation_status_from_minio(mock_client, "test_bucket", "crate123", None)
-
- assert result == {"status": "valid"}
- mock_minio_response.close.assert_called_once()
- mock_minio_response.release_conn.assert_called_once()
-
-
-@pytest.mark.parametrize(
- "bucket, crateid, root_path, status_code, get_side_effect, error_check",
- [
- (
- "my-bucket", "crate123", None, 500,
- S3Error(code="S3 error",
- message=None,
- resource=None,
- request_id=None,
- host_id=None,
- response=None),
- "MinIO S3 Error"
- ),
- (
- "my-bucket", "crate123", None, 500,
- ValueError("Missing env var"),
- "Configuration Error"
- ),
- (
- "my-bucket", "crate123", None, 500,
- RuntimeError("Unexpected failure"),
- "Unknown Error"
- ),
- ],
- ids=["s3error", "value_error", "unexpected_error"]
-)
-def test_get_validation_error_raised(
- mocker,
- bucket: str, crateid: str, root_path: str, status_code: int, get_side_effect, error_check: str
-):
- mock_client = MagicMock()
- mock_client.get_object.side_effect = get_side_effect
-
- from app.utils.minio_utils import get_validation_status_from_minio, InvalidAPIUsage
- with pytest.raises(InvalidAPIUsage) as exc:
- get_validation_status_from_minio(mock_client, bucket, crateid, root_path)
-
- assert exc.value.status_code == status_code
- assert error_check in str(exc.value.message)
-
-
-# Testing function: update_validation_status_in_minio
-
-def test_update_validation_status_success():
- mock_minio_client = mock.Mock()
-
- crate_id = "crate123"
- validation_status = json.dumps({"status": "valid", "errors": []})
-
- from app.utils.minio_utils import update_validation_status_in_minio
- update_validation_status_in_minio(mock_minio_client, "test_bucket", crate_id, "", validation_status)
-
- expected_object_name = f"{crate_id}_validation/validation_status.txt"
- expected_data = json.dumps(json.loads(validation_status), indent=None).encode("utf-8")
-
- mock_minio_client.put_object.assert_called_once()
- args, kwargs = mock_minio_client.put_object.call_args
-
- # FIXME: Original suggested test expected 4 values in args, but returned only 2.
- # Solution was to check both args and kwargs for the 'data' and 'length' objects.
- # Do we need to chose one format of call_args for our tests, or is this ambiguity okay?
- bucket_name = args[0] if args else kwargs["bucket_name"]
- object_name = args[1] if len(args) > 1 else kwargs["object_name"]
- actual_data_stream = args[2] if len(args) > 2 else kwargs["data"]
- length = args[3] if len(args) > 3 else kwargs["length"]
-
- assert bucket_name == "test_bucket"
- assert object_name == expected_object_name
- assert isinstance(actual_data_stream, BytesIO)
- actual_data_stream.seek(0)
- assert actual_data_stream.read() == expected_data
- assert length == len(expected_data)
- assert kwargs["content_type"] == "application/json"
-
-
-@pytest.mark.parametrize(
- "bucket, crateid, root_path, validation_result, put_side_effect, error_check, status_code",
- [
- (
- "my-bucket", "crate123", None,
- {"status": "valid"},
- S3Error(code="S3 error",
- message=None,
- resource=None,
- request_id=None,
- host_id=None,
- response=None),
- "MinIO S3 Error", 500
- ),
- (
- "my-bucket", "crate123", None,
- {"status": "valid"},
- ValueError("Missing env vars"),
- "Configuration Error", 500
- ),
- (
- "my-bucket", "crate123", None,
- {"status": "valid"},
- RuntimeError("Unexpected failure"),
- "Unknown Error", 500
- ),
- ],
- ids=["s3error", "value_error", "unexpected_error"]
-)
-def test_update_validation_status_erro(
- bucket: str, crateid: str, root_path: str, validation_result: dict,
- put_side_effect, error_check: str, status_code: int
-):
- mock_minio_client = mock.Mock()
- mock_minio_client.put_object.side_effect = put_side_effect
-
- from app.utils.minio_utils import update_validation_status_in_minio, InvalidAPIUsage
- with pytest.raises(InvalidAPIUsage) as exc:
- update_validation_status_in_minio(mock_minio_client, bucket, crateid, root_path, json.dumps(validation_result))
-
- assert exc.value.status_code == status_code
- assert error_check in str(exc.value.message)
-
-
-# Testing function: fetch_ro_crate_from_minio
-
-@patch("app.utils.minio_utils.download_file_from_minio")
-@patch("app.utils.minio_utils.get_minio_object_list")
-@patch("app.utils.minio_utils.find_rocrate_object_on_minio")
-def test_fetch_rocrate_zip(
- mock_find_object,
- mock_get_list,
- mock_download,
- tmp_path,
-):
- # Setup mocks
- minio_client = "minio_client"
- rocrate_obj = DummyObject("some/path/rocrate123.zip", is_dir=False)
- mock_find_object.return_value = rocrate_obj
-
- from app.utils.minio_utils import fetch_ro_crate_from_minio
-
- with patch("app.utils.minio_utils.tempfile.mkdtemp", return_value=str(tmp_path)):
- # Execute
- result = fetch_ro_crate_from_minio(minio_client, "test_bucket", "rocrate123", "some/path")
-
- # Assert
- expected_path = tmp_path / "rocrate123.zip"
- assert result == str(expected_path)
- mock_download.assert_called_once_with(
- "minio_client", "test_bucket",
- "some/path/rocrate123.zip", str(expected_path))
-
-
-@patch("app.utils.minio_utils.download_file_from_minio")
-@patch("app.utils.minio_utils.get_minio_object_list")
-@patch("app.utils.minio_utils.find_rocrate_object_on_minio")
-def test_fetch_rocrate_directory(
- mock_find_object,
- mock_get_list,
- mock_download,
- tmp_path,
-):
- # Setup mocks
- minio_client = "minio_client"
- rocrate_obj = DummyObject("rocrates/rocrate124", is_dir=True)
- mock_find_object.return_value = rocrate_obj
-
- from app.utils.minio_utils import fetch_ro_crate_from_minio
-
- with patch("app.utils.minio_utils.tempfile.mkdtemp", return_value=str(tmp_path)):
- # Objects inside the RO-Crate
- mock_get_list.return_value = [
- DummyObject("rocrates/rocrate124/metadata.json"),
- DummyObject("rocrates/rocrate124/data/file1.txt"),
- ]
-
- # Execute
- result = fetch_ro_crate_from_minio(minio_client, "test_bucket", "rocrate124", "rocrates")
-
- # Assert
- expected_root = tmp_path / "rocrate124"
- assert result == str(expected_root)
- mock_download.assert_any_call(
- "minio_client", "test_bucket",
- "rocrates/rocrate124/metadata.json",
- str(expected_root / "metadata.json")
- )
- mock_download.assert_any_call(
- "minio_client", "test_bucket",
- "rocrates/rocrate124/data/file1.txt",
- str(expected_root / "data/file1.txt")
- )
-
-
-@patch("app.utils.minio_utils.download_file_from_minio")
-@patch("app.utils.minio_utils.get_minio_object_list")
-@patch("app.utils.minio_utils.find_rocrate_object_on_minio")
-def test_fetch_rocrate_handles_empty_dir(
- mock_find_object,
- mock_get_list,
- mock_download,
- tmp_path,
-):
- minio_client = "minio_client"
- rocrate_obj = DummyObject("rocrate456", is_dir=True)
- mock_find_object.return_value = rocrate_obj
- mock_get_list.return_value = []
-
- from app.utils.minio_utils import fetch_ro_crate_from_minio
-
- with patch("app.utils.minio_utils.tempfile.mkdtemp", return_value=str(tmp_path)):
- result = fetch_ro_crate_from_minio(minio_client, "test_bucket", "rocrate456", "")
-
- expected_root = tmp_path / "rocrate456"
- assert result == str(expected_root)
- mock_download.assert_not_called()
diff --git a/tests/test_services.py b/tests/test_services.py
deleted file mode 100644
index 0413e17..0000000
--- a/tests/test_services.py
+++ /dev/null
@@ -1,292 +0,0 @@
-import pytest
-from unittest.mock import patch, MagicMock
-from flask import Flask
-from flask.testing import FlaskClient
-
-from app.services.validation_service import (
- queue_ro_crate_validation_task,
- queue_ro_crate_metadata_validation_task,
- get_ro_crate_validation_task
-)
-
-from app.utils.minio_utils import InvalidAPIUsage
-
-
-@pytest.fixture
-def flask_app():
- app = Flask(__name__)
- with app.app_context():
- yield app
-
-
-# Test function: queue_ro_crate_validation_task
-
-@pytest.mark.parametrize(
- "crate_id, rocrate_exists, minio_client, delay_side_effects, payload, profiles_path, status_code, response_dict",
- [
- (
- "crate123", True, "minio_client", None,
- {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "root_path": "base_path",
- "webhook_url": "https://webhook.example.com",
- "profile_name": "default"
- },
- None,
- 202, {"message": "Validation in progress"}
- ),
- (
- "crate123", True, "minio_client", Exception("Celery down"),
- {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "root_path": "base_path",
- "webhook_url": "https://webhook.example.com",
- "profile_name": "default"
- },
- None,
- 500, {"error": "Celery down"}
- ),
- ],
- ids=["successful_queue", "celery_server_down"]
-)
-@patch("app.services.validation_service.process_validation_task_by_id.delay")
-@patch("app.services.validation_service.check_ro_crate_exists")
-@patch("app.services.validation_service.get_minio_client")
-def test_queue_ro_crate_validation_task(
- mock_client,
- mock_exists,
- mock_delay,
- flask_app: FlaskClient, crate_id: str, rocrate_exists: bool, minio_client: str,
- delay_side_effects: Exception, payload: dict, profiles_path: str, status_code: int, response_dict: dict
-):
- mock_delay.side_effect = delay_side_effects
- mock_exists.return_value = rocrate_exists
- mock_client.return_value = minio_client
-
- minio_config = payload["minio_config"] if "minio_config" in payload else None
- root_path = payload["root_path"] if "root_path" in payload else None
- profile_name = payload["profile_name"] if "profile_name" in payload else None
- webhook_url = payload["webhook_url"] if "webhook_url" in payload else None
-
- response, status_code = queue_ro_crate_validation_task(minio_config, crate_id, root_path,
- profile_name, webhook_url, profiles_path)
-
- mock_client.assert_called_once_with(minio_config)
- mock_exists.assert_called_once_with(minio_client, minio_config["bucket"], crate_id, root_path)
- mock_delay.assert_called_once_with(minio_config, crate_id, root_path, profile_name, webhook_url, profiles_path)
- assert status_code == status_code
- assert response.json == response_dict
-
-
-@pytest.mark.parametrize(
- "crate_id, rocrate_exists, minio_client, payload, iau_message",
- [
- (
- "crate12z", False, "minio_client",
- {
- "minio_config": {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "root_path": "base_path",
- "webhook_url": "https://webhook.example.com",
- "profile_name": "default"
- }, "No RO-Crate with prefix: crate12z"
- ),
- ],
- ids=["no_rocrate_exists"]
-)
-@patch("app.services.validation_service.process_validation_task_by_id.delay")
-@patch("app.services.validation_service.check_ro_crate_exists")
-@patch("app.services.validation_service.get_minio_client")
-def test_queue_ro_crate_validation_task_failure(
- mock_client,
- mock_exists,
- mock_delay,
- flask_app: FlaskClient, crate_id: str, rocrate_exists: bool,
- minio_client: str, payload: dict, iau_message: str
-):
- mock_exists.return_value = rocrate_exists
- mock_client.return_value = minio_client
-
- minio_config = payload["minio_config"] if "minio_config" in payload else None
- root_path = payload["root_path"] if "root_path" in payload else None
- profile_name = payload["profile_name"] if "profile_name" in payload else None
- webhook_url = payload["webhook_url"] if "webhook_url" in payload else None
-
- with pytest.raises(InvalidAPIUsage) as exc_info:
- queue_ro_crate_validation_task(minio_config, crate_id, root_path, profile_name, webhook_url)
-
- assert iau_message in str(exc_info.value.message)
- mock_client.assert_called_once_with(minio_config)
- mock_exists.assert_called_once_with(minio_client, minio_config["bucket"], crate_id, root_path)
- mock_delay.assert_not_called()
-
-
-# Test function: queue_ro_crate_metadata_validation_task
-
-@pytest.mark.parametrize(
- "crate_json, profile, webhook, status_code, return_value, response_json, delay_side_effect, profiles_path",
- [
- (
- '{"@context": "https://w3id.org/ro/crate/1.1/context"}',
- "default", "http://webhook",
- 202, None, {"message": "Validation in progress"},
- None, None
- ),
- (
- '{"@context": "https://w3id.org/ro/crate/1.1/context"}',
- "default", None,
- 200, {"status": "ok"}, {"result": {"status": "ok"}},
- None, None
- ),
- (
- '{"@context": "https://w3id.org/ro/crate/1.1/context"}',
- "default", "http://webhook",
- 500, None, {"error": "Celery error"},
- Exception("Celery error"), None
- ),
- ],
- ids=["success_with_webhook", "success_without_webhook", "failure_celery_error"]
-)
-def test_queue_metadata(flask_app, crate_json: dict, profile: str, webhook: str,
- status_code: int, return_value: dict, response_json: dict,
- delay_side_effect: Exception, profiles_path: str):
- with patch("app.services.validation_service.process_validation_task_by_metadata.delay",
- side_effect=delay_side_effect) as mock_delay:
- mock_result = MagicMock()
- if return_value is not None:
- mock_result.get.return_value = return_value
- if delay_side_effect is None:
- mock_delay.return_value = mock_result
-
- response, status = queue_ro_crate_metadata_validation_task(crate_json, profile, webhook, profiles_path)
-
- mock_delay.assert_called_once_with(crate_json, profile, webhook, profiles_path)
- assert status == status_code
- assert response.json == response_json
-
-
-@pytest.mark.parametrize(
- "crate_json, status_code, response_error",
- [
- (
- None,
- 422, "Missing required parameter: crate_json"
- ),
- (
- "{",
- 422, "not valid JSON"
- ),
- (
- "{}",
- 422, "Required parameter crate_json is empty"
- ),
- ],
- ids=["missing_crate_json","invalid_json","empty_json"]
-)
-def test_queue_metadata_json_errors(flask_app, crate_json: str, status_code: int, response_error: str):
- response, status = queue_ro_crate_metadata_validation_task(crate_json)
- assert status == status_code
- assert response_error in response.json["error"]
-
-
-# Test function: get_ro_crate_validation_task
-
-@pytest.mark.parametrize(
- "minio_config, crate_id, crate_exists, validation_exists, " +
- "validation_value, status_code, error_message, minio_client",
- [
- (
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "crate123", True, True, {"status": "valid"}, 200, None,
- "minio_client"
- ),
- (
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "crate123", False, False, None, 400, "No RO-Crate with prefix: crate123",
- "minio_client"
- ),
- (
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "crate123", True, False, None, 400, "No validation result yet for RO-Crate: crate123",
- "minio_client"
- ),
- ],
- ids=["validation_exists", "rocrate_missing", "validation_missing"]
-)
-@patch("app.services.validation_service.check_ro_crate_exists")
-@patch("app.services.validation_service.check_validation_exists")
-@patch("app.services.validation_service.return_ro_crate_validation")
-@patch("app.services.validation_service.get_minio_client")
-def test_get_validation(
- mock_client,
- mock_return,
- mock_validation,
- mock_rocrate,
- flask_app, minio_config: dict, crate_id: str, crate_exists: bool,
- validation_exists: bool, validation_value: dict,
- status_code: int, error_message: str, minio_client: str
-):
- mock_client.return_value = minio_client
- mock_rocrate.return_value = crate_exists
- mock_validation.return_value = validation_exists
- mock_return.return_value = validation_value
-
- if crate_exists and validation_exists:
- response, status = get_ro_crate_validation_task(minio_config, crate_id, "base_path")
-
- mock_client.assert_called_once_with(minio_config)
- mock_return.assert_called_once_with(minio_client, minio_config["bucket"], crate_id, "base_path")
- mock_rocrate.assert_called_once_with(minio_client, minio_config["bucket"], crate_id, "base_path")
- mock_validation.assert_called_once_with(minio_client, minio_config["bucket"], crate_id, "base_path")
-
- assert status == status_code
- assert response == validation_value
-
- else:
- with pytest.raises(InvalidAPIUsage) as exc_info:
- get_ro_crate_validation_task(minio_config, crate_id, "base_path")
-
- assert exc_info.value.status_code == status_code
- assert error_message in str(exc_info.value.message)
-
- mock_rocrate.assert_called_once_with(minio_client, minio_config["bucket"], crate_id, "base_path")
- if crate_exists:
- mock_validation.assert_called_once_with(minio_client, minio_config["bucket"], crate_id, "base_path")
- else:
- mock_validation.assert_not_called()
- mock_return.assert_not_called()
diff --git a/tests/test_validation_tasks.py b/tests/test_validation_tasks.py
deleted file mode 100644
index 5d36e1d..0000000
--- a/tests/test_validation_tasks.py
+++ /dev/null
@@ -1,525 +0,0 @@
-from unittest import mock
-import pytest
-import json
-
-from app.tasks.validation_tasks import (
- process_validation_task_by_id,
- perform_ro_crate_validation,
- perform_metadata_validation,
- return_ro_crate_validation,
- process_validation_task_by_metadata,
- check_ro_crate_exists,
- check_validation_exists
-)
-
-from app.utils.minio_utils import InvalidAPIUsage
-
-
-# Test function: process_validation_task_by_id
-
-@pytest.mark.parametrize(
- "minio_config, crate_id, os_path_exists, os_path_isfile, os_path_isdir, " +
- "return_value, webhook, profile, profiles_path, val_success, val_result, minio_client",
- [
- (
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "crate123", True, True, False, "/tmp/crate.zip",
- "https://example.com/hook", "profileA", None, True, '{"status": "valid"}',
- "minio_client"
- ),
- (
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "crate123", True, False, True, "/tmp/crate123",
- "https://example.com/hook", "profileA", None, True, '{"status": "valid"}',
- "minio_client"
- ),
- (
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "crate123", True, False, True, "/tmp/crate123",
- None, "profileA", None, True, '{"status": "valid"}',
- "minio_client"
- ),
- ],
- ids=["successful_validation_zip", "successful_validation_dir", "successful_validation_nowebhook"]
-)
-@mock.patch("app.tasks.validation_tasks.get_minio_client")
-@mock.patch("app.tasks.validation_tasks.shutil.rmtree")
-@mock.patch("app.tasks.validation_tasks.os.remove")
-@mock.patch("app.tasks.validation_tasks.os.path.exists")
-@mock.patch("app.tasks.validation_tasks.os.path.isfile")
-@mock.patch("app.tasks.validation_tasks.os.path.isdir")
-@mock.patch("app.tasks.validation_tasks.send_webhook_notification")
-@mock.patch("app.tasks.validation_tasks.update_validation_status_in_minio")
-@mock.patch("app.tasks.validation_tasks.perform_ro_crate_validation")
-@mock.patch("app.tasks.validation_tasks.fetch_ro_crate_from_minio")
-def test_process_validation(
- mock_fetch,
- mock_validate,
- mock_update,
- mock_webhook,
- mock_isdir,
- mock_isfile,
- mock_exists,
- mock_remove,
- mock_rmtree,
- mock_client,
- minio_config: dict, crate_id: str, os_path_exists: bool, os_path_isfile: bool, os_path_isdir: bool,
- return_value: str, webhook: str, profile: str, profiles_path: str, val_success: bool, val_result: str, minio_client: str
-):
- mock_exists.return_value = os_path_exists
- mock_isfile.return_value = os_path_isfile
- mock_isdir.return_value = os_path_isdir
- mock_fetch.return_value = return_value
- mock_client.return_value = minio_client
-
- mock_validation_result = mock.Mock()
- mock_validation_result.has_issues.return_value = val_success
- mock_validation_result.to_json.return_value = val_result
- mock_validate.return_value = mock_validation_result
-
- process_validation_task_by_id(minio_config, crate_id, "", profile, webhook, profiles_path)
-
- mock_client.assert_called_once_with(minio_config)
- mock_fetch.assert_called_once_with(minio_client, minio_config["bucket"], crate_id, "")
- mock_validate.assert_called_once_with(return_value, profile, profiles_path=profiles_path)
- mock_update.assert_called_once_with(minio_client, minio_config["bucket"], crate_id, "", val_result)
- if webhook is not None:
- mock_webhook.assert_called_once_with(webhook, val_result)
- else:
- mock_webhook.assert_not_called()
- if os_path_exists and os_path_isfile:
- mock_remove.assert_called_once_with(return_value)
- mock_rmtree.assert_not_called()
- elif os_path_exists and os_path_isdir:
- mock_rmtree.assert_called_once_with(return_value)
- mock_remove.assert_not_called()
-
-
-@pytest.mark.parametrize(
- "minio_config, crate_id, os_path_exists, os_path_isfile, os_path_isdir, return_fetch, "
- + "webhook, profile, profiles_path, return_validate, validate_side_effect, fetch_side_effect, minio_client",
- [
- (
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "crate123", True, True, False, "/tmp/crate.zip",
- "https://example.com/hook", "profileA", None, "Validation failed", None, None,
- "minio_client"
- ),
- (
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "crate123", True, True, False, "/tmp/crate.zip",
- "https://example.com/hook", "profileA", None, None, Exception("Unexpected error"), None,
- "minio_client"
- ),
- (
- {
- "endpoint": "localhost:9000",
- "accesskey": "admin",
- "secret": "password123",
- "ssl": False,
- "bucket": "test_bucket"
- },
- "crate123", False, False, False, None,
- "https://example.com/hook", "profileA", None, None, None, Exception("MinIO fetch failed"),
- "minio_client"
- ),
- ],
- ids=["validation_fails_with_message", "validation_fails_with_validation_exception",
- "validation_fails_with_fetch_exception"]
-)
-@mock.patch("app.tasks.validation_tasks.get_minio_client")
-@mock.patch("app.tasks.validation_tasks.shutil.rmtree")
-@mock.patch("app.tasks.validation_tasks.os.remove")
-@mock.patch("app.tasks.validation_tasks.os.path.exists")
-@mock.patch("app.tasks.validation_tasks.os.path.isfile")
-@mock.patch("app.tasks.validation_tasks.os.path.isdir")
-@mock.patch("app.tasks.validation_tasks.send_webhook_notification")
-@mock.patch("app.tasks.validation_tasks.update_validation_status_in_minio")
-@mock.patch("app.tasks.validation_tasks.perform_ro_crate_validation")
-@mock.patch("app.tasks.validation_tasks.fetch_ro_crate_from_minio")
-def test_process_validation_failure(
- mock_fetch,
- mock_validate,
- mock_update,
- mock_webhook,
- mock_isdir,
- mock_isfile,
- mock_exists,
- mock_remove,
- mock_rmtree,
- mock_client,
- minio_config: dict, crate_id: str, os_path_exists: bool, os_path_isfile: bool, os_path_isdir: bool,
- return_fetch: str, webhook: str, profile: str, profiles_path: str, return_validate: str,
- validate_side_effect: Exception, fetch_side_effect: Exception, minio_client: str
-):
- mock_exists.return_value = os_path_exists
- mock_isfile.return_value = os_path_isfile
- mock_isdir.return_value = os_path_isdir
- mock_client.return_value = minio_client
-
- if fetch_side_effect is None:
- mock_fetch.return_value = return_fetch
- else:
- mock_fetch.side_effect = fetch_side_effect
-
- if validate_side_effect is None:
- mock_validate.return_value = return_validate
- else:
- mock_validate.side_effect = validate_side_effect
-
- process_validation_task_by_id(minio_config, crate_id, "", profile, webhook, profiles_path)
-
- if fetch_side_effect is None:
- mock_validate.assert_called_once_with(return_fetch, profile, profiles_path=profiles_path)
- else:
- mock_validate.assert_not_called()
-
- mock_update.assert_not_called()
- mock_webhook.assert_called_once()
- args, kwargs = mock_webhook.call_args
- assert args[0] == webhook
- if fetch_side_effect is not None:
- assert fetch_side_effect.args[0] in args[1]["error"]
- elif validate_side_effect is not None:
- assert validate_side_effect.args[0] in args[1]["error"]
- else:
- assert return_validate in args[1]["error"]
-
- if not os_path_exists:
- mock_remove.assert_not_called()
- mock_rmtree.assert_not_called()
- elif os_path_exists and os_path_isfile:
- mock_remove.assert_called_once_with(return_fetch)
- mock_rmtree.assert_not_called()
- elif os_path_exists and os_path_isdir:
- mock_rmtree.assert_called_once_with(return_fetch)
- mock_remove.assert_not_called()
-
-
-# Test function: process_validation_task_by_metadata
-
-@pytest.mark.parametrize(
- "crate_json, profile_name, webhook_url, profiles_path, validation_json, validation_value",
- [
- (
- '{"@context": "https://w3id.org/ro/crate/1.1/context", "@graph": []}',
- "test-profile", "https://example.com/webhook",
- "/app/profiles",
- '{"status": "valid"}', False
- ),
- (
- '{"@context": "https://w3id.org/ro/crate/1.1/context", "@graph": []}',
- "test-profile", "https://example.com/webhook",
- None,
- '{"status": "invalid"}', True
- )
- ],
- ids=["success_no_issues", "success_with_issues"]
-)
-@mock.patch("app.tasks.validation_tasks.send_webhook_notification")
-@mock.patch("app.tasks.validation_tasks.perform_metadata_validation")
-def test_metadata_validation(
- mock_validate, mock_webhook,
- crate_json: str, profile_name: str, webhook_url: str, profiles_path: str | None,
- validation_json: str, validation_value: bool,
-):
- mock_result = mock.Mock()
- mock_result.has_issues.return_value = validation_value
- mock_result.to_json.return_value = validation_json
- mock_validate.return_value = mock_result
-
- result = process_validation_task_by_metadata(
- crate_json, profile_name, webhook_url, profiles_path
- )
-
- assert result == validation_json
- mock_validate.assert_called_once_with(
- crate_json, profile_name, profiles_path=profiles_path
- )
- mock_webhook.assert_called_once_with(webhook_url, validation_json)
-
-
-@pytest.mark.parametrize(
- "crate_json, profile_name, webhook_url, profiles_path, validation_message",
- [
- (
- '{"@context": "https://w3id.org/ro/crate/1.1/context", "@graph": []}',
- "test-profile", "https://example.com/webhook",
- "/app/profiles",
- "Validation error"
- ),
- (
- '{"@context": "https://w3id.org/ro/crate/1.1/context", "@graph": []}',
- "test-profile", None,
- None,
- "Validation error"
- )
- ],
- ids=["validation_fails", "validation_fails_no_webhook"]
-)
-@mock.patch("app.tasks.validation_tasks.send_webhook_notification")
-@mock.patch("app.tasks.validation_tasks.perform_metadata_validation")
-def test_validation_fails_and_sends_error_notification_to_webhook(
- mock_validate, mock_webhook,
- crate_json: str, profile_name: str, webhook_url: str, profiles_path: str | None,
- validation_message: str
-):
-
- mock_validate.return_value = validation_message
-
- result = process_validation_task_by_metadata(
- crate_json, profile_name, webhook_url, profiles_path
- )
-
- assert isinstance(result, str)
- assert validation_message in result
- mock_validate.assert_called_once_with(
- crate_json, profile_name, profiles_path=profiles_path
- )
-
- if webhook_url is not None:
- # Error webhook should be sent
- mock_webhook.assert_called_once()
- args, kwargs = mock_webhook.call_args
- assert kwargs is None or "error" in args[1]
- else:
- # Make sure webhook not sent
- mock_webhook.assert_not_called()
-
-
-# Test function: perform_ro_crate_validation
-
-@pytest.mark.parametrize(
- "file_path, profile_name, skip_checks",
- [
- ("crates/test_crate", "ro_profile", ["check1", "check2"]),
- ("crates/test_crate", None, None)
- ],
- ids=["success_with_all_args", "success_with_only_crate"]
-)
-@mock.patch("app.tasks.validation_tasks.services.validate")
-@mock.patch("app.tasks.validation_tasks.services.ValidationSettings")
-def test_validation_success_with_all_args(
- mock_validation_settings, mock_validate,
- file_path: str, profile_name: str, skip_checks: list
-):
- mock_result = mock.Mock()
- mock_validate.return_value = mock_result
-
- result = perform_ro_crate_validation(file_path, profile_name, skip_checks)
-
- # Assert that result was returned
- assert result == mock_result
-
- # Validate proper construction of ValidationSettings
- mock_validation_settings.assert_called_once()
- args, kwargs = mock_validation_settings.call_args
- assert kwargs["rocrate_uri"].endswith(file_path)
- if profile_name is not None:
- assert kwargs["profile_identifier"] == profile_name
- else:
- assert "profile_identifier" not in kwargs
- if skip_checks is not None:
- assert kwargs["skip_checks"] == skip_checks
- else:
- assert "skip_checks" not in kwargs
-
- mock_validate.assert_called_once_with(mock_validation_settings.return_value)
-
-
-@mock.patch("app.tasks.validation_tasks.services.validate", side_effect=RuntimeError("Validation error"))
-@mock.patch("app.tasks.validation_tasks.services.ValidationSettings")
-def test_validation_raises_exception_and_returns_string(mock_validation_settings, mock_validate):
- file_path = "crates/test_crate"
- result = perform_ro_crate_validation(file_path, "profile", skip_checks_list=None)
-
- assert isinstance(result, str)
- assert "Validation error" in result
- mock_validate.assert_called_once()
-
-
-@mock.patch("app.tasks.validation_tasks.services.validate")
-@mock.patch("app.tasks.validation_tasks.services.ValidationSettings", side_effect=ValueError("Bad config"))
-def test_validation_settings_error(mock_validation_settings, mock_validate):
- file_path = "crates/test_crate"
- result = perform_ro_crate_validation(file_path, None)
-
- assert isinstance(result, str)
- assert "Bad config" in result
- mock_validate.assert_not_called()
-
-
-# Test function: perform_metadata_validation
-
-@pytest.mark.parametrize(
- "crate_json, profile_name, skip_checks",
- [
- ('{"id":"dummy json"}', "ro_profile", ["check1", "check2"]),
- ('{"id":"dummy json"}', None, None)
- ],
- ids=["success_with_all_args", "success_with_only_crate"]
-)
-@mock.patch("app.tasks.validation_tasks.services.validate")
-@mock.patch("app.tasks.validation_tasks.services.ValidationSettings")
-def test_metadata_validation_success_with_all_args(
- mock_validation_settings, mock_validate,
- crate_json: str, profile_name: str, skip_checks: list
-):
- mock_result = mock.Mock()
- mock_validate.return_value = mock_result
-
- result = perform_metadata_validation(crate_json, profile_name, skip_checks)
-
- # Assert that result was returned
- assert result == mock_result
-
- # Validate proper construction of ValidationSettings
- mock_validation_settings.assert_called_once()
- args, kwargs = mock_validation_settings.call_args
- assert kwargs["metadata_dict"] == json.loads(crate_json)
- if profile_name is not None:
- assert kwargs["profile_identifier"] == profile_name
- else:
- assert "profile_identifier" not in kwargs
- if skip_checks is not None:
- assert kwargs["skip_checks"] == skip_checks
- else:
- assert "skip_checks" not in kwargs
-
- mock_validate.assert_called_once_with(mock_validation_settings.return_value)
-
-
-@mock.patch("app.tasks.validation_tasks.services.validate", side_effect=RuntimeError("Validation error"))
-@mock.patch("app.tasks.validation_tasks.services.ValidationSettings")
-def test_metadata_validation_raises_exception_and_returns_string(mock_validation_settings, mock_validate):
- crate_json = '{"id":"test metadata"}'
- result = perform_metadata_validation(crate_json, "profile", skip_checks_list=None)
-
- assert isinstance(result, str)
- assert "Validation error" in result
- mock_validate.assert_called_once()
-
-
-@mock.patch("app.tasks.validation_tasks.services.validate")
-@mock.patch("app.tasks.validation_tasks.services.ValidationSettings", side_effect=ValueError("Bad config"))
-def test_metadata_validation_settings_error(mock_validation_settings, mock_validate):
- crate_json = '{"id":"test metadata"}'
- result = perform_metadata_validation(crate_json, None)
-
- assert isinstance(result, str)
- assert "Bad config" in result
- mock_validate.assert_not_called()
-
-
-# Test function: return_ro_crate_validation
-
-@mock.patch("app.tasks.validation_tasks.get_validation_status_from_minio")
-def test_return_validation_returns_dict(mock_get_status):
- # Simulate dict result
- mock_get_status.return_value = {"status": "passed", "errors": []}
-
- result = return_ro_crate_validation("minio_client", "test_bucket", "crate123", None)
- assert isinstance(result, dict)
- assert result["status"] == "passed"
- mock_get_status.assert_called_once_with("minio_client", "test_bucket", "crate123", None)
-
-
-@mock.patch("app.tasks.validation_tasks.get_validation_status_from_minio")
-def test_return_validation_returns_string(mock_get_status):
- # Simulate string result
- mock_get_status.return_value = "Validation result: OK"
-
- result = return_ro_crate_validation("minio_client", "test_bucket", "crate456", None)
- assert isinstance(result, str)
- assert "OK" in result
- mock_get_status.assert_called_once_with("minio_client", "test_bucket", "crate456", None)
-
-
-@mock.patch("app.tasks.validation_tasks.get_validation_status_from_minio")
-def test_return_validation_raises_error(mock_get_status):
- # Simulate exception
- mock_get_status.side_effect = InvalidAPIUsage("MinIO S3 Error: empty", 500)
-
- with pytest.raises(InvalidAPIUsage) as exc_info:
- return_ro_crate_validation("minio_client", "test_bucket", "crate789", None)
-
- assert "MinIO S3 Error" in str(exc_info.value.message)
- mock_get_status.assert_called_once_with("minio_client", "test_bucket", "crate789", None)
-
-
-# Test function: check_ro_crate_exists
-
-@pytest.mark.parametrize(
- "minio_client, bucket, crate_id, base_path, ro_object_return, rocrate_exists",
- [
- ("minio_client", "test_bucket", "crate123", "base_path", "crate123", True),
- ("minio_client", "test_bucket", "crate12z", "base_path", False, False)
- ],
- ids=["rocrate_exists", "rocrate_does_not_exist"]
-)
-@mock.patch("app.tasks.validation_tasks.find_rocrate_object_on_minio")
-def test_ro_crate_exists(
- mock_find_rocrate,
- minio_client: str, bucket: str, crate_id: str, base_path: str,
- ro_object_return: str, rocrate_exists: bool
-):
- mock_find_rocrate.return_value = ro_object_return
-
- result = check_ro_crate_exists(minio_client, bucket, crate_id, base_path)
-
- mock_find_rocrate.assert_called_once_with(crate_id, minio_client, bucket, base_path)
- assert result is rocrate_exists
-
-
-# Test function: check_validation_exists
-
-@pytest.mark.parametrize(
- "minio_client, bucket, crate_id, base_path, val_object_return, validate_exists",
- [
- ("minio_client", "test_bucket", "crate123", "base_path", "crate123", True),
- ("minio_client", "test_bucket", "crate12z", "base_path", False, False)
- ],
- ids=["validation_exists", "validation_does_not_exist"]
-)
-@mock.patch("app.tasks.validation_tasks.find_validation_object_on_minio")
-def test_validation_exists(
- mock_find_validation,
- minio_client: str, bucket: str, crate_id: str, base_path: str,
- val_object_return: str, validate_exists: bool
-):
- mock_find_validation.return_value = val_object_return
-
- result = check_validation_exists(minio_client, bucket, crate_id, base_path)
-
- mock_find_validation.assert_called_once_with(crate_id, minio_client, bucket, base_path)
- assert result is validate_exists
diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py
new file mode 100644
index 0000000..74275a2
--- /dev/null
+++ b/tests/utils/test_config.py
@@ -0,0 +1,112 @@
+"""Tests for the validated Settings configuration object."""
+
+import pytest
+
+from app.utils.config import ConfigError, Settings
+
+
+def test_defaults_when_storage_disabled():
+ """With storage off, S3/Celery vars are not required and sensible defaults apply."""
+ settings = Settings.from_env({})
+
+ assert settings.storage_enabled is False
+ assert settings.flask_env == "development"
+ assert settings.debug is True
+ assert settings.profiles_path is None
+ assert settings.extra_profiles_path is None
+ assert settings.cache_path is None
+ assert settings.validation_offline is False
+
+
+def test_validation_tuning_vars_are_read():
+ settings = Settings.from_env(
+ {
+ "EXTRA_PROFILES_PATH": "/app/extra-profiles",
+ "CACHE_PATH": "/app/.rocrate-cache",
+ "VALIDATION_OFFLINE": "true",
+ }
+ )
+ assert settings.extra_profiles_path == "/app/extra-profiles"
+ assert settings.cache_path == "/app/.rocrate-cache"
+ assert settings.validation_offline is True
+
+
+def test_storage_enabled_requires_s3_and_broker_config():
+ """Enabling storage without the needed vars fails early and naming every missing var."""
+ with pytest.raises(ConfigError) as exc_info:
+ Settings.from_env({"STORAGE_ENABLED": "true"})
+
+ message = str(exc_info.value)
+ for var in (
+ "S3_ENDPOINT",
+ "S3_ACCESS_KEY",
+ "S3_SECRET_KEY",
+ "S3_BUCKET",
+ "CELERY_BROKER_URL",
+ "CELERY_RESULT_BACKEND",
+ ):
+ assert var in message
+
+
+def test_blank_required_value_is_treated_as_missing():
+ """A whitespace-only required var counts as missing, not as a valid value."""
+ env = _storage_env()
+ env["S3_BUCKET"] = " "
+
+ with pytest.raises(ConfigError) as exc_info:
+ Settings.from_env(env)
+
+ assert "S3_BUCKET" in str(exc_info.value)
+
+
+def test_valid_storage_config_populates_fields():
+ """A complete storage config loads cleanly and parses booleans."""
+ env = _storage_env()
+ env["S3_USE_SSL"] = "true"
+
+ settings = Settings.from_env(env)
+
+ assert settings.storage_enabled is True
+ assert settings.s3_endpoint == "minio:9000"
+ assert settings.s3_bucket == "ro-crates"
+ assert settings.s3_use_ssl is True
+ assert settings.celery_broker_url == "redis://redis:6379/0"
+
+
+def test_flask_env_production_disables_debug():
+ settings = Settings.from_env({"FLASK_ENV": "production"})
+ assert settings.flask_env == "production"
+ assert settings.debug is False
+
+
+@pytest.mark.parametrize(
+ "raw, expected",
+ [
+ ("true", True),
+ ("1", True),
+ ("yes", True),
+ ("on", True),
+ ("false", False),
+ ("no", False),
+ ("", False),
+ ("anything", False),
+ ],
+)
+def test_storage_enabled_boolean_parsing(raw, expected):
+ # Truthy values require a complete storage config; falsy values need nothing.
+ env = _storage_env() if expected else {}
+ env["STORAGE_ENABLED"] = raw
+ assert Settings.from_env(env).storage_enabled is expected
+
+
+def _storage_env() -> dict:
+ """Returns a complete, valid storage-enabled environment for tests to mutate."""
+ return {
+ "STORAGE_ENABLED": "true",
+ "S3_ENDPOINT": "minio:9000",
+ "S3_ACCESS_KEY": "minioadmin",
+ "S3_SECRET_KEY": "minioadmin",
+ "S3_BUCKET": "ro-crates",
+ "CELERY_BROKER_URL": "redis://redis:6379/0",
+ "CELERY_RESULT_BACKEND": "redis://redis:6379/1",
+ }
diff --git a/tests/utils/test_webhook_utils.py b/tests/utils/test_webhook_utils.py
new file mode 100644
index 0000000..3e8ecfa
--- /dev/null
+++ b/tests/utils/test_webhook_utils.py
@@ -0,0 +1,56 @@
+"""Tests for webhook delivery with retry/backoff."""
+
+from unittest import mock
+
+import pytest
+import requests
+
+from app.utils import webhook_utils
+from app.utils.webhook_utils import WebhookDeliveryError, send_webhook_notification
+
+
+def _ok_response():
+ response = mock.Mock()
+ response.raise_for_status.return_value = None
+ return response
+
+
+def test_successful_delivery_posts_once():
+ with mock.patch.object(webhook_utils.requests, "post", return_value=_ok_response()) as post:
+ send_webhook_notification("https://hook", {"status": "valid"}, sleep=lambda _: None)
+ post.assert_called_once()
+ # The payload is sent as JSON and a timeout is set (no unbounded hang).
+ assert post.call_args.kwargs["json"] == {"status": "valid"}
+ assert "timeout" in post.call_args.kwargs
+
+
+def test_retries_then_succeeds():
+ flaky = [requests.ConnectionError("boom"), requests.ConnectionError("boom"), _ok_response()]
+ sleeps = []
+ with mock.patch.object(webhook_utils.requests, "post", side_effect=flaky) as post:
+ send_webhook_notification("https://hook", {"x": 1}, max_attempts=3, sleep=sleeps.append)
+ assert post.call_count == 3
+ assert len(sleeps) == 2 # slept between the three attempts
+
+
+def test_terminal_failure_raises_after_exhausting_attempts():
+ with mock.patch.object(
+ webhook_utils.requests, "post", side_effect=requests.ConnectionError("down")
+ ) as post:
+ with pytest.raises(WebhookDeliveryError) as exc_info:
+ send_webhook_notification(
+ "https://hook", {"x": 1}, max_attempts=3, sleep=lambda _: None
+ )
+ assert post.call_count == 3
+ assert "https://hook" in str(exc_info.value)
+
+
+def test_http_error_status_is_retried():
+ bad = mock.Mock()
+ bad.raise_for_status.side_effect = requests.HTTPError("500")
+ with mock.patch.object(webhook_utils.requests, "post", return_value=bad) as post:
+ with pytest.raises(WebhookDeliveryError):
+ send_webhook_notification(
+ "https://hook", {"x": 1}, max_attempts=2, sleep=lambda _: None
+ )
+ assert post.call_count == 2
diff --git a/tests/validation/test_results.py b/tests/validation/test_results.py
new file mode 100644
index 0000000..80d1503
--- /dev/null
+++ b/tests/validation/test_results.py
@@ -0,0 +1,66 @@
+"""Tests for the ValidationOutcome result type."""
+
+import json
+
+from app.validation.results import ValidationOutcome, ValidationStatus
+
+
+class FakeResult:
+ """Stand-in for a rocrate_validator ValidationResult."""
+
+ def __init__(self, has_issues: bool, report: dict):
+ self._has_issues = has_issues
+ self._report = report
+
+ def has_issues(self) -> bool:
+ return self._has_issues
+
+ def to_json(self) -> str:
+ return json.dumps(self._report)
+
+
+def test_from_validator_result_without_issues_is_valid():
+ outcome = ValidationOutcome.from_validator_result(
+ FakeResult(False, {"report": "ok"}), profile="ro-crate"
+ )
+ assert outcome.status is ValidationStatus.VALID
+ assert outcome.is_valid is True
+ assert outcome.profile == "ro-crate"
+ assert outcome.detail == {"report": "ok"}
+ assert outcome.error is None
+
+
+def test_from_validator_result_with_issues_is_invalid():
+ outcome = ValidationOutcome.from_validator_result(FakeResult(True, {"issues": [1]}))
+ assert outcome.status is ValidationStatus.INVALID
+ assert outcome.is_valid is False
+ assert outcome.detail == {"issues": [1]}
+
+
+def test_from_error_records_message_and_has_no_detail():
+ outcome = ValidationOutcome.from_error("boom", profile="ro-crate")
+ assert outcome.status is ValidationStatus.ERROR
+ assert outcome.is_valid is False
+ assert outcome.error == "boom"
+ assert outcome.detail is None
+
+
+def test_to_dict_serialises_status_as_string_and_omits_absent_fields():
+ outcome = ValidationOutcome.from_validator_result(FakeResult(False, {"r": 1}))
+ data = outcome.to_dict()
+ assert data["status"] == "valid"
+ assert data["detail"] == {"r": 1}
+ assert "error" not in data
+
+
+def test_to_json_round_trips():
+ outcome = ValidationOutcome.from_error("nope")
+ parsed = json.loads(outcome.to_json())
+ assert parsed["status"] == "error"
+ assert parsed["error"] == "nope"
+
+
+def test_created_at_is_propagated_when_provided():
+ outcome = ValidationOutcome.from_error("x", created_at="2026-06-16T00:00:00Z")
+ assert outcome.created_at == "2026-06-16T00:00:00Z"
+ assert outcome.to_dict()["created_at"] == "2026-06-16T00:00:00Z"
diff --git a/tests/validation/test_runner.py b/tests/validation/test_runner.py
new file mode 100644
index 0000000..35079fb
--- /dev/null
+++ b/tests/validation/test_runner.py
@@ -0,0 +1,109 @@
+"""Tests for the validation runner that wraps rocrate_validator."""
+
+import json
+
+from app.validation import runner
+from app.validation.results import ValidationStatus
+
+
+class FakeResult:
+ def __init__(self, has_issues: bool):
+ self._has_issues = has_issues
+
+ def has_issues(self) -> bool:
+ return self._has_issues
+
+ def to_json(self) -> str:
+ return json.dumps({"issues": self._has_issues})
+
+
+class FakeServices:
+ """A stand-in for rocrate_validator.services."""
+
+ def __init__(self, result=None, raises=None):
+ self._result = result
+ self._raises = raises
+ self.last_settings = None
+
+ def ValidationSettings(self, **kwargs): # noqa: N802 - mirrors the real API
+ self.last_settings = kwargs
+ return kwargs
+
+ def validate(self, settings):
+ if self._raises is not None:
+ raise self._raises
+ return self._result
+
+
+def test_validate_metadata_success_is_valid(monkeypatch):
+ fake = FakeServices(result=FakeResult(has_issues=False))
+ monkeypatch.setattr(runner, "services", fake)
+
+ outcome = runner.validate_metadata({"@graph": []}, profile_name="ro-crate")
+
+ assert outcome.status is ValidationStatus.VALID
+ assert outcome.profile == "ro-crate"
+ assert fake.last_settings["metadata_only"] is True
+ assert fake.last_settings["metadata_dict"] == {"@graph": []}
+
+
+def test_validate_metadata_with_issues_is_invalid(monkeypatch):
+ monkeypatch.setattr(runner, "services", FakeServices(result=FakeResult(True)))
+ outcome = runner.validate_metadata({"@graph": []})
+ assert outcome.status is ValidationStatus.INVALID
+
+
+def test_validate_metadata_exception_becomes_error_outcome(monkeypatch):
+ monkeypatch.setattr(runner, "services", FakeServices(raises=RuntimeError("kaboom")))
+ outcome = runner.validate_metadata({"@graph": []}, profile_name="ro-crate")
+ assert outcome.status is ValidationStatus.ERROR
+ assert "kaboom" in outcome.error
+ assert outcome.profile == "ro-crate"
+
+
+def test_validate_crate_path_success(monkeypatch):
+ fake = FakeServices(result=FakeResult(has_issues=False))
+ monkeypatch.setattr(runner, "services", fake)
+
+ outcome = runner.validate_crate_path("/tmp/crate", profile_name="ro-crate")
+
+ assert outcome.status is ValidationStatus.VALID
+ assert fake.last_settings["rocrate_uri"] == "/tmp/crate"
+
+
+def test_validate_crate_path_exception_becomes_error_outcome(monkeypatch):
+ monkeypatch.setattr(runner, "services", FakeServices(raises=ValueError("bad crate")))
+ outcome = runner.validate_crate_path("/tmp/crate")
+ assert outcome.status is ValidationStatus.ERROR
+ assert "bad crate" in outcome.error
+
+
+def test_offline_cache_and_extra_profiles_are_passed_through(monkeypatch):
+ fake = FakeServices(result=FakeResult(has_issues=False))
+ monkeypatch.setattr(runner, "services", fake)
+
+ runner.validate_crate_path(
+ "/tmp/crate",
+ profile_name="five-safes-crate",
+ extra_profiles_path="/app/extra-profiles",
+ cache_path="/app/.rocrate-cache",
+ offline=True,
+ )
+
+ s = fake.last_settings
+ assert s["extra_profiles_path"] == "/app/extra-profiles"
+ assert s["cache_path"] == "/app/.rocrate-cache"
+ assert s["offline"] is True
+
+
+def test_offline_and_cache_omitted_when_not_set(monkeypatch):
+ fake = FakeServices(result=FakeResult(has_issues=False))
+ monkeypatch.setattr(runner, "services", fake)
+
+ runner.validate_metadata({"@graph": []})
+
+ s = fake.last_settings
+ assert "extra_profiles_path" not in s
+ assert "cache_path" not in s
+ # offline defaults to False and is only forwarded when enabled
+ assert s.get("offline", False) is False
diff --git a/cratey.py b/wsgi.py
similarity index 67%
rename from cratey.py
rename to wsgi.py
index 338a4a5..c0f7d15 100644
--- a/cratey.py
+++ b/wsgi.py
@@ -1,14 +1,10 @@
"""Entry point for the Flask application."""
-# Author: Alexander Hambley
-# License: MIT
-# Copyright (c) 2025 eScience Lab, The University of Manchester
-
from app import create_app
from app.services.logging_service import setup_logging
app = create_app()
-setup_logging()
+setup_logging(app.config["SETTINGS"])
if __name__ == "__main__":
# Run the Flask development server: