From 4fdf5f1858feafe7187cb527b7e259fa8ae5bb8c Mon Sep 17 00:00:00 2001 From: "roller100 (BearingNode)" Date: Wed, 18 Mar 2026 13:04:59 +0000 Subject: [PATCH 1/3] feat(dbt): add dbt test assertions coverage to csv_to_postgres scenario Add `dbt-ol test` to test/run.sh so test-phase events are captured alongside seed and run events. Add three expected event files for the dataQualityAssertions facet: - events/customers/data_quality_event.json (not_null + unique on customer_id) - events/orders/data_quality_event.json (not_null on customer_id, not_null + unique on order_id) - events/analytics/data_quality_event.json (not_null on customer_id + total_revenue, unique on customer_id) Register all three as test cases in config.json with the dataQualityAssertions facet tag. Expected event content derived from actual dbt-ol 1.23.0 output against a live PostgreSQL 15 instance. Signed-off-by: roller100 (BearingNode) --- .../dbt/scenarios/csv_to_postgres/config.json | 51 +++++++++++++++++++ .../events/analytics/data_quality_event.json | 34 +++++++++++++ .../events/customers/data_quality_event.json | 29 +++++++++++ .../events/orders/data_quality_event.json | 34 +++++++++++++ .../dbt/scenarios/csv_to_postgres/test/run.sh | 1 + 5 files changed, 149 insertions(+) create mode 100644 producer/dbt/scenarios/csv_to_postgres/events/analytics/data_quality_event.json create mode 100644 producer/dbt/scenarios/csv_to_postgres/events/customers/data_quality_event.json create mode 100644 producer/dbt/scenarios/csv_to_postgres/events/orders/data_quality_event.json diff --git a/producer/dbt/scenarios/csv_to_postgres/config.json b/producer/dbt/scenarios/csv_to_postgres/config.json index 6a737208..c28fc7f2 100644 --- a/producer/dbt/scenarios/csv_to_postgres/config.json +++ b/producer/dbt/scenarios/csv_to_postgres/config.json @@ -265,6 +265,57 @@ ] } } + }, + { + "name": "customers_data_quality_assertions_test", + "path": "events/customers/data_quality_event.json", + "openlineage_versions": { + "min": "1.38.0" + }, + "tags": { + "facets": [ + "dataQualityAssertions" + ], + "lineage_level": { + "postgres": [ + "dataset" + ] + } + } + }, + { + "name": "orders_data_quality_assertions_test", + "path": "events/orders/data_quality_event.json", + "openlineage_versions": { + "min": "1.38.0" + }, + "tags": { + "facets": [ + "dataQualityAssertions" + ], + "lineage_level": { + "postgres": [ + "dataset" + ] + } + } + }, + { + "name": "analytics_data_quality_assertions_test", + "path": "events/analytics/data_quality_event.json", + "openlineage_versions": { + "min": "1.38.0" + }, + "tags": { + "facets": [ + "dataQualityAssertions" + ], + "lineage_level": { + "postgres": [ + "dataset" + ] + } + } } ] } \ No newline at end of file diff --git a/producer/dbt/scenarios/csv_to_postgres/events/analytics/data_quality_event.json b/producer/dbt/scenarios/csv_to_postgres/events/analytics/data_quality_event.json new file mode 100644 index 00000000..d7d89c16 --- /dev/null +++ b/producer/dbt/scenarios/csv_to_postgres/events/analytics/data_quality_event.json @@ -0,0 +1,34 @@ +{ + "eventType": "COMPLETE", + "job": { + "namespace": "dbt", + "name": "dbt_test.main.openlineage_compatibility_test.customer_analytics.test" + }, + "inputs": [ + { + "name": "dbt_test.main.customer_analytics", + "namespace": "postgres://localhost:5432", + "facets": { + "dataQualityAssertions": { + "assertions": [ + { + "assertion": "not_null", + "column": "customer_id", + "success": true + }, + { + "assertion": "not_null", + "column": "total_revenue", + "success": true + }, + { + "assertion": "unique", + "column": "customer_id", + "success": true + } + ] + } + } + } + ] +} diff --git a/producer/dbt/scenarios/csv_to_postgres/events/customers/data_quality_event.json b/producer/dbt/scenarios/csv_to_postgres/events/customers/data_quality_event.json new file mode 100644 index 00000000..97635709 --- /dev/null +++ b/producer/dbt/scenarios/csv_to_postgres/events/customers/data_quality_event.json @@ -0,0 +1,29 @@ +{ + "eventType": "COMPLETE", + "job": { + "namespace": "dbt", + "name": "dbt_test.main.openlineage_compatibility_test.stg_customers.test" + }, + "inputs": [ + { + "name": "dbt_test.main.stg_customers", + "namespace": "postgres://localhost:5432", + "facets": { + "dataQualityAssertions": { + "assertions": [ + { + "assertion": "not_null", + "column": "customer_id", + "success": true + }, + { + "assertion": "unique", + "column": "customer_id", + "success": true + } + ] + } + } + } + ] +} diff --git a/producer/dbt/scenarios/csv_to_postgres/events/orders/data_quality_event.json b/producer/dbt/scenarios/csv_to_postgres/events/orders/data_quality_event.json new file mode 100644 index 00000000..4d9efe83 --- /dev/null +++ b/producer/dbt/scenarios/csv_to_postgres/events/orders/data_quality_event.json @@ -0,0 +1,34 @@ +{ + "eventType": "COMPLETE", + "job": { + "namespace": "dbt", + "name": "dbt_test.main.openlineage_compatibility_test.stg_orders.test" + }, + "inputs": [ + { + "name": "dbt_test.main.stg_orders", + "namespace": "postgres://localhost:5432", + "facets": { + "dataQualityAssertions": { + "assertions": [ + { + "assertion": "not_null", + "column": "customer_id", + "success": true + }, + { + "assertion": "not_null", + "column": "order_id", + "success": true + }, + { + "assertion": "unique", + "column": "order_id", + "success": true + } + ] + } + } + } + ] +} diff --git a/producer/dbt/scenarios/csv_to_postgres/test/run.sh b/producer/dbt/scenarios/csv_to_postgres/test/run.sh index acb9dbd3..63035380 100644 --- a/producer/dbt/scenarios/csv_to_postgres/test/run.sh +++ b/producer/dbt/scenarios/csv_to_postgres/test/run.sh @@ -14,6 +14,7 @@ EOF dbt-ol seed --project-dir="../../../runner" --profiles-dir="../../../runner" --target=postgres --no-version-check dbt-ol run --project-dir="../../../runner" --profiles-dir="../../../runner" --target=postgres --no-version-check + dbt-ol test --project-dir="../../../runner" --profiles-dir="../../../runner" --target=postgres --no-version-check jq -c '.' "${PRODUCER_OUTPUT_EVENTS_DIR}/events.jsonl" | nl -w1 -s' ' | while read -r i line; do echo "$line" | jq '.' > "${PRODUCER_OUTPUT_EVENTS_DIR}/event-$i.json" From 9816cb3be445a769293c618982a8c4f5d2237085 Mon Sep 17 00:00:00 2001 From: "roller100 (BearingNode)" Date: Wed, 18 Mar 2026 13:05:26 +0000 Subject: [PATCH 2/3] fix(dbt): update README to reflect actual run.sh workflow The local debugging section referenced run_dbt_tests.sh, which was removed during the PR #211 cleanup. Replace with accurate instructions using docker compose + run.sh directly. Update the workflow description, test structure layout, and validation scope (add dataQualityAssertions) to match the current architecture. Signed-off-by: roller100 (BearingNode) --- producer/dbt/README.md | 58 +++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/producer/dbt/README.md b/producer/dbt/README.md index 282a8da3..bf153fc2 100644 --- a/producer/dbt/README.md +++ b/producer/dbt/README.md @@ -13,23 +13,21 @@ It is important to note that this is a **compatibility validation framework** us ## Test Architecture and Workflow -The test is orchestrated by the `run_dbt_tests.sh` script and follows a clear, sequential workflow designed for reliability and ease of use. This structure ensures that each component of the integration is validated systematically. +The test is orchestrated by the scenario's `test/run.sh` script and follows a clear, sequential workflow designed for reliability and ease of use. This structure ensures that each component of the integration is validated systematically. The end-to-end process is as follows: -1. **Test Orchestration**: The `run_dbt_tests.sh` script serves as the main entry point. It sets up the environment and initiates over the scenarios folder to execute each test scenario. +1. **Scenario Execution**: The `test/run.sh` script executes the dbt project defined in the `runner/` directory using `dbt-ol seed`, `dbt-ol run`, and `dbt-ol test`. -2. **Scenario Execution**: The test runner executes the dbt project defined in the `runner/` directory. The specific dbt commands to be run (e.g., `dbt seed`, `dbt run`, `dbt test`) are defined in the test scenarios run script (`test/run.sh`). - -3. **Event Generation and Capture**: During the execution, the `dbt-ol` wrapper intercepts the dbt commands and emits OpenLineage events. The `test/openlineage.yml` configuration directs these events to be captured as a local file (`{directory_input_param}/events.jsonl`) using the `file` transport. +2. **Event Generation and Capture**: During the execution, the `dbt-ol` wrapper intercepts the dbt commands and emits OpenLineage events. The `test/run.sh` script writes an `openlineage.yml` configuration that directs these events to be captured as a local file (`{output_dir}/events.jsonl`) using the `file` transport. -4. **Extract events**: OpenLineage emits events reliable to one file ('append: true' causes overwrites and events to be lost) so it is required to extract the before validation. +3. **Extract events**: OpenLineage emits all events to one file, so `run.sh` splits them into individual numbered files (`event-1.json`, `event-2.json`, …) before deleting the combined `.jsonl`. -5. **Event Validation**: Once the dbt process is complete, the test framework performs a two-stage validation on the generated events: - * **Syntax Validation**: Each event is validated against the official OpenLineage JSON schema (e.g., version `1.40.1`) to ensure it is structurally correct. - * **Semantic Validation**: The content of the events is compared against expected templates. This deep comparison, powered by the `scripts/compare_events.py` utility, verifies the accuracy of job names, dataset identifiers, lineage relationships, and the presence and structure of key facets. +4. **Event Validation**: Once the dbt process is complete, the shared framework (`scripts/validate_ol_events.py`) performs a two-stage validation on the generated events: + * **Syntax Validation**: Each event is validated against the official OpenLineage JSON schema to ensure it is structurally correct. + * **Semantic Validation**: The content of the events is compared against expected templates in `scenarios/csv_to_postgres/events/`. This comparison, powered by the `scripts/compare_events.py` utility, verifies the accuracy of job names, dataset identifiers, lineage relationships, and the presence and structure of key facets. -6. **Reporting**: Upon completion, the test runner generates a standardized JSON report (`dbt_producer_report.json`) that details the results of each validation step. This report is designed to be consumed by higher-level aggregation scripts in a CI/CD environment. +5. **Reporting**: Upon completion, the framework generates a standardised JSON report that details the results of each validation step for consumption by CI/CD aggregation scripts. ## Validation Scope @@ -38,6 +36,7 @@ This test validates that the `openlineage-dbt` integration correctly generates O #### dbt Operations Covered: - `dbt seed`: To load initial data. - `dbt run`: To execute dbt models. +- `dbt test`: To run data quality tests and capture `dataQualityAssertions` facets. #### Validation Checks: - **Event Generation**: Correctly creates `START` and `COMPLETE` events for jobs and runs. @@ -50,6 +49,7 @@ This test validates that the `openlineage-dbt` integration correctly generates O - `schema` - `dataSource` - `columnLineage` + - `dataQualityAssertions` - **Specification Compliance**: Events are validated against the OpenLineage specification schema (version `2-0-2`). ## Test Structure @@ -58,16 +58,14 @@ The test is organized into the following key directories, each with a specific r ``` producer/dbt/ -├── run_dbt_tests.sh # Main test execution script -├── scenarios/ # Defines the dbt commands and expected outcomes for each test case -├── output/ # Default output directory for generated OpenLineage events (generated during execution) +├── scenarios/ # Test scenarios; each defines expected events and a run script ├── runner/ # A self-contained dbt project used as the test target -└── specs/ # Stores OpenLineage spcification get from local repository (generated during execution) +├── versions.json # Supported component and OpenLineage version ranges +└── maintainers.json # Maintainer contact information ``` - **`runner/`**: A self-contained dbt project with models, seeds, and configuration. This is the target of the `dbt-ol` command. -- **`scenarios/`**: Defines the dbt commands to be executed and contains the expected event templates for validation. -- **`output/`**: The default output directory for the generated `events.jsonl` file and extracted events. +- **`scenarios/`**: Contains one directory per scenario. Each scenario has a `config.json` defining expected event templates, an `events/` directory of expected event JSON files, and a `test/` directory with `run.sh` and `compose.yml`. ## How to Run the Tests @@ -106,34 +104,30 @@ The GitHub Actions workflow: If you need to debug event generation locally: -1. **Start PostgreSQL (Optional)**: +1. **Start PostgreSQL**: ```bash - cd producer/dbt/scenarions/csv_to_postgres/test - docker compose up + docker compose -f producer/dbt/scenarios/csv_to_postgres/test/compose.yml up -d ``` -2. **Install Python Dependencies**: +2. **Install dbt and the OpenLineage wrapper** (use a virtual environment outside the repo): ```bash - # Activate virtual environment (recommended) - python -m venv venv - source venv/bin/activate # On Windows: venv\Scripts\activate + python -m venv ~/.venvs/dbt-compat-test + source ~/.venvs/dbt-compat-test/bin/activate + pip install dbt-core==1.8.0 dbt-postgres openlineage-dbt==1.23.0 ``` -3. **Run Test Scenario**: +3. **Run the scenario**: ```bash - ./producer/dbt/run_dbt_tests.sh --openlineage-directory + mkdir -p /tmp/dbt-events + bash producer/dbt/scenarios/csv_to_postgres/test/run.sh /tmp/dbt-events ``` -4. **Inspect Generated Events**: +4. **Inspect generated events**: ```bash - # View events - cat ./producer/dbt/output/csv_to_postgres/event-{id}.json | jq '.' - - # check report - cat ./producer/dbt/dbt_producer_report.json | jq '.' + cat /tmp/dbt-events/event-1.json | jq '.' ``` -**Note**: Local debugging is entirely optional. All official validation happens in GitHub Actions with PostgreSQL service containers. The test runner (`test/run.sh`) is the same code used by CI/CD, ensuring consistency. +**Note**: Local debugging is entirely optional. All official validation happens in GitHub Actions with PostgreSQL service containers. The `test/run.sh` script is the same code used by CI/CD, ensuring consistency. ## Important dbt Integration Notes From 4322364b11d8f7cf202c6cdc58a338b9ebaf3dcd Mon Sep 17 00:00:00 2001 From: "roller100 (BearingNode)" Date: Wed, 18 Mar 2026 13:34:12 +0000 Subject: [PATCH 3/3] chore: ignore jsonl event files generated during local testing Add **/events/*.jsonl to the existing local-testing ignore block. dbt-ol writes events.jsonl before extraction; this prevents it from being accidentally staged in any producer's events directory. Signed-off-by: roller100 (BearingNode) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 2c95801b..db6ef1f7 100644 --- a/.gitignore +++ b/.gitignore @@ -173,4 +173,5 @@ bin/ **/specs/ **/output/ **/test/openlineage.yml +**/events/*.jsonl dbt_producer_report.json \ No newline at end of file