From 07a5a0f22ec71182a085428f4a84f6decdc08167 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 6 Feb 2026 18:49:36 -0300 Subject: [PATCH 01/95] feat(ui): add flagged status to test definitions (TG-976 Phase 1) Allow users to flag test definitions for attention. The flag persists across test runs, giving users a lightweight way to track work-in-progress issues without needing to leave the app. - Add `flagged` boolean column to test_definitions table - Add flag/unflag disposition actions on Test Definitions and Test Results pages - Add "Flagged" filter on both pages - Add flagged toggle to test definition edit form - Include flagged column in grid display, detail panel, and Excel export - Fix pre-existing bug where inherited dataclass fields were dropped by to_dataframe Co-Authored-By: Claude Opus 4.6 --- testgen/common/models/test_definition.py | 15 ++++-- .../030_initialize_new_schema_structure.sql | 15 +++--- .../dbupgrade/0173_incremental_upgrade.sql | 4 ++ testgen/ui/queries/test_result_queries.py | 7 +++ testgen/ui/views/test_definitions.py | 43 +++++++++++++---- testgen/ui/views/test_results.py | 47 ++++++++++++++++--- 6 files changed, 104 insertions(+), 27 deletions(-) create mode 100644 testgen/template/dbupgrade/0173_incremental_upgrade.sql diff --git a/testgen/common/models/test_definition.py b/testgen/common/models/test_definition.py index a18b8223..99a518e8 100644 --- a/testgen/common/models/test_definition.py +++ b/testgen/common/models/test_definition.py @@ -1,11 +1,12 @@ from collections.abc import Iterable from dataclasses import dataclass from datetime import datetime -from typing import Literal +from typing import ClassVar, Literal from uuid import UUID import streamlit as st from sqlalchemy import ( + Boolean, Column, ForeignKey, String, @@ -91,7 +92,8 @@ class TestDefinitionSummary(TestTypeSummary): profiling_as_of_date: datetime last_manual_update: datetime export_to_observability: bool - prediction: str | None + prediction: dict[str, dict[str, float]] | None + flagged: bool @dataclass @@ -211,6 +213,7 @@ class TestDefinition(Entity): last_manual_update: datetime = Column(UpdateTimestamp, nullable=False) export_to_observability: bool = Column(YNString) prediction: dict[str, dict[str, float]] | None = Column(postgresql.JSONB) + flagged: bool = Column(Boolean, default=False, nullable=False) _default_order_by = (asc(func.lower(schema_name)), asc(func.lower(table_name)), asc(func.lower(column_name)), asc(test_type)) _summary_columns = ( @@ -276,10 +279,12 @@ def select_minimal_where( ) return [TestDefinitionMinimal(**row) for row in results] + _yn_columns: ClassVar = {"test_active", "lock_refresh"} + @classmethod def set_status_attribute( cls, - status_type: Literal["test_active", "lock_refresh"], + status_type: Literal["test_active", "lock_refresh", "flagged"], test_definition_ids: list[str | UUID], value: bool, ) -> None: @@ -296,7 +301,7 @@ def set_status_attribute( """ params = { "test_definition_ids": test_definition_ids, - "value": YNString().process_bind_param(value, None), + "value": YNString().process_bind_param(value, None) if status_type in cls._yn_columns else value, } db_session = get_current_session() @@ -318,7 +323,7 @@ def move( SELECT UNNEST(ARRAY [:test_definition_ids]) AS id ) UPDATE test_definitions - SET + SET {"table_name = :target_table_name," if target_table_name else ""} {"column_name = :target_column_name," if target_column_name else ""} table_groups_id = :target_table_group, diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 822cd28b..6bc7eaad 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -235,6 +235,7 @@ CREATE TABLE test_definitions ( profiling_as_of_date TIMESTAMP, last_manual_update TIMESTAMP DEFAULT NULL, export_to_observability VARCHAR(5), + flagged BOOLEAN DEFAULT FALSE NOT NULL, CONSTRAINT test_definitions_test_suites_test_suite_id_fk FOREIGN KEY (test_suite_id) REFERENCES test_suites ); @@ -745,20 +746,20 @@ CREATE INDEX ix_td_ts_tc CREATE UNIQUE INDEX uix_td_autogen_schema ON test_definitions (test_suite_id, test_type, schema_name) - WHERE last_auto_gen_date IS NOT NULL - AND table_name IS NULL + WHERE last_auto_gen_date IS NOT NULL + AND table_name IS NULL AND column_name IS NULL; CREATE UNIQUE INDEX uix_td_autogen_table ON test_definitions (test_suite_id, test_type, schema_name, table_name) - WHERE last_auto_gen_date IS NOT NULL - AND table_name IS NOT NULL + WHERE last_auto_gen_date IS NOT NULL + AND table_name IS NOT NULL AND column_name IS NULL; CREATE UNIQUE INDEX uix_td_autogen_column ON test_definitions (test_suite_id, test_type, schema_name, table_name, column_name) - WHERE last_auto_gen_date IS NOT NULL - AND table_name IS NOT NULL + WHERE last_auto_gen_date IS NOT NULL + AND table_name IS NOT NULL AND column_name IS NOT NULL; -- Index test_runs @@ -794,7 +795,7 @@ CREATE INDEX ix_tr_ts_tctt ON test_results(test_suite_id, table_name, column_names, test_type); -- Index data_structure_log -CREATE INDEX ix_dsl_tg_tcd +CREATE INDEX ix_dsl_tg_tcd ON data_structure_log (table_groups_id, table_name, change_date); -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/testgen/template/dbupgrade/0173_incremental_upgrade.sql b/testgen/template/dbupgrade/0173_incremental_upgrade.sql new file mode 100644 index 00000000..eff17ba6 --- /dev/null +++ b/testgen/template/dbupgrade/0173_incremental_upgrade.sql @@ -0,0 +1,4 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE test_definitions + ADD COLUMN IF NOT EXISTS flagged BOOLEAN DEFAULT FALSE NOT NULL; diff --git a/testgen/ui/queries/test_result_queries.py b/testgen/ui/queries/test_result_queries.py index ad35a8b4..86b83ff7 100644 --- a/testgen/ui/queries/test_result_queries.py +++ b/testgen/ui/queries/test_result_queries.py @@ -15,6 +15,7 @@ def get_test_results( column_name: str | None = None, action: Literal["Confirmed", "Dismissed", "Muted", "No Action"] | None = None, sorting_columns: list[str] | None = None, + flagged: bool | None = None, ) -> pd.DataFrame: query = f""" WITH run_results @@ -59,6 +60,7 @@ def get_test_results( c.id::VARCHAR as connection_id, r.test_suite_id::VARCHAR, r.test_definition_id::VARCHAR, r.auto_gen, + td.flagged, -- These are used in the PDF report tt.threshold_description, tt.usage_notes, r.test_time, @@ -94,6 +96,9 @@ def get_test_results( AND r.column_names = dcc.column_name) LEFT JOIN data_table_chars dtc ON dcc.table_id = dtc.table_id + LEFT JOIN test_definitions td + ON (r.test_definition_id = td.id) + {"WHERE td.flagged = :flagged" if flagged is not None else ""} {f"ORDER BY {', '.join(' '.join(col) for col in sorting_columns)}" if sorting_columns else ""}; """ params = { @@ -105,10 +110,12 @@ def get_test_results( "disposition": { "Muted": "Inactive", }.get(action, action), + "flagged": flagged, } df = fetch_df_from_db(query, params) df["test_date"] = pd.to_datetime(df["test_date"]) + df["flagged_display"] = df["flagged"].apply(lambda value: "Yes" if value else "No") return df diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 9487db24..67cd47af 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -17,7 +17,7 @@ from testgen.common.models import with_database_session from testgen.common.models.connection import Connection from testgen.common.models.table_group import TableGroup, TableGroupMinimal -from testgen.common.models.test_definition import TestDefinition, TestDefinitionMinimal, TestDefinitionSummary +from testgen.common.models.test_definition import TestDefinition, TestDefinitionMinimal from testgen.common.models.test_suite import TestSuite, TestSuiteMinimal from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( @@ -51,6 +51,7 @@ def render( table_name: str | None = None, column_name: str | None = None, test_type: str | None = None, + flagged: str | None = None, **_kwargs, ) -> None: test_suite = TestSuite.get(test_suite_id) @@ -75,7 +76,7 @@ def render( ], ) - table_filter_column, column_filter_column, test_filter_column, sort_column, table_actions_column = st.columns([.2, .2, .2, .1, .25], vertical_alignment="bottom") + table_filter_column, column_filter_column, test_filter_column, flagged_filter_column, sort_column, table_actions_column = st.columns([.2, .2, .15, .1, .1, .25], vertical_alignment="bottom") testgen.flex_row_end(table_actions_column) actions_column, disposition_column = st.columns([.5, .5]) @@ -83,7 +84,7 @@ def render( testgen.flex_row_end(disposition_column) filters_changed = False - current_filters = (table_name, column_name, test_type) + current_filters = (table_name, column_name, test_type, flagged) if (query_filters := st.session_state.get("test_definitions:filters")) != current_filters: if query_filters: filters_changed = True @@ -124,6 +125,14 @@ def render( label="Test Type", ) + with flagged_filter_column: + flagged = testgen.select( + options=["Flagged", "Not Flagged"], + default_value=flagged, + bind_to_query="flagged", + label="Flagged", + ) + with sort_column: sortable_columns = ( ("Table", "table_name"), @@ -152,7 +161,7 @@ def render( with st.container(): with st.spinner("Loading data ..."): - df = get_test_definitions(test_suite, table_name, column_name, test_type, sorting_columns) + df = get_test_definitions(test_suite, table_name, column_name, test_type, sorting_columns, flagged) selected, selected_test_def = render_grid(df, multi_select, filters_changed) @@ -193,6 +202,11 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: { "icon": "🔐", "help": "Unlock for future test generation", "attribute": "lock_refresh", "value": False, "message": "Unlocked" }, ]) + disposition_actions.extend([ + { "icon": "🚩", "help": "Flag for attention", "attribute": "flagged", "value": True, "message": "Flagged" }, + { "icon": "⌀", "help": "Clear flag", "attribute": "flagged", "value": False, "message": "Flag cleared" }, + ]) + for action in disposition_actions: action_disabled = not selected or all(sel[action["attribute"]] == action["value"] for sel in selected) action["button"] = disposition_column.button(action["icon"], help=action["help"], disabled=action_disabled) @@ -241,6 +255,7 @@ def render_grid(df: pd.DataFrame, multi_select: bool, filters_changed: bool) -> "test_name_short", "test_active_display", "lock_refresh_display", + "flagged_display", "urgency", "export_to_observability_display", "profiling_as_of_date", @@ -258,6 +273,7 @@ def render_grid(df: pd.DataFrame, multi_select: bool, filters_changed: bool) -> "Test Type", "Active", "Locked", + "Flagged", "Urgency", "Export to Observabilty", "Based on Profiling", @@ -282,6 +298,7 @@ def render_selected_details(selected_test: dict, table_group: TableGroupMinimal) "test_active_display", "test_definition_status", "lock_refresh_display", + "flagged_display", "urgency", "export_to_observability", ] @@ -294,6 +311,7 @@ def render_selected_details(selected_test: dict, table_group: TableGroupMinimal) "test_active", "test_definition_status", "lock_refresh", + "flagged", "urgency", "export_to_observability", ] @@ -426,6 +444,7 @@ def show_test_form( skip_errors = selected_test_def["skip_errors"] or 0 if mode == "edit" else 0 test_active = bool(selected_test_def["test_active"]) if mode == "edit" else True lock_refresh = bool(selected_test_def["lock_refresh"]) if mode == "edit" else False + test_flagged = bool(selected_test_def["flagged"]) if mode == "edit" else False test_definition_status = selected_test_def["test_definition_status"] if mode == "edit" else "" column_name = empty_if_null(selected_test_def["column_name"]) if mode == "edit" else empty_if_null(column_name) last_auto_gen_date = empty_if_null(selected_test_def["last_auto_gen_date"]) if mode == "edit" else "" @@ -531,6 +550,7 @@ def show_test_form( help="Protects test parameters from being overwritten when tests in this Test Suite are regenerated.", ), "test_active": left_column.toggle(label="Test Active", value=test_active), + "flagged": left_column.toggle(label="Flagged", value=test_flagged, help="Flag this test for attention."), "custom_query": custom_query, "baseline_ct": baseline_ct, "baseline_unique_ct": baseline_unique_ct, @@ -740,14 +760,14 @@ def render_dynamic_attribute(attribute: str, container: DeltaGenerator): value = None placeholder = "Max" disabled = True - + if test_definition.get("history_calculation") == "Value" and ( "history_calculation_upper" not in dynamic_attributes or test_definition.get("history_calculation_upper") == "Value" ): value = 1 disabled = True - + test_definition[attribute] = container.number_input( label=label_text, step=1, @@ -1085,7 +1105,7 @@ def get_excel_report_data( else: data = get_test_definitions(test_suite) - for key in ["test_active_display", "lock_refresh_display"]: + for key in ["test_active_display", "lock_refresh_display", "flagged_display"]: data[key] = data[key].apply(lambda val: val if val == "Yes" else None) for key in ["profiling_as_of_date", "last_manual_update"]: @@ -1102,6 +1122,7 @@ def get_excel_report_data( "export_uom": {"header": "Unit of measure"}, "test_active_display": {"header": "Active"}, "lock_refresh_display": {"header": "Locked"}, + "flagged_display": {"header": "Flagged"}, "urgency": {"header": "Severity"}, "profiling_as_of_date": {"header": "From profiling as-of (UTC)"}, "last_manual_update": {"header": "Last manual update (UTC)"}, @@ -1222,6 +1243,7 @@ def get_test_definitions( column_name: str | None = None, test_type: str | None = None, sorting_columns: list[str] | None = None, + flagged_filter: str | None = None, ) -> pd.DataFrame: clauses = [TestDefinition.test_suite_id == test_suite.id] if table_name: @@ -1230,6 +1252,10 @@ def get_test_definitions( clauses.append(TestDefinition.column_name.ilike(column_name)) if test_type: clauses.append(TestDefinition.test_type == test_type) + if flagged_filter == "Flagged": + clauses.append(TestDefinition.flagged == True) + elif flagged_filter == "Not Flagged": + clauses.append(TestDefinition.flagged == False) sort_funcs = {"ASC": asc, "DESC": desc} test_definitions = TestDefinition.select_where( @@ -1240,13 +1266,14 @@ def get_test_definitions( ]) if sorting_columns else None, ) - df = to_dataframe(test_definitions, TestDefinitionSummary.columns()) + df = to_dataframe(test_definitions) date_service.accommodate_dataframe_to_timezone(df, st.session_state) for key in ["id", "table_groups_id", "profile_run_id", "test_suite_id"]: df[key] = df[key].apply(lambda value: str(value)) df["test_active_display"] = df["test_active"].apply(lambda value: "Yes" if value else "No") df["lock_refresh_display"] = df["lock_refresh"].apply(lambda value: "Yes" if value else "No") + df["flagged_display"] = df["flagged"].apply(lambda value: "Yes" if value else "No") df["urgency"] = df.apply(lambda row: row["severity"] or test_suite.severity or row["default_severity"], axis=1) df["final_test_description"] = df.apply(lambda row: row["test_description"] or row["default_test_description"], axis=1) df["export_uom"] = df.apply(lambda row: row["measure_uom_description"] or row["measure_uom"], axis=1) diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index dce7f095..910aa1d1 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -63,6 +63,7 @@ def render( column_name: str | None = None, test_type: str | None = None, action: str | None = None, + flagged: str | None = None, **_kwargs, ) -> None: run = TestRun.get_minimal(run_id) @@ -86,15 +87,15 @@ def render( ) summary_column, score_column, actions_column, export_button_column = st.columns([.3, .15, .3, .15], vertical_alignment="bottom") - status_filter_column, table_filter_column, column_filter_column, test_type_filter_column, action_filter_column, sort_column = st.columns( - [.175, .2, .2, .175, .15, .1], vertical_alignment="bottom" + status_filter_column, table_filter_column, column_filter_column, test_type_filter_column, flagged_filter_column, action_filter_column, sort_column = st.columns( + [.15, .175, .175, .15, .1, .15, .1], vertical_alignment="bottom" ) testgen.flex_row_end(actions_column, wrap=True) testgen.flex_row_end(export_button_column) filters_changed = False - current_filters = (status, table_name, column_name, test_type, action) + current_filters = (status, table_name, column_name, test_type, flagged, action) if (query_filters := st.session_state.get("test_results:filters")) != current_filters: if query_filters: filters_changed = True @@ -157,6 +158,14 @@ def render( label="Test Type", ) + with flagged_filter_column: + flagged = testgen.select( + options=["Flagged", "Not Flagged"], + default_value=flagged, + bind_to_query="flagged", + label="Flagged", + ) + with action_filter_column: action = testgen.select( options=["✓ Confirmed", "✘ Dismissed", "🔇 Muted", "â†Šī¸Ž No Action"], @@ -196,8 +205,9 @@ def render( with st.container(): with st.spinner("Loading data ..."): # Retrieve test results (always cached, action as null) + flagged_bool = True if flagged == "Flagged" else False if flagged == "Not Flagged" else None df = test_result_queries.get_test_results( - run_id, status, test_type, table_name, column_name, action, sorting_columns + run_id, status, test_type, table_name, column_name, action, sorting_columns, flagged_bool ) # Retrieve disposition action (cache refreshed) df_action = get_test_disposition(run_id) @@ -222,6 +232,7 @@ def render( "measure_uom", "result_status", "action", + "flagged_display", "result_message", ], [ @@ -232,6 +243,7 @@ def render( "Unit of Measure", "Status", "Action", + "Flagged", "Details", ], id_column="test_result_id", @@ -298,6 +310,26 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: lst_cached_functions=affected_cached_functions, ) + if session.auth.user_has_permission("disposition"): + flag_actions = [ + { "icon": "🚩", "help": "Flag test for attention", "value": True, "message": "Flagged" }, + { "icon": "⌀", "help": "Clear flag", "value": False, "message": "Flag cleared" }, + ] + for flag_action in flag_actions: + flag_disabled = not selected or all(sel["flagged"] == flag_action["value"] for sel in selected) + flag_action["button"] = actions_column.button(flag_action["icon"], help=flag_action["help"], disabled=flag_disabled) + + for flag_action in flag_actions: + if flag_action["button"]: + test_definition_ids = list({row["test_definition_id"] for row in selected}) + TestDefinition.set_status_attribute("flagged", test_definition_ids, flag_action["value"]) + fm.reset_post_updates( + None, + as_toast=True, + clear_cache=True, + lst_cached_functions=affected_cached_functions, + ) + # Needs to be after all data loading/updating # Otherwise the database session is lost for any queries after the fragment -_- with score_column: @@ -441,11 +473,11 @@ def get_test_result_summary(test_run_id: str) -> list[dict]: def show_test_def_detail(test_definition_id: str, test_suite: TestSuiteMinimal): def readable_boolean(v: bool): return "Yes" if v else "No" - + if not test_definition_id: st.warning("Test definition no longer exists.") return - + test_definition = TestDefinition.get(test_definition_id) if test_definition: @@ -640,6 +672,7 @@ def get_excel_report_data( "result_status": {"header": "Status"}, "result_message": {"header": "Message"}, "action": {}, + "flagged_display": {"header": "Flagged"}, } return get_excel_file_data( data, @@ -816,7 +849,7 @@ def source_data_dialog(selected_row): st.markdown(f"#### {selected_row['test_name_short']}") st.caption(selected_row["test_description"]) - + st.markdown("#### Test Parameters") testgen.caption(selected_row["input_parameters"], styles="max-height: 75px; overflow: auto;") From 84b0ad19042f7c813a3e68ce9766cfed801839c7 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Wed, 11 Feb 2026 13:19:19 -0300 Subject: [PATCH 02/95] feat(ui): add notes dialog, review column, and sorting improvements (TG-980) - Notes dialog: styled subtitle (Table/Column/Test), tooltip fix, auto-expand when empty, form reset on add/edit - Test results: merged Review column (disposition + flagged + notes count), Flagged/Has Notes sorting - Test definitions: notes count column, Flagged/Has Notes sorting - Model: add get_notes_count_by_ids, use uuid4 for TestDefinitionNote id - Grid: add column_styles parameter to render_grid_select Co-Authored-By: Claude Opus 4.6 --- testgen/common/models/test_definition.py | 72 ++++- .../030_initialize_new_schema_structure.sql | 11 + .../dbupgrade/0174_incremental_upgrade.sql | 12 + testgen/ui/components/frontend/js/main.js | 1 + .../js/pages/test_definition_notes.js | 296 ++++++++++++++++++ testgen/ui/queries/test_result_queries.py | 1 + testgen/ui/services/form_service.py | 9 +- .../dialogs/test_definition_notes_dialog.py | 39 +++ testgen/ui/views/test_definitions.py | 56 +++- testgen/ui/views/test_results.py | 266 ++++++++-------- 10 files changed, 626 insertions(+), 137 deletions(-) create mode 100644 testgen/template/dbupgrade/0174_incremental_upgrade.sql create mode 100644 testgen/ui/components/frontend/js/pages/test_definition_notes.js create mode 100644 testgen/ui/views/dialogs/test_definition_notes_dialog.py diff --git a/testgen/common/models/test_definition.py b/testgen/common/models/test_definition.py index 99a518e8..d34d451b 100644 --- a/testgen/common/models/test_definition.py +++ b/testgen/common/models/test_definition.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from datetime import datetime from typing import ClassVar, Literal -from uuid import UUID +from uuid import UUID, uuid4 import streamlit as st from sqlalchemy import ( @@ -13,6 +13,7 @@ Text, TypeDecorator, asc, + delete, func, insert, select, @@ -23,7 +24,7 @@ from sqlalchemy.orm import InstrumentedAttribute from sqlalchemy.sql.expression import case, literal -from testgen.common.models import get_current_session +from testgen.common.models import Base, get_current_session from testgen.common.models.custom_types import NullIfEmptyString, UpdateTimestamp, YNString, ZeroIfEmptyInteger from testgen.common.models.entity import ENTITY_HASH_FUNCS, Entity, EntityMinimal from testgen.utils import is_uuid4 @@ -406,3 +407,70 @@ def save(self) -> None: super().save() TestDefinition.clear_cache() + + +class TestDefinitionNote(Base): + __tablename__ = "test_definition_notes" + + id: UUID = Column(postgresql.UUID(as_uuid=True), default=uuid4, primary_key=True) + test_definition_id: UUID = Column( + postgresql.UUID(as_uuid=True), ForeignKey("test_definitions.id", ondelete="CASCADE"), nullable=False + ) + detail: str = Column(Text, nullable=False) + created_by: str = Column(String(100), nullable=False) + created_at: datetime = Column(postgresql.TIMESTAMP, server_default=text("CURRENT_TIMESTAMP")) + updated_at: datetime = Column(postgresql.TIMESTAMP) + + @classmethod + def add_note(cls, test_definition_id: str | UUID, detail: str, username: str) -> None: + db_session = get_current_session() + db_session.execute( + insert(cls).values(test_definition_id=test_definition_id, detail=detail, created_by=username) + ) + db_session.commit() + + @classmethod + def update_note(cls, note_id: str | UUID, detail: str) -> None: + db_session = get_current_session() + db_session.execute( + update(cls).where(cls.id == note_id).values(detail=detail, updated_at=func.now()) + ) + db_session.commit() + + @classmethod + def delete_note(cls, note_id: str | UUID) -> None: + db_session = get_current_session() + db_session.execute(delete(cls).where(cls.id == note_id)) + db_session.commit() + + @classmethod + def get_notes_count_by_ids(cls, test_definition_ids: list[str]) -> dict[str, int]: + """Returns {test_definition_id: count} for all given IDs.""" + db_session = get_current_session() + rows = db_session.execute( + text(""" + SELECT test_definition_id::VARCHAR, COUNT(*) as cnt + FROM test_definition_notes + WHERE test_definition_id = ANY(:ids) + GROUP BY test_definition_id + """), + {"ids": [UUID(td_id) for td_id in test_definition_ids]}, + ).all() + return {str(row[0]): row[1] for row in rows} + + @classmethod + def get_notes(cls, test_definition_id: str | UUID) -> list[dict]: + db_session = get_current_session() + results = db_session.execute( + select(cls).where(cls.test_definition_id == test_definition_id).order_by(cls.created_at.desc()) + ).scalars().all() + return [ + { + "id": str(note.id), + "detail": note.detail, + "created_by": note.created_by, + "created_at": note.created_at.isoformat() if note.created_at else None, + "updated_at": note.updated_at.isoformat() if note.updated_at else None, + } + for note in results + ] diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 6bc7eaad..82629d01 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -240,6 +240,17 @@ CREATE TABLE test_definitions ( FOREIGN KEY (test_suite_id) REFERENCES test_suites ); +CREATE TABLE test_definition_notes ( + id UUID DEFAULT gen_random_uuid() PRIMARY KEY, + test_definition_id UUID NOT NULL REFERENCES test_definitions ON DELETE CASCADE, + detail TEXT NOT NULL, + created_by VARCHAR(100) NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP +); + +CREATE INDEX ix_tdn_tdid ON test_definition_notes(test_definition_id, created_at DESC); + CREATE TABLE profile_results ( id UUID DEFAULT gen_random_uuid() CONSTRAINT profile_results_id_pk diff --git a/testgen/template/dbupgrade/0174_incremental_upgrade.sql b/testgen/template/dbupgrade/0174_incremental_upgrade.sql new file mode 100644 index 00000000..b540b93b --- /dev/null +++ b/testgen/template/dbupgrade/0174_incremental_upgrade.sql @@ -0,0 +1,12 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +CREATE TABLE IF NOT EXISTS test_definition_notes ( + id UUID DEFAULT gen_random_uuid() PRIMARY KEY, + test_definition_id UUID NOT NULL REFERENCES test_definitions ON DELETE CASCADE, + detail TEXT NOT NULL, + created_by VARCHAR(100) NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP +); + +CREATE INDEX IF NOT EXISTS ix_tdn_tdid ON test_definition_notes(test_definition_id, created_at DESC); diff --git a/testgen/ui/components/frontend/js/main.js b/testgen/ui/components/frontend/js/main.js index 8819548e..e8ebbd9e 100644 --- a/testgen/ui/components/frontend/js/main.js +++ b/testgen/ui/components/frontend/js/main.js @@ -45,6 +45,7 @@ const componentLoaders = { monitors_dashboard: () => import('./pages/monitors_dashboard.js').then(m => m.MonitorsDashboard), table_monitoring_trends: () => import('./pages/table_monitoring_trends.js').then(m => m.TableMonitoringTrend), test_results_chart: () => import('./pages/test_results_chart.js').then(m => m.TestResultsChart), + test_definition_notes: () => import('./pages/test_definition_notes.js').then(m => m.TestDefinitionNotes), schema_changes_list: () => import('./components/schema_changes_list.js').then(m => m.SchemaChangesList), edit_monitor_settings: () => import('./pages/edit_monitor_settings.js').then(m => m.EditMonitorSettings), }; diff --git a/testgen/ui/components/frontend/js/pages/test_definition_notes.js b/testgen/ui/components/frontend/js/pages/test_definition_notes.js new file mode 100644 index 00000000..bf9983b3 --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/test_definition_notes.js @@ -0,0 +1,296 @@ +/** + * @typedef Note + * @type {object} + * @property {string} id + * @property {string} detail + * @property {string} created_by + * @property {string?} created_at + * @property {string?} updated_at + * + * @typedef Properties + * @type {object} + * @property {{table: string, column: string, test: string}} test_label + * @property {Array} notes + * @property {string} current_user + */ +import van from '../van.min.js'; +import { Button } from '../components/button.js'; +import { Icon } from '../components/icon.js'; +import { Streamlit } from '../streamlit.js'; +import { emitEvent, getValue, loadStylesheet } from '../utils.js'; +import { ExpansionPanel } from '../components/expansion_panel.js'; + +const minHeight = 400; +const { div, span, textarea, p } = van.tags; + +/** + * @param {string?} isoString + * @returns {string} + */ +function formatDate(isoString) { + if (!isoString) return ''; + const date = new Date(isoString); + return Intl.DateTimeFormat('en-US', { dateStyle: 'medium', timeStyle: 'short' }).format(date); +} + +/** + * @param {Properties} props + * @returns + */ +const TestDefinitionNotes = (props) => { + loadStylesheet('test-definition-notes', stylesheet); + window.testgen.isPage = true; + + // Form state: shared between add and edit modes + const editNoteId = van.state(null); + const noteText = van.state(''); + const isEdit = van.state(false); + + const resetForm = () => { + editNoteId.val = null; + noteText.val = ''; + isEdit.val = false; + }; + + /** + * @param {Note} note + * @param {string} currentUser + * @returns + */ + const NoteItem = (note, currentUser) => { + const confirmingDelete = van.state(false); + const isOwner = note.created_by === currentUser; + + return div( + { class: () => `tdn-note ${isEdit.val && editNoteId.val === note.id ? 'tdn-editing' : ''}` }, + div( + { class: 'tdn-note-header' }, + span({ class: 'tdn-note-author' }, `@${note.created_by}`), + span({ class: 'tdn-note-separator' }, '\u2014'), + span({ class: 'tdn-note-date' }, + formatDate(note.created_at), + note.updated_at ? ' (edited)' : '', + ), + isOwner ? div( + { class: 'tdn-note-actions' }, + () => isEdit.val && editNoteId.val === note.id + ? div( + { class: 'flex-row fx-gap-1 fx-align-center' }, + Icon({ size: 18, classes: 'tdn-editing-indicator' }, 'edit'), + span({ class: 'tdn-editing-indicator text-caption' }, 'Editing'), + ) + : div( + { class: 'flex-row fx-gap-1' }, + Button({ + type: 'icon', + icon: 'edit', + tooltip: 'Edit note', + onclick: () => { + isEdit.val = true; + editNoteId.val = note.id; + noteText.val = note.detail; + }, + }), + () => confirmingDelete.val + ? div( + { class: 'flex-row fx-gap-1 fx-align-center' }, + span({ class: 'text-caption' }, 'Delete?'), + Button({ + label: 'Yes', + type: 'stroked', + color: 'warn', + onclick: () => emitEvent('NoteDeleted', { payload: { id: note.id } }), + }), + Button({ + label: 'No', + type: 'stroked', + color: 'basic', + onclick: () => { confirmingDelete.val = false; }, + }), + ) + : Button({ + type: 'icon', + icon: 'delete', + tooltip: 'Delete note', + tooltipPosition: 'top-left', + onclick: () => { confirmingDelete.val = true; }, + }), + ), + ) : null, + ), + p({ class: 'tdn-note-detail' }, note.detail), + ); + }; + + return div( + { id: 'test-definition-notes', class: 'flex-column fx-gap-2', style: 'height: 100%; overflow-y: auto;' }, + () => { + const label = getValue(props.test_label); + return div( + { class: 'tdn-label' }, + span({ class: 'text-secondary' }, 'Table: '), span(label.table), + span({ class: 'tdn-separator' }, '|'), + span({ class: 'text-secondary' }, 'Column: '), span(label.column), + span({ class: 'tdn-separator' }, '|'), + span({ class: 'text-secondary' }, 'Test: '), span(label.test), + ); + }, + () => ExpansionPanel( + { + title: isEdit.val + ? span({ class: 'tdn-editing-indicator' }, 'Edit Note') + : span({ class: 'text-green' }, 'Add Note'), + expanded: isEdit.val || getValue(props.notes).length === 0, + }, + div( + { class: 'flex-column' }, + textarea({ + class: 'tdn-form-textarea', + placeholder: 'Type a note...', + value: noteText, + oninput: (e) => noteText.val = e.target.value, + rows: 3, + }), + div( + { class: 'flex-row fx-justify-content-flex-end fx-gap-2 mt-3' }, + () => isEdit.val + ? Button({ + type: 'stroked', + label: 'Cancel', + width: 'auto', + onclick: resetForm, + }) + : '', + Button({ + type: 'stroked', + label: isEdit.val ? 'Save Changes' : 'Add Note', + width: 'auto', + disabled: () => !noteText.val.trim(), + onclick: () => { + const text = noteText.rawVal.trim(); + if (isEdit.rawVal) { + const id = editNoteId.rawVal; + resetForm(); + emitEvent('NoteUpdated', { payload: { id, text } }); + } else { + resetForm(); + emitEvent('NoteAdded', { payload: { text } }); + } + }, + }), + ), + ), + ), + () => { + const notes = getValue(props.notes); + const currentUser = getValue(props.current_user); + Streamlit.setFrameHeight(Math.max(minHeight, 80 * notes.length + 200)); + + return notes.length > 0 + ? div( + { class: 'tdn-notes-list' }, + ...notes.map(note => NoteItem(note, currentUser)), + ) + : div( + { class: 'tdn-empty-state text-secondary' }, + 'No notes yet. Add one above.', + ); + }, + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tdn-label { + font-size: 14px; + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 4px; +} +.tdn-separator { + color: var(--disabled-text-color); + margin: 0 4px; +} +.tdn-form-textarea { + box-sizing: border-box; + width: 100%; + border-radius: 8px; + border: 1px solid transparent; + transition: border-color 0.3s; + background-color: var(--form-field-color); + padding: 8px 12px; + color: var(--primary-text-color); + font-family: inherit; + font-size: 14px; + resize: vertical; +} +.tdn-form-textarea:focus, +.tdn-form-textarea:focus-visible { + outline: none; + border-color: var(--primary-color); +} +.tdn-form-textarea::placeholder { + font-style: italic; + color: var(--disabled-text-color); +} +.tdn-notes-list { + display: flex; + flex-direction: column; + gap: 4px; +} +.tdn-note { + padding: 12px; + border-radius: 8px; + background-color: var(--dk-card-background); + border: 1px solid var(--dk-card-border-color, rgba(0,0,0,0.06)); + transition: background-color 0.2s; +} +.tdn-note.tdn-editing { + background-color: var(--select-hover-background); +} +.tdn-note-header { + display: flex; + flex-direction: row; + align-items: center; + gap: 6px; + margin-bottom: 6px; +} +.tdn-editing-indicator { + color: var(--purple); +} +.tdn-note-author { + font-weight: 600; + font-size: 13px; + color: var(--primary-text-color); +} +.tdn-note-separator { + color: var(--disabled-text-color); + font-size: 12px; +} +.tdn-note-date { + font-size: 12px; + color: var(--secondary-text-color); +} +.tdn-note-actions { + display: flex; + flex-direction: row; + align-items: center; + margin-left: auto; + gap: 2px; +} +.tdn-note-detail { + margin: 0; + font-size: 14px; + line-height: 1.5; + color: var(--primary-text-color); + white-space: pre-wrap; +} +.tdn-empty-state { + text-align: center; + padding: 24px 0; + font-style: italic; +} +`); + +export { TestDefinitionNotes }; diff --git a/testgen/ui/queries/test_result_queries.py b/testgen/ui/queries/test_result_queries.py index 86b83ff7..928cfb27 100644 --- a/testgen/ui/queries/test_result_queries.py +++ b/testgen/ui/queries/test_result_queries.py @@ -61,6 +61,7 @@ def get_test_results( r.test_definition_id::VARCHAR, r.auto_gen, td.flagged, + (SELECT COUNT(*) FROM test_definition_notes tdn WHERE tdn.test_definition_id = td.id) as notes_count, -- These are used in the PDF report tt.threshold_description, tt.usage_notes, r.test_time, diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index 70e8f752..8426f3c2 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -151,6 +151,7 @@ def render_grid_select( reset_pagination: bool = False, bind_to_query: bool = False, render_highlights: bool = True, + column_styles: dict[str, dict] | None = None, key: str = "aggrid", ) -> tuple[list[dict], dict]: """ @@ -343,6 +344,8 @@ def on_page_change(): # Merge common and date-time specific kwargs all_kwargs = {**common_kwargs, **date_time_kwargs} + elif column_styles and column in column_styles: + all_kwargs = {**common_kwargs, "cellStyle": column_styles[column]} else: if render_highlights == True: # Merge common and highlight-specific kwargs @@ -394,14 +397,14 @@ def on_page_change(): selection.update([row[id_column] for row in selected_rows]) st.session_state[f"{key}_multiselection"] = selection - if selection: + if selection: # We need to get the data from the original dataframe # Otherwise changes to the dataframe (e.g., editing the current selection) do not get reflected in the returned rows # Adding "modelUpdated" to AgGrid(update_on=...) does not work # because it causes unnecessary reruns that cause dialogs to close abruptly selected_df = df[df[id_column].isin(selection)] selected_data = json.loads(selected_df.to_json(orient="records")) - + selected_id, selected_item = None, None if selected_rows: selected_id = selected_rows[len(selected_rows) - 1][id_column] @@ -414,5 +417,5 @@ def on_page_change(): testgen.caption(f"{count} item{'s' if count != 1 else ''} selected") return selected_data, selected_item - + return None, None diff --git a/testgen/ui/views/dialogs/test_definition_notes_dialog.py b/testgen/ui/views/dialogs/test_definition_notes_dialog.py new file mode 100644 index 00000000..f6686e26 --- /dev/null +++ b/testgen/ui/views/dialogs/test_definition_notes_dialog.py @@ -0,0 +1,39 @@ +import streamlit as st + +from testgen.common.models import with_database_session +from testgen.common.models.test_definition import TestDefinitionNote +from testgen.ui.components import widgets as testgen +from testgen.ui.session import session + + +@st.dialog(title="Test Definition Notes") +@with_database_session +def test_definition_notes_dialog(test_definition_id: str, test_label: dict) -> None: + current_user = session.auth.user.username if session.auth.user else "unknown" + notes = TestDefinitionNote.get_notes(test_definition_id) + + def on_note_added(payload: dict) -> None: + TestDefinitionNote.add_note(test_definition_id, payload["text"], current_user) + st.rerun() + + def on_note_updated(payload: dict) -> None: + TestDefinitionNote.update_note(payload["id"], payload["text"]) + st.rerun() + + def on_note_deleted(payload: dict) -> None: + TestDefinitionNote.delete_note(payload["id"]) + st.rerun() + + testgen.testgen_component( + "test_definition_notes", + props={ + "test_label": test_label, + "notes": notes, + "current_user": current_user, + }, + on_change_handlers={ + "NoteAdded": on_note_added, + "NoteUpdated": on_note_updated, + "NoteDeleted": on_note_deleted, + }, + ) diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 67cd47af..026596e1 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -7,7 +7,8 @@ import pandas as pd import streamlit as st -from sqlalchemy import and_, asc, desc, func, or_, tuple_ +from sqlalchemy import and_, asc, case, desc, func, or_, tuple_ +from sqlalchemy import select as sa_select from streamlit.delta_generator import DeltaGenerator from streamlit_extras.no_default_selectbox import selectbox @@ -17,7 +18,7 @@ from testgen.common.models import with_database_session from testgen.common.models.connection import Connection from testgen.common.models.table_group import TableGroup, TableGroupMinimal -from testgen.common.models.test_definition import TestDefinition, TestDefinitionMinimal +from testgen.common.models.test_definition import TestDefinition, TestDefinitionMinimal, TestDefinitionNote from testgen.common.models.test_suite import TestSuite, TestSuiteMinimal from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( @@ -33,6 +34,7 @@ from testgen.ui.session import session, temp_value from testgen.ui.views.dialogs.profiling_results_dialog import view_profiling_button from testgen.ui.views.dialogs.run_tests_dialog import run_tests_dialog +from testgen.ui.views.dialogs.test_definition_notes_dialog import test_definition_notes_dialog from testgen.utils import to_dataframe LOG = logging.getLogger("testgen") @@ -135,11 +137,13 @@ def render( with sort_column: sortable_columns = ( + ("Flagged", "flagged"), + ("Has Notes", "notes_count"), ("Table", "table_name"), ("Column", "column_name"), ("Test Type", "test_type"), ) - default = [(sortable_columns[i][1], "ASC") for i in (0, 1, 2)] + default = [(sortable_columns[i][1], "ASC") for i in (2, 3, 4)] sorting_columns = testgen.sorting_selector(sortable_columns, default) if user_can_disposition: @@ -225,6 +229,17 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: lst_cached_functions=[], ) + if actions_column.button( + ":material/sticky_note_2: Notes", + disabled=not selected or len(selected) != 1, + help="View and add notes for this test definition", + ): + row = selected[0] + test_definition_notes_dialog( + str(row["id"]), + {"table": row["table_name"], "column": row["column_name"], "test": row["test_name_short"]}, + ) + if user_can_edit: if actions_column.button( ":material/edit: Edit", @@ -256,6 +271,7 @@ def render_grid(df: pd.DataFrame, multi_select: bool, filters_changed: bool) -> "test_active_display", "lock_refresh_display", "flagged_display", + "notes_display", "urgency", "export_to_observability_display", "profiling_as_of_date", @@ -274,6 +290,7 @@ def render_grid(df: pd.DataFrame, multi_select: bool, filters_changed: bool) -> "Active", "Locked", "Flagged", + "Notes", "Urgency", "Export to Observabilty", "Based on Profiling", @@ -1258,12 +1275,30 @@ def get_test_definitions( clauses.append(TestDefinition.flagged == False) sort_funcs = {"ASC": asc, "DESC": desc} + + notes_count_expr = ( + sa_select(func.count(TestDefinitionNote.id)) + .where(TestDefinitionNote.test_definition_id == TestDefinition.id) + .correlate(TestDefinition) + .scalar_subquery() + ) + + sort_expressions = { + "flagged": lambda d: sort_funcs[d](case((TestDefinition.flagged == True, 0), else_=1)), + "notes_count": lambda d: sort_funcs[d](case((notes_count_expr > 0, 0), else_=1)), + } + + order_by = [] + if sorting_columns: + for (attribute, direction) in sorting_columns: + if attribute in sort_expressions: + order_by.append(sort_expressions[attribute](direction)) + else: + order_by.append(sort_funcs[direction](func.lower(getattr(TestDefinition, attribute)))) + test_definitions = TestDefinition.select_where( *clauses, - order_by=tuple([ - sort_funcs[direction](func.lower(getattr(TestDefinition, attribute))) - for (attribute, direction) in sorting_columns - ]) if sorting_columns else None, + order_by=tuple(order_by) if order_by else None, ) df = to_dataframe(test_definitions) @@ -1274,6 +1309,13 @@ def get_test_definitions( df["test_active_display"] = df["test_active"].apply(lambda value: "Yes" if value else "No") df["lock_refresh_display"] = df["lock_refresh"].apply(lambda value: "Yes" if value else "No") df["flagged_display"] = df["flagged"].apply(lambda value: "Yes" if value else "No") + + if not df.empty: + notes_counts = TestDefinitionNote.get_notes_count_by_ids([str(td_id) for td_id in df["id"]]) + df["notes_count"] = df["id"].map(notes_counts).fillna(0).astype(int) + else: + df["notes_count"] = pd.Series(dtype=int) + df["notes_display"] = df["notes_count"].apply(lambda x: f"📝 {x}" if x > 0 else "") df["urgency"] = df.apply(lambda row: row["severity"] or test_suite.severity or row["default_severity"], axis=1) df["final_test_description"] = df.apply(lambda row: row["test_description"] or row["default_test_description"], axis=1) df["export_uom"] = df.apply(lambda row: row["measure_uom_description"] or row["measure_uom"], axis=1) diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 910aa1d1..1e2b9f08 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -41,7 +41,8 @@ from testgen.ui.services.database_service import execute_db_query, fetch_df_from_db, fetch_one_from_db from testgen.ui.services.string_service import snake_case_to_title_case from testgen.ui.session import session -from testgen.ui.views.dialogs.profiling_results_dialog import view_profiling_button +from testgen.ui.views.dialogs.profiling_results_dialog import profiling_results_dialog +from testgen.ui.views.dialogs.test_definition_notes_dialog import test_definition_notes_dialog from testgen.ui.views.test_definitions import show_test_form_by_id from testgen.utils import friendly_score, str_to_timestamp @@ -86,12 +87,11 @@ def render( ], ) - summary_column, score_column, actions_column, export_button_column = st.columns([.3, .15, .3, .15], vertical_alignment="bottom") + summary_column, score_column, export_button_column = st.columns([.35, .15, .5], vertical_alignment="bottom") status_filter_column, table_filter_column, column_filter_column, test_type_filter_column, flagged_filter_column, action_filter_column, sort_column = st.columns( [.15, .175, .175, .15, .1, .15, .1], vertical_alignment="bottom" ) - testgen.flex_row_end(actions_column, wrap=True) testgen.flex_row_end(export_button_column) filters_changed = False @@ -177,6 +177,8 @@ def render( with sort_column: sortable_columns = ( + ("Flagged", "CASE WHEN td.flagged THEN 0 ELSE 1 END"), + ("Has Notes", "CASE WHEN (SELECT COUNT(*) FROM test_definition_notes tdn WHERE tdn.test_definition_id = td.id) > 0 THEN 0 ELSE 1 END"), ("Table", "LOWER(r.table_name)"), ("Columns/Focus", "LOWER(r.column_names)"), ("Test Type", "r.test_type"), @@ -185,10 +187,16 @@ def render( ("Status", "result_status"), ("Action", "r.disposition"), ) - default = [(sortable_columns[i][1], "ASC") for i in (0, 1, 2)] + default = [(sortable_columns[i][1], "ASC") for i in (2, 3, 4)] sorting_columns = testgen.sorting_selector(sortable_columns, default) - with actions_column: + actions_column, disposition_column = st.columns([.5, .5]) + testgen.flex_row_start(actions_column) + testgen.flex_row_end(disposition_column) + + user_can_edit = session.auth.user_has_permission("edit") + + with disposition_column: multi_select = st.toggle( "Multi-Select", help="Toggle on to perform actions on multiple results", @@ -219,6 +227,18 @@ def render( action_map = df_action.set_index("id")["action"].to_dict() df["action"] = df["test_result_id"].map(action_map).fillna(df["action"]) + def build_review_column(row): + parts = [] + if row["action"]: + parts.append(row["action"]) + if row["flagged"]: + parts.append("🚩") + if row.get("notes_count", 0) > 0: + parts.append(f"📝{row['notes_count']}") + return " ".join(parts) + + df["review"] = df.apply(build_review_column, axis=1) + test_suite = TestSuite.get_minimal(run.test_suite_id) table_group = TableGroup.get_minimal(test_suite.table_groups_id) @@ -231,8 +251,7 @@ def render( "result_measure", "measure_uom", "result_status", - "action", - "flagged_display", + "review", "result_message", ], [ @@ -242,14 +261,14 @@ def render( "Result Measure", "Unit of Measure", "Status", - "Action", - "Flagged", + "Review", "Details", ], id_column="test_result_id", selection_mode="multiple" if multi_select else "single", reset_pagination=filters_changed, bind_to_query=True, + column_styles={"review": {"textAlign": "center", "fontSize": "1.1em"}}, ) popover_container = export_button_column.empty() @@ -282,6 +301,91 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: # Need to render toolbar buttons after grid, so selection status is maintained affected_cached_functions = [get_test_disposition, test_result_queries.get_test_results] + # === Action buttons (left side, near the grid) === + + if actions_column.button( + ":material/sticky_note_2: Notes", + disabled=not selected or len(selected) != 1, + help="View and add notes for this test definition", + ): + row = selected[0] + test_definition_notes_dialog( + str(row["test_definition_id"]), + {"table": row["table_name"], "column": row["column_names"], "test": row["test_name_short"]}, + ) + + if actions_column.button( + ":material/edit: Edit Test", + disabled=not selected_row or not user_can_edit, + help="Edit the Test Definition", + ): + show_test_form_by_id(selected_row["test_definition_id"]) + + if actions_column.button( + ":material/visibility: Source Data", + disabled=not selected_row, + help="View current source data for highlighted result", + ): + MixpanelService().send_event( + "view-source-data", + page=PAGE_PATH, + test_type=selected_row["test_name_short"], + ) + source_data_dialog(selected_row) + + can_view_profiling = ( + selected_row + and selected_row.get("test_scope") == "column" + and selected_row.get("column_names") not in (None, "(multi-column)", "N/A") + and selected_row.get("table_name") not in (None, "(multi-table)") + ) + if actions_column.button( + ":material/insert_chart: Profiling", + disabled=not can_view_profiling, + help="View profiling for highlighted column", + ): + profiling_results_dialog( + selected_row["column_names"], + selected_row["table_name"], + selected_row["table_groups_id"], + ) + + report_eligible_rows = [ + row for row in selected + if row["result_status"] != "Passed" and row["disposition"] in (None, "Confirmed") + ] if selected else [] + report_btn_help = ( + "Generate PDF reports for the selected results that are not muted or dismissed and are not Passed" + if multi_select + else "Generate PDF report for selected result" + ) + if actions_column.button( + ":material/download: Issue Report", + disabled=not report_eligible_rows, + help=report_btn_help, + ): + MixpanelService().send_event( + "download-issue-report", + page=PAGE_PATH, + issue_count=len(report_eligible_rows), + ) + dialog_title = "Download Issue Report" + if len(report_eligible_rows) == 1: + download_dialog( + dialog_title=dialog_title, + file_content_func=get_report_file_data, + args=(report_eligible_rows[0],), + ) + else: + zip_func = zip_multi_file_data( + "testgen_test_issue_reports.zip", + get_report_file_data, + [(arg,) for arg in selected], + ) + download_dialog(dialog_title=dialog_title, file_content_func=zip_func) + + # === Disposition buttons (right side) === + disposition_actions = [ { "icon": "✓", "help": "Confirm this issue as relevant for this run", "status": "Confirmed" }, { "icon": "✘", "help": "Dismiss this issue as not relevant for this run", "status": "Dismissed" }, @@ -298,7 +402,7 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: or sel["result_status"] == "Passed" for sel in selected ) - action["button"] = actions_column.button(action["icon"], help=action["help"], disabled=disable_dispo) + action["button"] = disposition_column.button(action["icon"], help=action["help"], disabled=disable_dispo) # This has to be done as a second loop - otherwise, the rest of the buttons after the clicked one are not displayed briefly while refreshing for action in disposition_actions: @@ -317,7 +421,7 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: ] for flag_action in flag_actions: flag_disabled = not selected or all(sel["flagged"] == flag_action["value"] for sel in selected) - flag_action["button"] = actions_column.button(flag_action["icon"], help=flag_action["help"], disabled=flag_disabled) + flag_action["button"] = disposition_column.button(flag_action["icon"], help=flag_action["help"], disabled=flag_disabled) for flag_action in flag_actions: if flag_action["button"]: @@ -335,14 +439,8 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: with score_column: render_score(run.project_code, run_id) - if selected: - render_selected_details( - selected, - selected_row, - test_suite, - session.auth.user_has_permission("edit"), - multi_select, - ) + if selected_row: + render_selected_details(selected_row, test_suite) # Help Links st.markdown("[Help on Test Types](https://docs.datakitchen.io/article/dataops-testgen-help/testgen-test-types)") @@ -541,109 +639,34 @@ def readable_boolean(v: bool): @with_database_session def render_selected_details( - selected_rows: list[dict], selected_item: dict, test_suite: TestSuiteMinimal, - user_can_edit: bool, - multi_select: bool = False, ) -> None: - if not selected_rows: - st.markdown(":orange[Select a record to see more information.]") - else: - pg_col1, pg_col2 = st.columns([0.5, 0.5]) - - with pg_col2: - v_col1, v_col2, v_col3, v_col4 = st.columns([.25, .25, .25, .25]) - - if selected_item: - dfh = test_result_queries.get_test_result_history(selected_item) - show_hist_columns = ["test_date", "threshold_value", "result_measure", "result_status"] - - time_columns = ["test_date"] - date_service.accommodate_dataframe_to_timezone(dfh, st.session_state, time_columns) - - if user_can_edit: - view_edit_test(v_col1, selected_item["test_definition_id"]) - - if selected_item["test_scope"] == "column": - with v_col2: - view_profiling_button( - selected_item["column_names"], - selected_item["table_name"], - selected_item["table_groups_id"], - ) - - with v_col3: - if st.button( - ":material/visibility: Source Data", help="View current source data for highlighted result", - use_container_width=True - ): - MixpanelService().send_event( - "view-source-data", - page=PAGE_PATH, - test_type=selected_item["test_name_short"], - ) - source_data_dialog(selected_item) - - with v_col4: - - report_eligible_rows = [ - row for row in selected_rows - if row["result_status"] != "Passed" and row["disposition"] in (None, "Confirmed") - ] - - if multi_select: - report_btn_help = ( - "Generate PDF reports for the selected results that are not muted or dismissed and are not Passed" - ) + dfh = test_result_queries.get_test_result_history(selected_item) + show_hist_columns = ["test_date", "threshold_value", "result_measure", "result_status"] + + time_columns = ["test_date"] + date_service.accommodate_dataframe_to_timezone(dfh, st.session_state, time_columns) + + pg_col1, pg_col2 = st.columns([0.5, 0.5]) + + with pg_col1: + fm.show_subheader(selected_item["test_name_short"]) + st.markdown(f"###### {selected_item['test_description']}") + if selected_item["measure_uom_description"]: + st.caption(selected_item["measure_uom_description"]) + if selected_item["result_message"]: + st.caption(selected_item["result_message"].replace("*", "\\*")) + fm.render_grid_select(dfh, show_hist_columns, selection_mode="disabled", key="test_history") + with pg_col2: + ut_tab1, ut_tab2 = st.tabs(["History", "Test Definition"]) + with ut_tab1: + if dfh.empty: + st.write("Test history not available.") else: - report_btn_help = "Generate PDF report for selected result" - - if st.button( - ":material/download: Issue Report", - use_container_width=True, - disabled=not report_eligible_rows, - help=report_btn_help, - ): - MixpanelService().send_event( - "download-issue-report", - page=PAGE_PATH, - issue_count=len(report_eligible_rows), - ) - dialog_title = "Download Issue Report" - if len(report_eligible_rows) == 1: - download_dialog( - dialog_title=dialog_title, - file_content_func=get_report_file_data, - args=(report_eligible_rows[0],), - ) - else: - zip_func = zip_multi_file_data( - "testgen_test_issue_reports.zip", - get_report_file_data, - [(arg,) for arg in selected_rows], - ) - download_dialog(dialog_title=dialog_title, file_content_func=zip_func) - - if selected_item: - with pg_col1: - fm.show_subheader(selected_item["test_name_short"]) - st.markdown(f"###### {selected_item['test_description']}") - if selected_item["measure_uom_description"]: - st.caption(selected_item["measure_uom_description"]) - if selected_item["result_message"]: - st.caption(selected_item["result_message"].replace("*", "\\*")) - fm.render_grid_select(dfh, show_hist_columns, selection_mode="disabled", key="test_history") - with pg_col2: - ut_tab1, ut_tab2 = st.tabs(["History", "Test Definition"]) - with ut_tab1: - if dfh.empty: - st.write("Test history not available.") - else: - # write_history_graph(dfh) - write_history_chart_v2(dfh) - with ut_tab2: - show_test_def_detail(selected_item["test_definition_id"], test_suite) + write_history_chart_v2(dfh) + with ut_tab2: + show_test_def_detail(selected_item["test_definition_id"], test_suite) @with_database_session @@ -888,13 +911,6 @@ def source_data_dialog(selected_row): st.dataframe(df_bad, width=1050, hide_index=True) -def view_edit_test(button_container, test_definition_id): - if test_definition_id: - with button_container: - if st.button(":material/edit: Edit Test", help="Edit the Test Definition", use_container_width=True): - show_test_form_by_id(test_definition_id) - - def get_report_file_data(update_progress, tr_data) -> FILE_DATA_TYPE: tr_id = tr_data["test_result_id"][:8] tr_time = pd.Timestamp(tr_data["test_date"]).strftime("%Y%m%d_%H%M%S") From 07b63272b4a68d9852dcce1656b07f49ec6b1830 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 13 Feb 2026 10:24:40 -0300 Subject: [PATCH 03/95] test: extract testable logic from command functions and add 67 unit tests Extract pure decision/transformation logic from I/O-heavy command functions into standalone functions, then add unit tests for each. No behavioral changes. Extractions: - calculate_sampling_params() from run_profiling.py into profiling_query.py - collect_test_identifiers() and check_identifiers() from run_test_validation.py - build_cat_expressions(), group_cat_tests(), parse_cat_results() from execute_tests_query.py - calculate_prediction_tolerances() and Z_SCORE_MAP from test_thresholds_prediction.py Tests for _score_card_to_results() added directly (already a standalone function). Co-Authored-By: Claude Opus 4.6 --- .../commands/queries/execute_tests_query.py | 256 +++++++++----- testgen/commands/queries/profiling_query.py | 34 ++ testgen/commands/run_profiling.py | 26 +- testgen/commands/run_test_validation.py | 90 +++-- .../commands/test_thresholds_prediction.py | 64 ++-- .../queries/test_execute_tests_query.py | 314 ++++++++++++++++++ .../commands/queries/test_profiling_query.py | 93 ++++++ .../unit/commands/test_run_test_validation.py | 248 ++++++++++++++ tests/unit/commands/test_score_cards.py | 82 +++++ .../commands/test_thresholds_prediction.py | 93 ++++++ 10 files changed, 1163 insertions(+), 137 deletions(-) create mode 100644 tests/unit/commands/queries/test_execute_tests_query.py create mode 100644 tests/unit/commands/queries/test_profiling_query.py create mode 100644 tests/unit/commands/test_run_test_validation.py create mode 100644 tests/unit/commands/test_score_cards.py create mode 100644 tests/unit/commands/test_thresholds_prediction.py diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index 6e3c18df..c853ca9c 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -67,6 +67,150 @@ class AggregateResult(TypedDict): result_codes: str +def build_cat_expressions( + measure: str, + test_operator: str, + test_condition: str, + history_calculation: str, + lower_tolerance: str, + upper_tolerance: str, + varchar_type: str, + concat_operator: str, + null_value: str = "", +) -> tuple[str, str]: + """Build measure_expression and condition_expression for a CAT test. + + Args: + measure: Already-resolved measure SQL expression. + test_operator: Comparison operator (e.g., "=", "BETWEEN"). + test_condition: Already-resolved test condition SQL expression. + history_calculation: "PREDICT" for prediction mode, anything else for normal. + lower_tolerance: Lower tolerance value (empty/None means training mode for PREDICT). + upper_tolerance: Upper tolerance value (empty/None means training mode for PREDICT). + varchar_type: DB-specific varchar type (e.g., "VARCHAR", "STRING"). + concat_operator: DB-specific concat operator (e.g., "||", "+"). + null_value: Sentinel string for NULL values. + + Returns: + (measure_expression, condition_expression) + """ + measure_expression = f"COALESCE(CAST({measure} AS {varchar_type}) {concat_operator} '|', '{null_value}|')" + + # For prediction mode, return -1 during training period + if history_calculation == "PREDICT" and (not lower_tolerance or not upper_tolerance): + condition_expression = "'-1,'" + else: + condition = ( + f"{measure} {test_operator} {test_condition}" + if "BETWEEN" in test_operator + else f"{measure}{test_operator}{test_condition}" + ) + condition_expression = f"CASE WHEN {condition} THEN '0,' ELSE '1,' END" + + return measure_expression, condition_expression + + +def group_cat_tests( + test_defs: list[TestExecutionDef], + max_query_chars: int, + concat_operator: str, + single: bool = False, +) -> list[list[TestExecutionDef]]: + """Group test defs into batches respecting character limit. + + All test defs must have measure_expression and condition_expression set. + + Args: + test_defs: List of test defs with expressions already set. + max_query_chars: Maximum characters per query. + concat_operator: DB-specific concat operator for calculating expression size. + single: If True, put each test def in its own group. + + Returns: + List of groups, where each group is a list of test defs. + """ + if single: + return [[td] for td in test_defs] + + test_defs_by_table: dict[tuple[str, str], list[TestExecutionDef]] = {} + for td in test_defs: + table = (td.schema_name, td.table_name) + if not test_defs_by_table.get(table): + test_defs_by_table[table] = [] + test_defs_by_table[table].append(td) + + groups: list[list[TestExecutionDef]] = [] + for table_test_defs in test_defs_by_table.values(): + current_chars = 0 + current_group: list[TestExecutionDef] = [] + + for td in table_test_defs: + td_chars = len(td.measure_expression) + len(td.condition_expression) + 2 * len(concat_operator) + if (current_chars + td_chars) > max_query_chars: + if current_group: + groups.append(current_group) + current_chars = 0 + current_group = [] + + current_chars += td_chars + current_group.append(td) + + if current_group: + groups.append(current_group) + + return groups + + +def parse_cat_results( + aggregate_results: list[AggregateResult], + aggregate_test_defs: list[list[TestExecutionDef]], + test_run_id: UUID, + test_suite_id: UUID | str, + test_starttime: datetime, + input_parameters_fn, + null_value: str = "", +) -> list[list]: + """Parse aggregate query results into individual test result rows. + + Args: + aggregate_results: List of aggregate result dicts from DB. + aggregate_test_defs: List of test def groups matching the queries. + test_run_id: ID of the current test run. + test_suite_id: ID of the test suite. + test_starttime: Start time of the test run. + input_parameters_fn: Callable that takes a TestExecutionDef and returns input params string. + null_value: Sentinel string for NULL values. + + Returns: + List of result rows (each row is a list of values). + """ + test_results: list[list] = [] + for result in aggregate_results: + test_defs = aggregate_test_defs[result["query_index"]] + result_measures = result["result_measures"].split("|") + result_codes = result["result_codes"].split(",") + + for index, td in enumerate(test_defs): + test_results.append([ + test_run_id, + test_suite_id, + test_starttime, + td.id, + td.test_type, + td.schema_name, + td.table_name, + td.column_name, + td.skip_errors or 0, + input_parameters_fn(td), + result_codes[index], + None, # result_status will be calculated later + None, # No result_message + result_measures[index] if result_measures[index] != null_value else None, + ]) + + return test_results + + class TestExecutionSQL: null_value = "" @@ -175,7 +319,7 @@ def _get_query( query = query.replace(":", "\\:") return query, None if no_bind else params - + def has_schema_changes(self) -> tuple[dict]: # Runs on App database return self._get_query("has_schema_changes.sql") @@ -263,69 +407,37 @@ def aggregate_cat_tests( measure = replace_params(td.measure, params) measure = replace_templated_functions(measure, self.flavor) - td.measure_expression = f"COALESCE(CAST({measure} AS {varchar_type}) {concat_operator} '|', '{self.null_value}|')" - - # For prediction mode, return -1 during training period - if td.history_calculation == "PREDICT" and (not td.lower_tolerance or not td.upper_tolerance): - td.condition_expression = "'-1,'" - else: - condition = ( - f"{td.measure} {td.test_operator} {td.test_condition}" - if "BETWEEN" in td.test_operator - else f"{td.measure}{td.test_operator}{td.test_condition}" - ) - condition = replace_params(condition, params) - condition = replace_templated_functions(condition, self.flavor) - td.condition_expression = f"CASE WHEN {condition} THEN '0,' ELSE '1,' END" + condition = replace_params(td.test_condition, params) + condition = replace_templated_functions(condition, self.flavor) + + td.measure_expression, td.condition_expression = build_cat_expressions( + measure=measure, + test_operator=td.test_operator, + test_condition=condition, + history_calculation=td.history_calculation, + lower_tolerance=td.lower_tolerance, + upper_tolerance=td.upper_tolerance, + varchar_type=varchar_type, + concat_operator=concat_operator, + null_value=self.null_value, + ) + + max_query_chars = self.connection.max_query_chars - 400 + groups = group_cat_tests(test_defs, max_query_chars, concat_operator, single) aggregate_queries: list[tuple[str, None]] = [] aggregate_test_defs: list[list[TestExecutionDef]] = [] - - def add_query(test_defs: list[TestExecutionDef]) -> str: - if not test_defs: - return - + for group in groups: query = ( f"SELECT {len(aggregate_queries)} AS query_index, " - f"{concat_operator.join([td.measure_expression for td in test_defs])} AS result_measures, " - f"{concat_operator.join([td.condition_expression for td in test_defs])} AS result_codes " - f"FROM {quote}{test_defs[0].schema_name}{quote}.{quote}{test_defs[0].table_name}{quote}" + f"{concat_operator.join([td.measure_expression for td in group])} AS result_measures, " + f"{concat_operator.join([td.condition_expression for td in group])} AS result_codes " + f"FROM {quote}{group[0].schema_name}{quote}.{quote}{group[0].table_name}{quote}" ) query = query.replace(":", "\\:") aggregate_queries.append((query, None)) - aggregate_test_defs.append(test_defs) - - if single: - for td in test_defs: - # Add separate query for each test - add_query([td]) - else: - test_defs_by_table: dict[tuple[str, str], list[TestExecutionDef]] = {} - for td in test_defs: - table = (td.schema_name, td.table_name) - if not test_defs_by_table.get(table): - test_defs_by_table[table] = [] - test_defs_by_table[table].append(td) - - max_query_chars = self.connection.max_query_chars - 400 - for test_defs in test_defs_by_table.values(): - # Add new query for each table - current_chars = 0 - current_test_defs = [] - - for td in test_defs: - td_chars = len(td.measure_expression) + len(td.condition_expression) + 2 * len(concat_operator) - # Add new query if current query will become bigger than character limit - if (current_chars + td_chars) > max_query_chars: - add_query(current_test_defs) - current_chars = 0 - current_test_defs = [] - - current_chars += td_chars - current_test_defs.append(td) - - add_query(current_test_defs) + aggregate_test_defs.append(group) return aggregate_queries, aggregate_test_defs @@ -334,31 +446,15 @@ def get_cat_test_results( aggregate_results: list[AggregateResult], aggregate_test_defs: list[list[TestExecutionDef]], ) -> list[list[UUID | str | datetime | int | None]]: - test_results: list[list[UUID | str | datetime | int | None]] = [] - for result in aggregate_results: - test_defs = aggregate_test_defs[result["query_index"]] - result_measures = result["result_measures"].split("|") - result_codes = result["result_codes"].split(",") - - for index, td in enumerate(test_defs): - test_results.append([ - self.test_run.id, - self.test_run.test_suite_id, - self.test_run.test_starttime, - td.id, - td.test_type, - td.schema_name, - td.table_name, - td.column_name, - td.skip_errors or 0, - self._get_input_parameters(td), - result_codes[index], - None, # result_status will be calculated later - None, # No result_message - result_measures[index] if result_measures[index] != self.null_value else None, - ]) - - return test_results + return parse_cat_results( + aggregate_results=aggregate_results, + aggregate_test_defs=aggregate_test_defs, + test_run_id=self.test_run.id, + test_suite_id=self.test_run.test_suite_id, + test_starttime=self.test_run.test_starttime, + input_parameters_fn=self._get_input_parameters, + null_value=self.null_value, + ) def update_test_results(self) -> list[tuple[str, dict]]: # Runs on App database diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index c1ec78fe..4f67fde6 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -19,6 +19,40 @@ class TableSampling: sample_percent: float +def calculate_sampling_params( + table_name: str, + record_count: int, + sample_percent_raw: str | float, + min_sample: int, + max_sample: int = 999000, +) -> TableSampling | None: + """Calculate sampling parameters for a table based on record count and sample percent. + + Returns None if sampling is not applicable (invalid percent, or record_count <= min_sample). + """ + if isinstance(sample_percent_raw, str): + cleaned = sample_percent_raw.replace(".", "", 1) if sample_percent_raw else "" + sample_percent = float(sample_percent_raw) if cleaned.isdigit() else 30 + else: + sample_percent = float(sample_percent_raw) if sample_percent_raw is not None else 30 + + if not (0 < sample_percent < 100): + return None + + if record_count <= min_sample: + return None + + calc_sample = round(sample_percent * record_count / 100) + sample_count = min(max(calc_sample, min_sample), max_sample) + + return TableSampling( + table_name=table_name, + sample_count=sample_count, + sample_ratio=record_count / sample_count, + sample_percent=round(100 * sample_count / record_count, 4), + ) + + @dataclasses.dataclass class HygieneIssueType: id: str diff --git a/testgen/commands/run_profiling.py b/testgen/commands/run_profiling.py index 2d9499ff..adcdedd6 100644 --- a/testgen/commands/run_profiling.py +++ b/testgen/commands/run_profiling.py @@ -6,7 +6,7 @@ import testgen.common.process_service as process_service from testgen import settings -from testgen.commands.queries.profiling_query import HygieneIssueType, ProfilingSQL, TableSampling +from testgen.commands.queries.profiling_query import HygieneIssueType, ProfilingSQL, TableSampling, calculate_sampling_params from testgen.commands.queries.refresh_data_chars_query import ColumnChars from testgen.commands.queries.rollup_scores_query import RollupScoresSQL from testgen.commands.run_refresh_data_chars import run_data_chars_refresh @@ -142,25 +142,17 @@ def _run_column_profiling(sql_generator: ProfilingSQL, data_chars: list[ColumnCh LOG.info(f"Running column profiling queries: {len(data_chars)}") table_group = sql_generator.table_group sampling_params: dict[str, TableSampling] = {} - sample_percent = ( - float(table_group.profile_sample_percent) - if str(table_group.profile_sample_percent).replace(".", "", 1).isdigit() - else 30 - ) - if table_group.profile_use_sampling and 0 < sample_percent < 100: - min_sample = table_group.profile_sample_min_count - max_sample = 999000 + if table_group.profile_use_sampling: for column in data_chars: - if not sampling_params.get(column.table_name) and column.record_ct > min_sample: - calc_sample = round(sample_percent * column.record_ct / 100) - sample_count = min(max(calc_sample, min_sample), max_sample) - - sampling_params[column.table_name] = TableSampling( + if not sampling_params.get(column.table_name): + result = calculate_sampling_params( table_name=column.table_name, - sample_count=sample_count, - sample_ratio=column.record_ct / sample_count, - sample_percent=round(100 * sample_count / column.record_ct, 4), + record_count=column.record_ct, + sample_percent_raw=table_group.profile_sample_percent, + min_sample=table_group.profile_sample_min_count, ) + if result: + sampling_params[column.table_name] = result def update_column_progress(progress: ThreadedProgress) -> None: profiling_run.set_progress( diff --git a/testgen/commands/run_test_validation.py b/testgen/commands/run_test_validation.py index 55fb6185..cdb961be 100644 --- a/testgen/commands/run_test_validation.py +++ b/testgen/commands/run_test_validation.py @@ -9,20 +9,29 @@ LOG = logging.getLogger("testgen") -def run_test_validation(sql_generator: TestExecutionSQL, test_defs: list[TestExecutionDef]) -> list[TestExecutionDef]: - test_defs_by_id: dict[UUID, TestExecutionDef] = {td.id: td for td in test_defs} +def collect_test_identifiers( + test_defs: list[TestExecutionDef], + quote_char: str, +) -> tuple[dict[tuple[str, str, str | None], set[UUID]], set[str], dict[UUID, list[str]]]: + """Collect identifiers (schema, table, column) that need validation from test definitions. + + Returns: + identifiers_to_check: {(schema, table, column|None): {test_ids}} + target_schemas: set of schemas to query + errors: {test_id: [error_messages]} + """ identifiers_to_check: dict[tuple[str, str, str | None], set[UUID]] = {} - target_schemas = set() - quote = sql_generator.flavor_service.quote_character + target_schemas: set[str] = set() + errors: dict[UUID, list[str]] = {} def add_identifiers(test_id: UUID, schema: str, table: str, columns: str | None = None, single_column: bool = False) -> None: target_schemas.add(schema) if columns: if single_column: - identifiers = [(schema.lower(), table.lower(), columns.strip(f" {quote}").lower())] + identifiers = [(schema.lower(), table.lower(), columns.strip(f" {quote_char}").lower())] else: - column_names = re.split(rf",(?=(?:[^\{quote}]*\{quote}[^\{quote}]*\{quote})*[^\{quote}]*$)", columns) - column_names = [col.strip(f" {quote}") for col in column_names] + column_names = re.split(rf",(?=(?:[^\{quote_char}]*\{quote_char}[^\{quote_char}]*\{quote_char})*[^\{quote_char}]*$)", columns) + column_names = [col.strip(f" {quote_char}") for col in column_names] identifiers = [(schema.lower(), table.lower(), col.lower()) for col in column_names if col] else: identifiers = [(schema.lower(), table.lower(), None)] @@ -32,11 +41,10 @@ def add_identifiers(test_id: UUID, schema: str, table: str, columns: str | None identifiers_to_check[key] = set() identifiers_to_check[key].add(test_id) - def add_test_error(test_ids: list[UUID], error: str) -> None: - for test_id in test_ids: - if not test_defs_by_id[test_id].errors: - test_defs_by_id[test_id].errors.append("Deactivated") - test_defs_by_id[test_id].errors.append(error) + def add_error(test_id: UUID, error: str) -> None: + if test_id not in errors: + errors[test_id] = ["Deactivated"] + errors[test_id].append(error) for td in test_defs: # No validation needed for custom query or table group tests @@ -64,9 +72,50 @@ def add_test_error(test_ids: list[UUID], error: str) -> None: if td.match_groupby_names: add_identifiers(td.id, td.match_schema_name, td.match_table_name, td.match_groupby_names) else: - add_test_error([td.id], "Invalid test: match schema, table, or column not defined") + add_error(td.id, "Invalid test: match schema, table, or column not defined") + else: + add_error(td.id, "Invalid test: schema, table, or column not defined") + + return identifiers_to_check, target_schemas, errors + + +def check_identifiers( + identifiers_to_check: dict[tuple[str, str, str | None], set[UUID]], + target_tables: set[tuple[str, str]], + target_columns: set[tuple[str, str, str]], +) -> dict[UUID, list[str]]: + """Check collected identifiers against actual target tables/columns. + + Returns {test_id: [error_messages]} for identifiers that don't exist. + """ + errors: dict[UUID, list[str]] = {} + + for identifier, test_ids in identifiers_to_check.items(): + table = (identifier[0], identifier[1]) + if table not in target_tables: + error = f"Missing table: {'.'.join(table)}" + elif identifier[2] and identifier not in target_columns: + error = f"Missing column: {'.'.join(identifier)}" else: - add_test_error([td.id], "Invalid test: schema, table, or column not defined") + continue + + for test_id in test_ids: + if test_id not in errors: + errors[test_id] = ["Deactivated"] + errors[test_id].append(error) + + return errors + + +def run_test_validation(sql_generator: TestExecutionSQL, test_defs: list[TestExecutionDef]) -> list[TestExecutionDef]: + quote = sql_generator.flavor_service.quote_character + + identifiers_to_check, target_schemas, collection_errors = collect_test_identifiers(test_defs, quote) + + # Apply collection errors to test defs + test_defs_by_id: dict[UUID, TestExecutionDef] = {td.id: td for td in test_defs} + for test_id, error_list in collection_errors.items(): + test_defs_by_id[test_id].errors = error_list if target_schemas: LOG.info("Getting tables and columns in target schemas for validation") @@ -84,12 +133,13 @@ def add_test_error(test_ids: list[UUID], error: str) -> None: for item in target_identifiers } - for identifier, test_ids in identifiers_to_check.items(): - table = (identifier[0], identifier[1]) - if table not in target_tables: - add_test_error(test_ids, f"Missing table: {'.'.join(table)}") - elif identifier[2] and identifier not in target_columns: - add_test_error(test_ids, f"Missing column: {'.'.join(identifier)}") + check_errors = check_identifiers(identifiers_to_check, target_tables, target_columns) + for test_id, error_list in check_errors.items(): + if not test_defs_by_id[test_id].errors: + test_defs_by_id[test_id].errors = error_list + else: + # Skip "Deactivated" prefix since it's already there from collection_errors or we add it + test_defs_by_id[test_id].errors.extend(error_list[1:] if test_defs_by_id[test_id].errors else error_list) error_results = sql_generator.get_test_errors(test_defs_by_id.values()) if error_results: diff --git a/testgen/commands/test_thresholds_prediction.py b/testgen/commands/test_thresholds_prediction.py index 7641b78b..b90fc0d3 100644 --- a/testgen/commands/test_thresholds_prediction.py +++ b/testgen/commands/test_thresholds_prediction.py @@ -17,6 +17,45 @@ LOG = logging.getLogger("testgen") +Z_SCORE_MAP = { + ("lower_tolerance", PredictSensitivity.low): -2.0, # 2.5th percentile + ("lower_tolerance", PredictSensitivity.medium): -1.5, # 7th percentile + ("lower_tolerance", PredictSensitivity.high): -1.0, # 16th percentile + ("upper_tolerance", PredictSensitivity.high): 1.0, # 84th percentile + ("upper_tolerance", PredictSensitivity.medium): 1.5, # 93rd percentile + ("upper_tolerance", PredictSensitivity.low): 2.0, # 97.5th percentile +} + + +def calculate_prediction_tolerances( + forecast: pd.DataFrame, + sensitivity: PredictSensitivity, + z_score_map: dict | None = None, +) -> tuple[float | None, float | None, str | None]: + """Compute lower/upper tolerance from a SARIMAX forecast using z-score map. + + The forecast DataFrame must have 'mean' and 'se' columns, indexed by date. + Tolerances are computed for the first forecast date at the given sensitivity. + + Returns: + (lower_tolerance, upper_tolerance, forecast_json) or (None, None, None) if NaN. + """ + if z_score_map is None: + z_score_map = Z_SCORE_MAP + + for key, z_score in z_score_map.items(): + column = f"{key[0]}|{key[1].value}" + forecast[column] = forecast["mean"] + (z_score * forecast["se"]) + + next_date = forecast.index[0] + lower_tolerance = forecast.at[next_date, f"lower_tolerance|{sensitivity.value}"] + upper_tolerance = forecast.at[next_date, f"upper_tolerance|{sensitivity.value}"] + + if pd.isna(lower_tolerance) or pd.isna(upper_tolerance): + return None, None, None + + return lower_tolerance, upper_tolerance, forecast.to_json() + class TestThresholdsPrediction: staging_table = "stg_test_definition_updates" @@ -29,14 +68,7 @@ class TestThresholdsPrediction: "prediction", ) num_forecast = 10 - z_score_map: ClassVar = { - ("lower_tolerance", PredictSensitivity.low): -2.0, # 2.5th percentile - ("lower_tolerance", PredictSensitivity.medium): -1.5, # 7th percentile - ("lower_tolerance", PredictSensitivity.high): -1.0, # 16th percentile - ("upper_tolerance", PredictSensitivity.high): 1.0, # 84th percentile - ("upper_tolerance", PredictSensitivity.medium): 1.5, # 93rd percentile - ("upper_tolerance", PredictSensitivity.low): 2.0, # 97.5th percentile - } + z_score_map: ClassVar = Z_SCORE_MAP def __init__(self, test_suite: TestSuite, run_date: datetime): self.test_suite = test_suite @@ -71,19 +103,11 @@ def run(self) -> None: ] if self.test_suite.predict_holiday_codes else None, ) - for key, z_score in self.z_score_map.items(): - column = f"{key[0]}|{key[1].value}" - forecast[column] = forecast["mean"] + (z_score * forecast["se"]) - - next_date = forecast.index[0] sensitivity = self.test_suite.predict_sensitivity or PredictSensitivity.medium - lower_tolerance = forecast.at[next_date, f"lower_tolerance|{sensitivity.value}"] - upper_tolerance = forecast.at[next_date, f"upper_tolerance|{sensitivity.value}"] - - if pd.isna(lower_tolerance) or pd.isna(upper_tolerance): - test_prediction.extend([None, None, None]) - else: - test_prediction.extend([lower_tolerance, upper_tolerance, forecast.to_json()]) + lower, upper, forecast_json = calculate_prediction_tolerances( + forecast, sensitivity, self.z_score_map, + ) + test_prediction.extend([lower, upper, forecast_json]) except NotEnoughData: test_prediction.extend([None, None, None]) else: diff --git a/tests/unit/commands/queries/test_execute_tests_query.py b/tests/unit/commands/queries/test_execute_tests_query.py new file mode 100644 index 00000000..0f4bcf4c --- /dev/null +++ b/tests/unit/commands/queries/test_execute_tests_query.py @@ -0,0 +1,314 @@ +from datetime import datetime, UTC +from uuid import uuid4 + +import pytest + +from testgen.commands.queries.execute_tests_query import ( + TestExecutionDef, + build_cat_expressions, + group_cat_tests, + parse_cat_results, +) + +pytestmark = pytest.mark.unit + + +def _make_td(**overrides) -> TestExecutionDef: + """Build a minimal TestExecutionDef with sensible defaults.""" + defaults = dict( + id=uuid4(), + test_type="Alpha", + schema_name="public", + table_name="orders", + column_name="amount", + skip_errors=0, + history_calculation="NONE", + custom_query="", + run_type="CAT", + test_scope="column", + template="", + measure="COUNT(*)", + test_operator=">=", + test_condition="100", + baseline_ct="", + baseline_unique_ct="", + baseline_value="", + baseline_value_ct="", + threshold_value="", + baseline_sum="", + baseline_avg="", + baseline_sd="", + lower_tolerance="", + upper_tolerance="", + subset_condition="", + groupby_names="", + having_condition="", + window_date_column="", + window_days="", + match_schema_name="", + match_table_name="", + match_column_names="", + match_subset_condition="", + match_groupby_names="", + match_having_condition="", + ) + defaults.update(overrides) + return TestExecutionDef(**defaults) + + +def _make_input_params_fn(): + return lambda td: f"params_for_{td.test_type}" + + +# --- build_cat_expressions --- + + +def test_build_basic_measure_with_coalesce_cast(): + measure_expr, _ = build_cat_expressions( + measure="COUNT(*)", + test_operator=">=", + test_condition="100", + history_calculation="NONE", + lower_tolerance="10", + upper_tolerance="200", + varchar_type="VARCHAR", + concat_operator="||", + ) + assert "COALESCE(CAST(COUNT(*) AS VARCHAR)" in measure_expr + assert "||" in measure_expr + assert "'|'" in measure_expr + assert "|" in measure_expr + + +def test_build_normal_pass_fail_condition(): + _, cond_expr = build_cat_expressions( + measure="COUNT(*)", + test_operator=">=", + test_condition="100", + history_calculation="NONE", + lower_tolerance="10", + upper_tolerance="200", + varchar_type="VARCHAR", + concat_operator="||", + ) + assert "CASE WHEN" in cond_expr + assert "COUNT(*)>=100" in cond_expr + assert "THEN '0,'" in cond_expr + assert "ELSE '1,'" in cond_expr + + +def test_build_between_operator_spacing(): + _, cond_expr = build_cat_expressions( + measure="AVG(price)", + test_operator=" BETWEEN ", + test_condition="10 AND 200", + history_calculation="NONE", + lower_tolerance="10", + upper_tolerance="200", + varchar_type="VARCHAR", + concat_operator="||", + ) + # BETWEEN branch uses f"{measure} {operator} {condition}" — double spaces expected + # since operator already includes spaces + assert "AVG(price) BETWEEN 10 AND 200" in cond_expr + + +def test_build_non_between_operator_no_spacing(): + _, cond_expr = build_cat_expressions( + measure="COUNT(*)", + test_operator="<=", + test_condition="500", + history_calculation="NONE", + lower_tolerance="10", + upper_tolerance="200", + varchar_type="VARCHAR", + concat_operator="||", + ) + assert "COUNT(*)<=500" in cond_expr + + +def test_build_prediction_mode_training(): + """PREDICT mode without tolerances should return -1 (training).""" + _, cond_expr = build_cat_expressions( + measure="COUNT(*)", + test_operator=">=", + test_condition="100", + history_calculation="PREDICT", + lower_tolerance="", + upper_tolerance="", + varchar_type="VARCHAR", + concat_operator="||", + ) + assert cond_expr == "'-1,'" + + +def test_build_prediction_mode_with_tolerances(): + """PREDICT mode with tolerances should produce normal condition.""" + _, cond_expr = build_cat_expressions( + measure="COUNT(*)", + test_operator=">=", + test_condition="100", + history_calculation="PREDICT", + lower_tolerance="50", + upper_tolerance="200", + varchar_type="VARCHAR", + concat_operator="||", + ) + assert "CASE WHEN" in cond_expr + + +def test_build_prediction_partial_tolerance_is_training(): + """PREDICT with only lower tolerance set should still be training mode.""" + _, cond_expr = build_cat_expressions( + measure="COUNT(*)", + test_operator=">=", + test_condition="100", + history_calculation="PREDICT", + lower_tolerance="50", + upper_tolerance="", + varchar_type="VARCHAR", + concat_operator="||", + ) + assert cond_expr == "'-1,'" + + +def test_build_custom_null_value(): + measure_expr, _ = build_cat_expressions( + measure="COUNT(*)", + test_operator=">=", + test_condition="100", + history_calculation="NONE", + lower_tolerance="", + upper_tolerance="", + varchar_type="VARCHAR", + concat_operator="||", + null_value="MISSING", + ) + assert "'MISSING|'" in measure_expr + + +# --- group_cat_tests --- + + +def test_group_single_mode(): + tds = [_make_td(measure_expression="m1", condition_expression="c1"), + _make_td(measure_expression="m2", condition_expression="c2")] + groups = group_cat_tests(tds, max_query_chars=10000, concat_operator="||", single=True) + assert len(groups) == 2 + assert len(groups[0]) == 1 + assert len(groups[1]) == 1 + + +def test_group_all_fit_in_one(): + tds = [_make_td(measure_expression="m1", condition_expression="c1"), + _make_td(measure_expression="m2", condition_expression="c2")] + groups = group_cat_tests(tds, max_query_chars=10000, concat_operator="||") + assert len(groups) == 1 + assert len(groups[0]) == 2 + + +def test_group_character_overflow_splits(): + # Each td takes len("m"*50) + len("c"*50) + 2*len("||") = 104 chars + tds = [_make_td(measure_expression="m" * 50, condition_expression="c" * 50) for _ in range(3)] + # max_query_chars = 250 fits 2 tds (208 <= 250), third overflows (312 > 250) + groups = group_cat_tests(tds, max_query_chars=250, concat_operator="||") + assert len(groups) == 2 + assert len(groups[0]) == 2 + assert len(groups[1]) == 1 + + +def test_group_different_tables_separate(): + td1 = _make_td(schema_name="public", table_name="orders", + measure_expression="m1", condition_expression="c1") + td2 = _make_td(schema_name="public", table_name="customers", + measure_expression="m2", condition_expression="c2") + groups = group_cat_tests([td1, td2], max_query_chars=10000, concat_operator="||") + assert len(groups) == 2 + + +def test_group_empty_input(): + groups = group_cat_tests([], max_query_chars=10000, concat_operator="||") + assert groups == [] + + +def test_group_same_table_together(): + tds = [_make_td(schema_name="s", table_name="t", + measure_expression="m", condition_expression="c") for _ in range(5)] + groups = group_cat_tests(tds, max_query_chars=10000, concat_operator="||") + assert len(groups) == 1 + assert len(groups[0]) == 5 + + +# --- parse_cat_results --- + + +def test_parse_basic_single_result(): + td = _make_td(test_type="Alpha") + test_defs = [[td]] + results = [{"query_index": 0, "result_measures": "42|", "result_codes": "1,"}] + run_id = uuid4() + suite_id = uuid4() + start = datetime.now(UTC) + + rows = parse_cat_results(results, test_defs, run_id, suite_id, start, + _make_input_params_fn()) + assert len(rows) == 1 + row = rows[0] + assert row[0] == run_id + assert row[1] == suite_id + assert row[2] == start + assert row[3] == td.id + assert row[10] == "1" # result_code + assert row[13] == "42" # result_measure + + +def test_parse_null_value_handling(): + td = _make_td() + test_defs = [[td]] + results = [{"query_index": 0, "result_measures": "|", "result_codes": "0,"}] + + rows = parse_cat_results(results, test_defs, uuid4(), uuid4(), + datetime.now(UTC), _make_input_params_fn()) + assert rows[0][13] is None # should become None + + +def test_parse_multi_test_per_query(): + td1 = _make_td(test_type="Alpha") + td2 = _make_td(test_type="Beta") + test_defs = [[td1, td2]] + results = [{"query_index": 0, "result_measures": "10|20|", "result_codes": "1,0,"}] + + rows = parse_cat_results(results, test_defs, uuid4(), uuid4(), + datetime.now(UTC), _make_input_params_fn()) + assert len(rows) == 2 + assert rows[0][13] == "10" + assert rows[1][13] == "20" + assert rows[0][10] == "1" + assert rows[1][10] == "0" + + +def test_parse_multiple_queries(): + td1 = _make_td(test_type="Alpha") + td2 = _make_td(test_type="Beta") + test_defs = [[td1], [td2]] + results = [ + {"query_index": 0, "result_measures": "10|", "result_codes": "1,"}, + {"query_index": 1, "result_measures": "20|", "result_codes": "0,"}, + ] + + rows = parse_cat_results(results, test_defs, uuid4(), uuid4(), + datetime.now(UTC), _make_input_params_fn()) + assert len(rows) == 2 + assert rows[0][4] == "Alpha" + assert rows[1][4] == "Beta" + + +def test_parse_result_code_negative_one(): + """Training mode result (-1) should pass through.""" + td = _make_td() + test_defs = [[td]] + results = [{"query_index": 0, "result_measures": "42|", "result_codes": "-1,"}] + + rows = parse_cat_results(results, test_defs, uuid4(), uuid4(), + datetime.now(UTC), _make_input_params_fn()) + assert rows[0][10] == "-1" diff --git a/tests/unit/commands/queries/test_profiling_query.py b/tests/unit/commands/queries/test_profiling_query.py new file mode 100644 index 00000000..61ad7df0 --- /dev/null +++ b/tests/unit/commands/queries/test_profiling_query.py @@ -0,0 +1,93 @@ +import pytest + +from testgen.commands.queries.profiling_query import calculate_sampling_params + +pytestmark = pytest.mark.unit + + +# --- calculate_sampling_params --- + + +def test_sampling_basic_calculation(): + result = calculate_sampling_params("orders", 10000, "30", min_sample=100) + assert result is not None + assert result.table_name == "orders" + assert result.sample_count == 3000 + assert result.sample_ratio == pytest.approx(10000 / 3000) + assert result.sample_percent == pytest.approx(30.0) + + +def test_sampling_non_numeric_percent_fallback(): + """Non-numeric string should fall back to 30%.""" + result = calculate_sampling_params("orders", 10000, "abc", min_sample=100) + assert result is not None + assert result.sample_count == 3000 + + +def test_sampling_empty_string_percent_fallback(): + result = calculate_sampling_params("orders", 10000, "", min_sample=100) + assert result is not None + assert result.sample_count == 3000 + + +def test_sampling_none_percent_fallback(): + result = calculate_sampling_params("orders", 10000, None, min_sample=100) + assert result is not None + assert result.sample_count == 3000 + + +def test_sampling_percent_out_of_range_zero(): + result = calculate_sampling_params("orders", 10000, "0", min_sample=100) + assert result is None + + +def test_sampling_percent_out_of_range_100(): + result = calculate_sampling_params("orders", 10000, "100", min_sample=100) + assert result is None + + +def test_sampling_record_count_below_min_sample(): + result = calculate_sampling_params("small_table", 50, "30", min_sample=100) + assert result is None + + +def test_sampling_record_count_equals_min_sample(): + result = calculate_sampling_params("small_table", 100, "30", min_sample=100) + assert result is None + + +def test_sampling_clamped_to_min_sample(): + """When calculated sample is below min_sample, clamp up to min_sample.""" + result = calculate_sampling_params("orders", 1000, "5", min_sample=200) + # 5% of 1000 = 50, but min_sample is 200 + assert result is not None + assert result.sample_count == 200 + + +def test_sampling_clamped_to_max_sample(): + """When calculated sample exceeds max, clamp down to max.""" + result = calculate_sampling_params("huge_table", 10_000_000, "50", min_sample=100, max_sample=999000) + # 50% of 10M = 5M, but max is 999000 + assert result is not None + assert result.sample_count == 999000 + + +def test_sampling_ratio_and_percent_math(): + result = calculate_sampling_params("orders", 5000, "20", min_sample=100) + # 20% of 5000 = 1000 + assert result.sample_count == 1000 + assert result.sample_ratio == pytest.approx(5.0) + assert result.sample_percent == pytest.approx(20.0) + + +def test_sampling_float_percent(): + result = calculate_sampling_params("orders", 10000, 25.5, min_sample=100) + # 25.5% of 10000 = 2550 + assert result is not None + assert result.sample_count == 2550 + + +def test_sampling_decimal_string_percent(): + result = calculate_sampling_params("orders", 10000, "15.5", min_sample=100) + assert result is not None + assert result.sample_count == 1550 diff --git a/tests/unit/commands/test_run_test_validation.py b/tests/unit/commands/test_run_test_validation.py new file mode 100644 index 00000000..d37ce81b --- /dev/null +++ b/tests/unit/commands/test_run_test_validation.py @@ -0,0 +1,248 @@ +from uuid import uuid4 + +import pytest + +from testgen.commands.queries.execute_tests_query import TestExecutionDef +from testgen.commands.run_test_validation import check_identifiers, collect_test_identifiers + +pytestmark = pytest.mark.unit + + +def _make_td(**overrides) -> TestExecutionDef: + """Build a minimal TestExecutionDef with sensible defaults.""" + defaults = dict( + id=uuid4(), + test_type="Alpha", + schema_name="public", + table_name="orders", + column_name="amount", + skip_errors=0, + history_calculation="NONE", + custom_query="", + run_type="CAT", + test_scope="column", + template="", + measure="", + test_operator="=", + test_condition="", + baseline_ct="", + baseline_unique_ct="", + baseline_value="", + baseline_value_ct="", + threshold_value="", + baseline_sum="", + baseline_avg="", + baseline_sd="", + lower_tolerance="", + upper_tolerance="", + subset_condition="", + groupby_names="", + having_condition="", + window_date_column="", + window_days="", + match_schema_name="", + match_table_name="", + match_column_names="", + match_subset_condition="", + match_groupby_names="", + match_having_condition="", + ) + defaults.update(overrides) + return TestExecutionDef(**defaults) + + +# --- collect_test_identifiers --- + + +def test_collect_custom_type_skipped(): + td = _make_td(test_type="CUSTOM") + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert len(identifiers) == 0 + assert len(schemas) == 0 + assert len(errors) == 0 + + +def test_collect_tablegroup_scope_skipped(): + td = _make_td(test_scope="tablegroup") + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert len(identifiers) == 0 + + +def test_collect_table_scope_collects_table_only(): + td = _make_td(test_scope="table", column_name="irrelevant") + identifiers, schemas, errors = collect_test_identifiers([td], '"') + # Should have table-level identifier (column=None), not column-level + assert (td.schema_name.lower(), td.table_name.lower(), None) in identifiers + + +def test_collect_column_scope_single_column(): + td = _make_td(test_scope="column", column_name="amount") + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert ("public", "orders", "amount") in identifiers + + +def test_collect_column_scope_multi_column(): + """Multi-column scope (not single_column) should split on commas.""" + td = _make_td(test_scope="referential", column_name="col_a,col_b", match_schema_name="", match_table_name="") + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert ("public", "orders", "col_a") in identifiers + assert ("public", "orders", "col_b") in identifiers + + +def test_collect_quoted_multi_column_parsing(): + """Columns with quoted identifiers should be parsed correctly.""" + td = _make_td(test_scope="referential", column_name='"col,a","col_b"', match_schema_name="", match_table_name="") + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert ("public", "orders", "col,a") in identifiers + assert ("public", "orders", "col_b") in identifiers + + +def test_collect_groupby_names(): + td = _make_td(groupby_names="region,country") + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert ("public", "orders", "region") in identifiers + assert ("public", "orders", "country") in identifiers + + +def test_collect_referential_window_date_column(): + td = _make_td( + test_scope="referential", + column_name="col_a", + window_date_column="created_at", + match_schema_name="public", + match_table_name="customers", + match_column_names="cust_id", + ) + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert ("public", "orders", "created_at") in identifiers + + +def test_collect_referential_match_columns(): + td = _make_td( + test_scope="referential", + column_name="order_id", + match_schema_name="public", + match_table_name="customers", + match_column_names="cust_id", + ) + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert ("public", "customers", "cust_id") in identifiers + + +def test_collect_referential_match_groupby(): + td = _make_td( + test_scope="referential", + column_name="order_id", + match_schema_name="public", + match_table_name="customers", + match_column_names="", + match_groupby_names="region", + ) + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert ("public", "customers", "region") in identifiers + + +def test_collect_referential_missing_match_schema_errors(): + td = _make_td( + test_scope="referential", + column_name="order_id", + match_schema_name="", + match_table_name="", + match_column_names="cust_id", + ) + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert td.id in errors + assert any("match schema" in e for e in errors[td.id]) + + +def test_collect_missing_schema_or_table_errors(): + td = _make_td(schema_name="", table_name="") + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert td.id in errors + assert any("schema, table, or column not defined" in e for e in errors[td.id]) + + +def test_collect_aggregate_type_validates_table_only(): + td = _make_td(test_type="Aggregate_Balance", test_scope="referential", + column_name="amount", match_schema_name="public", + match_table_name="customers", match_column_names="balance") + identifiers, schemas, errors = collect_test_identifiers([td], '"') + # Table-level check for main table + assert ("public", "orders", None) in identifiers + # Match columns should NOT be checked for Aggregate_ types + assert ("public", "customers", "balance") not in identifiers + + +def test_collect_target_schemas_populated(): + td1 = _make_td(schema_name="schema_a") + td2 = _make_td(schema_name="schema_b") + identifiers, schemas, errors = collect_test_identifiers([td1, td2], '"') + assert "schema_a" in schemas + assert "schema_b" in schemas + + +def test_collect_error_format_starts_with_deactivated(): + td = _make_td(schema_name="", table_name="") + identifiers, schemas, errors = collect_test_identifiers([td], '"') + assert errors[td.id][0] == "Deactivated" + + +# --- check_identifiers --- + + +def test_check_all_identifiers_present(): + test_id = uuid4() + identifiers = {("public", "orders", "amount"): {test_id}} + tables = {("public", "orders")} + columns = {("public", "orders", "amount")} + errors = check_identifiers(identifiers, tables, columns) + assert len(errors) == 0 + + +def test_check_missing_table(): + test_id = uuid4() + identifiers = {("public", "orders", None): {test_id}} + tables = set() # No tables exist + columns = set() + errors = check_identifiers(identifiers, tables, columns) + assert test_id in errors + assert any("Missing table" in e for e in errors[test_id]) + + +def test_check_missing_column(): + test_id = uuid4() + identifiers = {("public", "orders", "nonexistent"): {test_id}} + tables = {("public", "orders")} + columns = {("public", "orders", "amount")} # different column + errors = check_identifiers(identifiers, tables, columns) + assert test_id in errors + assert any("Missing column" in e for e in errors[test_id]) + + +def test_check_table_only_identifier_passes(): + """Identifier with column=None should only check table existence.""" + test_id = uuid4() + identifiers = {("public", "orders", None): {test_id}} + tables = {("public", "orders")} + columns = set() + errors = check_identifiers(identifiers, tables, columns) + assert len(errors) == 0 + + +def test_check_multiple_tests_share_identifier(): + id1, id2 = uuid4(), uuid4() + identifiers = {("public", "missing_table", None): {id1, id2}} + tables = set() + columns = set() + errors = check_identifiers(identifiers, tables, columns) + assert id1 in errors + assert id2 in errors + + +def test_check_error_format_starts_with_deactivated(): + test_id = uuid4() + identifiers = {("public", "orders", "bad_col"): {test_id}} + tables = {("public", "orders")} + columns = set() + errors = check_identifiers(identifiers, tables, columns) + assert errors[test_id][0] == "Deactivated" diff --git a/tests/unit/commands/test_score_cards.py b/tests/unit/commands/test_score_cards.py new file mode 100644 index 00000000..a537eee3 --- /dev/null +++ b/tests/unit/commands/test_score_cards.py @@ -0,0 +1,82 @@ +from uuid import uuid4 + +import pytest + +from testgen.commands.run_refresh_score_cards_results import _score_card_to_results + +pytestmark = pytest.mark.unit + + +def _make_score_card(**overrides): + defaults = { + "id": str(uuid4()), + "project_code": "test_project", + "name": "Test Score Card", + "score": 85.5, + "cde_score": 90.0, + "profiling_score": 80.0, + "testing_score": 88.0, + "categories": [], + "history": [], + "definition": None, + } + defaults.update(overrides) + return defaults + + +def test_basic_result_count(): + """Should produce 4 base results (score, cde_score, profiling_score, testing_score).""" + card = _make_score_card() + results = _score_card_to_results(card) + assert len(results) == 4 + + +def test_result_categories(): + card = _make_score_card() + results = _score_card_to_results(card) + categories = [r.category for r in results] + assert categories == ["score", "cde_score", "profiling_score", "testing_score"] + + +def test_result_scores_match_card(): + card = _make_score_card(score=85.5, cde_score=90.0, profiling_score=80.0, testing_score=88.0) + results = _score_card_to_results(card) + assert results[0].score == 85.5 + assert results[1].score == 90.0 + assert results[2].score == 80.0 + assert results[3].score == 88.0 + + +def test_definition_id_set(): + card_id = str(uuid4()) + card = _make_score_card(id=card_id) + results = _score_card_to_results(card) + for result in results: + assert str(result.definition_id) == card_id + + +def test_with_categories(): + """Categories from score card should be appended as extra results.""" + card = _make_score_card(categories=[ + {"label": "completeness", "score": 95.0}, + {"label": "accuracy", "score": 72.0}, + ]) + results = _score_card_to_results(card) + assert len(results) == 6 # 4 base + 2 categories + assert results[4].category == "completeness" + assert results[4].score == 95.0 + assert results[5].category == "accuracy" + assert results[5].score == 72.0 + + +def test_empty_categories(): + card = _make_score_card(categories=[]) + results = _score_card_to_results(card) + assert len(results) == 4 + + +def test_none_score_values(): + card = _make_score_card(score=None, cde_score=None, profiling_score=None, testing_score=None) + results = _score_card_to_results(card) + for result in results: + assert result.score is None diff --git a/tests/unit/commands/test_thresholds_prediction.py b/tests/unit/commands/test_thresholds_prediction.py new file mode 100644 index 00000000..a9c476d3 --- /dev/null +++ b/tests/unit/commands/test_thresholds_prediction.py @@ -0,0 +1,93 @@ +import json + +import numpy as np +import pandas as pd +import pytest + +from testgen.commands.test_thresholds_prediction import Z_SCORE_MAP, calculate_prediction_tolerances +from testgen.common.models.test_suite import PredictSensitivity + +pytestmark = pytest.mark.unit + + +def _make_forecast(mean_values: list[float], se_values: list[float]) -> pd.DataFrame: + """Build a minimal forecast DataFrame with 'mean' and 'se' columns.""" + dates = pd.date_range("2025-01-01", periods=len(mean_values), freq="D") + return pd.DataFrame({"mean": mean_values, "se": se_values}, index=dates) + + +def test_normal_calculation_medium_sensitivity(): + forecast = _make_forecast([100.0, 105.0], [10.0, 12.0]) + lower, upper, forecast_json = calculate_prediction_tolerances( + forecast, PredictSensitivity.medium, + ) + # medium: lower z=-1.5, upper z=1.5 + # lower = 100 + (-1.5 * 10) = 85.0 + # upper = 100 + (1.5 * 10) = 115.0 + assert lower == pytest.approx(85.0) + assert upper == pytest.approx(115.0) + assert forecast_json is not None + # Verify it's valid JSON + parsed = json.loads(forecast_json) + assert "mean" in parsed + + +def test_high_sensitivity_tighter_bounds(): + forecast = _make_forecast([100.0], [10.0]) + lower, upper, _ = calculate_prediction_tolerances( + forecast, PredictSensitivity.high, + ) + # high: lower z=-1.0, upper z=1.0 + assert lower == pytest.approx(90.0) + assert upper == pytest.approx(110.0) + + +def test_low_sensitivity_wider_bounds(): + forecast = _make_forecast([100.0], [10.0]) + lower, upper, _ = calculate_prediction_tolerances( + forecast, PredictSensitivity.low, + ) + # low: lower z=-2.0, upper z=2.0 + assert lower == pytest.approx(80.0) + assert upper == pytest.approx(120.0) + + +def test_nan_in_forecast_returns_none(): + forecast = _make_forecast([float("nan")], [10.0]) + lower, upper, forecast_json = calculate_prediction_tolerances( + forecast, PredictSensitivity.medium, + ) + assert lower is None + assert upper is None + assert forecast_json is None + + +def test_nan_se_returns_none(): + forecast = _make_forecast([100.0], [float("nan")]) + lower, upper, forecast_json = calculate_prediction_tolerances( + forecast, PredictSensitivity.medium, + ) + assert lower is None + assert upper is None + assert forecast_json is None + + +def test_z_score_columns_added_to_forecast(): + """Verify that the z-score tolerance columns are added to the forecast DataFrame.""" + forecast = _make_forecast([100.0, 105.0], [10.0, 12.0]) + calculate_prediction_tolerances(forecast, PredictSensitivity.medium) + # All z-score columns should be present + for key in Z_SCORE_MAP: + col = f"{key[0]}|{key[1].value}" + assert col in forecast.columns + + +def test_uses_first_forecast_date(): + """Tolerances should be computed from the first row of the forecast.""" + forecast = _make_forecast([100.0, 200.0], [10.0, 50.0]) + lower, upper, _ = calculate_prediction_tolerances( + forecast, PredictSensitivity.medium, + ) + # Should use first row (mean=100, se=10), not second (mean=200, se=50) + assert lower == pytest.approx(85.0) + assert upper == pytest.approx(115.0) From b57e70f1d729d4c0176a3581fb8f7e080b60afea Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 13 Feb 2026 11:25:35 -0300 Subject: [PATCH 04/95] fix: resolve ruff linting issues (import sorting, dict literals, unused import) Co-Authored-By: Claude Opus 4.6 --- testgen/commands/run_profiling.py | 7 +- .../queries/test_execute_tests_query.py | 76 +++++++++---------- .../unit/commands/test_run_test_validation.py | 74 +++++++++--------- .../commands/test_thresholds_prediction.py | 1 - 4 files changed, 81 insertions(+), 77 deletions(-) diff --git a/testgen/commands/run_profiling.py b/testgen/commands/run_profiling.py index adcdedd6..ee297040 100644 --- a/testgen/commands/run_profiling.py +++ b/testgen/commands/run_profiling.py @@ -6,7 +6,12 @@ import testgen.common.process_service as process_service from testgen import settings -from testgen.commands.queries.profiling_query import HygieneIssueType, ProfilingSQL, TableSampling, calculate_sampling_params +from testgen.commands.queries.profiling_query import ( + HygieneIssueType, + ProfilingSQL, + TableSampling, + calculate_sampling_params, +) from testgen.commands.queries.refresh_data_chars_query import ColumnChars from testgen.commands.queries.rollup_scores_query import RollupScoresSQL from testgen.commands.run_refresh_data_chars import run_data_chars_refresh diff --git a/tests/unit/commands/queries/test_execute_tests_query.py b/tests/unit/commands/queries/test_execute_tests_query.py index 0f4bcf4c..a78870e1 100644 --- a/tests/unit/commands/queries/test_execute_tests_query.py +++ b/tests/unit/commands/queries/test_execute_tests_query.py @@ -1,4 +1,4 @@ -from datetime import datetime, UTC +from datetime import UTC, datetime from uuid import uuid4 import pytest @@ -15,43 +15,43 @@ def _make_td(**overrides) -> TestExecutionDef: """Build a minimal TestExecutionDef with sensible defaults.""" - defaults = dict( - id=uuid4(), - test_type="Alpha", - schema_name="public", - table_name="orders", - column_name="amount", - skip_errors=0, - history_calculation="NONE", - custom_query="", - run_type="CAT", - test_scope="column", - template="", - measure="COUNT(*)", - test_operator=">=", - test_condition="100", - baseline_ct="", - baseline_unique_ct="", - baseline_value="", - baseline_value_ct="", - threshold_value="", - baseline_sum="", - baseline_avg="", - baseline_sd="", - lower_tolerance="", - upper_tolerance="", - subset_condition="", - groupby_names="", - having_condition="", - window_date_column="", - window_days="", - match_schema_name="", - match_table_name="", - match_column_names="", - match_subset_condition="", - match_groupby_names="", - match_having_condition="", - ) + defaults = { + "id": uuid4(), + "test_type": "Alpha", + "schema_name": "public", + "table_name": "orders", + "column_name": "amount", + "skip_errors": 0, + "history_calculation": "NONE", + "custom_query": "", + "run_type": "CAT", + "test_scope": "column", + "template": "", + "measure": "COUNT(*)", + "test_operator": ">=", + "test_condition": "100", + "baseline_ct": "", + "baseline_unique_ct": "", + "baseline_value": "", + "baseline_value_ct": "", + "threshold_value": "", + "baseline_sum": "", + "baseline_avg": "", + "baseline_sd": "", + "lower_tolerance": "", + "upper_tolerance": "", + "subset_condition": "", + "groupby_names": "", + "having_condition": "", + "window_date_column": "", + "window_days": "", + "match_schema_name": "", + "match_table_name": "", + "match_column_names": "", + "match_subset_condition": "", + "match_groupby_names": "", + "match_having_condition": "", + } defaults.update(overrides) return TestExecutionDef(**defaults) diff --git a/tests/unit/commands/test_run_test_validation.py b/tests/unit/commands/test_run_test_validation.py index d37ce81b..8c51fff4 100644 --- a/tests/unit/commands/test_run_test_validation.py +++ b/tests/unit/commands/test_run_test_validation.py @@ -10,43 +10,43 @@ def _make_td(**overrides) -> TestExecutionDef: """Build a minimal TestExecutionDef with sensible defaults.""" - defaults = dict( - id=uuid4(), - test_type="Alpha", - schema_name="public", - table_name="orders", - column_name="amount", - skip_errors=0, - history_calculation="NONE", - custom_query="", - run_type="CAT", - test_scope="column", - template="", - measure="", - test_operator="=", - test_condition="", - baseline_ct="", - baseline_unique_ct="", - baseline_value="", - baseline_value_ct="", - threshold_value="", - baseline_sum="", - baseline_avg="", - baseline_sd="", - lower_tolerance="", - upper_tolerance="", - subset_condition="", - groupby_names="", - having_condition="", - window_date_column="", - window_days="", - match_schema_name="", - match_table_name="", - match_column_names="", - match_subset_condition="", - match_groupby_names="", - match_having_condition="", - ) + defaults = { + "id": uuid4(), + "test_type": "Alpha", + "schema_name": "public", + "table_name": "orders", + "column_name": "amount", + "skip_errors": 0, + "history_calculation": "NONE", + "custom_query": "", + "run_type": "CAT", + "test_scope": "column", + "template": "", + "measure": "", + "test_operator": "=", + "test_condition": "", + "baseline_ct": "", + "baseline_unique_ct": "", + "baseline_value": "", + "baseline_value_ct": "", + "threshold_value": "", + "baseline_sum": "", + "baseline_avg": "", + "baseline_sd": "", + "lower_tolerance": "", + "upper_tolerance": "", + "subset_condition": "", + "groupby_names": "", + "having_condition": "", + "window_date_column": "", + "window_days": "", + "match_schema_name": "", + "match_table_name": "", + "match_column_names": "", + "match_subset_condition": "", + "match_groupby_names": "", + "match_having_condition": "", + } defaults.update(overrides) return TestExecutionDef(**defaults) diff --git a/tests/unit/commands/test_thresholds_prediction.py b/tests/unit/commands/test_thresholds_prediction.py index a9c476d3..8f02af1b 100644 --- a/tests/unit/commands/test_thresholds_prediction.py +++ b/tests/unit/commands/test_thresholds_prediction.py @@ -1,6 +1,5 @@ import json -import numpy as np import pandas as pd import pytest From 002d5da36184e0af649969b70c6e01ac80a36d2f Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Thu, 19 Feb 2026 20:43:23 -0300 Subject: [PATCH 05/95] refactor: remove pydantic and streamlit-pydantic dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both pydantic (v1) and streamlit-pydantic are unused — the only consumer (ui/forms.py) has zero imports across the codebase. Removing clears the path for PR 1 to introduce pydantic v2 as a transitive dep of the MCP SDK. Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 2 - testgen/ui/forms.py | 136 -------------------------------------------- 2 files changed, 138 deletions(-) delete mode 100644 testgen/ui/forms.py diff --git a/pyproject.toml b/pyproject.toml index 7d178a37..7e1ef181 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,8 +56,6 @@ dependencies = [ "cryptography==44.0.1", "validators==0.33.0", "reportlab==4.2.2", - "pydantic==1.10.13", - "streamlit-pydantic==0.6.0", "cron-converter==1.2.1", "cron-descriptor==2.0.5", "pybars3==0.9.7", diff --git a/testgen/ui/forms.py b/testgen/ui/forms.py deleted file mode 100644 index 8ed6762f..00000000 --- a/testgen/ui/forms.py +++ /dev/null @@ -1,136 +0,0 @@ -from collections.abc import Callable, Generator -from typing import Any, cast - -import streamlit as st -from pydantic import BaseModel, Extra, Field # noqa: F401 -from pydantic.fields import FieldInfo -from pydantic.schema import default_ref_template -from streamlit.delta_generator import DeltaGenerator -from streamlit_pydantic.ui_renderer import InputUI - - -class BaseForm(BaseModel): - @classmethod - def empty(cls) -> "BaseForm": - return cls.construct() - - @property - def _disabled_fields(self) -> set[str]: - if not getattr(self, "_disabled_fields_set", None): - self._disabled_fields_set = set() - return self._disabled_fields_set - - def disable(self, field: str) -> None: - self._disabled_fields.add(field) - - def enable(self, field) -> None: - self._disabled_fields.remove(field) - - @classmethod - def schema(self_or_cls, by_alias: bool = True, ref_template: str = default_ref_template) -> dict[str, Any]: - schema = super().schema(by_alias=by_alias, ref_template=ref_template) - - schema_properties: dict[str, dict] = schema.get("properties", {}) - disabled_fields: set[str] = getattr(self_or_cls, "_disabled_fields_set", set()) - for property_name, property_schema in schema_properties.items(): - if property_name in disabled_fields and not property_schema.get("readOnly"): - property_schema["readOnly"] = True - - return schema - - @classmethod - def get_field_label(cls, field_name: str) -> str: - schema = cls.schema() - schema_properties = schema.get("properties", {}) - field_schema = schema_properties[field_name] - return field_schema.get("st_kwargs_label") or field_schema.get("title") - - def _iter(self, *args, **kwargs) -> Generator[tuple[str, Any], None, None]: - """ - NOTE: can be removed in favor of `@computed_field` if - streamlit-pydantic is ever updated to use pydantic 2.0. - """ - - for dict_key, value in super()._iter(*args, **kwargs): - field_descriptor = self.__fields__.get(dict_key) - is_computed_field = ( - field_descriptor is not None - and isinstance(field_descriptor.field_info, ComputedField) - ) - if is_computed_field: - value = field_descriptor.field_info.get_value(self) - yield dict_key, value - - class Config: - extra = Extra.allow - arbitrary_types_allowed = True - - -def computed_field(default=None): - def decorator(method: Callable) -> ComputedField: - return ComputedField(method, default=default) - return decorator - - -class ComputedField(FieldInfo): - def __init__(self, method: Callable, *args, **kwargs): - super().__init__(*args, **kwargs) - self.func = method - - def get_value(self, instance: type[BaseForm]): - return self.func(instance) - - -class ManualRender: - @property - def input_ui(self): - if not getattr(self, "_input_ui", None): - self._input_ui = InputUI( - self.form_key(), - self, # type: ignore - group_optional_fields="no", # type: ignore - lowercase_labels=False, - ignore_empty_values=False, - ) - return self._input_ui - - def form_key(self): - raise NotImplementedError - - def render_input_ui(self, container: DeltaGenerator, session_state: dict) -> "BaseForm": - raise NotImplementedError - - def render_field(self, field_name: str, container: DeltaGenerator | None = None) -> Any: - streamlit_container = container or self.input_ui._streamlit_container - model_property = self.input_ui._schema_properties[field_name] - initial_value = getattr(self, field_name, None) or self.input_ui._get_value(field_name) - is_disabled = field_name in getattr(self, "_disabled_fields", set()) - - if is_disabled: - model_property["readOnly"] = True - - if model_property.get("type") != "boolean" and initial_value not in [None, ""]: - model_property["init_value"] = initial_value - - new_value = self.input_ui._render_property(streamlit_container, field_name, model_property) - self.update_field_value(field_name, new_value) - - return new_value - - def update_field_value(self, field_name: str, value: Any) -> Any: - self.input_ui._store_value(field_name, value) - setattr(self, field_name, value) - return value - - def get_field_value(self, field_name: str, latest: bool = False) -> Any: - if latest: - return st.session_state.get(self.get_field_key(field_name)) - return self.input_ui._get_value(field_name) - - def reset_cache(self) -> None: - for field_name in cast(type[BaseForm], type(self)).__fields__.keys(): - st.session_state.pop(self.get_field_key(field_name), None) - st.session_state.pop(self.form_key() + "-data", None) - - def get_field_key(self, field_name: str) -> Any: - return str(self.input_ui._session_state.run_id) + "-" + str(self.input_ui._key) + "-" + field_name From c12355260015cd30875d277f8c863391625da8ff Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Thu, 19 Feb 2026 21:41:35 -0300 Subject: [PATCH 06/95] feat(mcp): add MCP server foundation with JWT auth and ping tool Add FastMCP server with streamable HTTP transport, JWT authentication reusing existing token schema, and a smoke-test ping tool. Extract shared auth building blocks (JWT signing, password verification, permission checking) from UI into common/auth.py so both the Streamlit UI and MCP server use the same logic. - Add mcp[cli], uvicorn, PyJWT, bcrypt to core dependencies - Add MCP_PORT, MCP_HOST, MCP_ENABLED settings - Create common/auth.py with shared JWT and password utilities - Refactor ui/auth.py to use common/auth.py - Create mcp/server.py with FastMCP app and ping tool - Create mcp/auth.py for MCP-specific authenticate/validate flows - Add `testgen run-app mcp` and `testgen mcp-token` CLI commands - Add 17 unit tests (9 common auth + 8 MCP auth) Co-Authored-By: Claude Opus 4.6 --- pyproject.toml | 6 ++ testgen/__main__.py | 36 +++++++++ testgen/common/auth.py | 65 ++++++++++++++++ testgen/mcp/__init__.py | 12 +++ testgen/mcp/auth.py | 32 ++++++++ testgen/mcp/server.py | 75 ++++++++++++++++++ testgen/settings.py | 24 ++++++ testgen/ui/auth.py | 14 ++-- tests/unit/common/test_auth.py | 101 ++++++++++++++++++++++++ tests/unit/mcp/__init__.py | 0 tests/unit/mcp/test_auth.py | 138 +++++++++++++++++++++++++++++++++ 11 files changed, 494 insertions(+), 9 deletions(-) create mode 100644 testgen/common/auth.py create mode 100644 testgen/mcp/__init__.py create mode 100644 testgen/mcp/auth.py create mode 100644 testgen/mcp/server.py create mode 100644 tests/unit/common/test_auth.py create mode 100644 tests/unit/mcp/__init__.py create mode 100644 tests/unit/mcp/test_auth.py diff --git a/pyproject.toml b/pyproject.toml index 7e1ef181..0b99f53d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,6 +68,12 @@ dependencies = [ "matplotlib==3.9.2", "scipy==1.14.1", "jinja2==3.1.6", + + # MCP server + "mcp[cli]==1.26.0", + "uvicorn==0.41.0", + "PyJWT==2.11.0", + "bcrypt==5.0.0", ] [project.optional-dependencies] diff --git a/testgen/__main__.py b/testgen/__main__.py index 98cf9b2c..e3b129e6 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -56,6 +56,8 @@ LOG = logging.getLogger("testgen") APP_MODULES = ["ui", "scheduler"] +if settings.MCP_ENABLED: + APP_MODULES.append("mcp") VERSION_DATA = version_service.get_version() CHILDREN_POLL_INTERVAL = 10 @@ -72,6 +74,8 @@ class CliGroup(click.Group): def invoke(self, ctx: Context): try: super().invoke(ctx) + except click.exceptions.UsageError: + raise except Exception: LOG.exception("There was an unexpected error") @@ -761,6 +765,10 @@ def run_app(module): case "scheduler": run_scheduler() + case "mcp": + from testgen.mcp.server import run_mcp + run_mcp() + case "all": children = [ subprocess.Popen([sys.executable, sys.argv[0], "run-app", m], start_new_session=True) @@ -790,5 +798,33 @@ def term_children(signum, _): +@cli.command("mcp-token", help="Generate a JWT token for MCP server authentication.") +@click.option("--username", required=True, help="TestGen username") +@click.option("--password", required=True, hide_input=True, help="TestGen password") +@with_database_session +def mcp_token(username: str, password: str): + from testgen.mcp import get_server_url + from testgen.mcp.auth import authenticate_user + try: + token = authenticate_user(username, password) + except ValueError as e: + click.secho(str(e), fg="red") + sys.exit(1) + + mcp_url = f"{get_server_url()}/mcp" + + click.echo() + click.echo(token) + click.echo() + click.secho("MCP server URL:", bold=True) + click.echo(f" {mcp_url}") + click.echo() + click.secho("Pass the token as a Bearer header when connecting from any MCP client.", dim=True) + click.echo() + click.secho("Example — Claude Code:", bold=True) + click.echo(f' claude mcp add --transport http testgen {mcp_url} --header "Authorization: Bearer {token}"') + click.echo() + + if __name__ == "__main__": cli() diff --git a/testgen/common/auth.py b/testgen/common/auth.py new file mode 100644 index 00000000..b14cb712 --- /dev/null +++ b/testgen/common/auth.py @@ -0,0 +1,65 @@ +import base64 +import logging +from datetime import UTC, datetime, timedelta + +import bcrypt +import jwt + +from testgen import settings + +LOG = logging.getLogger("testgen") + + +def get_jwt_signing_key() -> bytes: + """Decode the base64-encoded JWT signing key from settings.""" + return base64.b64decode(settings.JWT_HASHING_KEY_B64.encode("ascii")) + + +def create_jwt_token(username: str, expiry_days: int = 30) -> str: + """Create a signed JWT token with the standard TestGen payload schema.""" + payload = { + "username": username, + "exp_date": (datetime.now(UTC) + timedelta(days=expiry_days)).timestamp(), + } + return jwt.encode(payload, get_jwt_signing_key(), algorithm="HS256") + + +def decode_jwt_token(token_str: str) -> dict: + """Decode and validate a JWT token. Returns the payload dict. + + Raises ValueError if the token is invalid or expired. + """ + try: + payload = jwt.decode(token_str, get_jwt_signing_key(), algorithms=["HS256"]) + except jwt.InvalidTokenError as e: + raise ValueError(f"Invalid token: {e}") from e + + if payload.get("exp_date", 0) <= datetime.now(UTC).timestamp(): + raise ValueError("Token has expired") + + return payload + + +def verify_password(password: str, hashed_password: str) -> bool: + """Verify a plaintext password against a bcrypt hash. + + Same algorithm as streamlit_authenticator. + """ + return bcrypt.checkpw(password.encode(), hashed_password.encode()) + + +def check_permission(user: object, permission: str) -> bool: + """Check if a user has the given permission. + + Uses the enterprise ROLE_PERMISSION_MATRIX if available, + falls back to open-source (always allowed). + """ + try: + from testgen_enterprise_auth.auth import ROLE_PERMISSION_MATRIX + except Exception: + # Enterprise auth plugin not available or not loadable + # (importing it triggers Streamlit UI code that may fail outside the UI) + return True + else: + allowed = ROLE_PERMISSION_MATRIX.get(user.role, []) + return permission in allowed diff --git a/testgen/mcp/__init__.py b/testgen/mcp/__init__.py new file mode 100644 index 00000000..bf4de795 --- /dev/null +++ b/testgen/mcp/__init__.py @@ -0,0 +1,12 @@ +from testgen import settings +from testgen.common.models.settings import PersistedSetting + + +def get_server_url() -> str: + """Derive the externally-reachable MCP server URL from the persisted BASE_URL.""" + base_url = PersistedSetting.get("BASE_URL", "") + if base_url: + scheme, _, host_port = base_url.partition("://") + host = host_port.split(":")[0] + return f"{scheme}://{host}:{settings.MCP_PORT}" + return f"http://localhost:{settings.MCP_PORT}" diff --git a/testgen/mcp/auth.py b/testgen/mcp/auth.py new file mode 100644 index 00000000..71ce8b20 --- /dev/null +++ b/testgen/mcp/auth.py @@ -0,0 +1,32 @@ +from testgen.common.auth import check_permission, create_jwt_token, decode_jwt_token, verify_password +from testgen.common.models.user import User + +__all__ = ["authenticate_user", "check_permission", "validate_token"] + + +def authenticate_user(username: str, password: str) -> str: + """Verify credentials and return a JWT token.""" + user = User.get(username) + + if user is None: + raise ValueError("Invalid username or password") + + if not verify_password(password, user.password): + raise ValueError("Invalid username or password") + + return create_jwt_token(user.username) + + +def validate_token(token: str) -> User: + """Decode and validate a JWT token, returning the User.""" + payload = decode_jwt_token(token) + + username = payload.get("username") + if not username: + raise ValueError("Token missing username") + + user = User.get(username) + if user is None: + raise ValueError(f"User not found: {username}") + + return user diff --git a/testgen/mcp/server.py b/testgen/mcp/server.py new file mode 100644 index 00000000..971ddb73 --- /dev/null +++ b/testgen/mcp/server.py @@ -0,0 +1,75 @@ +import logging + +from mcp.server.auth.provider import AccessToken +from mcp.server.auth.settings import AuthSettings +from mcp.server.fastmcp import FastMCP + +from testgen import settings +from testgen.common import version_service +from testgen.common.auth import decode_jwt_token +from testgen.common.models import with_database_session + +LOG = logging.getLogger("testgen") + +SERVER_INSTRUCTIONS = """\ +You are connected to a TestGen data quality testing server. + +WORKFLOW: +1. ALWAYS start with get_data_inventory to understand the available projects, connections, and table groups. +2. Use the appropriate tools to explore profiling results, test definitions, and test results. +3. When asked about data quality, reference specific test results and profiling anomalies. +4. Provide actionable recommendations based on the data quality findings. + +IMPORTANT: +- Use ISO 8601 format for dates (YYYY-MM-DD). +- UUIDs are used as identifiers for most entities. +""" + + +class JWTTokenVerifier: + """Verify JWT Bearer tokens for MCP server authentication.""" + + async def verify_token(self, token: str) -> AccessToken | None: + try: + payload = decode_jwt_token(token) + return AccessToken( + token=token, + client_id=payload["username"], + scopes=[], + expires_at=int(payload["exp_date"]), + ) + except (ValueError, KeyError): + return None + + +@with_database_session +def ping() -> dict: + """Check server connectivity and return version information.""" + version_data = version_service.get_version() + return { + "status": "ok", + "edition": version_data.edition, + "version": version_data.current, + } + + +def run_mcp() -> None: + """Start the MCP server with streamable HTTP transport.""" + from testgen.mcp import get_server_url + server_url = with_database_session(get_server_url)() + + mcp = FastMCP( + "TestGen", + host=settings.MCP_HOST, + port=settings.MCP_PORT, + instructions=SERVER_INSTRUCTIONS, + auth=AuthSettings( + issuer_url=server_url, + resource_server_url=server_url, + ), + token_verifier=JWTTokenVerifier(), + ) + mcp.tool()(ping) + + LOG.info("Starting MCP server on %s:%s (auth issuer: %s)", settings.MCP_HOST, settings.MCP_PORT, server_url) + mcp.run(transport="streamable-http") diff --git a/testgen/settings.py b/testgen/settings.py index cf71768d..93661252 100644 --- a/testgen/settings.py +++ b/testgen/settings.py @@ -500,3 +500,27 @@ """ Email: SMTP password """ + +MCP_PORT: int = int(os.getenv("TG_MCP_PORT", "8510")) +""" +Port for the MCP server. + +from env variable: `TG_MCP_PORT` +defaults to: `8510` +""" + +MCP_HOST: str = os.getenv("TG_MCP_HOST", "0.0.0.0") # noqa: S104 +""" +Host for the MCP server. + +from env variable: `TG_MCP_HOST` +defaults to: `0.0.0.0` +""" + +MCP_ENABLED: bool = os.getenv("TG_MCP_ENABLED", "Yes").lower() in ("yes", "y") +""" +Enable the MCP server when running `testgen run-app all`. + +from env variable: `TG_MCP_ENABLED` +defaults to: `Yes` +""" diff --git a/testgen/ui/auth.py b/testgen/ui/auth.py index 14706465..4e82938b 100644 --- a/testgen/ui/auth.py +++ b/testgen/ui/auth.py @@ -1,13 +1,10 @@ -import base64 import logging -from datetime import UTC, datetime from typing import Literal import extra_streamlit_components as stx -import jwt import streamlit as st -from testgen import settings +from testgen.common.auth import decode_jwt_token, get_jwt_signing_key from testgen.common.mixpanel_service import MixpanelService from testgen.common.models.user import User from testgen.ui.services.javascript_service import execute_javascript @@ -46,8 +43,8 @@ def user_has_permission(self, _permission: Permission) -> bool: def get_jwt_hashing_key(self) -> bytes: try: - return base64.b64decode(settings.JWT_HASHING_KEY_B64.encode("ascii")) - except Exception as e: + return get_jwt_signing_key() + except Exception: st.error( "Error reading the JWT signing key from settings.\n\n Make sure you have a valid " "base64 string assigned to the TG_JWT_HASHING_KEY environment variable." @@ -74,9 +71,8 @@ def load_user_session(self) -> None: token = cookies.get(self.jwt_cookie_name) if token is not None: try: - token = jwt.decode(token, self.get_jwt_hashing_key(), algorithms=["HS256"]) - if token["exp_date"] > datetime.now(UTC).timestamp(): - self.user = User.get(token["username"]) + payload = decode_jwt_token(token) + self.user = User.get(payload["username"]) except Exception: LOG.debug("Invalid auth token found on cookies", exc_info=True, stack_info=True) diff --git a/tests/unit/common/test_auth.py b/tests/unit/common/test_auth.py new file mode 100644 index 00000000..f56a13c3 --- /dev/null +++ b/tests/unit/common/test_auth.py @@ -0,0 +1,101 @@ +import base64 +import sys +from datetime import UTC, datetime, timedelta +from unittest.mock import MagicMock, patch + +import bcrypt +import jwt +import pytest + +from testgen.common.auth import ( + check_permission, + create_jwt_token, + decode_jwt_token, + verify_password, +) + +JWT_KEY = base64.b64encode(b"test-secret-key-for-jwt-signing!").decode("ascii") +TEST_PASSWORD = "testpass" # noqa: S105 + + +def _make_token(username="testuser", exp_days=30): + key = base64.b64decode(JWT_KEY.encode("ascii")) + payload = { + "username": username, + "exp_date": (datetime.now(UTC) + timedelta(days=exp_days)).timestamp(), + } + return jwt.encode(payload, key, algorithm="HS256") + + +@patch("testgen.common.auth.settings") +def test_create_jwt_token_creates_valid_token(mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + token = create_jwt_token("testuser", expiry_days=7) + + key = base64.b64decode(JWT_KEY.encode("ascii")) + payload = jwt.decode(token, key, algorithms=["HS256"]) + assert payload["username"] == "testuser" + assert payload["exp_date"] > datetime.now(UTC).timestamp() + + +@patch("testgen.common.auth.settings") +def test_decode_jwt_token_decodes_valid_token(mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + token = _make_token() + payload = decode_jwt_token(token) + assert payload["username"] == "testuser" + + +@patch("testgen.common.auth.settings") +def test_decode_jwt_token_raises_for_expired_token(mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + token = _make_token(exp_days=-1) + with pytest.raises(ValueError, match="Token has expired"): + decode_jwt_token(token) + + +@patch("testgen.common.auth.settings") +def test_decode_jwt_token_raises_for_invalid_token(mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + with pytest.raises(ValueError, match="Invalid token"): + decode_jwt_token("not-a-valid-token") + + +def test_verify_password_correct(): + hashed = bcrypt.hashpw(TEST_PASSWORD.encode(), bcrypt.gensalt()).decode() + assert verify_password(TEST_PASSWORD, hashed) is True + + +def test_verify_password_wrong(): + hashed = bcrypt.hashpw(TEST_PASSWORD.encode(), bcrypt.gensalt()).decode() + assert verify_password("wrongpass", hashed) is False + + +def test_check_permission_allowed_with_enterprise_plugin(): + mock_matrix = { + "admin": ["administer", "edit", "disposition", "view", "catalog"], + "business": ["view", "catalog"], + } + mock_auth = MagicMock() + mock_auth.ROLE_PERMISSION_MATRIX = mock_matrix + with patch.dict(sys.modules, {"testgen_enterprise_auth": MagicMock(), "testgen_enterprise_auth.auth": mock_auth}): + user = MagicMock(role="admin") + assert check_permission(user, "edit") is True + + +def test_check_permission_denied_with_enterprise_plugin(): + mock_matrix = { + "admin": ["administer", "edit", "disposition", "view", "catalog"], + "business": ["view", "catalog"], + } + mock_auth = MagicMock() + mock_auth.ROLE_PERMISSION_MATRIX = mock_matrix + with patch.dict(sys.modules, {"testgen_enterprise_auth": MagicMock(), "testgen_enterprise_auth.auth": mock_auth}): + user = MagicMock(role="business") + assert check_permission(user, "administer") is False + + +def test_check_permission_falls_back_when_no_plugin(): + with patch.dict(sys.modules, {"testgen_enterprise_auth": None, "testgen_enterprise_auth.auth": None}): + user = MagicMock(role="business") + assert check_permission(user, "administer") is True diff --git a/tests/unit/mcp/__init__.py b/tests/unit/mcp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/mcp/test_auth.py b/tests/unit/mcp/test_auth.py new file mode 100644 index 00000000..ab0d4973 --- /dev/null +++ b/tests/unit/mcp/test_auth.py @@ -0,0 +1,138 @@ +import asyncio +import base64 +from datetime import UTC, datetime, timedelta +from unittest.mock import MagicMock, patch + +import bcrypt +import jwt +import pytest + +from testgen.mcp.auth import authenticate_user, validate_token +from testgen.mcp.server import JWTTokenVerifier + +JWT_KEY = base64.b64encode(b"test-secret-key-for-jwt-signing!").decode("ascii") +TEST_PASSWORD = "testpass" # noqa: S105 + + +def _make_user(username="testuser", role="admin"): + hashed = bcrypt.hashpw(TEST_PASSWORD.encode(), bcrypt.gensalt()).decode() + user = MagicMock() + user.username = username + user.password = hashed + user.role = role + return user + + +def _make_token(username="testuser", exp_days=30): + key = base64.b64decode(JWT_KEY.encode("ascii")) + payload = { + "username": username, + "exp_date": (datetime.now(UTC) + timedelta(days=exp_days)).timestamp(), + } + return jwt.encode(payload, key, algorithm="HS256") + + +@patch("testgen.common.auth.settings") +@patch("testgen.mcp.auth.User") +def test_authenticate_user_returns_jwt(mock_user_cls, mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + mock_user_cls.get.return_value = _make_user() + + token = authenticate_user("testuser", TEST_PASSWORD) + + key = base64.b64decode(JWT_KEY.encode("ascii")) + payload = jwt.decode(token, key, algorithms=["HS256"]) + assert payload["username"] == "testuser" + assert payload["exp_date"] > datetime.now(UTC).timestamp() + + +@patch("testgen.common.auth.settings") +@patch("testgen.mcp.auth.User") +def test_authenticate_user_raises_for_wrong_password(mock_user_cls, mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + mock_user_cls.get.return_value = _make_user() + + with pytest.raises(ValueError, match="Invalid username or password"): + authenticate_user("testuser", "wrongpass") + + +@patch("testgen.common.auth.settings") +@patch("testgen.mcp.auth.User") +def test_authenticate_user_raises_for_unknown_user(mock_user_cls, mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + mock_user_cls.get.return_value = None + + with pytest.raises(ValueError, match="Invalid username or password"): + authenticate_user("nobody", TEST_PASSWORD) + + +@patch("testgen.common.auth.settings") +@patch("testgen.mcp.auth.User") +def test_validate_token_returns_user(mock_user_cls, mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + expected_user = _make_user() + mock_user_cls.get.return_value = expected_user + + user = validate_token(_make_token()) + + assert user is expected_user + mock_user_cls.get.assert_called_once_with("testuser") + + +@patch("testgen.common.auth.settings") +def test_validate_token_raises_for_expired_token(mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + + with pytest.raises(ValueError, match="Token has expired"): + validate_token(_make_token(exp_days=-1)) + + +@patch("testgen.common.auth.settings") +def test_validate_token_raises_for_invalid_token(mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + + with pytest.raises(ValueError, match="Invalid token"): + validate_token("not-a-valid-token") + + +@patch("testgen.common.auth.settings") +@patch("testgen.mcp.auth.User") +def test_validate_token_raises_for_missing_user(mock_user_cls, mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + mock_user_cls.get.return_value = None + + with pytest.raises(ValueError, match="User not found"): + validate_token(_make_token()) + + +@patch("testgen.common.auth.settings") +def test_token_verifier_returns_access_token_for_valid_jwt(mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + verifier = JWTTokenVerifier() + token = _make_token() + + result = asyncio.run(verifier.verify_token(token)) + + assert result is not None + assert result.client_id == "testuser" + assert result.token == token + + +@patch("testgen.common.auth.settings") +def test_token_verifier_returns_none_for_expired_jwt(mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + verifier = JWTTokenVerifier() + + result = asyncio.run(verifier.verify_token(_make_token(exp_days=-1))) + + assert result is None + + +@patch("testgen.common.auth.settings") +def test_token_verifier_returns_none_for_invalid_jwt(mock_settings): + mock_settings.JWT_HASHING_KEY_B64 = JWT_KEY + verifier = JWTTokenVerifier() + + result = asyncio.run(verifier.verify_token("garbage")) + + assert result is None From 87f72053f1c1c0eae9dfe3647a3d04d137c12f90 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Tue, 24 Feb 2026 12:21:42 -0300 Subject: [PATCH 07/95] refactor: hide MCP behind feature flag and standardize boolean settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default MCP_ENABLED to "no" until the feature is complete. Standardize all boolean env var checks to accept ("yes", "true") only — drop "y". Co-Authored-By: Claude Opus 4.6 --- testgen/settings.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/testgen/settings.py b/testgen/settings.py index 93661252..8d2b4512 100644 --- a/testgen/settings.py +++ b/testgen/settings.py @@ -1,13 +1,13 @@ import os import typing -IS_DEBUG_LOG_LEVEL: bool = os.getenv("TESTGEN_DEBUG_LOG_LEVEL", "no").lower() == "yes" +IS_DEBUG_LOG_LEVEL: bool = os.getenv("TESTGEN_DEBUG_LOG_LEVEL", "no").lower() in ("yes", "true") """ When set, logs will be at debug level. defaults to: `no` """ -IS_DEBUG: bool = os.getenv("TESTGEN_DEBUG", "no").lower() == "yes" +IS_DEBUG: bool = os.getenv("TESTGEN_DEBUG", "no").lower() in ("yes", "true") """ When True invalidates the cache with the bootstrapped application causing the changes to the routing and plugins to take effect on every @@ -17,7 +17,7 @@ defaults to: `True` """ -LOG_TO_FILE: bool = os.getenv("TESTGEN_LOG_TO_FILE", "yes").lower() == "yes" +LOG_TO_FILE: bool = os.getenv("TESTGEN_LOG_TO_FILE", "yes").lower() in ("yes", "true") """ When set, rotating file logs will be generated. defaults to: `True` @@ -266,7 +266,7 @@ defaults to: `environ[DATABASE_PORT]` """ -SKIP_DATABASE_CERTIFICATE_VERIFICATION: bool = os.getenv("TG_TARGET_DB_TRUST_SERVER_CERTIFICATE", "no").lower() == "yes" +SKIP_DATABASE_CERTIFICATE_VERIFICATION: bool = os.getenv("TG_TARGET_DB_TRUST_SERVER_CERTIFICATE", "no").lower() in ("yes", "true") """ When True for supported SQL flavors, set up the SQLAlchemy connection to trust the database server certificate. @@ -372,7 +372,7 @@ from env variable: `OBSERVABILITY_API_KEY` """ -OBSERVABILITY_VERIFY_SSL: bool = os.getenv("TG_EXPORT_TO_OBSERVABILITY_VERIFY_SSL", "yes").lower() in ["yes", "true"] +OBSERVABILITY_VERIFY_SSL: bool = os.getenv("TG_EXPORT_TO_OBSERVABILITY_VERIFY_SSL", "yes").lower() in ("yes", "true") """ When False, exporting events to your instance of Observability will skip SSL verification. @@ -456,7 +456,7 @@ Random ID that uniquely identifies the instance. """ -ANALYTICS_ENABLED: bool = os.getenv("TG_ANALYTICS", "yes").lower() in ("true", "yes") +ANALYTICS_ENABLED: bool = os.getenv("TG_ANALYTICS", "yes").lower() in ("yes", "true") """ Disables sending usage data when set to any value except "true" and "yes". Defaults to "yes" """ @@ -517,7 +517,7 @@ defaults to: `0.0.0.0` """ -MCP_ENABLED: bool = os.getenv("TG_MCP_ENABLED", "Yes").lower() in ("yes", "y") +MCP_ENABLED: bool = os.getenv("TG_MCP_ENABLED", "no").lower() in ("yes", "true") """ Enable the MCP server when running `testgen run-app all`. From 0db6fdb0b5a8d097b2dc5d770e881c3f974bf675 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 25 Feb 2026 00:06:10 -0500 Subject: [PATCH 08/95] fix: improve upgrade commands to update revision after each script --- testgen/commands/run_upgrade_db_config.py | 52 ++++++++++++----------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/testgen/commands/run_upgrade_db_config.py b/testgen/commands/run_upgrade_db_config.py index 95ec4bc0..712149ba 100644 --- a/testgen/commands/run_upgrade_db_config.py +++ b/testgen/commands/run_upgrade_db_config.py @@ -51,32 +51,36 @@ def _get_upgrade_template_directory(): return "dbupgrade" -def _get_upgrade_scripts(sub_directory: str, params_mapping: dict, mask: str = r"^.*sql$", min_val: str = "") -> tuple[list[tuple[str, dict]], str]: +def _get_upgrade_scripts(sub_directory: str, params_mapping: dict, mask: str = r"^.*sql$", min_val: str = "") -> list[tuple[str, str]]: files = sorted(get_template_files(mask=mask, sub_directory=sub_directory), key=lambda key: str(key)) - max_prefix = "" - queries = [] + scripts = [] for file in files: if file.name > min_val: template = file.read_text("utf-8") query = replace_params(template, params_mapping) - queries.append((query, None)) - max_prefix = file.name[0:4] + scripts.append((file.name[0:4], query)) - if len(queries) == 0: + if not scripts: LOG.debug(f"No sql files were found for the mask {mask} in subdirectory {sub_directory}") - return queries, max_prefix + return scripts -def _execute_upgrade_scripts(params_mapping: dict, lstScripts: list[tuple[str, dict]]): - # Run scripts using admin credentials - execute_db_queries( - lstScripts, - user_override=params_mapping["TESTGEN_ADMIN_USER"], - password_override=params_mapping["TESTGEN_ADMIN_PASSWORD"], - user_type="schema_admin", - ) +def _execute_upgrade_scripts(params_mapping: dict, scripts: list[tuple[str, str]]) -> bool: + admin_user = params_mapping["TESTGEN_ADMIN_USER"] + admin_password = params_mapping["TESTGEN_ADMIN_PASSWORD"] + + for revision_prefix, query in scripts: + LOG.info(f"Applying upgrade script {revision_prefix}") + execute_db_queries( + [(query, None)], + user_override=admin_user, + password_override=admin_password, + user_type="schema_admin", + ) + _update_revision_number(params_mapping, revision_prefix) + return True @@ -131,18 +135,17 @@ def run_upgrade_db_config() -> bool: next_revision = _format_revision_prefix(_get_next_revision_prefix(params_mapping)) upgrade_dir = _get_upgrade_template_directory() - queries, max_revision = _get_upgrade_scripts(upgrade_dir, params_mapping, min_val=next_revision) - LOG.info(f"Current revision: {current_revision}. Latest revision: {max_revision or current_revision}. Upgrade scripts: {len(queries)}") - if len(queries) > 0: - has_been_upgraded = _execute_upgrade_scripts(params_mapping, queries) - else: - has_been_upgraded = False + scripts = _get_upgrade_scripts(upgrade_dir, params_mapping, min_val=next_revision) + latest_revision = scripts[-1][0] if scripts else current_revision + LOG.info(f"Current revision: {current_revision}. Latest revision: {latest_revision}. Upgrade scripts: {len(scripts)}") + if scripts: + _execute_upgrade_scripts(params_mapping, scripts) LOG.info("Refreshing static metadata") _refresh_static_metadata(params_mapping) + has_been_upgraded = bool(scripts) if has_been_upgraded: - _update_revision_number(params_mapping, max_revision) LOG.info("Application data was successfully upgraded, and static metadata was refreshed.") else: LOG.info("Database upgrade was not required. Static metadata was refreshed.") @@ -155,6 +158,5 @@ def is_db_revision_up_to_date(): strNextPrefix = _format_revision_prefix(_get_next_revision_prefix(params_mapping)) upgrade_dir = _get_upgrade_template_directory() - # Retrieve and execute upgrade scripts, if any - lstQueries, max_prefix = _get_upgrade_scripts(upgrade_dir, params_mapping, min_val=strNextPrefix) - return len(lstQueries) == 0 + scripts = _get_upgrade_scripts(upgrade_dir, params_mapping, min_val=strNextPrefix) + return len(scripts) == 0 From 67749e4fa3da94e56311e041561ebbfd44ce6887 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Wed, 25 Feb 2026 13:12:46 -0300 Subject: [PATCH 09/95] refactor: decouple RBAC from enterprise plugin via PluginHook Remove direct import of testgen_enterprise_auth from OS auth code. RBAC is now discovered through the plugin system: plugins declare an RBACProvider subclass on their PluginSpec, and PluginHook resolves it at startup. OS default grants all permissions. Co-Authored-By: Claude Opus 4.6 --- testgen/common/auth.py | 16 ++---- testgen/mcp/server.py | 5 ++ testgen/ui/bootstrap.py | 2 +- testgen/utils/plugins.py | 92 ++++++++++++++++++++++++---------- tests/unit/common/test_auth.py | 49 +++++++++--------- 5 files changed, 99 insertions(+), 65 deletions(-) diff --git a/testgen/common/auth.py b/testgen/common/auth.py index b14cb712..94c83ed0 100644 --- a/testgen/common/auth.py +++ b/testgen/common/auth.py @@ -51,15 +51,9 @@ def verify_password(password: str, hashed_password: str) -> bool: def check_permission(user: object, permission: str) -> bool: """Check if a user has the given permission. - Uses the enterprise ROLE_PERMISSION_MATRIX if available, - falls back to open-source (always allowed). + Uses the RBAC provider registered by installed plugins. + Returns True (all allowed) if no plugin overrides the default. """ - try: - from testgen_enterprise_auth.auth import ROLE_PERMISSION_MATRIX - except Exception: - # Enterprise auth plugin not available or not loadable - # (importing it triggers Streamlit UI code that may fail outside the UI) - return True - else: - allowed = ROLE_PERMISSION_MATRIX.get(user.role, []) - return permission in allowed + from testgen.utils.plugins import PluginHook + + return PluginHook.instance().rbac.check_permission(user, permission) diff --git a/testgen/mcp/server.py b/testgen/mcp/server.py index 971ddb73..6134e32f 100644 --- a/testgen/mcp/server.py +++ b/testgen/mcp/server.py @@ -56,6 +56,11 @@ def ping() -> dict: def run_mcp() -> None: """Start the MCP server with streamable HTTP transport.""" from testgen.mcp import get_server_url + from testgen.utils.plugins import discover + + for plugin in discover(): + plugin.load() + server_url = with_database_session(get_server_url)() mcp = FastMCP( diff --git a/testgen/ui/bootstrap.py b/testgen/ui/bootstrap.py index b21cf6a2..14a55790 100644 --- a/testgen/ui/bootstrap.py +++ b/testgen/ui/bootstrap.py @@ -77,7 +77,7 @@ def run(log_level: int = logging.INFO) -> Application: logo_class = plugins.Logo for plugin in installed_plugins: - spec = plugin.load() + spec = plugin.load_streamlit() if spec.page: pages.append(spec.page) diff --git a/testgen/utils/plugins.py b/testgen/utils/plugins.py index 15bb024d..34171900 100644 --- a/testgen/utils/plugins.py +++ b/testgen/utils/plugins.py @@ -1,4 +1,7 @@ +from __future__ import annotations + import dataclasses +import importlib import importlib.metadata import inspect import json @@ -6,6 +9,7 @@ import shutil from collections.abc import Generator from pathlib import Path +from types import ModuleType from typing import ClassVar from testgen.ui.assets import get_asset_path @@ -20,7 +24,7 @@ ui_plugins_entrypoint_prefix = "./plugin_pages" -def discover() -> Generator["Plugin", None, None]: +def discover() -> Generator[Plugin, None, None]: ui_plugins_provision_file.touch(exist_ok=True) for package_path, distribution_names in importlib.metadata.packages_distributions().items(): if package_path.startswith(PLUGIN_PREFIX): @@ -98,42 +102,76 @@ def _read_ui_plugin_spec() -> dict: return json.loads(contents.replace("export default ", "")[:-1]) +class RBACProvider: + """Base RBAC provider. OS default: all permissions granted.""" + + @staticmethod + def check_permission(_user: object, _permission: str) -> bool: + return True + + class PluginSpec: + rbac: ClassVar[type[RBACProvider]] = RBACProvider auth: ClassVar[type[Authentication] | None] = None page: ClassVar[type[Page] | None] = None logo: ClassVar[type[Logo] | None] = None component: ClassVar[ComponentSpec | None] = None +class PluginHook: + """Singleton holding resolved plugin values, pre-loaded with defaults.""" + + _instance: PluginHook | None = None + rbac: type[RBACProvider] = RBACProvider + + @classmethod + def instance(cls) -> PluginHook: + if cls._instance is None: + cls._instance = cls() + return cls._instance + + +def _find_plugin_spec(module: ModuleType) -> type[PluginSpec] | None: + """Find the first concrete PluginSpec subclass in a module.""" + for name in dir(module): + cls = getattr(module, name, None) + if inspect.isclass(cls) and issubclass(cls, PluginSpec) and cls is not PluginSpec: + return cls + return None + + @dataclasses.dataclass class Plugin: package: str version: str - def load(self) -> PluginSpec: - plugin_page = None - plugin_auth = None - plugin_logo = None - plugin_component_spec = None - + def load(self) -> type[PluginSpec]: + """Lightweight load: import plugin module and populate PluginHook.""" module = importlib.import_module(self.package) - for property_name in dir(module): - if ((maybe_class := getattr(module, property_name, None)) and inspect.isclass(maybe_class)): - if issubclass(maybe_class, PluginSpec) and maybe_class != PluginSpec: - return maybe_class - - if issubclass(maybe_class, Page): - plugin_page = maybe_class - - elif issubclass(maybe_class, Authentication): - plugin_auth = maybe_class - - elif issubclass(maybe_class, Logo): - plugin_logo = maybe_class - - return type("AnyPlugin", (PluginSpec,), { - "page": plugin_page, - "auth": plugin_auth, - "logo": plugin_logo, - "component": plugin_component_spec, - }) + spec = _find_plugin_spec(module) + if spec is not None: + hook = PluginHook.instance() + if spec.rbac is not RBACProvider: + hook.rbac = spec.rbac + return spec or PluginSpec + + def load_streamlit(self) -> type[PluginSpec]: + """Full Streamlit load. Calls load() first, then returns spec for UI access.""" + spec = self.load() + if spec is not PluginSpec: + return spec + + # Fallback: discover UI classes from module (backward compat for plugins without explicit PluginSpec) + _discoverable: dict[type, str] = {Page: "page", Authentication: "auth", Logo: "logo"} + attrs: dict[str, type] = {} + module = importlib.import_module(self.package) + + for name in dir(module): + cls = getattr(module, name, None) + if not inspect.isclass(cls): + continue + for base, attr in _discoverable.items(): + if issubclass(cls, base) and cls is not base: + attrs[attr] = cls + + return type("AnyPlugin", (PluginSpec,), attrs) if attrs else PluginSpec diff --git a/tests/unit/common/test_auth.py b/tests/unit/common/test_auth.py index f56a13c3..87ccc5bb 100644 --- a/tests/unit/common/test_auth.py +++ b/tests/unit/common/test_auth.py @@ -1,7 +1,6 @@ import base64 -import sys from datetime import UTC, datetime, timedelta -from unittest.mock import MagicMock, patch +from unittest.mock import ANY, MagicMock, patch import bcrypt import jwt @@ -71,31 +70,29 @@ def test_verify_password_wrong(): assert verify_password("wrongpass", hashed) is False -def test_check_permission_allowed_with_enterprise_plugin(): - mock_matrix = { - "admin": ["administer", "edit", "disposition", "view", "catalog"], - "business": ["view", "catalog"], - } - mock_auth = MagicMock() - mock_auth.ROLE_PERMISSION_MATRIX = mock_matrix - with patch.dict(sys.modules, {"testgen_enterprise_auth": MagicMock(), "testgen_enterprise_auth.auth": mock_auth}): - user = MagicMock(role="admin") - assert check_permission(user, "edit") is True +def test_check_permission_allowed_with_plugin(): + mock_rbac = MagicMock() + mock_rbac.check_permission.return_value = True + mock_hook = MagicMock() + mock_hook.rbac = mock_rbac + with patch("testgen.utils.plugins.PluginHook.instance", return_value=mock_hook): + assert check_permission(MagicMock(role="admin"), "edit") is True + mock_rbac.check_permission.assert_called_once_with(ANY, "edit") -def test_check_permission_denied_with_enterprise_plugin(): - mock_matrix = { - "admin": ["administer", "edit", "disposition", "view", "catalog"], - "business": ["view", "catalog"], - } - mock_auth = MagicMock() - mock_auth.ROLE_PERMISSION_MATRIX = mock_matrix - with patch.dict(sys.modules, {"testgen_enterprise_auth": MagicMock(), "testgen_enterprise_auth.auth": mock_auth}): - user = MagicMock(role="business") - assert check_permission(user, "administer") is False +def test_check_permission_denied_with_plugin(): + mock_rbac = MagicMock() + mock_rbac.check_permission.return_value = False + mock_hook = MagicMock() + mock_hook.rbac = mock_rbac + with patch("testgen.utils.plugins.PluginHook.instance", return_value=mock_hook): + assert check_permission(MagicMock(role="business"), "administer") is False + +def test_check_permission_defaults_without_plugin(): + from testgen.utils.plugins import PluginHook -def test_check_permission_falls_back_when_no_plugin(): - with patch.dict(sys.modules, {"testgen_enterprise_auth": None, "testgen_enterprise_auth.auth": None}): - user = MagicMock(role="business") - assert check_permission(user, "administer") is True + with patch("testgen.utils.plugins.PluginHook.instance") as mock_instance: + hook = PluginHook() + mock_instance.return_value = hook + assert check_permission(MagicMock(role="business"), "administer") is True From 8f69936065df2a4d6f49317e0506c0d46d370cb6 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Mon, 23 Feb 2026 14:56:53 -0300 Subject: [PATCH 10/95] feat(mcp): add P0 tools, resources, prompts and model extensions - 8 MCP tools: discovery (inventory, projects, suites), test runs (recent runs), test results (results, failure summary, history), reference (test type lookup) - 2 resources: test-types reference table, glossary - 4 prompts: health_check, investigate_failures, table_health, compare_runs - Model extensions: TestResult (select_results, select_failures, select_history), DataTable (new partial model for data_table_chars) - Inventory service with ORM queries and adaptive compact mode - 53 unit tests for all new modules Co-Authored-By: Claude Opus 4.6 --- testgen/common/models/data_table.py | 38 +++++ testgen/common/models/test_definition.py | 4 + testgen/common/models/test_result.py | 79 ++++++++- testgen/mcp/prompts/__init__.py | 0 testgen/mcp/prompts/workflows.py | 85 ++++++++++ testgen/mcp/server.py | 75 ++++++--- testgen/mcp/services/__init__.py | 0 testgen/mcp/services/inventory_service.py | 191 ++++++++++++++++++++++ testgen/mcp/tools/__init__.py | 0 testgen/mcp/tools/discovery.py | 72 ++++++++ testgen/mcp/tools/reference.py | 119 ++++++++++++++ testgen/mcp/tools/test_results.py | 167 +++++++++++++++++++ testgen/mcp/tools/test_runs.py | 65 ++++++++ tests/unit/mcp/test_inventory_service.py | 177 ++++++++++++++++++++ tests/unit/mcp/test_model_data_table.py | 41 +++++ tests/unit/mcp/test_model_test_result.py | 103 ++++++++++++ tests/unit/mcp/test_tools_discovery.py | 83 ++++++++++ tests/unit/mcp/test_tools_reference.py | 88 ++++++++++ tests/unit/mcp/test_tools_test_results.py | 179 ++++++++++++++++++++ tests/unit/mcp/test_tools_test_runs.py | 97 +++++++++++ 20 files changed, 1640 insertions(+), 23 deletions(-) create mode 100644 testgen/common/models/data_table.py create mode 100644 testgen/mcp/prompts/__init__.py create mode 100644 testgen/mcp/prompts/workflows.py create mode 100644 testgen/mcp/services/__init__.py create mode 100644 testgen/mcp/services/inventory_service.py create mode 100644 testgen/mcp/tools/__init__.py create mode 100644 testgen/mcp/tools/discovery.py create mode 100644 testgen/mcp/tools/reference.py create mode 100644 testgen/mcp/tools/test_results.py create mode 100644 testgen/mcp/tools/test_runs.py create mode 100644 tests/unit/mcp/test_inventory_service.py create mode 100644 tests/unit/mcp/test_model_data_table.py create mode 100644 tests/unit/mcp/test_model_test_result.py create mode 100644 tests/unit/mcp/test_tools_discovery.py create mode 100644 tests/unit/mcp/test_tools_reference.py create mode 100644 tests/unit/mcp/test_tools_test_results.py create mode 100644 tests/unit/mcp/test_tools_test_runs.py diff --git a/testgen/common/models/data_table.py b/testgen/common/models/data_table.py new file mode 100644 index 00000000..bab03a4a --- /dev/null +++ b/testgen/common/models/data_table.py @@ -0,0 +1,38 @@ +from uuid import UUID, uuid4 + +from sqlalchemy import BigInteger, Column, ForeignKey, String, asc, func, select +from sqlalchemy.dialects import postgresql + +from testgen.common.models import get_current_session +from testgen.common.models.entity import Entity + + +class DataTable(Entity): + __tablename__ = "data_table_chars" + + id: UUID = Column("table_id", postgresql.UUID(as_uuid=True), primary_key=True, default=uuid4) + table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("table_groups.id")) + table_name: str = Column(String) + column_ct: int = Column(BigInteger) + + # Unmapped columns: schema_name, functional_table_type, description, + # critical_data_element, data_source, source_system, source_process, + # business_domain, stakeholder_group, transform_level, aggregation_level, + # data_product, add_date, drop_date, last_refresh_date, approx_record_ct, + # record_ct, last_complete_profile_run_id, last_profile_record_ct, + # dq_score_profiling, dq_score_testing + + @classmethod + def select_table_names(cls, table_groups_id: UUID, limit: int = 100) -> list[str]: + query = ( + select(cls.table_name) + .where(cls.table_groups_id == table_groups_id) + .order_by(asc(func.lower(cls.table_name))) + .limit(limit) + ) + return list(get_current_session().scalars(query).all()) + + @classmethod + def count_tables(cls, table_groups_id: UUID) -> int: + query = select(func.count()).where(cls.table_groups_id == table_groups_id) + return get_current_session().scalar(query) or 0 diff --git a/testgen/common/models/test_definition.py b/testgen/common/models/test_definition.py index c110695d..1ddb8af5 100644 --- a/testgen/common/models/test_definition.py +++ b/testgen/common/models/test_definition.py @@ -123,6 +123,8 @@ def process_bind_param(self, value: str | None, _dialect) -> str | None: class TestType(Entity): __tablename__ = "test_types" + _get_by = "test_type" + id: str = Column(String) test_type: str = Column(String, primary_key=True, nullable=False) test_name_short: str = Column(String) @@ -149,6 +151,8 @@ class TestType(Entity): usage_notes: str = Column(String) active: str = Column(String) + # Unmapped columns: generation_template, result_visualization, result_visualization_params + _summary_columns = ( *[key for key in TestTypeSummary.__annotations__.keys() if key != "default_test_description"], test_description.label("default_test_description"), diff --git a/testgen/common/models/test_result.py b/testgen/common/models/test_result.py index dd8d9ded..296a39c8 100644 --- a/testgen/common/models/test_result.py +++ b/testgen/common/models/test_result.py @@ -1,8 +1,10 @@ import enum from collections import defaultdict +from datetime import datetime +from typing import Self from uuid import UUID, uuid4 -from sqlalchemy import Boolean, Column, Enum, ForeignKey, Integer, String, or_, select +from sqlalchemy import Boolean, Column, Enum, ForeignKey, Integer, String, desc, func, or_, select from sqlalchemy.dialects import postgresql from sqlalchemy.orm import aliased @@ -40,8 +42,81 @@ class TestResult(Entity): status: TestResultStatus = Column("result_status", Enum(TestResultStatus)) message: str = Column("result_message", String) + test_time: datetime = Column(postgresql.TIMESTAMP) result_code: int = Column(Integer) - # Note: not all table columns are implemented by this entity + disposition: str = Column(String) + result_measure: str = Column(String) + threshold_value: str = Column(String) + + # Unmapped columns: result_id, skip_errors, input_parameters, severity, + # result_signal, test_description, table_groups_id, dq_prevalence, + # dq_record_ct, observability_status + + @classmethod + def select_results( + cls, + test_run_id: UUID, + status: TestResultStatus | None = None, + table_name: str | None = None, + test_type: str | None = None, + limit: int = 50, + ) -> list[Self]: + clauses = [ + cls.test_run_id == test_run_id, + func.coalesce(cls.disposition, "Confirmed") == "Confirmed", + ] + if status: + clauses.append(cls.status == status) + if table_name: + clauses.append(cls.table_name == table_name) + if test_type: + clauses.append(cls.test_type == test_type) + query = select(cls).where(*clauses).order_by(cls.status, cls.table_name, cls.column_names).limit(limit) + return get_current_session().scalars(query).all() + + @classmethod + def select_failures( + cls, + test_run_id: UUID, + group_by: str = "test_type", + ) -> list[tuple]: + allowed = {"test_type", "table_name", "column_names"} + if group_by not in allowed: + raise ValueError(f"group_by must be one of {allowed}") + + where = [ + cls.test_run_id == test_run_id, + cls.status.in_([TestResultStatus.Failed, TestResultStatus.Warning]), + func.coalesce(cls.disposition, "Confirmed") == "Confirmed", + ] + + # Column grouping includes table_name for context → (table, column, count) + if group_by == "column_names": + group_cols = (cls.table_name, cls.column_names) + else: + group_cols = (getattr(cls, group_by),) + + query = ( + select(*group_cols, func.count().label("failure_count")) + .where(*where) + .group_by(*group_cols) + .order_by(func.count().desc()) + ) + return get_current_session().execute(query).all() + + @classmethod + def select_history( + cls, + test_definition_id: UUID, + limit: int = 20, + ) -> list[Self]: + query = ( + select(cls) + .where(cls.test_definition_id == test_definition_id) + .order_by(desc(cls.test_time)) + .limit(limit) + ) + return get_current_session().scalars(query).all() @classmethod def diff(cls, test_run_id_a: UUID, test_run_id_b: UUID) -> list[TestResultDiffType]: diff --git a/testgen/mcp/prompts/__init__.py b/testgen/mcp/prompts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/testgen/mcp/prompts/workflows.py b/testgen/mcp/prompts/workflows.py new file mode 100644 index 00000000..7fe4c44f --- /dev/null +++ b/testgen/mcp/prompts/workflows.py @@ -0,0 +1,85 @@ +def health_check() -> str: + """Run a data quality health check across all projects and test suites. + + Provides a comprehensive overview of the current data quality status. + """ + return """\ +Please perform a data quality health check: + +1. Call `get_data_inventory()` to get a complete overview of all projects, connections, table groups, and test suites. +2. For each project, call `get_recent_test_runs(project_code='...')` to get the most recent test run. +3. Summarize the overall health: + - Which projects/suites are healthy (all tests passing)? + - Which have failures or warnings? + - Which have not been run recently? +4. Highlight any critical issues that need immediate attention. +5. Provide actionable recommendations for improving data quality. +""" + + +def investigate_failures(test_suite: str | None = None) -> str: + """Investigate test failures to identify root causes and patterns. + + Args: + test_suite: Optional test suite name to focus the investigation on. + """ + suite_filter = f" Focus on the test suite named `{test_suite}`." if test_suite else "" + + return f"""\ +Please investigate test failures and identify root causes:{suite_filter} + +1. Call `get_data_inventory()` to understand the project structure. +2. Call `get_recent_test_runs(project_code='...')` to find the most recent run{f" for suite `{test_suite}`" if test_suite else ""}. +3. Call `get_failure_summary(test_run_id='...')` to see failures grouped by test type. +4. For each failure category, call `get_test_type(test_type='...')` to understand what the test checks. +5. Call `get_test_results(test_run_id='...', status='Failed')` to see individual failure details. +6. Analyze the patterns: + - Are failures concentrated in specific tables or columns? + - Do certain test types fail consistently? + - What do the measured values vs thresholds tell us about the root cause? +7. Provide a root cause analysis and recommended remediation steps. +""" + + +def table_health(table_name: str) -> str: + """Assess the data quality health of a specific table across all test suites. + + Args: + table_name: The name of the table to investigate. + """ + return f"""\ +Please assess the data quality health of table `{table_name}`: + +1. Call `get_data_inventory()` to find which table groups and test suites include this table. +2. For each relevant test suite, call `get_recent_test_runs(project_code='...')` to find the latest run. +3. Call `get_test_results(test_run_id='...', table_name='{table_name}')` to get all results for this table. +4. Summarize the table's health: + - Which tests pass and which fail? + - What data quality dimensions are affected? + - Are there patterns in the failures (e.g., specific columns)? +5. Provide recommendations for improving data quality for this table. +""" + + +def compare_runs(test_suite: str | None = None) -> str: + """Compare the two most recent test runs to identify regressions and improvements. + + Args: + test_suite: Optional test suite name to focus the comparison on. + """ + suite_filter = f" for suite `{test_suite}`" if test_suite else "" + + return f"""\ +Please compare the two most recent test runs{suite_filter} to identify regressions and improvements: + +1. Call `get_data_inventory()` to understand the project structure. +2. Call `list_test_suites(project_code='...')` to find suites{suite_filter} and their latest runs. +3. For the most recent completed run, call `get_test_results(test_run_id='...')` to get all results. +4. For the previous run, call `get_test_results(test_run_id='...')` to get all results. +5. Compare the two runs: + - **Regressions:** Tests that passed before but now fail. + - **Improvements:** Tests that failed before but now pass. + - **Persistent failures:** Tests that fail in both runs. + - **Stable passes:** Tests that pass in both runs. +6. Summarize the trend and highlight any concerning regressions. +""" diff --git a/testgen/mcp/server.py b/testgen/mcp/server.py index 6134e32f..4e724c89 100644 --- a/testgen/mcp/server.py +++ b/testgen/mcp/server.py @@ -5,7 +5,6 @@ from mcp.server.fastmcp import FastMCP from testgen import settings -from testgen.common import version_service from testgen.common.auth import decode_jwt_token from testgen.common.models import with_database_session @@ -14,15 +13,36 @@ SERVER_INSTRUCTIONS = """\ You are connected to a TestGen data quality testing server. -WORKFLOW: -1. ALWAYS start with get_data_inventory to understand the available projects, connections, and table groups. -2. Use the appropriate tools to explore profiling results, test definitions, and test results. -3. When asked about data quality, reference specific test results and profiling anomalies. -4. Provide actionable recommendations based on the data quality findings. +## Available Tools (8) -IMPORTANT: -- Use ISO 8601 format for dates (YYYY-MM-DD). -- UUIDs are used as identifiers for most entities. +- **get_data_inventory()** — Complete overview of projects, connections, table groups, test suites, and latest run stats. START HERE. +- **list_projects()** — List all project codes and names. +- **list_test_suites(project_code)** — List test suites with run stats for a project. +- **get_recent_test_runs(project_code, test_suite?, limit?)** — Get recent test runs with pass/fail counts (default 5). +- **get_test_results(test_run_id, status?, table_name?, test_type?, limit?)** — Get individual test results with filters. +- **get_test_result_history(test_definition_id, limit?)** — Historical results for a test definition across runs (measure, threshold, status over time). +- **get_failure_summary(test_run_id, group_by?)** — Failures grouped by test_type, table, or column. +- **get_test_type(test_type)** — Detailed info about a test type (what it checks, thresholds, DQ dimension). + +## Resources (2) + +- **testgen://test-types** — Reference table of all active test types. +- **testgen://glossary** — Entity hierarchy, result statuses, DQ dimensions, test scopes. + +## Workflow + +1. ALWAYS start with `get_data_inventory` to understand the landscape. +2. Drill into specific runs with `get_recent_test_runs` and `get_test_results`. +3. DO NOT assume what a test type checks. Look at `testgen://test-types` +4. Use `get_failure_summary` to understand failure patterns, then `get_test_type` for each category. +5. Use `get_test_result_history` to see how a specific test's measure and status changed over time. +6. Reference `testgen://glossary` for definitions of statuses, dimensions, and scopes. + +## Conventions + +- UUIDs are used as identifiers — pass them as strings. +- Dates are in ISO 8601 format. +- Test results with disposition 'Dismissed' or 'Inactive' are excluded from counts by default. """ @@ -42,20 +62,14 @@ async def verify_token(self, token: str) -> AccessToken | None: return None -@with_database_session -def ping() -> dict: - """Check server connectivity and return version information.""" - version_data = version_service.get_version() - return { - "status": "ok", - "edition": version_data.edition, - "version": version_data.current, - } - - def run_mcp() -> None: """Start the MCP server with streamable HTTP transport.""" from testgen.mcp import get_server_url + from testgen.mcp.prompts.workflows import compare_runs, health_check, investigate_failures, table_health + from testgen.mcp.tools.discovery import get_data_inventory, list_projects, list_test_suites + from testgen.mcp.tools.reference import get_test_type, glossary_resource, test_types_resource + from testgen.mcp.tools.test_results import get_failure_summary, get_test_result_history, get_test_results + from testgen.mcp.tools.test_runs import get_recent_test_runs from testgen.utils.plugins import discover for plugin in discover(): @@ -74,7 +88,26 @@ def run_mcp() -> None: ), token_verifier=JWTTokenVerifier(), ) - mcp.tool()(ping) + + # Tools (8) + mcp.tool()(get_data_inventory) + mcp.tool()(list_projects) + mcp.tool()(list_test_suites) + mcp.tool()(get_recent_test_runs) + mcp.tool()(get_test_results) + mcp.tool()(get_test_result_history) + mcp.tool()(get_failure_summary) + mcp.tool()(get_test_type) + + # Resources (2) + mcp.resource("testgen://test-types")(test_types_resource) + mcp.resource("testgen://glossary")(glossary_resource) + + # Prompts (4) + mcp.prompt()(health_check) + mcp.prompt()(investigate_failures) + mcp.prompt()(table_health) + mcp.prompt()(compare_runs) LOG.info("Starting MCP server on %s:%s (auth issuer: %s)", settings.MCP_HOST, settings.MCP_PORT, server_url) mcp.run(transport="streamable-http") diff --git a/testgen/mcp/services/__init__.py b/testgen/mcp/services/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/testgen/mcp/services/inventory_service.py b/testgen/mcp/services/inventory_service.py new file mode 100644 index 00000000..6f1551a6 --- /dev/null +++ b/testgen/mcp/services/inventory_service.py @@ -0,0 +1,191 @@ +from sqlalchemy import and_, select + +from testgen.common.models import get_current_session +from testgen.common.models.connection import Connection +from testgen.common.models.data_table import DataTable +from testgen.common.models.project import Project +from testgen.common.models.table_group import TableGroup +from testgen.common.models.test_run import TestRun +from testgen.common.models.test_suite import TestSuite + + +def get_inventory() -> str: + """Build a markdown inventory of all projects, connections, table groups, and test suites.""" + session = get_current_session() + + query = ( + select( + Project.project_code, + Project.project_name, + Connection.connection_id, + Connection.connection_name, + Connection.sql_flavor_code, + TableGroup.id.label("table_group_id"), + TableGroup.table_groups_name, + TableGroup.table_group_schema, + TestSuite.id.label("test_suite_id"), + TestSuite.test_suite, + TestRun.id.label("last_run_id"), + TestRun.test_starttime, + TestRun.status.label("last_run_status"), + TestRun.test_ct, + TestRun.passed_ct, + TestRun.failed_ct, + TestRun.warning_ct, + ) + .outerjoin(Connection, Connection.project_code == Project.project_code) + .outerjoin(TableGroup, TableGroup.connection_id == Connection.connection_id) + .outerjoin( + TestSuite, + and_( + TestSuite.table_groups_id == TableGroup.id, + TestSuite.is_monitor.isnot(True), + ), + ) + .outerjoin(TestRun, TestRun.id == TestSuite.last_complete_test_run_id) + .order_by(Project.project_name, Connection.connection_name, TableGroup.table_groups_name, TestSuite.test_suite) + ) + + rows = session.execute(query).all() + + # Collect table stats per project + project_codes = {row.project_code for row in rows if row.project_code} + stats_by_group = {} + for project_code in project_codes: + for stat in TableGroup.select_stats(project_code): + stats_by_group[stat.id] = stat + + # Collect table names per group (first 100) + group_ids = {row.table_group_id for row in rows if row.table_group_id} + tables_by_group: dict = {} + for gid in group_ids: + table_names = DataTable.select_table_names(gid, limit=100) + if table_names: + tables_by_group[gid] = table_names + + # Build nested structure + projects: dict[str, dict] = {} + total_suites = 0 + + for row in rows: + proj = projects.setdefault(row.project_code, { + "name": row.project_name, + "connections": {}, + }) + if row.connection_id is None: + continue + + conn = proj["connections"].setdefault(row.connection_id, { + "name": row.connection_name, + "flavor": row.sql_flavor_code, + "groups": {}, + }) + if row.table_group_id is None: + continue + + group = conn["groups"].setdefault(row.table_group_id, { + "name": row.table_groups_name, + "schema": row.table_group_schema, + "suites": [], + }) + if row.test_suite_id is not None: + total_suites += 1 + group["suites"].append({ + "id": str(row.test_suite_id), + "name": row.test_suite, + "last_run_id": str(row.last_run_id) if row.last_run_id else None, + "last_run_time": str(row.test_starttime) if row.test_starttime else None, + "last_run_status": row.last_run_status, + "test_ct": row.test_ct, + "passed_ct": row.passed_ct, + "failed_ct": row.failed_ct, + "warning_ct": row.warning_ct, + }) + + # Compact mode for large inventories + compact_suites = total_suites > 20 + total_groups = sum( + len(conn["groups"]) + for proj in projects.values() + for conn in proj["connections"].values() + ) + compact_groups = total_groups > 50 + + # Format as Markdown + lines = ["# Data Inventory\n"] + + for project_code, proj in projects.items(): + lines.append(f"## Project: {proj['name']} (`{project_code}`)\n") + + if not proj["connections"]: + lines.append("_No connections configured._\n") + continue + + for _conn_id, conn in proj["connections"].items(): + lines.append(f"### Connection: {conn['name']} ({conn['flavor']})\n") + + if not conn["groups"]: + lines.append("_No table groups._\n") + continue + + for group_id, group in conn["groups"].items(): + stat = stats_by_group.get(group_id) + table_ct = stat.table_ct if stat and stat.table_ct else 0 + column_ct = stat.column_ct if stat and stat.column_ct else 0 + group_tables = tables_by_group.get(group_id, []) + + if compact_groups: + lines.append( + f"- **{group['name']}** (schema: `{group['schema']}`, " + f"{table_ct} tables, {column_ct} columns, " + f"{len(group['suites'])} test suites)" + ) + continue + + lines.append( + f"#### Table Group: {group['name']} (schema: `{group['schema']}`, " + f"{table_ct} tables, {column_ct} columns)\n" + ) + + if group_tables: + tables_str = ", ".join(f"`{t}`" for t in group_tables) + if table_ct and table_ct > 100: + tables_str += f", ... ({table_ct - 100} more)" + lines.append(f"Tables: {tables_str}\n") + + if not group["suites"]: + lines.append("_No test suites._\n") + continue + + for suite in group["suites"]: + if compact_suites: + status_icon = "" + if suite["last_run_status"] == "Complete": + if suite["failed_ct"]: + status_icon = " [FAILURES]" + else: + status_icon = " [OK]" + lines.append(f"- **{suite['name']}** (`{suite['id']}`){status_icon}") + else: + lines.append(f"**Test Suite: {suite['name']}** (id: `{suite['id']}`)") + if suite["last_run_id"]: + lines.append(f" - Last run: `{suite['last_run_id']}` ({suite['last_run_status']})") + lines.append(f" - Time: {suite['last_run_time']}") + lines.append( + f" - Results: {suite['test_ct']} tests, " + f"{suite['passed_ct']} passed, " + f"{suite['failed_ct']} failed, " + f"{suite['warning_ct']} warnings" + ) + else: + lines.append(" - _No completed runs._") + lines.append("") + + lines.append("") + + lines.append( + "---\n" + "For test type definitions, read the `testgen://test-types` resource or call `get_test_type`." + ) + + return "\n".join(lines) diff --git a/testgen/mcp/tools/__init__.py b/testgen/mcp/tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/testgen/mcp/tools/discovery.py b/testgen/mcp/tools/discovery.py new file mode 100644 index 00000000..d9bb872d --- /dev/null +++ b/testgen/mcp/tools/discovery.py @@ -0,0 +1,72 @@ +from testgen.common.models import with_database_session +from testgen.common.models.project import Project +from testgen.common.models.test_suite import TestSuite + + +@with_database_session +def get_data_inventory() -> str: + """Get a complete inventory of all projects, connections, table groups, and test suites with their latest run status. + + This is the recommended starting point for understanding the data quality landscape. + Returns a structured markdown overview of the entire TestGen configuration. + """ + from testgen.mcp.services.inventory_service import get_inventory + + return get_inventory() + + +@with_database_session +def list_projects() -> str: + """List all configured projects. + + Returns project codes and names. Use these to scope queries to specific projects. + """ + projects = Project.select_where() + + if not projects: + return "No projects found." + + lines = ["# Projects\n"] + for project in projects: + lines.append(f"- **{project.project_name}** (`{project.project_code}`)") + + return "\n".join(lines) + + +@with_database_session +def list_test_suites(project_code: str) -> str: + """List all test suites for a project with their latest run statistics. + + Args: + project_code: The project code to list test suites for. + """ + summaries = TestSuite.select_summary(project_code) + + if not summaries: + return f"No test suites found for project `{project_code}`." + + lines = [f"# Test Suites for `{project_code}`\n"] + for s in summaries: + lines.append(f"## {s.test_suite} (id: `{s.id}`)") + lines.append(f"- Connection: {s.connection_name}") + lines.append(f"- Table Group: {s.table_groups_name}") + if s.test_suite_description: + lines.append(f"- Description: {s.test_suite_description}") + lines.append(f"- Test definitions: {s.test_ct or 0}") + + if s.latest_run_id: + lines.append(f"- Latest run: `{s.latest_run_id}` ({s.latest_run_start})") + lines.append( + f" - {s.last_run_test_ct or 0} tests: " + f"{s.last_run_passed_ct or 0} passed, " + f"{s.last_run_failed_ct or 0} failed, " + f"{s.last_run_warning_ct or 0} warnings, " + f"{s.last_run_error_ct or 0} errors" + ) + if s.last_run_dismissed_ct: + lines.append(f" - {s.last_run_dismissed_ct} dismissed") + else: + lines.append("- _No completed runs._") + lines.append("") + + return "\n".join(lines) diff --git a/testgen/mcp/tools/reference.py b/testgen/mcp/tools/reference.py new file mode 100644 index 00000000..19954c85 --- /dev/null +++ b/testgen/mcp/tools/reference.py @@ -0,0 +1,119 @@ +from testgen.common.models import with_database_session +from testgen.common.models.test_definition import TestType + + +@with_database_session +def get_test_type(test_type: str) -> str: + """Get detailed information about a specific test type. + + Args: + test_type: The test type code (e.g., 'Alpha_Trunc', 'Unique_Pct'). + """ + tt = TestType.get(test_type) + + if not tt: + return f"Test type `{test_type}` not found." + + lines = [ + f"# Test Type: {tt.test_type}\n", + f"- **Name:** {tt.test_name_short}", + ] + if tt.test_name_long: + lines.append(f"- **Full Name:** {tt.test_name_long}") + if tt.test_description: + lines.append(f"- **Description:** {tt.test_description}") + if tt.measure_uom: + lines.append(f"- **Measure UOM:** {tt.measure_uom}") + if tt.measure_uom_description: + lines.append(f"- **Measure Description:** {tt.measure_uom_description}") + if tt.threshold_description: + lines.append(f"- **Threshold:** {tt.threshold_description}") + if tt.dq_dimension: + lines.append(f"- **DQ Dimension:** {tt.dq_dimension}") + if tt.test_scope: + lines.append(f"- **Scope:** {tt.test_scope}") + if tt.except_message: + lines.append(f"- **Exception Message:** {tt.except_message}") + if tt.usage_notes: + lines.append(f"- **Usage Notes:** {tt.usage_notes}") + + return "\n".join(lines) + + +@with_database_session +def test_types_resource() -> str: + """Reference table of all active test types with their descriptions and DQ dimensions.""" + test_types = TestType.select_where(TestType.active == "Y") + + if not test_types: + return "No active test types found." + + lines = [ + "# TestGen Test Types Reference\n", + "| Test Type | Name | DQ Dimension | Scope | Description |", + "|---|---|---|---|---|", + ] + + for tt in test_types: + desc = (tt.test_description or "")[:80] + lines.append( + f"| {tt.test_type} | {tt.test_name_short or ''} | " + f"{tt.dq_dimension or ''} | {tt.test_scope or ''} | {desc} |" + ) + + return "\n".join(lines) + + +def glossary_resource() -> str: + """Glossary of TestGen concepts, entity hierarchy, result statuses, and DQ dimensions.""" + return """\ +# TestGen Glossary + +## Entity Hierarchy + +- **Project** — Top-level organizational unit. Contains connections and test suites. +- **Connection** — Database connection configuration (host, credentials, flavor). +- **Table Group** — A set of tables within a schema that are profiled and tested together. +- **Test Suite** — A collection of test definitions scoped to a table group. +- **Test Definition** — A configured test with parameters, thresholds, and target table/column. +- **Test Run** — An execution of a test suite producing test results. +- **Test Result** — The outcome of a single test definition within a test run. + +## Result Statuses + +- **Passed** — Test passed within acceptable thresholds. +- **Warning** — Test exceeded the failure threshold. +- **Failed** — Test exceeded the failure threshold. Higher severity. +- **Error** — Test could not execute (e.g., SQL error, missing table). +- **Log** — Informational result, not scored. + +## Disposition + +Disposition is a user-assigned review status for test results: +- **Confirmed** (default) — Result is valid and counts toward scoring. +- **Dismissed** — Result reviewed and dismissed (excluded from scoring). +- **Inactive** — Test was deactivated after this result (excluded from scoring). + +## DQ Dimensions + +- **Accuracy** — Data values are correct and reflect real-world truth. +- **Completeness** — Required data is present (no unexpected NULLs or blanks). +- **Consistency** — Data agrees across columns, tables, or systems. +- **Timeliness** — Data is current and updated within expected windows. +- **Uniqueness** — No unintended duplicates exist. +- **Validity** — Data conforms to expected formats, ranges, or patterns. + +## Test Scopes + +- **column** — Tests a single column (e.g., null rate, pattern match). +- **table** — Tests table-level properties (e.g., row count, freshness). +- **referential** — Tests relationships between tables (e.g., foreign key match). +- **custom** — User-defined SQL tests. + +## Monitor Types + +- **Volume_Trend** — Tracks row count changes over time using statistical prediction. +- **Freshness_Trend** — Detects when a table has not been updated as expected. +- **Schema_Drift** — Detects column additions, deletions, or type changes. +- **Metric_Trend** — Tracks changes in user-defined metrics over time. +""" diff --git a/testgen/mcp/tools/test_results.py b/testgen/mcp/tools/test_results.py new file mode 100644 index 00000000..64eec2e6 --- /dev/null +++ b/testgen/mcp/tools/test_results.py @@ -0,0 +1,167 @@ +from uuid import UUID + +from testgen.common.models import with_database_session +from testgen.common.models.test_result import TestResult, TestResultStatus + + +def _parse_uuid(value: str, label: str = "ID") -> UUID: + try: + return UUID(value) + except (ValueError, AttributeError) as err: + raise ValueError(f"Invalid {label}: `{value}` is not a valid UUID.") from err + + +def _parse_status(value: str) -> TestResultStatus: + try: + return TestResultStatus(value) + except ValueError as err: + valid = ", ".join(s.value for s in TestResultStatus) + raise ValueError(f"Invalid status `{value}`. Valid values: {valid}") from err + + +@with_database_session +def get_test_results( + test_run_id: str, + status: str | None = None, + table_name: str | None = None, + test_type: str | None = None, + limit: int = 50, +) -> str: + """Get individual test results for a test run, with optional filters. + + Args: + test_run_id: The UUID of the test run. + status: Filter by result status (Passed, Failed, Warning, Error, Log). + table_name: Filter by table name. + test_type: Filter by test type code. + limit: Maximum number of results to return (default 50). + """ + run_uuid = _parse_uuid(test_run_id, "test_run_id") + status_enum = _parse_status(status) if status else None + + results = TestResult.select_results( + test_run_id=run_uuid, + status=status_enum, + table_name=table_name, + test_type=test_type, + limit=limit, + ) + + if not results: + filters = [] + if status: + filters.append(f"status={status}") + if table_name: + filters.append(f"table={table_name}") + if test_type: + filters.append(f"type={test_type}") + filter_str = f" (filters: {', '.join(filters)})" if filters else "" + return f"No test results found for run `{test_run_id}`{filter_str}." + + lines = [f"# Test Results for run `{test_run_id}`\n"] + lines.append(f"Showing {len(results)} result(s).\n") + + for r in results: + status_str = r.status.value if r.status else "Unknown" + lines.append(f"## [{status_str}] {r.test_type} on `{r.table_name}`") + lines.append(f"- Test definition: `{r.test_definition_id}`") + if r.column_names: + lines.append(f"- Column: `{r.column_names}`") + if r.result_measure is not None: + lines.append(f"- Measured value: {r.result_measure}") + if r.threshold_value is not None: + lines.append(f"- Threshold: {r.threshold_value}") + if r.message: + lines.append(f"- Message: {r.message}") + lines.append("") + + return "\n".join(lines) + + +@with_database_session +def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: + """Get a summary of test failures (Failed and Warning) grouped by test type, table name, or column. + + Args: + test_run_id: The UUID of the test run. + group_by: Group failures by 'test_type', 'table', or 'column' (default: 'test_type'). + """ + run_uuid = _parse_uuid(test_run_id, "test_run_id") + + # Map public param names to model field names + model_group_map = {"table": "table_name", "column": "column_names"} + model_group_by = model_group_map.get(group_by, group_by) + failures = TestResult.select_failures(test_run_id=run_uuid, group_by=model_group_by) + + if not failures: + return f"No confirmed failures found for run `{test_run_id}`." + + total = sum(row[-1] for row in failures) + group_label = {"test_type": "Test Type", "table": "Table Name", "column": "Column"}[group_by] + + lines = [ + f"# Failure Summary for run `{test_run_id}`\n", + f"**Total confirmed failures (Failed + Warning):** {total}\n", + f"| {group_label} | Count |", + "|---|---|", + ] + + for row in failures: + count = row[-1] + if group_by == "column": + # Row is (table_name, column_names, count) + table, column = row[0], row[1] + label = f"{table}.{column}" if column else f"{table} (table-level)" + else: + label = row[0] + lines.append(f"| {label} | {count} |") + + if group_by == "test_type": + lines.append( + "\nCheck `testgen://test-types` to understand what each test type checks " + "and `get_test_type(test_type='...')` to fetch more details." + ) + + return "\n".join(lines) + + +@with_database_session +def get_test_result_history( + test_definition_id: str, + limit: int = 20, +) -> str: + """Get the historical results of a specific test definition across runs, showing how measure and status changed over time. + + Args: + test_definition_id: The UUID of the test definition (from get_test_results output). + limit: Maximum number of historical results to return (default 20). + """ + def_uuid = _parse_uuid(test_definition_id, "test_definition_id") + results = TestResult.select_history(test_definition_id=def_uuid, limit=limit) + + if not results: + return f"No historical results found for test definition `{test_definition_id}`." + + first = results[0] + lines = [ + "# Test Result History\n", + f"- **Test Type:** {first.test_type}", + f"- **Table:** `{first.table_name}`", + ] + if first.column_names: + lines.append(f"- **Column:** `{first.column_names}`") + + lines.extend([ + f"\nShowing {len(results)} result(s), newest first.\n", + "| Date | Measure | Threshold | Status |", + "|---|---|---|---|", + ]) + + for r in results: + date_str = str(r.test_time) if r.test_time else "—" + measure = r.result_measure if r.result_measure is not None else "—" + threshold = r.threshold_value if r.threshold_value is not None else "—" + status_str = r.status.value if r.status else "—" + lines.append(f"| {date_str} | {measure} | {threshold} | {status_str} |") + + return "\n".join(lines) diff --git a/testgen/mcp/tools/test_runs.py b/testgen/mcp/tools/test_runs.py new file mode 100644 index 00000000..a0524424 --- /dev/null +++ b/testgen/mcp/tools/test_runs.py @@ -0,0 +1,65 @@ +from testgen.common.models import with_database_session +from testgen.common.models.test_run import TestRun +from testgen.common.models.test_suite import TestSuite + + +@with_database_session +def get_recent_test_runs(project_code: str, test_suite: str | None = None, limit: int = 5) -> str: + """Get recent completed test runs for a project, optionally filtered by test suite name. + + Args: + project_code: The project code to query. + test_suite: Optional test suite name to filter by. + limit: Maximum number of runs to return (default 5). + """ + test_suite_id = None + if test_suite: + suites = TestSuite.select_minimal_where( + TestSuite.project_code == project_code, + TestSuite.test_suite == test_suite, + ) + if not suites: + return f"Test suite `{test_suite}` not found in project `{project_code}`." + test_suite_id = str(suites[0].id) + + summaries = TestRun.select_summary(project_code=project_code, test_suite_id=test_suite_id) + + if not summaries: + scope = f" for suite `{test_suite}`" if test_suite else "" + return f"No completed test runs found in project `{project_code}`{scope}." + + runs = summaries[:limit] + + lines = [f"# Recent Test Runs for `{project_code}`\n"] + if test_suite: + lines[0] = f"# Recent Test Runs for `{project_code}` / `{test_suite}`\n" + lines.append(f"Showing {len(runs)} of {len(summaries)} run(s).\n") + + for run in runs: + passed = run.passed_ct or 0 + failed = run.failed_ct or 0 + warning = run.warning_ct or 0 + errors = run.error_ct or 0 + + status_hint = "" + if failed or warning: + status_hint = f" **[{failed}F/{warning}W]**" + elif run.status == "Complete": + status_hint = " [OK]" + + lines.append(f"## {run.test_suite} — {run.status}{status_hint}") + lines.append(f"- **Run ID:** `{run.test_run_id}`") + lines.append(f"- **Started:** {run.test_starttime} | **Ended:** {run.test_endtime}") + lines.append(f"- **Results:** {run.test_ct or 0} tests — {passed} passed, {failed} failed, {warning} warnings, {errors} errors") + + if run.dismissed_ct: + lines.append(f"- **Dismissed:** {run.dismissed_ct}") + + if run.dq_score_testing is not None: + lines.append(f"- **DQ Score:** {run.dq_score_testing:.1f}") + + lines.append("") + + lines.append("Use `get_test_results(test_run_id='...')` for detailed results of a specific run.") + + return "\n".join(lines) diff --git a/tests/unit/mcp/test_inventory_service.py b/tests/unit/mcp/test_inventory_service.py new file mode 100644 index 00000000..d393c820 --- /dev/null +++ b/tests/unit/mcp/test_inventory_service.py @@ -0,0 +1,177 @@ +from unittest.mock import MagicMock, patch +from uuid import uuid4 + +import pytest + + +@pytest.fixture +def session_mock(): + with patch("testgen.mcp.services.inventory_service.get_current_session") as mock: + yield mock.return_value + + +def _make_row(project_code="demo", project_name="Demo", connection_id=1, connection_name="main", + sql_flavor_code="postgresql", table_group_id=None, table_groups_name="core", + table_group_schema="public", test_suite_id=None, test_suite="Quality", + last_run_id=None, test_starttime=None, last_run_status=None, + test_ct=None, passed_ct=None, failed_ct=None, warning_ct=None): + row = MagicMock() + row.project_code = project_code + row.project_name = project_name + row.connection_id = connection_id + row.connection_name = connection_name + row.sql_flavor_code = sql_flavor_code + row.table_group_id = table_group_id or uuid4() + row.table_groups_name = table_groups_name + row.table_group_schema = table_group_schema + row.test_suite_id = test_suite_id or uuid4() + row.test_suite = test_suite + row.last_run_id = last_run_id or uuid4() + row.test_starttime = test_starttime or "2024-01-15T10:00:00" + row.last_run_status = last_run_status or "Complete" + row.test_ct = test_ct if test_ct is not None else 50 + row.passed_ct = passed_ct if passed_ct is not None else 47 + row.failed_ct = failed_ct if failed_ct is not None else 2 + row.warning_ct = warning_ct if warning_ct is not None else 1 + return row + + +@patch("testgen.mcp.services.inventory_service.DataTable") +@patch("testgen.mcp.services.inventory_service.TableGroup") +@patch("testgen.mcp.services.inventory_service.select") +def test_get_inventory_basic(mock_select, mock_tg, mock_dt, session_mock): + tg_id = uuid4() + row = _make_row(table_group_id=tg_id) + session_mock.execute.return_value.all.return_value = [row] + + stat = MagicMock() + stat.id = tg_id + stat.table_ct = 10 + stat.column_ct = 50 + mock_tg.select_stats.return_value = [stat] + mock_dt.select_table_names.return_value = ["customers", "orders", "products"] + + from testgen.mcp.services.inventory_service import get_inventory + + result = get_inventory() + + assert "Data Inventory" in result + assert "Demo" in result + assert "main" in result + assert "core" in result + assert "Quality" in result + assert "10 tables" in result + assert "`customers`" in result + assert "`orders`" in result + + +@patch("testgen.mcp.services.inventory_service.DataTable") +@patch("testgen.mcp.services.inventory_service.TableGroup") +@patch("testgen.mcp.services.inventory_service.select") +def test_get_inventory_empty(mock_select, mock_tg, mock_dt, session_mock): + session_mock.execute.return_value.all.return_value = [] + + from testgen.mcp.services.inventory_service import get_inventory + + result = get_inventory() + + assert "Data Inventory" in result + + +@patch("testgen.mcp.services.inventory_service.DataTable") +@patch("testgen.mcp.services.inventory_service.TableGroup") +@patch("testgen.mcp.services.inventory_service.select") +def test_get_inventory_project_no_connections(mock_select, mock_tg, mock_dt, session_mock): + row = _make_row(connection_id=None) + session_mock.execute.return_value.all.return_value = [row] + mock_tg.select_stats.return_value = [] + + from testgen.mcp.services.inventory_service import get_inventory + + result = get_inventory() + + assert "Demo" in result + assert "No connections" in result + + +@patch("testgen.mcp.services.inventory_service.DataTable") +@patch("testgen.mcp.services.inventory_service.TableGroup") +@patch("testgen.mcp.services.inventory_service.select") +def test_get_inventory_includes_test_type_hint(mock_select, mock_tg, mock_dt, session_mock): + session_mock.execute.return_value.all.return_value = [_make_row()] + stat = MagicMock() + stat.id = uuid4() + stat.table_ct = 5 + stat.column_ct = 20 + mock_tg.select_stats.return_value = [stat] + mock_dt.select_table_names.return_value = [] + + from testgen.mcp.services.inventory_service import get_inventory + + result = get_inventory() + + assert "test-types" in result + + +@patch("testgen.mcp.services.inventory_service.DataTable") +@patch("testgen.mcp.services.inventory_service.TableGroup") +@patch("testgen.mcp.services.inventory_service.select") +def test_get_inventory_compact_suites(mock_select, mock_tg, mock_dt, session_mock): + """When >20 suites, suite output uses compact format (name + status icon only).""" + tg_id = uuid4() + rows = [ + _make_row( + table_group_id=tg_id, + test_suite=f"Suite_{i}", + test_suite_id=uuid4(), + failed_ct=1 if i == 0 else 0, + warning_ct=0, + ) + for i in range(25) + ] + session_mock.execute.return_value.all.return_value = rows + + stat = MagicMock() + stat.id = tg_id + stat.table_ct = 10 + stat.column_ct = 50 + mock_tg.select_stats.return_value = [stat] + mock_dt.select_table_names.return_value = ["t1"] + + from testgen.mcp.services.inventory_service import get_inventory + + result = get_inventory() + + # Compact suites: show "[FAILURES]" / "[OK]" badges, no full run details + assert "[FAILURES]" in result + assert "[OK]" in result + # Full format markers should NOT appear + assert "Last run:" not in result + + +@patch("testgen.mcp.services.inventory_service.DataTable") +@patch("testgen.mcp.services.inventory_service.TableGroup") +@patch("testgen.mcp.services.inventory_service.select") +def test_get_inventory_compact_groups(mock_select, mock_tg, mock_dt, session_mock): + """When >50 groups, group output uses single-line compact format.""" + rows = [ + _make_row( + table_group_id=uuid4(), + table_groups_name=f"Group_{i}", + test_suite=f"Suite_{i}", + test_suite_id=uuid4(), + ) + for i in range(55) + ] + session_mock.execute.return_value.all.return_value = rows + + mock_tg.select_stats.return_value = [] + mock_dt.select_table_names.return_value = [] + + from testgen.mcp.services.inventory_service import get_inventory + + result = get_inventory() + + # Compact groups: single line with "X test suites", no "#### Table Group:" headers + assert "test suites)" in result + assert "#### Table Group:" not in result diff --git a/tests/unit/mcp/test_model_data_table.py b/tests/unit/mcp/test_model_data_table.py new file mode 100644 index 00000000..0f9f10e2 --- /dev/null +++ b/tests/unit/mcp/test_model_data_table.py @@ -0,0 +1,41 @@ +from unittest.mock import patch +from uuid import uuid4 + +from testgen.common.models.data_table import DataTable + + +@patch("testgen.common.models.data_table.get_current_session") +def test_select_table_names_returns_list(session_mock): + session_mock.return_value.scalars.return_value.all.return_value = ["customers", "orders", "products"] + + result = DataTable.select_table_names(table_groups_id=uuid4()) + + assert result == ["customers", "orders", "products"] + session_mock.return_value.scalars.assert_called_once() + + +@patch("testgen.common.models.data_table.get_current_session") +def test_select_table_names_empty(session_mock): + session_mock.return_value.scalars.return_value.all.return_value = [] + + result = DataTable.select_table_names(table_groups_id=uuid4()) + + assert result == [] + + +@patch("testgen.common.models.data_table.get_current_session") +def test_count_tables(session_mock): + session_mock.return_value.scalar.return_value = 42 + + result = DataTable.count_tables(table_groups_id=uuid4()) + + assert result == 42 + + +@patch("testgen.common.models.data_table.get_current_session") +def test_count_tables_none_returns_zero(session_mock): + session_mock.return_value.scalar.return_value = None + + result = DataTable.count_tables(table_groups_id=uuid4()) + + assert result == 0 diff --git a/tests/unit/mcp/test_model_test_result.py b/tests/unit/mcp/test_model_test_result.py new file mode 100644 index 00000000..d32714aa --- /dev/null +++ b/tests/unit/mcp/test_model_test_result.py @@ -0,0 +1,103 @@ +from unittest.mock import MagicMock, patch +from uuid import uuid4 + +import pytest + +from testgen.common.models.test_result import TestResult, TestResultStatus + + +@pytest.fixture +def session_mock(): + with patch("testgen.common.models.test_result.get_current_session") as mock: + yield mock.return_value + + +def test_select_results_basic(session_mock): + mock_results = [MagicMock(spec=TestResult)] + session_mock.scalars.return_value.all.return_value = mock_results + + results = TestResult.select_results(test_run_id=uuid4()) + + assert results == mock_results + session_mock.scalars.assert_called_once() + + +def test_select_results_with_status_filter(session_mock): + session_mock.scalars.return_value.all.return_value = [] + + results = TestResult.select_results(test_run_id=uuid4(), status=TestResultStatus.Failed) + + assert results == [] + + +def test_select_results_with_all_filters(session_mock): + session_mock.scalars.return_value.all.return_value = [] + + results = TestResult.select_results( + test_run_id=uuid4(), + status=TestResultStatus.Passed, + table_name="orders", + test_type="Alpha_Trunc", + limit=10, + ) + + assert results == [] + + +def test_select_failures_by_test_type(session_mock): + session_mock.execute.return_value.all.return_value = [ + ("Alpha_Trunc", 5), + ("Unique_Pct", 3), + ] + + results = TestResult.select_failures(test_run_id=uuid4(), group_by="test_type") + + assert len(results) == 2 + assert results[0] == ("Alpha_Trunc", 5) + + +def test_select_failures_by_table_name(session_mock): + session_mock.execute.return_value.all.return_value = [("orders", 8)] + + results = TestResult.select_failures(test_run_id=uuid4(), group_by="table_name") + + assert results[0] == ("orders", 8) + + +def test_select_failures_by_column_names(session_mock): + session_mock.execute.return_value.all.return_value = [("orders", "customer_name", 4)] + + results = TestResult.select_failures(test_run_id=uuid4(), group_by="column_names") + + assert results[0] == ("orders", "customer_name", 4) + + +def test_select_failures_invalid_group_by(): + with pytest.raises(ValueError, match="group_by must be one of"): + TestResult.select_failures(test_run_id=uuid4(), group_by="invalid_column") + + +def test_select_failures_empty(session_mock): + session_mock.execute.return_value.all.return_value = [] + + results = TestResult.select_failures(test_run_id=uuid4()) + + assert results == [] + + +def test_select_history_basic(session_mock): + mock_results = [MagicMock(spec=TestResult), MagicMock(spec=TestResult)] + session_mock.scalars.return_value.all.return_value = mock_results + + results = TestResult.select_history(test_definition_id=uuid4()) + + assert results == mock_results + session_mock.scalars.assert_called_once() + + +def test_select_history_empty(session_mock): + session_mock.scalars.return_value.all.return_value = [] + + results = TestResult.select_history(test_definition_id=uuid4(), limit=10) + + assert results == [] diff --git a/tests/unit/mcp/test_tools_discovery.py b/tests/unit/mcp/test_tools_discovery.py new file mode 100644 index 00000000..a82cffa6 --- /dev/null +++ b/tests/unit/mcp/test_tools_discovery.py @@ -0,0 +1,83 @@ +from unittest.mock import MagicMock, patch +from uuid import uuid4 + + +@patch("testgen.mcp.services.inventory_service.get_inventory") +def test_get_data_inventory_returns_markdown(mock_get_inventory, db_session_mock): + mock_get_inventory.return_value = "# Data Inventory\n\n## Project: Demo" + + from testgen.mcp.tools.discovery import get_data_inventory + + result = get_data_inventory() + + assert "Data Inventory" in result + mock_get_inventory.assert_called_once() + + +@patch("testgen.mcp.tools.discovery.Project") +def test_list_projects_returns_formatted(mock_project, db_session_mock): + proj1 = MagicMock() + proj1.project_name = "Demo Project" + proj1.project_code = "demo" + proj2 = MagicMock() + proj2.project_name = "Staging" + proj2.project_code = "staging" + mock_project.select_where.return_value = [proj1, proj2] + + from testgen.mcp.tools.discovery import list_projects + + result = list_projects() + + assert "Demo Project" in result + assert "`demo`" in result + assert "Staging" in result + + +@patch("testgen.mcp.tools.discovery.Project") +def test_list_projects_empty(mock_project, db_session_mock): + mock_project.select_where.return_value = [] + + from testgen.mcp.tools.discovery import list_projects + + result = list_projects() + + assert "No projects found" in result + + +@patch("testgen.mcp.tools.discovery.TestSuite") +def test_list_test_suites_returns_stats(mock_suite, db_session_mock): + summary = MagicMock() + summary.id = uuid4() + summary.test_suite = "Quality Suite" + summary.connection_name = "main_conn" + summary.table_groups_name = "core_tables" + summary.test_suite_description = "Main quality checks" + summary.test_ct = 50 + summary.latest_run_id = uuid4() + summary.latest_run_start = "2024-01-15T10:00:00" + summary.last_run_test_ct = 50 + summary.last_run_passed_ct = 45 + summary.last_run_failed_ct = 3 + summary.last_run_warning_ct = 2 + summary.last_run_error_ct = 0 + summary.last_run_dismissed_ct = 0 + mock_suite.select_summary.return_value = [summary] + + from testgen.mcp.tools.discovery import list_test_suites + + result = list_test_suites("demo") + + assert "Quality Suite" in result + assert "45 passed" in result + assert "3 failed" in result + + +@patch("testgen.mcp.tools.discovery.TestSuite") +def test_list_test_suites_empty(mock_suite, db_session_mock): + mock_suite.select_summary.return_value = [] + + from testgen.mcp.tools.discovery import list_test_suites + + result = list_test_suites("nonexistent") + + assert "No test suites found" in result diff --git a/tests/unit/mcp/test_tools_reference.py b/tests/unit/mcp/test_tools_reference.py new file mode 100644 index 00000000..84d3f9e8 --- /dev/null +++ b/tests/unit/mcp/test_tools_reference.py @@ -0,0 +1,88 @@ +from unittest.mock import MagicMock, patch + + +@patch("testgen.mcp.tools.reference.TestType") +def test_get_test_type_found(mock_tt_cls, db_session_mock): + tt = MagicMock() + tt.test_type = "Alpha_Trunc" + tt.test_name_short = "Alpha Truncation" + tt.test_name_long = "Alphabetic Truncation Test" + tt.test_description = "Checks for truncated alphabetic values" + tt.measure_uom = "Pct" + tt.measure_uom_description = "Percentage of truncated values" + tt.threshold_description = "Maximum allowed truncation rate" + tt.dq_dimension = "Accuracy" + tt.test_scope = "column" + tt.except_message = "Alpha truncation detected" + tt.usage_notes = "Best for VARCHAR columns" + mock_tt_cls.get.return_value = tt + + from testgen.mcp.tools.reference import get_test_type + + result = get_test_type("Alpha_Trunc") + + assert "Alpha Truncation" in result + assert "Accuracy" in result + assert "column" in result + assert "truncated" in result.lower() + + +@patch("testgen.mcp.tools.reference.TestType") +def test_get_test_type_not_found(mock_tt_cls, db_session_mock): + mock_tt_cls.get.return_value = None + + from testgen.mcp.tools.reference import get_test_type + + result = get_test_type("Nonexistent_Type") + + assert "not found" in result + + +@patch("testgen.mcp.tools.reference.TestType") +def test_test_types_resource(mock_tt_cls, db_session_mock): + tt1 = MagicMock() + tt1.test_type = "Alpha_Trunc" + tt1.test_name_short = "Alpha Truncation" + tt1.dq_dimension = "Accuracy" + tt1.test_scope = "column" + tt1.test_description = "Checks truncation" + tt2 = MagicMock() + tt2.test_type = "Unique_Pct" + tt2.test_name_short = "Unique Percent" + tt2.dq_dimension = "Uniqueness" + tt2.test_scope = "column" + tt2.test_description = "Checks unique percentage" + mock_tt_cls.select_where.return_value = [tt1, tt2] + + from testgen.mcp.tools.reference import test_types_resource + + result = test_types_resource() + + assert "Alpha_Trunc" in result + assert "Unique_Pct" in result + assert "Accuracy" in result + assert "Uniqueness" in result + + +@patch("testgen.mcp.tools.reference.TestType") +def test_test_types_resource_empty(mock_tt_cls, db_session_mock): + mock_tt_cls.select_where.return_value = [] + + from testgen.mcp.tools.reference import test_types_resource + + result = test_types_resource() + + assert "No active test types" in result + + +def test_glossary_resource(): + from testgen.mcp.tools.reference import glossary_resource + + result = glossary_resource() + + assert "Entity Hierarchy" in result + assert "Result Statuses" in result + assert "DQ Dimensions" in result + assert "Test Scopes" in result + assert "Monitor Types" in result + assert "Disposition" in result diff --git a/tests/unit/mcp/test_tools_test_results.py b/tests/unit/mcp/test_tools_test_results.py new file mode 100644 index 00000000..f6325831 --- /dev/null +++ b/tests/unit/mcp/test_tools_test_results.py @@ -0,0 +1,179 @@ +from unittest.mock import MagicMock, patch +from uuid import uuid4 + +import pytest + +from testgen.common.models.test_result import TestResultStatus + + +@patch("testgen.mcp.tools.test_results.TestResult") +def test_get_test_results_basic(mock_result, db_session_mock): + run_id = str(uuid4()) + r1 = MagicMock() + r1.status = TestResultStatus.Failed + r1.test_type = "Alpha_Trunc" + r1.table_name = "orders" + r1.column_names = "customer_name" + r1.result_measure = "15.3" + r1.threshold_value = "10.0" + r1.message = "Truncation detected" + mock_result.select_results.return_value = [r1] + + from testgen.mcp.tools.test_results import get_test_results + + result = get_test_results(run_id) + + assert "Alpha_Trunc" in result + assert "orders" in result + assert "15.3" in result + assert "Truncation detected" in result + + +@patch("testgen.mcp.tools.test_results.TestResult") +def test_get_test_results_empty(mock_result, db_session_mock): + mock_result.select_results.return_value = [] + + from testgen.mcp.tools.test_results import get_test_results + + result = get_test_results(str(uuid4())) + + assert "No test results found" in result + + +@patch("testgen.mcp.tools.test_results.TestResult") +def test_get_test_results_with_filters(mock_result, db_session_mock): + mock_result.select_results.return_value = [] + + from testgen.mcp.tools.test_results import get_test_results + + result = get_test_results(str(uuid4()), status="Failed", table_name="orders", test_type="Alpha_Trunc") + + assert "status=Failed" in result + assert "table=orders" in result + + +def test_get_test_results_invalid_uuid(db_session_mock): + from testgen.mcp.tools.test_results import get_test_results + + with pytest.raises(ValueError, match="not a valid UUID"): + get_test_results("not-a-uuid") + + +def test_get_test_results_invalid_status(db_session_mock): + from testgen.mcp.tools.test_results import get_test_results + + with pytest.raises(ValueError, match="Invalid status"): + get_test_results(str(uuid4()), status="BadStatus") + + +@patch("testgen.mcp.tools.test_results.TestResult") +def test_get_failure_summary_by_test_type(mock_result, db_session_mock): + mock_result.select_failures.return_value = [ + ("Alpha_Trunc", 5), + ("Unique_Pct", 3), + ] + + from testgen.mcp.tools.test_results import get_failure_summary + + result = get_failure_summary(str(uuid4())) + + assert "Failed + Warning" in result + assert "8" in result + assert "Alpha_Trunc" in result + assert "get_test_type" in result + + +@patch("testgen.mcp.tools.test_results.TestResult") +def test_get_failure_summary_empty(mock_result, db_session_mock): + mock_result.select_failures.return_value = [] + + from testgen.mcp.tools.test_results import get_failure_summary + + result = get_failure_summary(str(uuid4())) + + assert "No confirmed failures" in result + + +@patch("testgen.mcp.tools.test_results.TestResult") +def test_get_failure_summary_by_table(mock_result, db_session_mock): + mock_result.select_failures.return_value = [("orders", 10)] + + from testgen.mcp.tools.test_results import get_failure_summary + + result = get_failure_summary(str(uuid4()), group_by="table") + + assert "Table Name" in result + assert "orders" in result + assert "get_test_type" not in result + + +@patch("testgen.mcp.tools.test_results.TestResult") +def test_get_failure_summary_by_column(mock_result, db_session_mock): + mock_result.select_failures.return_value = [("orders", "total_value", 34), ("orders", None, 2)] + + from testgen.mcp.tools.test_results import get_failure_summary + + result = get_failure_summary(str(uuid4()), group_by="column") + + assert "Column" in result + assert "orders.total_value" in result + assert "orders (table-level)" in result + assert "get_test_type" not in result + + +def test_get_failure_summary_invalid_uuid(db_session_mock): + from testgen.mcp.tools.test_results import get_failure_summary + + with pytest.raises(ValueError, match="not a valid UUID"): + get_failure_summary("bad-uuid") + + +@patch("testgen.mcp.tools.test_results.TestResult") +def test_get_test_result_history_basic(mock_result, db_session_mock): + def_id = str(uuid4()) + r1 = MagicMock() + r1.test_type = "Unique_Pct" + r1.table_name = "orders" + r1.column_names = "order_id" + r1.test_time = "2024-01-15T10:00:00" + r1.result_measure = "99.5" + r1.threshold_value = "95.0" + r1.status = TestResultStatus.Passed + r2 = MagicMock() + r2.test_type = "Unique_Pct" + r2.table_name = "orders" + r2.column_names = "order_id" + r2.test_time = "2024-01-10T10:00:00" + r2.result_measure = "88.0" + r2.threshold_value = "95.0" + r2.status = TestResultStatus.Failed + mock_result.select_history.return_value = [r1, r2] + + from testgen.mcp.tools.test_results import get_test_result_history + + result = get_test_result_history(def_id) + + assert "Unique_Pct" in result + assert "orders" in result + assert "99.5" in result + assert "88.0" in result + assert "Passed" in result + assert "Failed" in result + + +@patch("testgen.mcp.tools.test_results.TestResult") +def test_get_test_result_history_empty(mock_result, db_session_mock): + mock_result.select_history.return_value = [] + + from testgen.mcp.tools.test_results import get_test_result_history + + result = get_test_result_history(str(uuid4())) + + assert "No historical results" in result + + +def test_get_test_result_history_invalid_uuid(db_session_mock): + from testgen.mcp.tools.test_results import get_test_result_history + + with pytest.raises(ValueError, match="not a valid UUID"): + get_test_result_history("bad-uuid") diff --git a/tests/unit/mcp/test_tools_test_runs.py b/tests/unit/mcp/test_tools_test_runs.py new file mode 100644 index 00000000..e22b2c44 --- /dev/null +++ b/tests/unit/mcp/test_tools_test_runs.py @@ -0,0 +1,97 @@ +from unittest.mock import MagicMock, patch +from uuid import uuid4 + + +def _make_run_summary(**overrides): + defaults = { + "test_run_id": uuid4(), "test_suite": "Quality Suite", "project_name": "Demo", + "table_groups_name": "core_tables", "status": "Complete", + "test_starttime": "2024-01-15T10:00:00", "test_endtime": "2024-01-15T10:05:00", + "test_ct": 50, "passed_ct": 45, "failed_ct": 3, "warning_ct": 2, "error_ct": 0, + "log_ct": 0, "dismissed_ct": 0, "dq_score_testing": 92.5, + } + defaults.update(overrides) + return MagicMock(**defaults) + + +@patch("testgen.mcp.tools.test_runs.TestRun") +@patch("testgen.mcp.tools.test_runs.TestSuite") +def test_get_recent_test_runs_default_limit(mock_suite, mock_run, db_session_mock): + runs = [_make_run_summary(test_run_id=uuid4()) for _ in range(7)] + mock_run.select_summary.return_value = runs + + from testgen.mcp.tools.test_runs import get_recent_test_runs + + result = get_recent_test_runs("demo") + + assert "Showing 5 of 7" in result + assert "Quality Suite" in result + assert "92.5" in result + mock_run.select_summary.assert_called_once_with(project_code="demo", test_suite_id=None) + + +@patch("testgen.mcp.tools.test_runs.TestRun") +@patch("testgen.mcp.tools.test_runs.TestSuite") +def test_get_recent_test_runs_custom_limit(mock_suite, mock_run, db_session_mock): + runs = [_make_run_summary() for _ in range(3)] + mock_run.select_summary.return_value = runs + + from testgen.mcp.tools.test_runs import get_recent_test_runs + + result = get_recent_test_runs("demo", limit=10) + + assert "Showing 3 of 3" in result + + +@patch("testgen.mcp.tools.test_runs.TestRun") +@patch("testgen.mcp.tools.test_runs.TestSuite") +def test_get_recent_test_runs_with_suite_name(mock_suite, mock_run, db_session_mock): + suite_id = uuid4() + suite_minimal = MagicMock() + suite_minimal.id = suite_id + mock_suite.select_minimal_where.return_value = [suite_minimal] + mock_run.select_summary.return_value = [_make_run_summary(test_suite="My Suite")] + + from testgen.mcp.tools.test_runs import get_recent_test_runs + + result = get_recent_test_runs("demo", test_suite="My Suite") + + mock_run.select_summary.assert_called_once_with(project_code="demo", test_suite_id=str(suite_id)) + assert "My Suite" in result + + +@patch("testgen.mcp.tools.test_runs.TestRun") +@patch("testgen.mcp.tools.test_runs.TestSuite") +def test_get_recent_test_runs_suite_not_found(mock_suite, mock_run, db_session_mock): + mock_suite.select_minimal_where.return_value = [] + + from testgen.mcp.tools.test_runs import get_recent_test_runs + + result = get_recent_test_runs("demo", test_suite="Nonexistent") + + assert "not found" in result + mock_run.select_summary.assert_not_called() + + +@patch("testgen.mcp.tools.test_runs.TestRun") +@patch("testgen.mcp.tools.test_runs.TestSuite") +def test_get_recent_test_runs_no_runs(mock_suite, mock_run, db_session_mock): + mock_run.select_summary.return_value = [] + + from testgen.mcp.tools.test_runs import get_recent_test_runs + + result = get_recent_test_runs("demo") + + assert "No completed test runs" in result + + +@patch("testgen.mcp.tools.test_runs.TestRun") +@patch("testgen.mcp.tools.test_runs.TestSuite") +def test_get_recent_test_runs_shows_failure_counts(mock_suite, mock_run, db_session_mock): + mock_run.select_summary.return_value = [_make_run_summary(failed_ct=5, warning_ct=2)] + + from testgen.mcp.tools.test_runs import get_recent_test_runs + + result = get_recent_test_runs("demo") + + assert "5F/2W" in result From c73dfcc1affd01558da6f6b9cabb1dd67916f82c Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Wed, 25 Feb 2026 22:17:15 -0300 Subject: [PATCH 11/95] refactor(mcp): rewrite server instructions and handle plugin load errors Simplify MCP instructions to focus on data model and navigation. Gracefully skip plugins that fail to load in MCP context (e.g. when Streamlit v2 component registration is unavailable). Co-Authored-By: Claude Opus 4.6 --- testgen/mcp/server.py | 47 ++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/testgen/mcp/server.py b/testgen/mcp/server.py index 4e724c89..c7096eb3 100644 --- a/testgen/mcp/server.py +++ b/testgen/mcp/server.py @@ -11,38 +11,28 @@ LOG = logging.getLogger("testgen") SERVER_INSTRUCTIONS = """\ -You are connected to a TestGen data quality testing server. +TestGen is a data quality platform that profiles databases, generates tests, and monitors tables. -## Available Tools (8) +DATA MODEL -- **get_data_inventory()** — Complete overview of projects, connections, table groups, test suites, and latest run stats. START HERE. -- **list_projects()** — List all project codes and names. -- **list_test_suites(project_code)** — List test suites with run stats for a project. -- **get_recent_test_runs(project_code, test_suite?, limit?)** — Get recent test runs with pass/fail counts (default 5). -- **get_test_results(test_run_id, status?, table_name?, test_type?, limit?)** — Get individual test results with filters. -- **get_test_result_history(test_definition_id, limit?)** — Historical results for a test definition across runs (measure, threshold, status over time). -- **get_failure_summary(test_run_id, group_by?)** — Failures grouped by test_type, table, or column. -- **get_test_type(test_type)** — Detailed info about a test type (what it checks, thresholds, DQ dimension). +Projects contain Connections (to target databases) and Table Groups (sets of tables to profile and test together). +Table Groups contains Test Suites — collections of Test Definitions with configured thresholds. +Test Runs execute a Test Suite and produce Test Results (one per Test Definition). +Profiling Runs scan a Table Group and produce column-level statistics and detects data hygiene issues. +Monitors track table health over time: freshness, volume, schema changes, and custom metrics. -## Resources (2) +NAVIGATION -- **testgen://test-types** — Reference table of all active test types. -- **testgen://glossary** — Entity hierarchy, result statuses, DQ dimensions, test scopes. +Tools return entity IDs that feed into other tools. Start with get_data_inventory for broad discovery, then drill +into specific entities. -## Workflow +Test types have specific, non-obvious meanings (e.g., Alpha_Trunc). Do not guess what a test checks. +ALWAYS look them up using either the `testgen://test-types` resource or the `get_test_type()` tool. -1. ALWAYS start with `get_data_inventory` to understand the landscape. -2. Drill into specific runs with `get_recent_test_runs` and `get_test_results`. -3. DO NOT assume what a test type checks. Look at `testgen://test-types` -4. Use `get_failure_summary` to understand failure patterns, then `get_test_type` for each category. -5. Use `get_test_result_history` to see how a specific test's measure and status changed over time. -6. Reference `testgen://glossary` for definitions of statuses, dimensions, and scopes. - -## Conventions - -- UUIDs are used as identifiers — pass them as strings. -- Dates are in ISO 8601 format. -- Test results with disposition 'Dismissed' or 'Inactive' are excluded from counts by default. +CONVENTIONS +- Identifiers are UUIDs passed as strings. +- Dates are ISO 8601 format. +- Test results with disposition Dismissed or Inactive are excluded from counts and scores. """ @@ -73,7 +63,10 @@ def run_mcp() -> None: from testgen.utils.plugins import discover for plugin in discover(): - plugin.load() + try: + plugin.load() + except Exception: + LOG.debug("Plugin %s skipped (not loadable in MCP context)", plugin.package) server_url = with_database_session(get_server_url)() From ea6b600a34ad2a8250f67e33d2faa4e44b6f5bb2 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 26 Feb 2026 02:06:24 -0500 Subject: [PATCH 12/95] ci(docker): generate third-party-notices file --- .dockerignore | 1 + deploy/generate_third_party_notices.py | 278 +++++++++++++++++++++++++ deploy/testgen.dockerfile | 7 + pyproject.toml | 1 + 4 files changed, 287 insertions(+) create mode 100644 deploy/generate_third_party_notices.py diff --git a/.dockerignore b/.dockerignore index 9b49ae35..6e8cbf5f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -17,3 +17,4 @@ venv/ .ruff_cache/ deploy !deploy/install_*.sh +!deploy/generate_third_party_notices.py diff --git a/deploy/generate_third_party_notices.py b/deploy/generate_third_party_notices.py new file mode 100644 index 00000000..8930e591 --- /dev/null +++ b/deploy/generate_third_party_notices.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +"""Generate THIRD-PARTY-NOTICES from installed Python packages. + +Runs pip-licenses to collect metadata, filters out dev/internal packages, +and outputs a formatted notices file with summary table and per-package details. + +Usage: + python generate_third_party_notices.py [--output PATH] +""" + +import argparse +import json +import re +import subprocess +import sys +from datetime import date +from pathlib import Path + +# Packages installed temporarily during Docker build — never in pyproject.toml. +_BUILD_ONLY = {"pip-licenses", "prettytable"} + +# Internal DK packages not discoverable from pyproject.toml structure. +_EXTRA_INTERNAL = {"requests-extensions", "requests_extensions"} + +# Packages whose license is reported as UNKNOWN by pip-licenses (keys are normalized). +LICENSE_OVERRIDES = { + "google-crc32c": "Apache-2.0", + "streamlit-camera-input-live": "MIT", + "streamlit-embedcode": "MIT", + "streamlit-keyup": "MIT", + "streamlit-toggle-switch": "MIT", + "streamlit-vertical-slider": "MIT", + "streamlit-faker": "Apache-2.0", +} + + +def _normalize(name: str) -> str: + """Normalize package name per PEP 503 (lowercase, hyphens/underscores/dots → hyphen).""" + return re.sub(r"[-_.]+", "-", name).lower() + + +def _parse_pkg_name(requirement: str) -> str: + """Extract normalized package name from a PEP 508 requirement string.""" + raw = re.split(r"[><=!~\[;@\s]", requirement, maxsplit=1)[0].strip() + return _normalize(raw) + + +def _load_pyproject(path: Path) -> dict: + if sys.version_info >= (3, 11): + import tomllib + else: + import tomli as tomllib # type: ignore[no-redef] + with open(path, "rb") as f: + return tomllib.load(f) + + +def _find_pyprojects(repo_root: Path) -> list[Path]: + """Return pyproject.toml paths for root, submodule, and plugins.""" + candidates = [repo_root / "pyproject.toml", repo_root / "testgen" / "pyproject.toml"] + for plugins_dir in [repo_root / "plugins", repo_root / "testgen" / "plugins"]: + if plugins_dir.is_dir(): + candidates.extend(sorted(plugins_dir.glob("*/pyproject.toml"))) + return [p for p in candidates if p.exists()] + + +def _resolve_transitive(names: set[str]) -> set[str]: + """Expand a set of normalized package names to include all their transitive dependencies.""" + from importlib.metadata import requires, PackageNotFoundError + + resolved: set[str] = set() + queue = list(names) + while queue: + name = queue.pop() + norm = _normalize(name) + if norm in resolved: + continue + resolved.add(norm) + try: + reqs = requires(name) or [] + except PackageNotFoundError: + try: + reqs = requires(norm) or [] + except PackageNotFoundError: + continue + for req in reqs: + if "; extra ==" in req or "; " in req: + continue + dep_name = _parse_pkg_name(req) + if dep_name and dep_name not in resolved: + queue.append(dep_name) + return resolved + + +def _build_exclude_sets(repo_root: Path) -> tuple[set[str], set[str]]: + """Read pyproject.toml files to build dev-only and internal package sets.""" + dev_direct: set[str] = set(_BUILD_ONLY) + internal: set[str] = set(_EXTRA_INTERNAL) + + for pyproject_path in _find_pyprojects(repo_root): + data = _load_pyproject(pyproject_path) + + project_name = data.get("project", {}).get("name") + if project_name: + internal.add(project_name) + + for deps in data.get("project", {}).get("optional-dependencies", {}).values(): + for dep in deps: + dev_direct.add(_parse_pkg_name(dep)) + + # Expand dev deps transitively, then subtract anything reachable from the main + # package. This keeps shared deps (e.g. requests, urllib3) in the runtime set. + dev_all = _resolve_transitive(dev_direct) + runtime_all = _resolve_transitive(internal) + dev_only = dev_all - runtime_all + return dev_only, internal + + +def _find_repo_root() -> Path: + """Walk up from this script to find the repo root (contains pyproject.toml with 'testgen' subdir).""" + # Script lives at /testgen/deploy/ or is called from repo root + script_dir = Path(__file__).resolve().parent + for candidate in [script_dir.parent.parent, script_dir.parent, Path.cwd()]: + if (candidate / "pyproject.toml").exists() and (candidate / "testgen" / "pyproject.toml").exists(): + return candidate + # Fallback: just use empty sets (Docker build context may not have root pyproject.toml) + return script_dir + + +def normalize_license(name: str, lic: str) -> str: + if _normalize(name) in LICENSE_OVERRIDES: + return LICENSE_OVERRIDES[_normalize(name)] + if not lic or lic == "UNKNOWN": + return "UNKNOWN" + if "Apache" in lic and len(lic) > 50: + return "Apache-2.0" + return lic + + +def extract_copyright(license_text: str) -> str | None: + if not license_text: + return None + lines: list[str] = [] + seen: set[str] = set() + for line in license_text.split("\n"): + stripped = line.strip() + if re.match(r"(?i)copyright\s", stripped) and stripped not in seen: + lines.append(stripped) + seen.add(stripped) + return "\n".join(lines) if lines else None + + +def get_packages() -> list[dict]: + result = subprocess.run( + [ + sys.executable, "-m", "piplicenses", + "--format=json", + "--with-urls", + "--with-license-file", + "--with-notice-file", + "--no-license-path", + ], + capture_output=True, + text=True, + check=True, + ) + return json.loads(result.stdout) + + +def generate(packages: list[dict], dev_only: set[str], internal: set[str]) -> str: + runtime = [ + pkg for pkg in packages + if _normalize(pkg["Name"]) not in internal and _normalize(pkg["Name"]) not in dev_only + ] + runtime.sort(key=lambda p: p["Name"].lower()) + + lines: list[str] = [] + + # Header + lines.append("THIRD-PARTY SOFTWARE NOTICES AND INFORMATION") + lines.append("=" * 60) + lines.append("") + lines.append("DataOps TestGen Enterprise") + lines.append(f"Copyright (c) {date.today().year} DataKitchen, Inc.") + lines.append("") + lines.append("This product includes software developed by third parties.") + lines.append("The following sets forth attribution notices for third-party") + lines.append("software that may be contained in portions of this product.") + lines.append("") + lines.append(f"Generated: {date.today().isoformat()}") + lines.append(f"Runtime dependencies: {len(runtime)}") + lines.append("") + lines.append("") + + # Summary table + lines.append("-" * 60) + lines.append("SUMMARY") + lines.append("-" * 60) + lines.append("") + lines.append(f"{'Package':<40s} {'Version':<16s} {'License'}") + lines.append(f"{'-' * 40} {'-' * 16} {'-' * 30}") + for pkg in runtime: + lic = normalize_license(pkg["Name"], pkg["License"]) + lines.append(f"{pkg['Name']:<40s} {pkg['Version']:<16s} {lic}") + + lines.append("") + lines.append("") + + # Detailed notices + lines.append("-" * 60) + lines.append("DETAILED NOTICES") + lines.append("-" * 60) + + for pkg in runtime: + name = pkg["Name"] + version = pkg["Version"] + lic = normalize_license(name, pkg["License"]) + url = pkg.get("URL", "") + license_text = pkg.get("LicenseText", "") + notice_text = pkg.get("NoticeText", "") + + lines.append("") + lines.append("=" * 60) + lines.append(f"{name} {version}") + lines.append(f"License: {lic}") + if url and url != "UNKNOWN": + lines.append(f"URL: {url}") + lines.append("=" * 60) + + copyright_line = extract_copyright(license_text) + if copyright_line: + lines.append("") + lines.append(copyright_line) + + if notice_text and notice_text.strip() and notice_text.strip() != "UNKNOWN": + lines.append("") + lines.append("NOTICE:") + lines.append(notice_text.strip()) + + if license_text and license_text.strip() and license_text.strip() != "UNKNOWN": + text = license_text.strip() + # Abbreviate long Apache 2.0 boilerplate to the standard short form + if len(text) > 3000 and "apache" in text.lower(): + lines.append("") + lines.append("Licensed under the Apache License, Version 2.0.") + lines.append("You may obtain a copy of the License at") + lines.append("") + lines.append(" http://www.apache.org/licenses/LICENSE-2.0") + lines.append("") + lines.append("Unless required by applicable law or agreed to in writing,") + lines.append("software distributed under the License is distributed on an") + lines.append('"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.') + else: + lines.append("") + lines.append(text) + + lines.append("") + return "\n".join(lines) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate THIRD-PARTY-NOTICES") + parser.add_argument("--output", default=None, help="Output file path (default: stdout)") + args = parser.parse_args() + + repo_root = _find_repo_root() + dev_only, internal = _build_exclude_sets(repo_root) + packages = get_packages() + content = generate(packages, dev_only, internal) + + if args.output: + with open(args.output, "w") as f: + f.write(content) + else: + print(content) + + +if __name__ == "__main__": + main() diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index 5c4bb933..856804fb 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -15,6 +15,13 @@ RUN apk upgrade # Now install everything COPY . /tmp/dk/ RUN python3 -m pip install --prefix=/dk /tmp/dk + +# Generate third-party license notices from installed packages +RUN pip install pip-licenses \ + && SCRIPT=$(find /tmp/dk -name generate_third_party_notices.py | head -1) \ + && PYTHONPATH=/dk/lib/python3.12/site-packages python3 "$SCRIPT" --output /dk/THIRD-PARTY-NOTICES \ + && pip uninstall -y pip-licenses + RUN rm -Rf /tmp/dk RUN tg-patch-streamlit diff --git a/pyproject.toml b/pyproject.toml index bcea6dd2..9d0fcc74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -211,6 +211,7 @@ exclude = [ "_build", "build", "dist", + "deploy", ] [tool.ruff.lint] From 653362465d55c8260a2406e37749ed3ccc17c0e0 Mon Sep 17 00:00:00 2001 From: Luis Date: Thu, 5 Feb 2026 19:02:55 -0400 Subject: [PATCH 13/95] feat(projects): add a project membership to handle user role in a project --- pyproject.toml | 4 + testgen/common/models/project.py | 31 +++- testgen/common/models/project_membership.py | 82 +++++++++ testgen/common/models/test_run.py | 2 + testgen/common/models/user.py | 24 ++- testgen/common/notifications/profiling_run.py | 17 +- testgen/common/notifications/score_drop.py | 4 +- testgen/common/notifications/test_run.py | 6 +- .../030_initialize_new_schema_structure.sql | 25 ++- .../040_populate_new_schema_project.sql | 26 ++- .../dbupgrade/0174_incremental_upgrade.sql | 60 +++++++ testgen/ui/app.py | 8 +- testgen/ui/auth.py | 22 ++- .../frontend/js/components/score_issues.js | 9 +- .../frontend/js/components/select.js | 2 + .../frontend/js/data_profiling/data_issues.js | 4 +- .../js/data_profiling/data_profiling_utils.js | 1 + .../frontend/js/pages/data_catalog.js | 1 + .../frontend/js/pages/profiling_runs.js | 7 +- .../frontend/js/pages/project_dashboard.js | 27 +-- .../frontend/js/pages/quality_dashboard.js | 2 +- .../frontend/js/pages/score_details.js | 6 +- .../frontend/js/pages/score_explorer.js | 2 +- .../components/frontend/js/pages/test_runs.js | 5 +- .../frontend/js/pages/test_suites.js | 4 +- .../ui/components/frontend/js/streamlit.js | 6 +- .../components/frontend/standalone/.gitkeep | 0 .../standalone/project_settings/index.js | 164 ++++++++++++++++++ testgen/ui/components/widgets/__init__.py | 6 + testgen/ui/components/widgets/sidebar.py | 2 +- testgen/ui/pdf/hygiene_issue_report.py | 2 +- testgen/ui/pdf/test_result_report.py | 2 +- testgen/ui/queries/profiling_queries.py | 19 +- testgen/ui/static/css/shared.css | 36 ++++ testgen/ui/static/js/components/card.js | 23 ++- .../ui/static/js/components/score_issues.js | 9 +- testgen/ui/static/js/form_validators.js | 21 +++ testgen/ui/static/js/streamlit.js | 6 +- testgen/ui/views/hygiene_issues.py | 7 + testgen/ui/views/profiling_results.py | 7 + testgen/ui/views/project_settings.py | 151 ++++++---------- testgen/ui/views/score_details.py | 7 + testgen/ui/views/score_explorer.py | 8 + testgen/ui/views/test_definitions.py | 10 +- testgen/ui/views/test_results.py | 7 + .../test_profiling_run_notifications.py | 6 +- .../test_score_drop_notifications.py | 2 +- .../test_test_run_notifications.py | 4 +- 48 files changed, 705 insertions(+), 181 deletions(-) create mode 100644 testgen/common/models/project_membership.py create mode 100644 testgen/template/dbupgrade/0174_incremental_upgrade.sql create mode 100644 testgen/ui/components/frontend/standalone/.gitkeep create mode 100644 testgen/ui/components/frontend/standalone/project_settings/index.js diff --git a/pyproject.toml b/pyproject.toml index bd54765c..b3b88fb7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -299,3 +299,7 @@ asset_dir = "ui/components/frontend/js" [[tool.streamlit.component.components]] name = "edit_table_monitors" asset_dir = "ui/components/frontend/js" + +[[tool.streamlit.component.components]] +name = "project_settings" +asset_dir = "ui/components/frontend/standalone/project_settings" diff --git a/testgen/common/models/project.py b/testgen/common/models/project.py index e39daecd..eedfb13f 100644 --- a/testgen/common/models/project.py +++ b/testgen/common/models/project.py @@ -2,13 +2,15 @@ from uuid import UUID, uuid4 import streamlit as st -from sqlalchemy import Column, String, asc, func, text +from sqlalchemy import Column, String, asc, func, select, text from sqlalchemy.dialects import postgresql from testgen.common.models import get_current_session from testgen.common.models.connection import Connection from testgen.common.models.custom_types import NullIfEmptyString -from testgen.common.models.entity import Entity, EntityMinimal +from testgen.common.models.entity import ENTITY_HASH_FUNCS, Entity, EntityMinimal +from testgen.common.models.project_membership import ProjectMembership +from testgen.common.models.user import User @dataclass @@ -24,6 +26,12 @@ class ProjectSummary(EntityMinimal): can_export_to_observability: bool +@dataclass +class ProjectMember(EntityMinimal): + user: User + membership: ProjectMembership + + class Project(Entity): __tablename__ = "projects" @@ -99,3 +107,22 @@ def cascade_delete(cls, ids: list[str]) -> bool: def clear_cache(cls) -> bool: super().clear_cache() cls.get_summary.clear() + cls.get_project_members.clear() + + @classmethod + @st.cache_data(show_spinner=False, hash_funcs=ENTITY_HASH_FUNCS) + def get_project_members( + cls, + project_code: str, + *filters, + _order_by: tuple = (asc(func.lower(User.username)),), + ) -> list[ProjectMember]: + """Get all users who have access to this project.""" + query = ( + select(User, ProjectMembership) + .join(ProjectMembership, User.id == ProjectMembership.user_id) + .where(ProjectMembership.project_code == project_code, *filters) + .order_by(*_order_by) + ) + rows = get_current_session().execute(query).all() + return [ProjectMember(user=user, membership=membership) for user, membership in rows] diff --git a/testgen/common/models/project_membership.py b/testgen/common/models/project_membership.py new file mode 100644 index 00000000..94bcad5e --- /dev/null +++ b/testgen/common/models/project_membership.py @@ -0,0 +1,82 @@ +from datetime import datetime +from typing import Literal, Self +from uuid import UUID, uuid4 + +import streamlit as st +from sqlalchemy import Column, ForeignKey, String, asc, select +from sqlalchemy.dialects import postgresql + +from testgen.common.models import get_current_session +from testgen.common.models.entity import Entity + +RoleType = Literal["admin", "data_quality", "analyst", "business", "catalog"] + + +class ProjectMembership(Entity): + __tablename__ = "project_memberships" + + id: UUID = Column(postgresql.UUID(as_uuid=True), primary_key=True, default=uuid4) + user_id: UUID = Column( + postgresql.UUID(as_uuid=True), + ForeignKey("auth_users.id", ondelete="CASCADE"), + nullable=False, + ) + project_code: str = Column( + String, + ForeignKey("projects.project_code", ondelete="CASCADE"), + nullable=False, + ) + role: "RoleType" = Column(String, nullable=False) + created_at: datetime = Column(postgresql.TIMESTAMP, default=datetime.utcnow) + + _get_by = "id" + _default_order_by = (asc(project_code),) + + @classmethod + @st.cache_data(show_spinner=False) + def get_by_user_and_project(cls, user_id: UUID, project_code: str) -> Self | None: + """Get a specific membership for a user in a project.""" + query = select(cls).where( + cls.user_id == user_id, + cls.project_code == project_code, + ) + return get_current_session().scalars(query).first() + + @classmethod + @st.cache_data(show_spinner=False) + def get_projects_for_user(cls, user_id: UUID) -> list[str]: + """Get all project codes a user has access to.""" + query = select(cls.project_code).where(cls.user_id == user_id) + return list(get_current_session().scalars(query).all()) + + @classmethod + @st.cache_data(show_spinner=False) + def get_memberships_for_user(cls, user_id: UUID) -> list[Self]: + """Get all memberships for a user.""" + return list(cls.select_where(cls.user_id == user_id)) + + @classmethod + @st.cache_data(show_spinner=False) + def get_memberships_for_project(cls, project_code: str) -> list[Self]: + """Get all memberships for a project.""" + return list(cls.select_where(cls.project_code == project_code)) + + @classmethod + def user_has_project_access(cls, user_id: UUID, project_code: str) -> bool: + """Check if a user has any access to a project.""" + membership = cls.get_by_user_and_project(user_id, project_code) + return membership is not None + + @classmethod + def get_user_role_in_project(cls, user_id: UUID, project_code: str) -> "RoleType | None": + """Get the user's role within a specific project.""" + membership = cls.get_by_user_and_project(user_id, project_code) + return membership.role if membership else None + + @classmethod + def clear_cache(cls) -> None: + super().clear_cache() + cls.get_by_user_and_project.clear() + cls.get_projects_for_user.clear() + cls.get_memberships_for_user.clear() + cls.get_memberships_for_project.clear() diff --git a/testgen/common/models/test_run.py b/testgen/common/models/test_run.py index 3709328a..be2348d9 100644 --- a/testgen/common/models/test_run.py +++ b/testgen/common/models/test_run.py @@ -49,6 +49,7 @@ class TestRunSummary(EntityMinimal): test_endtime: datetime table_groups_name: str test_suite: str + project_code: str project_name: str status: TestRunStatus progress: list[ProgressStep] @@ -238,6 +239,7 @@ def select_summary( test_runs.test_endtime, table_groups.table_groups_name, test_suites.test_suite, + test_suites.project_code, projects.project_name, test_runs.status, test_runs.progress, diff --git a/testgen/common/models/user.py b/testgen/common/models/user.py index cc6f57c6..e65bc02b 100644 --- a/testgen/common/models/user.py +++ b/testgen/common/models/user.py @@ -1,16 +1,15 @@ from datetime import UTC, datetime -from typing import Literal, Self +from typing import Self from uuid import UUID, uuid4 import streamlit as st -from sqlalchemy import Column, String, asc, func, select, update +from sqlalchemy import Boolean, Column, String, asc, func, select, update from sqlalchemy.dialects import postgresql from testgen.common.models import get_current_session from testgen.common.models.custom_types import NullIfEmptyString from testgen.common.models.entity import Entity - -RoleType = Literal["admin", "data_quality", "analyst", "business", "catalog"] +from testgen.common.models.project_membership import RoleType class User(Entity): @@ -21,7 +20,7 @@ class User(Entity): email: str = Column(NullIfEmptyString) name: str = Column(NullIfEmptyString) password: str = Column(String) - role: RoleType = Column(String) + is_global_admin: bool = Column(Boolean, nullable=False, default=False) latest_login: datetime = Column(postgresql.TIMESTAMP) _get_by = "username" @@ -49,3 +48,18 @@ def save(self, update_latest_login: bool = False) -> None: def get(cls, identifier: str) -> Self | None: query = select(cls).where(func.lower(User.username) == func.lower(identifier)) return get_current_session().scalars(query).first() + + def get_accessible_projects(self) -> list[str]: + """Get all projects this user can access.""" + from testgen.common.models.project_membership import ProjectMembership + return ProjectMembership.get_projects_for_user(self.id) + + def get_role_in_project(self, project_code: str) -> RoleType | None: + """Get this user's role in a specific project.""" + from testgen.common.models.project_membership import ProjectMembership + return ProjectMembership.get_user_role_in_project(self.id, project_code) + + def has_project_access(self, project_code: str) -> bool: + """Check if user has access to a project.""" + from testgen.common.models.project_membership import ProjectMembership + return ProjectMembership.user_has_project_access(self.id, project_code) diff --git a/testgen/common/notifications/profiling_run.py b/testgen/common/notifications/profiling_run.py index 91e06092..c1731bac 100644 --- a/testgen/common/notifications/profiling_run.py +++ b/testgen/common/notifications/profiling_run.py @@ -258,7 +258,13 @@ def send_profiling_run_notifications(profiling_run: ProfilingRun, result_list_ct return profiling_run_issues_url = "".join( - (PersistedSetting.get("BASE_URL", ""), "/profiling-runs:hygiene?run_id=", str(profiling_run.id), "&source=email") + ( + PersistedSetting.get("BASE_URL", ""), + "/profiling-runs:hygiene?project_code=", + str(profiling_run.project_code), + "&run_id=", str(profiling_run.id), + "&source=email" + ) ) hygiene_issues_summary = [] @@ -304,7 +310,14 @@ def send_profiling_run_notifications(profiling_run: ProfilingRun, result_list_ct "id": str(profiling_run.id), "issues_url": profiling_run_issues_url, "results_url": "".join( - (PersistedSetting.get("BASE_URL", ""), "/profiling-runs:results?run_id=", str(profiling_run.id), "&source=email") + ( + PersistedSetting.get("BASE_URL", ""), + "/profiling-runs:results?project_code=", + str(profiling_run.project_code), + "&run_id=", + str(profiling_run.id), + "&source=email" + ) ), "start_time": profiling_run.profiling_starttime, "end_time": profiling_run.profiling_endtime, diff --git a/testgen/common/notifications/score_drop.py b/testgen/common/notifications/score_drop.py index 1bf33d87..e16f4bf4 100644 --- a/testgen/common/notifications/score_drop.py +++ b/testgen/common/notifications/score_drop.py @@ -178,7 +178,9 @@ def send_score_drop_notifications(notification_data: list[tuple[ScoreDefinition, "scorecard_url": "".join( ( PersistedSetting.get("BASE_URL", ""), - "/quality-dashboard:score-details?definition_id=", + "/quality-dashboard:score-details?project_code=", + str(definition.project_code), + "&definition_id=", str(definition.id), "&source=email", ) diff --git a/testgen/common/notifications/test_run.py b/testgen/common/notifications/test_run.py index 8452cd39..36535e6b 100644 --- a/testgen/common/notifications/test_run.py +++ b/testgen/common/notifications/test_run.py @@ -324,9 +324,11 @@ def send_test_run_notifications(test_run: TestRun, result_list_ct=20, result_sta test_run_url = "".join( ( PersistedSetting.get("BASE_URL", ""), - "/test-runs:results?run_id=", + "/test-runs:results?project_code=", + str(tr_summary.project_code), + "&run_id=", str(test_run.id), - "&source=email", + "&source=email" ) ) diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index d55ba76a..b7b0c0a8 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -620,7 +620,7 @@ CREATE TABLE auth_users ( email VARCHAR(256), name VARCHAR(256), password VARCHAR(120), - role VARCHAR(20), + is_global_admin BOOLEAN NOT NULL DEFAULT FALSE, latest_login TIMESTAMP ); @@ -628,6 +628,29 @@ ALTER TABLE auth_users ADD CONSTRAINT unique_username UNIQUE (username); +CREATE TABLE project_memberships ( + id UUID DEFAULT gen_random_uuid() + CONSTRAINT pk_project_memberships_id + PRIMARY KEY, + user_id UUID NOT NULL + CONSTRAINT fk_project_memberships_auth_users + REFERENCES auth_users(id) + ON DELETE CASCADE, + project_code VARCHAR(30) NOT NULL + CONSTRAINT fk_project_memberships_projects + REFERENCES projects(project_code) + ON DELETE CASCADE, + role VARCHAR(20) NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + + CONSTRAINT uq_project_memberships_user_project + UNIQUE (user_id, project_code) +); + +CREATE INDEX ix_pm_user_id ON project_memberships(user_id); +CREATE INDEX ix_pm_project_code ON project_memberships(project_code); +CREATE INDEX ix_pm_role ON project_memberships(role); + CREATE TABLE tg_revision ( component VARCHAR(50) NOT NULL CONSTRAINT tg_revision_component_pk diff --git a/testgen/template/dbsetup/040_populate_new_schema_project.sql b/testgen/template/dbsetup/040_populate_new_schema_project.sql index c6c959f5..cbfa8220 100644 --- a/testgen/template/dbsetup/040_populate_new_schema_project.sql +++ b/testgen/template/dbsetup/040_populate_new_schema_project.sql @@ -7,11 +7,21 @@ SELECT '{PROJECT_CODE}' as project_code, '{OBSERVABILITY_API_KEY}' as observability_api_key, '{OBSERVABILITY_API_URL}' as observability_api_url; -INSERT INTO auth_users - (username, email, name, password, role) -SELECT - '{UI_USER_USERNAME}' as username, - '{UI_USER_EMAIL}' as email, - '{UI_USER_NAME}' as name, - '{UI_USER_ENCRYPTED_PASSWORD}' as password, - 'admin' as role; + +WITH inserted_user AS ( + INSERT INTO auth_users + (username, email, name, password) + SELECT + '{UI_USER_USERNAME}' as username, + '{UI_USER_EMAIL}' as email, + '{UI_USER_NAME}' as name, + '{UI_USER_ENCRYPTED_PASSWORD}' as password + RETURNING id +) +INSERT INTO project_memberships + (user_id, project_code, role, created_at) +SELECT id AS user_id, + '{PROJECT_CODE}' AS project_code, + 'admin' AS role, + NOW() AS created_at +FROM inserted_user; diff --git a/testgen/template/dbupgrade/0174_incremental_upgrade.sql b/testgen/template/dbupgrade/0174_incremental_upgrade.sql new file mode 100644 index 00000000..954c7015 --- /dev/null +++ b/testgen/template/dbupgrade/0174_incremental_upgrade.sql @@ -0,0 +1,60 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +-- ============================================================================= +-- Create project_memberships table +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS project_memberships ( + id UUID DEFAULT gen_random_uuid() + CONSTRAINT pk_project_memberships_id + PRIMARY KEY, + user_id UUID NOT NULL + CONSTRAINT fk_project_memberships_auth_users + REFERENCES auth_users(id) + ON DELETE CASCADE, + project_code VARCHAR(30) NOT NULL + CONSTRAINT fk_project_memberships_projects + REFERENCES projects(project_code) + ON DELETE CASCADE, + role VARCHAR(20) NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + + CONSTRAINT uq_project_memberships_user_project + UNIQUE (user_id, project_code) +); + +CREATE INDEX IF NOT EXISTS ix_pm_user_id ON project_memberships(user_id); +CREATE INDEX IF NOT EXISTS ix_pm_project_code ON project_memberships(project_code); +CREATE INDEX IF NOT EXISTS ix_pm_role ON project_memberships(role); + +-- ============================================================================= +-- Add is_global_admin column to auth_users +-- ============================================================================= + +ALTER TABLE auth_users ADD COLUMN IF NOT EXISTS is_global_admin BOOLEAN NOT NULL DEFAULT FALSE; + +-- ============================================================================= +-- Set is_global_admin = TRUE for users with role = 'admin' +-- ============================================================================= + +UPDATE auth_users SET is_global_admin = TRUE WHERE role = 'admin'; + +-- ============================================================================= +-- Migrate ALL users to project_memberships +-- Each user gets their current role in every existing project +-- ============================================================================= + +INSERT INTO project_memberships (user_id, project_code, role) +SELECT + u.id AS user_id, + p.project_code AS project_code, + u.role AS role +FROM auth_users u +CROSS JOIN projects p +ON CONFLICT (user_id, project_code) DO NOTHING; + +-- ============================================================================= +-- Drop the role column from auth_users +-- ============================================================================= + +ALTER TABLE auth_users DROP COLUMN IF EXISTS role; diff --git a/testgen/ui/app.py b/testgen/ui/app.py index 9358938c..36d8b9f7 100644 --- a/testgen/ui/app.py +++ b/testgen/ui/app.py @@ -23,7 +23,13 @@ def render(log_level: int = logging.INFO): # Collapse when logging out because the sidebar takes some time to be removed from the DOM # Collapse for Catalog role since they only have access to one page initial_sidebar_state="collapsed" - if session.auth and (session.auth.logging_out or (session.auth.is_logged_in and not session.auth.user_has_permission("view"))) + if session.auth and ( + session.auth.logging_out + or ( + session.auth.is_logged_in + and not session.auth.user_has_permission("view") + ) + ) else "auto", ) diff --git a/testgen/ui/auth.py b/testgen/ui/auth.py index 4e82938b..73ac1599 100644 --- a/testgen/ui/auth.py +++ b/testgen/ui/auth.py @@ -6,8 +6,10 @@ from testgen.common.auth import decode_jwt_token, get_jwt_signing_key from testgen.common.mixpanel_service import MixpanelService +from testgen.common.models.project_membership import ProjectMembership, RoleType from testgen.common.models.user import User from testgen.ui.services.javascript_service import execute_javascript +from testgen.ui.session import session LOG = logging.getLogger("testgen") @@ -20,6 +22,7 @@ class Authentication: jwt_cookie_expiry_days = 1 user: User | None = None + role: RoleType | None = None # Intermediate state holders because auth cookie changes are not immediate cookies_ready: bool = False @@ -34,11 +37,18 @@ def is_logged_in(self) -> bool: def user_display(self) -> str | None: return (self.user.name or self.user.username) if self.user else None + @property + def current_project(self) -> str | None: + return session.sidebar_project + @property def default_page(self) -> str | None: return "project-dashboard" if self.user else "" - def user_has_permission(self, _permission: Permission) -> bool: + def user_has_permission(self, permission: Permission, /, project_code: str | None = None) -> bool: # noqa: ARG002 + return True + + def user_has_project_access(self, project_code: str) -> bool: # noqa: ARG002 return True def get_jwt_hashing_key(self) -> bytes: @@ -64,7 +74,8 @@ def get_credentials(self): def login_user(self, username: str) -> None: self.user = User.get(username) self.user.save(update_latest_login=True) - MixpanelService().send_event("login", include_usage=True, role=self.user.role) + self.load_user_role() + MixpanelService().send_event("login", include_usage=True, role=self.role) def load_user_session(self) -> None: cookies = self._load_cookies() @@ -73,12 +84,19 @@ def load_user_session(self) -> None: try: payload = decode_jwt_token(token) self.user = User.get(payload["username"]) + self.load_user_role() except Exception: LOG.debug("Invalid auth token found on cookies", exc_info=True, stack_info=True) + def load_user_role(self) -> None: + if self.user and self.current_project: + membership = ProjectMembership.get_by_user_and_project(self.user.id, self.current_project) + self.role = membership.role + def end_user_session(self) -> None: self._clear_jwt_cookie() self.user = None + self.role = None def _clear_jwt_cookie(self) -> None: execute_javascript( diff --git a/testgen/ui/components/frontend/js/components/score_issues.js b/testgen/ui/components/frontend/js/components/score_issues.js index 659f8020..0d93b745 100644 --- a/testgen/ui/components/frontend/js/components/score_issues.js +++ b/testgen/ui/components/frontend/js/components/score_issues.js @@ -159,7 +159,7 @@ const IssuesTable = ( category === 'column_name' ? span({ class: 'ml-2' }) : ColumnProfilingButton(row.column, row.table, row.table_group_id), - columns.map((columnName) => TableCell(row, columnName)), + columns.map((columnName) => TableCell(row, columnName, score.project_code)), )), () => Paginator({ pageIndex, @@ -253,13 +253,13 @@ const Toolbar = ( * @param {string} column * @returns {} */ -const TableCell = (row, column) => { +const TableCell = (row, column, projectCode) => { const componentByColumn = { column: IssueColumnCell, type: IssueCell, status: StatusCell, detail: DetailCell, - time: TimeCell, + time: (value, row) => TimeCell(value, row, projectCode), }; if (componentByColumn[column]) { @@ -306,7 +306,7 @@ const DetailCell = (value, row) => { ); }; -const TimeCell = (value, row) => { +const TimeCell = (value, row, projectCode) => { return div( { class: 'flex-column', style: `flex: 0 0 ${ISSUES_COLUMNS_SIZES.time}` }, row.issue_type === 'test' @@ -321,6 +321,7 @@ const TimeCell = (value, row) => { table_name: row.table, column_name: row.column, selected: row.id, + project_code: projectCode, }, }), ); diff --git a/testgen/ui/components/frontend/js/components/select.js b/testgen/ui/components/frontend/js/components/select.js index 3e3e658c..b454009b 100644 --- a/testgen/ui/components/frontend/js/components/select.js +++ b/testgen/ui/components/frontend/js/components/select.js @@ -10,6 +10,7 @@ * @property {string?} id * @property {string} label * @property {string?|Array.?} value +* @property {string?} placeholder * @property {Array.} options * @property {boolean} allowNull * @property {Function|null} onChange @@ -168,6 +169,7 @@ const Select = (/** @type {Properties} */ props) => { ? input({ id: `tg-select--field--${getRandomId()}`, value: valueLabel.val, + placeholder: props.placeholder, onkeyup: filterOptions, }) : valueLabel.val, diff --git a/testgen/ui/components/frontend/js/data_profiling/data_issues.js b/testgen/ui/components/frontend/js/data_profiling/data_issues.js index 261a2283..ea8a86ef 100644 --- a/testgen/ui/components/frontend/js/data_profiling/data_issues.js +++ b/testgen/ui/components/frontend/js/data_profiling/data_issues.js @@ -65,7 +65,7 @@ const PotentialPIICard = (/** @type Properties */ props, /** @type Table | Colum const potentialPII = item.hygiene_issues.filter(({ issue_likelihood }) => issue_likelihood === 'Potential PII'); const linkProps = props.noLinks ? null : { href: 'profiling-runs:hygiene', - params: { run_id: item.profile_run_id, issue_class: 'Potential PII' }, + params: { run_id: item.profile_run_id, issue_class: 'Potential PII', project_code: item.project_code }, }; const noneContent = item.profile_run_id && !item.profiling_error ? 'No potential PII detected' @@ -101,6 +101,7 @@ const HygieneIssuesCard = (/** @type Properties */ props, /** @type Table | Colu run_id: item.profile_run_id, table_name: item.table_name, column_name: item.column_name, + project_code: item.project_code, }, }; const noneContent = item.profile_run_id && !item.profiling_error @@ -141,6 +142,7 @@ const TestIssuesCard = (/** @type Properties */ props, /** @type Table | Column table_name: item.table_name, column_name: item.column_name, selected: issue.id, + project_code: item.project_code, }, open_new: true, label: formatTimestamp(issue.test_run_date), diff --git a/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js b/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js index 6c4c9586..829a8a24 100644 --- a/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js +++ b/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js @@ -227,6 +227,7 @@ const LatestProfilingTime = (/** @type Properties */ props, /** @type Table | Co run_id: item.profile_run_id, table_name: item.table_name, column_name: item.column_name, + project_code: item.project_code, }, open_new: true, label: formatTimestamp(item.profile_run_date), diff --git a/testgen/ui/components/frontend/js/pages/data_catalog.js b/testgen/ui/components/frontend/js/pages/data_catalog.js index 1e2f4dfb..66808d7d 100644 --- a/testgen/ui/components/frontend/js/pages/data_catalog.js +++ b/testgen/ui/components/frontend/js/pages/data_catalog.js @@ -573,6 +573,7 @@ const TestSuitesCard = (/** @type Table | Column */ item) => { test_suite_id: id, table_name: item.table_name, column_name: item.column_name, + project_code: item.project_code, }, open_new: true, label: name, diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js index d166795c..1515fbf4 100644 --- a/testgen/ui/components/frontend/js/pages/profiling_runs.js +++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js @@ -190,7 +190,7 @@ const ProfilingRuns = (/** @type Properties */ props) => { ), ), div( - paginatedRuns.val.map(item => ProfilingRunItem(item, columns, selectedRuns[item.id], userCanEdit)), + paginatedRuns.val.map(item => ProfilingRunItem(item, columns, selectedRuns[item.id], userCanEdit, projectSummary.project_code)), ), ), Paginator({ @@ -280,6 +280,7 @@ const ProfilingRunItem = ( /** @type string[] */ columns, /** @type boolean */ selected, /** @type boolean */ userCanEdit, + /** @type string */ projectCode, ) => { const runningStep = item.progress?.find((item) => item.status === 'Running'); @@ -363,7 +364,7 @@ const ProfilingRunItem = ( item.status === 'Complete' && item.column_ct ? Link({ label: 'View results', href: 'profiling-runs:results', - params: { 'run_id': item.id }, + params: { 'run_id': item.id, 'project_code': projectCode }, underline: true, right_icon: 'chevron_right', }) : null, @@ -381,7 +382,7 @@ const ProfilingRunItem = ( item.anomaly_ct ? Link({ label: `View ${item.anomaly_ct} issues`, href: 'profiling-runs:hygiene', - params: { 'run_id': item.id }, + params: { 'run_id': item.id, 'project_code': projectCode }, underline: true, right_icon: 'chevron_right', style: 'margin-top: 4px;', diff --git a/testgen/ui/components/frontend/js/pages/project_dashboard.js b/testgen/ui/components/frontend/js/pages/project_dashboard.js index 292b0aba..79cb9c02 100644 --- a/testgen/ui/components/frontend/js/pages/project_dashboard.js +++ b/testgen/ui/components/frontend/js/pages/project_dashboard.js @@ -121,8 +121,8 @@ const ProjectDashboard = (/** @type Properties */ props) => { { class: 'flex-column mt-4' }, getValue(filteredTableGroups).map(tableGroup => tableGroup.monitoring_summary - ? TableGroupCardWithMonitor(tableGroup) - : TableGroupCard(tableGroup) + ? TableGroupCardWithMonitor(tableGroup, getValue(props.project_summary)?.project_code) + : TableGroupCard(tableGroup, getValue(props.project_summary)?.project_code) ) ) : div( @@ -133,7 +133,7 @@ const ProjectDashboard = (/** @type Properties */ props) => { ); } -const TableGroupCard = (/** @type TableGroupSummary */ tableGroup) => { +const TableGroupCard = (/** @type TableGroupSummary */ tableGroup, /** @type string */ projectCode) => { const useApprox = tableGroup.record_ct === null || tableGroup.record_ct === undefined; return Card({ @@ -158,12 +158,12 @@ const TableGroupCard = (/** @type TableGroupSummary */ tableGroup) => { ${formatNumber(useApprox ? tableGroup.approx_data_point_ct : tableGroup.data_point_ct)} data points ${useApprox ? '*' : ''}`, ), - TableGroupTestSuiteSummary(tableGroup.test_suites), + TableGroupTestSuiteSummary(tableGroup.test_suites, projectCode), ), ScoreMetric(tableGroup.dq_score, tableGroup.dq_score_profiling, tableGroup.dq_score_testing), ), hr({ class: 'tg-overview--table-group-divider' }), - TableGroupLatestProfile(tableGroup), + TableGroupLatestProfile(tableGroup, projectCode), useApprox ? span({ class: 'text-caption text-right' }, '* Approximate counts based on server statistics') : null, @@ -171,7 +171,7 @@ const TableGroupCard = (/** @type TableGroupSummary */ tableGroup) => { }); }; -const TableGroupCardWithMonitor = (/** @type TableGroupSummary */ tableGroup) => { +const TableGroupCardWithMonitor = (/** @type TableGroupSummary */ tableGroup, /** @type string */ projectCode) => { const useApprox = tableGroup.record_ct === null || tableGroup.record_ct === undefined; return Card({ testId: 'table-group-summary-card', @@ -205,9 +205,9 @@ const TableGroupCardWithMonitor = (/** @type TableGroupSummary */ tableGroup) => ), hr({ class: 'tg-overview--table-group-divider' }), - TableGroupTestSuiteSummary(tableGroup.test_suites), + TableGroupTestSuiteSummary(tableGroup.test_suites, projectCode), hr({ class: 'tg-overview--table-group-divider' }), - TableGroupLatestProfile(tableGroup), + TableGroupLatestProfile(tableGroup, projectCode), useApprox ? span({ class: 'text-caption text-right' }, '* Approximate counts based on server statistics') : null, @@ -215,7 +215,7 @@ const TableGroupCardWithMonitor = (/** @type TableGroupSummary */ tableGroup) => }); }; -const TableGroupLatestProfile = (/** @type TableGroupSummary */ tableGroup) => { +const TableGroupLatestProfile = (/** @type TableGroupSummary */ tableGroup, /** @type string */ projectCode) => { if (!tableGroup.latest_profile_start) { return div( { class: 'mt-1 mb-1 text-secondary' }, @@ -233,7 +233,7 @@ const TableGroupLatestProfile = (/** @type TableGroupSummary */ tableGroup) => { Link({ label: formatTimestamp(tableGroup.latest_profile_start), href: 'profiling-runs:results', - params: { run_id: tableGroup.latest_profile_id }, + params: { run_id: tableGroup.latest_profile_id, project_code: projectCode }, }), daysAgo > staleProfileDays ? span({ class: 'text-error' }, `(${daysAgo} days ago)`) @@ -246,6 +246,7 @@ const TableGroupLatestProfile = (/** @type TableGroupSummary */ tableGroup) => { href: 'profiling-runs:hygiene', params: { run_id: tableGroup.latest_profile_id, + project_code: projectCode, }, width: 150, style: 'flex: 0 0 auto;', @@ -264,7 +265,7 @@ const TableGroupLatestProfile = (/** @type TableGroupSummary */ tableGroup) => { ); }; -const TableGroupTestSuiteSummary = (/** @type TestSuiteSummary[] */testSuites) => { +const TableGroupTestSuiteSummary = (/** @type TestSuiteSummary[] */testSuites, /** @type string */ projectCode) => { if (!testSuites?.length) { return div( { class: 'mt-1 mb-1 text-secondary' }, @@ -287,7 +288,7 @@ const TableGroupTestSuiteSummary = (/** @type TestSuiteSummary[] */testSuites) = Link({ label: suite.test_suite, href: 'test-suites:definitions', - params: { test_suite_id: suite.id }, + params: { test_suite_id: suite.id, project_code: projectCode }, }), span({ class: 'text-caption' }, `${suite.test_ct ?? 0} tests`), ), @@ -295,7 +296,7 @@ const TableGroupTestSuiteSummary = (/** @type TestSuiteSummary[] */testSuites) = ? Link({ label: formatTimestamp(suite.latest_run_start), href: 'test-runs:results', - params: { run_id: suite.latest_run_id }, + params: { run_id: suite.latest_run_id, project_code: projectCode }, style: 'flex: 1 1 25%;', }) : span({ style: 'flex: 1 1 25%;' }, '--'), diff --git a/testgen/ui/components/frontend/js/pages/quality_dashboard.js b/testgen/ui/components/frontend/js/pages/quality_dashboard.js index 3378f21e..1637942a 100644 --- a/testgen/ui/components/frontend/js/pages/quality_dashboard.js +++ b/testgen/ui/components/frontend/js/pages/quality_dashboard.js @@ -81,7 +81,7 @@ const QualityDashboard = (/** @type {Properties} */ props) => { right_icon: 'chevron_right', href: 'quality-dashboard:score-details', class: 'ml-4', - params: { definition_id: score.id }, + params: { definition_id: score.id, project_code: getValue(props.project_summary)?.project_code }, }), {showHistory: true}, )) diff --git a/testgen/ui/components/frontend/js/pages/score_details.js b/testgen/ui/components/frontend/js/pages/score_details.js index d80e7290..1bffa5c0 100644 --- a/testgen/ui/components/frontend/js/pages/score_details.js +++ b/testgen/ui/components/frontend/js/pages/score_details.js @@ -62,7 +62,7 @@ const ScoreDetails = (/** @type {Properties} */ props) => { return userCanEdit ? div( { class: 'flex-row tg-test-suites--card-actions' }, Button({ type: 'icon', icon: 'notifications', tooltip: 'Configure Notifications', onclick: () => emitEvent('EditNotifications', {}) }), - Button({ type: 'icon', icon: 'edit', tooltip: 'Edit Scorecard', onclick: () => emitEvent('LinkClicked', { href: 'quality-dashboard:explorer', params: { definition_id: score.id } }) }), + Button({ type: 'icon', icon: 'edit', tooltip: 'Edit Scorecard', onclick: () => emitEvent('LinkClicked', { href: 'quality-dashboard:explorer', params: { definition_id: score.id, project_code: score.project_code } }) }), Button({ type: 'icon', icon: 'delete', tooltip: 'Delete Scorecard', onclick: () => emitEvent('DeleteScoreRequested', { payload: score.id }) }), ) : ''; }, @@ -86,7 +86,7 @@ const ScoreDetails = (/** @type {Properties} */ props) => { getValue(props.score_type), getValue(props.category), getValue(props.drilldown), - (project_code, name, score_type, category) => emitEvent('LinkClicked', { href: 'quality-dashboard:score-details', params: { definition_id: scoreId, score_type, category } }), + (project_code, name, score_type, category) => emitEvent('LinkClicked', { href: 'quality-dashboard:score-details', params: { definition_id: scoreId, project_code, score_type, category } }), ) : ScoreBreakdown( props.score, @@ -95,7 +95,7 @@ const ScoreDetails = (/** @type {Properties} */ props) => { props.score_type, (project_code, name, score_type, category, drilldown) => emitEvent( 'LinkClicked', - { href: 'quality-dashboard:score-details', params: { definition_id: scoreId, score_type, category, drilldown } + { href: 'quality-dashboard:score-details', params: { definition_id: scoreId, project_code, score_type, category, drilldown } }), ) ); diff --git a/testgen/ui/components/frontend/js/pages/score_explorer.js b/testgen/ui/components/frontend/js/pages/score_explorer.js index 7bd64e02..55efd129 100644 --- a/testgen/ui/components/frontend/js/pages/score_explorer.js +++ b/testgen/ui/components/frontend/js/pages/score_explorer.js @@ -435,7 +435,7 @@ const Toolbar = ( let params = {project_code: definition.project_code}; if (!isNew_) { href = 'quality-dashboard:score-details'; - params = {definition_id: definition.id}; + params = {definition_id: definition.id, project_code: definition.project_code}; } return Button({ diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js index f05fa5ad..73922155 100644 --- a/testgen/ui/components/frontend/js/pages/test_runs.js +++ b/testgen/ui/components/frontend/js/pages/test_runs.js @@ -185,7 +185,7 @@ const TestRuns = (/** @type Properties */ props) => { ), ), div( - paginatedRuns.val.map(item => TestRunItem(item, columns, selectedRuns[item.test_run_id], userCanEdit)), + paginatedRuns.val.map(item => TestRunItem(item, columns, selectedRuns[item.test_run_id], userCanEdit, projectSummary.project_code)), ), ), Paginator({ @@ -287,6 +287,7 @@ const TestRunItem = ( /** @type string[] */ columns, /** @type boolean */ selected, /** @type boolean */ userCanEdit, + /** @type string */ projectCode, ) => { const runningStep = item.progress?.find((item) => item.status === 'Running'); @@ -307,7 +308,7 @@ const TestRunItem = ( Link({ label: formatTimestamp(item.test_starttime), href: 'test-runs:results', - params: { 'run_id': item.test_run_id }, + params: { 'run_id': item.test_run_id, 'project_code': projectCode }, underline: true, }), div( diff --git a/testgen/ui/components/frontend/js/pages/test_suites.js b/testgen/ui/components/frontend/js/pages/test_suites.js index a08d4770..9fdd40f2 100644 --- a/testgen/ui/components/frontend/js/pages/test_suites.js +++ b/testgen/ui/components/frontend/js/pages/test_suites.js @@ -140,7 +140,7 @@ const TestSuites = (/** @type Properties */ props) => { { class: 'flex-column' }, Link({ href: 'test-suites:definitions', - params: { test_suite_id: testSuite.id }, + params: { test_suite_id: testSuite.id, project_code: projectSummary.project_code }, label: `View ${testSuite.test_ct ?? 0} test definitions`, right_icon: 'chevron_right', right_icon_size: 20, @@ -156,7 +156,7 @@ const TestSuites = (/** @type Properties */ props) => { ? [ Link({ href: 'test-runs:results', - params: { run_id: testSuite.latest_run_id }, + params: { run_id: testSuite.latest_run_id, project_code: projectSummary.project_code }, label: formatTimestamp(testSuite.latest_run_start), class: 'mb-4', }), diff --git a/testgen/ui/components/frontend/js/streamlit.js b/testgen/ui/components/frontend/js/streamlit.js index a30ace8c..5b90454c 100644 --- a/testgen/ui/components/frontend/js/streamlit.js +++ b/testgen/ui/components/frontend/js/streamlit.js @@ -7,14 +7,16 @@ const Streamlit = { enableV2(handler) { this._v2 = true; this._customSendDataHandler = handler; + window.testgen = window.testgen || {}; + window.testgen.isPage = true; }, setFrameHeight(height) { - if (!this._v2) { + if (!this || !this._v2) { sendMessageToStreamlit('streamlit:setFrameHeight', { height: height }); } }, sendData(data) { - if (this._v2) { + if (this && this._v2) { const event = data.event; const triggerData = Object.fromEntries(Object.entries(data).filter(([k, v]) => k !== 'event')); this._customSendDataHandler(event, triggerData); diff --git a/testgen/ui/components/frontend/standalone/.gitkeep b/testgen/ui/components/frontend/standalone/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/testgen/ui/components/frontend/standalone/project_settings/index.js b/testgen/ui/components/frontend/standalone/project_settings/index.js new file mode 100644 index 00000000..fb7d0533 --- /dev/null +++ b/testgen/ui/components/frontend/standalone/project_settings/index.js @@ -0,0 +1,164 @@ +/** + * @import {VanState} from '/app/static/js/van.min.js'; + */ +import van from '/app/static/js/van.min.js'; +import { Streamlit } from '/app/static/js/streamlit.js'; +import { Card } from '/app/static/js/components/card.js'; +import { Input } from '/app/static/js/components/input.js'; +import { Button } from '/app/static/js/components/button.js'; +import { required } from '/app/static/js/form_validators.js'; +import { Alert } from '/app/static/js/components/alert.js'; +import { emitEvent, getValue, isEqual } from '/app/static/js/utils.js'; + +const { div, span } = van.tags; + +/** + * @typedef ObsTestResults + * @type {object} + * @property {boolean} successful + * @property {string} message + * @property {string?} details + * + * @typedef Properties + * @type {object} + * @property {VanState} name + * @property {VanState} observability_api_url + * @property {VanState} observability_api_key + * @property {VanState} observability_test_results + * + * @param {Properties} props + */ +const ProjectSettings = (props) => { + const /** @type Properties */ form = { + name: van.state(props.name.rawVal ?? ''), + observability_api_key: van.state(props.observability_api_key.rawVal ?? ''), + observability_api_url: van.state(props.observability_api_url.rawVal ?? ''), + }; + const formValidity = { + name: van.state(!!form.name.rawVal), + observability_api_key: van.state(true), + observability_api_url: van.state(true), + }; + const saveDisabled = van.derive(() => !formValidity.name.val || !formValidity.observability_api_url.val || !formValidity.observability_api_key.val); + const testObservabilityDisabled = van.derive(() => form.observability_api_url.val.length <= 0 || form.observability_api_key.val.length <= 0); + + return div( + { class: 'flex-column fx-gap-3' }, + div( + { class: 'flex-column fx-gap-1' }, + span({ class: 'body m' }, 'Project Info'), + Card({ + class: 'mb-0', + border: true, + content: div( + { class: 'flex-column fx-gap-3'}, + Input({ + label: 'Project Name', + value: form.name, + validators: [ required ], + onChange: (value, validity) => { + form.name.val = value; + formValidity.name.val = validity.valid; + }, + }), + ), + }), + ), + div( + { class: 'flex-column fx-gap-1' }, + span({ class: 'body m' }, 'Observability Integration'), + Card({ + class: 'mb-0', + border: true, + content: div( + { class: 'flex-column fx-gap-3'}, + Input({ + label: 'API URL', + value: form.observability_api_url, + onChange: (value, validity) => { + form.observability_api_url.val = value; + formValidity.observability_api_url.val = validity.valid; + }, + }), + Input({ + label: 'API Key', + value: form.observability_api_key, + onChange: (value, validity) => { + form.observability_api_key.val = value; + formValidity.observability_api_key.val = validity.valid; + }, + }), + div( + { class: 'flex-row' }, + Button({ + type: 'stroked', + color: 'basic', + label: 'Test Observability Connection', + width: 'auto', + disabled: testObservabilityDisabled, + onclick: () => emitEvent('TestObservabilityClicked', { + payload: { + observability_api_url: form.observability_api_url.rawVal, + observability_api_key: form.observability_api_key.rawVal, + }, + }), + }), + ), + () => { + const results = getValue(props.observability_test_results) ?? {}; + return Object.keys(results).length > 0 + ? Alert( + { type: results.successful ? 'success' : 'error' }, + div( + { class: 'flex-column' }, + span(results.message), + results.details ? span(results.details) : '', + ), + ) + : ''; + }, + ), + }), + ), + div( + { class: 'flex-row fx-justify-content-flex-end' }, + Button({ + type: 'stroked', + color: 'primary', + label: 'Save', + width: 'auto', + disabled: saveDisabled, + onclick: () => emitEvent('SaveClicked', { + payload: Object.fromEntries(Object.entries(form).map(([fieldName, value]) => [fieldName, value.rawVal])) + }), + }), + ), + ); +}; + +export default (component) => { + const { data, setStateValue, setTriggerValue, parentElement } = component; + + Streamlit.enableV2(setTriggerValue); + + let componentState = parentElement.state; + if (componentState === undefined) { + componentState = {}; + for (const [ key, value ] of Object.entries(data)) { + componentState[key] = van.state(value); + } + + parentElement.state = componentState; + van.add(parentElement, ProjectSettings(componentState)); + } else { + for (const [ key, value ] of Object.entries(data)) { + if (!isEqual(componentState[key].val, value)) { + componentState[key].val = value; + } + } + } + + return () => { + parentElement.state = null; + }; +}; diff --git a/testgen/ui/components/widgets/__init__.py b/testgen/ui/components/widgets/__init__.py index 63ff76d7..dbe7a776 100644 --- a/testgen/ui/components/widgets/__init__.py +++ b/testgen/ui/components/widgets/__init__.py @@ -53,3 +53,9 @@ js="pages/edit_table_monitors.js", isolate_styles=False, )) + +project_settings = component_v2_wrapped(components_v2.component( + name="dataops-testgen.project_settings", + js="index.js", + isolate_styles=False, +)) diff --git a/testgen/ui/components/widgets/sidebar.py b/testgen/ui/components/widgets/sidebar.py index ec7c0ab2..0a9d52c8 100644 --- a/testgen/ui/components/widgets/sidebar.py +++ b/testgen/ui/components/widgets/sidebar.py @@ -42,7 +42,7 @@ def sidebar( "menu": menu.filter_for_current_user().sort_items().unflatten().asdict(), "current_page": current_page, "username": session.auth.user_display, - "role": session.auth.user.role if session.auth.user else None, + "role": session.auth.role or "-", "logout_path": LOGOUT_PATH, "version": version.__dict__, "support_email": support_email, diff --git a/testgen/ui/pdf/hygiene_issue_report.py b/testgen/ui/pdf/hygiene_issue_report.py index df858ec1..1e67ae06 100644 --- a/testgen/ui/pdf/hygiene_issue_report.py +++ b/testgen/ui/pdf/hygiene_issue_report.py @@ -139,7 +139,7 @@ def build_summary_table(document, hi_data): ), ( Paragraph( - f""" + f""" View on TestGen > """, style=PARA_STYLE_LINK, diff --git a/testgen/ui/pdf/test_result_report.py b/testgen/ui/pdf/test_result_report.py index 50f79b55..eab9cee8 100644 --- a/testgen/ui/pdf/test_result_report.py +++ b/testgen/ui/pdf/test_result_report.py @@ -152,7 +152,7 @@ def build_summary_table(document, tr_data): ), ( Paragraph( - f""" + f""" View on TestGen > """, style=PARA_STYLE_LINK, diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py index 14f34b13..5f8362e6 100644 --- a/testgen/ui/queries/profiling_queries.py +++ b/testgen/ui/queries/profiling_queries.py @@ -80,7 +80,7 @@ def get_profiling_results(profiling_run_id: str, table_name: str | None = None, query = f""" SELECT - id::VARCHAR, + profile_results.id::VARCHAR, 'column' AS type, schema_name, table_name, @@ -107,8 +107,11 @@ def get_profiling_results(profiling_run_id: str, table_name: str | None = None, AND table_name = profile_results.table_name AND column_name = profile_results.column_name ) THEN 'Yes' END AS hygiene_issues, - CASE WHEN query_error IS NOT NULL THEN 'Error: ' || query_error ELSE NULL END AS result_details + CASE WHEN query_error IS NOT NULL THEN 'Error: ' || query_error ELSE NULL END AS result_details, + tg.project_code, + tg.connection_id::VARCHAR AS connection_id FROM profile_results + LEFT JOIN table_groups tg ON (profile_results.table_groups_id = tg.id) WHERE profile_run_id = :profiling_run_id AND table_name ILIKE :table_name AND column_name ILIKE :column_name @@ -242,16 +245,16 @@ def get_tables_by_condition( -- Profile Run table_chars.last_complete_profile_run_id::VARCHAR AS profile_run_id, profiling_starttime AS profile_run_date, - TRUE AS is_latest_profile + TRUE AS is_latest_profile, + table_groups.project_code, + table_groups.connection_id::VARCHAR AS connection_id FROM data_table_chars table_chars LEFT JOIN profiling_runs ON ( table_chars.last_complete_profile_run_id = profiling_runs.id ) - {""" LEFT JOIN table_groups ON ( table_chars.table_groups_id = table_groups.id ) - """ if include_tags else ""} {""" LEFT JOIN active_test_definitions active_tests ON ( table_chars.table_groups_id = active_tests.table_groups_id @@ -404,16 +407,16 @@ def get_columns_by_condition( column_chars.dq_score_testing, """ if include_scores else ""} table_chars.approx_record_ct, + table_groups.project_code, + table_groups.connection_id::VARCHAR AS connection_id, {COLUMN_PROFILING_FIELDS} FROM data_column_chars column_chars LEFT JOIN data_table_chars table_chars ON ( column_chars.table_id = table_chars.table_id ) - {""" LEFT JOIN table_groups ON ( column_chars.table_groups_id = table_groups.id ) - """ if include_tags else ""} LEFT JOIN profile_results ON ( column_chars.last_complete_profile_run_id = profile_results.profile_run_id AND column_chars.schema_name = profile_results.schema_name @@ -516,7 +519,7 @@ def get_profiling_anomalies( END AS likelihood_order, t.anomaly_description, r.detail, t.suggested_action, r.anomaly_id, r.table_groups_id::VARCHAR, r.id::VARCHAR, p.profiling_starttime, r.profile_run_id::VARCHAR, - tg.table_groups_name, + tg.table_groups_name, tg.project_code, -- These are used in the PDF report dcc.functional_data_type, diff --git a/testgen/ui/static/css/shared.css b/testgen/ui/static/css/shared.css index 8390aafe..28ed9baa 100644 --- a/testgen/ui/static/css/shared.css +++ b/testgen/ui/static/css/shared.css @@ -748,3 +748,39 @@ input::-ms-clear { .warning-text { color: var(--orange); } + +/* Base Styles - Using standard system fonts for that Material feel */ +.display, .headline, .title, .body, .label { + margin: 0; + padding: 0; +} + +/* --- Display: For prominent, large-scale text --- */ +.display { font-weight: 400; letter-spacing: -0.25px; } +.display.l { font-size: 57px; line-height: 64px; } +.display.m { font-size: 45px; line-height: 52px; } +.display.s { font-size: 36px; line-height: 44px; } + +/* --- Headline: High-emphasis, shorter text --- */ +.headline { font-weight: 400; } +.headline.l { font-size: 32px; line-height: 40px; } +.headline.m { font-size: 28px; line-height: 36px; } +.headline.s { font-size: 24px; line-height: 32px; } + +/* --- Title: Medium-emphasis, usually for UI headers --- */ +.title { font-weight: 400; } /* Title Large is 400, M/S are 500 */ +.title.l { font-size: 22px; line-height: 28px; } +.title.m { font-size: 16px; line-height: 24px; font-weight: 500; letter-spacing: 0.15px; } +.title.s { font-size: 14px; line-height: 20px; font-weight: 500; letter-spacing: 0.1px; } + +/* --- Body: For extended reading and long-form content --- */ +.body { font-weight: 400; } +.body.l { font-size: 16px; line-height: 24px; letter-spacing: 0.5px; } +.body.m { font-size: 14px; line-height: 20px; letter-spacing: 0.25px; } +.body.s { font-size: 12px; line-height: 16px; letter-spacing: 0.4px; } + +/* --- Label: For small functional text (buttons, captions) --- */ +.label { font-weight: 500; } +.label.l { font-size: 14px; line-height: 20px; letter-spacing: 0.1px; } +.label.m { font-size: 12px; line-height: 16px; letter-spacing: 0.5px; } +.label.s { font-size: 11px; line-height: 16px; letter-spacing: 0.5px; } diff --git a/testgen/ui/static/js/components/card.js b/testgen/ui/static/js/components/card.js index b883b9b7..988d77db 100644 --- a/testgen/ui/static/js/components/card.js +++ b/testgen/ui/static/js/components/card.js @@ -9,16 +9,33 @@ * @property {string?} class * @property {string?} testId */ -import { loadStylesheet } from '../utils.js'; +import { loadStylesheet, getValue } from '../utils.js'; import van from '../van.min.js'; const { div, h3 } = van.tags; const Card = (/** @type Properties */ props) => { loadStylesheet('card', stylesheet); - return div( - { class: `tg-card mb-4 ${props.border ? 'tg-card-border' : ''} ${props.class}`, id: props.id ?? '', 'data-testid': props.testId ?? '' }, + { + id: props.id ?? '', + 'data-testid': props.testId ?? '', + class: () => { + const classes = ['tg-card']; + if (getValue(props.border)) { + classes.push('tg-card-border'); + } + + if (!!props.class) { + classes.push(...props.class); + if (!props.class.includes('mb-') && !props.class.includes('m-')) { + classes.push('mb-4'); + } + } + + return classes.join(' '); + }, + }, () => props.title || props.actionContent ? div( diff --git a/testgen/ui/static/js/components/score_issues.js b/testgen/ui/static/js/components/score_issues.js index 659f8020..0d93b745 100644 --- a/testgen/ui/static/js/components/score_issues.js +++ b/testgen/ui/static/js/components/score_issues.js @@ -159,7 +159,7 @@ const IssuesTable = ( category === 'column_name' ? span({ class: 'ml-2' }) : ColumnProfilingButton(row.column, row.table, row.table_group_id), - columns.map((columnName) => TableCell(row, columnName)), + columns.map((columnName) => TableCell(row, columnName, score.project_code)), )), () => Paginator({ pageIndex, @@ -253,13 +253,13 @@ const Toolbar = ( * @param {string} column * @returns {} */ -const TableCell = (row, column) => { +const TableCell = (row, column, projectCode) => { const componentByColumn = { column: IssueColumnCell, type: IssueCell, status: StatusCell, detail: DetailCell, - time: TimeCell, + time: (value, row) => TimeCell(value, row, projectCode), }; if (componentByColumn[column]) { @@ -306,7 +306,7 @@ const DetailCell = (value, row) => { ); }; -const TimeCell = (value, row) => { +const TimeCell = (value, row, projectCode) => { return div( { class: 'flex-column', style: `flex: 0 0 ${ISSUES_COLUMNS_SIZES.time}` }, row.issue_type === 'test' @@ -321,6 +321,7 @@ const TimeCell = (value, row) => { table_name: row.table, column_name: row.column, selected: row.id, + project_code: projectCode, }, }), ); diff --git a/testgen/ui/static/js/form_validators.js b/testgen/ui/static/js/form_validators.js index 635b8b6a..58c085bb 100644 --- a/testgen/ui/static/js/form_validators.js +++ b/testgen/ui/static/js/form_validators.js @@ -120,6 +120,26 @@ function sizeLimit(limit) { return validator; } +/** + * @typedef NotInOptions + * @type {object} + * @property {string?} errorMessage + * @property {((v: any) => any)?} formatter + * @property {string} a + * + * @param {any[]} values + * @param {NotInOptions?} options + * @returns {Validator} + */ +function notIn(values, options) { + return (value) => { + if (value && values.includes(!!options?.formatter ? options.formatter(value) : value)) { + return options?.errorMessage ?? `Value cannot be any of: ${values.join(', ')}.`; + } + return null; + }; +} + export { maxLength, minLength, @@ -128,4 +148,5 @@ export { required, requiredIf, sizeLimit, + notIn, }; diff --git a/testgen/ui/static/js/streamlit.js b/testgen/ui/static/js/streamlit.js index a30ace8c..5b90454c 100644 --- a/testgen/ui/static/js/streamlit.js +++ b/testgen/ui/static/js/streamlit.js @@ -7,14 +7,16 @@ const Streamlit = { enableV2(handler) { this._v2 = true; this._customSendDataHandler = handler; + window.testgen = window.testgen || {}; + window.testgen.isPage = true; }, setFrameHeight(height) { - if (!this._v2) { + if (!this || !this._v2) { sendMessageToStreamlit('streamlit:setFrameHeight', { height: height }); } }, sendData(data) { - if (this._v2) { + if (this && this._v2) { const event = data.event; const triggerData = Object.fromEntries(Object.entries(data).filter(([k, v]) => k !== 'event')); this._customSendDataHandler(event, triggerData); diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py index 15e04614..4f4b52fe 100644 --- a/testgen/ui/views/hygiene_issues.py +++ b/testgen/ui/views/hygiene_issues.py @@ -59,6 +59,13 @@ def render( ) return + if not session.auth.user_has_project_access(run.project_code): + self.router.navigate_with_warning( + "You don't have access to view this resource. Redirecting ...", + "profiling-runs", + ) + return + run_date = date_service.get_timezoned_timestamp(st.session_state, run.profiling_starttime) session.set_sidebar_project(run.project_code) diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index 5a31fa3f..a71dbeb7 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -44,6 +44,13 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | ) return + if not session.auth.user_has_project_access(run.project_code): + self.router.navigate_with_warning( + "You don't have access to view this resource. Redirecting ...", + "profiling-runs", + ) + return + run_date = date_service.get_timezoned_timestamp(st.session_state, run.profiling_starttime) session.set_sidebar_project(run.project_code) diff --git a/testgen/ui/views/project_settings.py b/testgen/ui/views/project_settings.py index 08f1af13..aefc66a0 100644 --- a/testgen/ui/views/project_settings.py +++ b/testgen/ui/views/project_settings.py @@ -1,9 +1,8 @@ -import time +import random import typing -from functools import partial +from dataclasses import asdict, dataclass, field import streamlit as st -from streamlit.delta_generator import DeltaGenerator from testgen.commands.run_observability_exporter import test_observability_exporter from testgen.common.models import with_database_session @@ -11,7 +10,7 @@ from testgen.ui.components import widgets as testgen from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page -from testgen.ui.session import session +from testgen.ui.session import session, temp_value PAGE_TITLE = "Project Settings" @@ -41,103 +40,59 @@ def render(self, project_code: str | None = None, **_kwargs) -> None: "manage-projects", ) - testgen.whitespace(1) - self.show_edit_form() - - def show_edit_form(self) -> None: - form_container = st.container() - status_container = st.container() - - with form_container: - with testgen.card(): - name_input = st.text_input( - label="Project Name", - value=self.project.project_name, - max_chars=30, - key="project_settings:keys:project_name", - ) - st.text_input( - label="Observability API URL", - value=self.project.observability_api_url, - key="project_settings:keys:observability_api_url", - ) - st.text_input( - label="Observability API Key", - value=self.project.observability_api_key, - key="project_settings:keys:observability_api_key", - ) - - testgen.whitespace(1) - test_button_column, warning_column, save_button_column = st.columns([.4, .3, .3]) - testgen.flex_row_start(test_button_column) - testgen.flex_row_end(save_button_column) - - with test_button_column: - testgen.button( - type_="stroked", - color="basic", - label="Test Observability Connection", - width=250, - on_click=partial(self._display_connection_status, status_container), - key="project-settings:keys:test-connection", - ) - - with warning_column: - if not name_input: - testgen.text("Project name is required", "color: var(--red)") - elif self.existing_names and name_input in self.existing_names: - testgen.text("Project name in use", "color: var(--red)") - - with save_button_column: - testgen.button( - type_="flat", - label="Save", - width=100, - on_click=self.edit_project, - key="project-settings:keys:edit", - ) + get_test_results, set_test_results = temp_value(f"project_settings:{project_code}", default=None) + + def on_observability_connection_test(payload: dict) -> None: + results = self.test_observability_connection(project_code, payload) + set_test_results(asdict(results)) + + return testgen.project_settings( + key="project_settings", + data={ + "name": self.project.project_name, + "observability_api_url": self.project.observability_api_url, + "observability_api_key": self.project.observability_api_key, + "observability_test_results": get_test_results(), + }, + on_TestObservabilityClicked_change=on_observability_connection_test, + on_SaveClicked_change=lambda payload: self.update_project(project_code, payload), + ) @with_database_session - def edit_project(self) -> None: - edited_project = self._get_edited_project() - if edited_project["project_name"] and (not self.existing_names or edited_project["project_name"] not in self.existing_names): - self.project.project_name = edited_project["project_name"] - self.project.observability_api_url = edited_project["observability_api_url"] - self.project.observability_api_key = edited_project["observability_api_key"] - self.project.save() - st.toast("Changes have been saved.") - - def _get_edited_project(self) -> None: - edited_project = { - "id": self.project.id, - "project_code": self.project.project_code, - } - # We have to get the input widget values from the session state - # The return values for st.text_input do not reflect the latest user input if the button is clicked without unfocusing the input - # https://discuss.streamlit.io/t/issue-with-modifying-text-using-st-text-input-and-st-button/56619/5 - for key in [ "project_name", "observability_api_url", "observability_api_key" ]: - value = st.session_state.get(f"project_settings:keys:{key}") - edited_project[key] = value.strip() if value else None - return edited_project - - def _display_connection_status(self, status_container: DeltaGenerator) -> None: - single_element_container = status_container.empty() - single_element_container.info("Connecting ...") - + def update_project(self, project_code: str, edited_project: dict) -> None: + existing_names = [ + p.project_name.lower() for p in Project.select_where(Project.project_code != project_code) + ] + new_project_name = edited_project["name"] + if new_project_name.lower() in existing_names: + raise ValueError(f"Another project named {new_project_name} exists") + + self.project.project_name = new_project_name + self.project.observability_api_url = edited_project.get("observability_api_url") + self.project.observability_api_key = edited_project.get("observability_api_key") + self.project.save() + Project.clear_cache() + + def test_observability_connection(self, project_code: str, edited_project: dict) -> "ObservabilityConnectionStatus": try: - project = self._get_edited_project() test_observability_exporter( - project["project_code"], - project["observability_api_url"], - project["observability_api_key"], + project_code, + edited_project.get("observability_api_url"), + edited_project.get("observability_api_key"), ) - single_element_container.success("The connection was successful.") + return ObservabilityConnectionStatus(successful=True, message="The connection was successful.") except Exception as e: - with single_element_container.container(): - st.error("Error attempting the connection.") - error_message = e.args[0] - st.caption("Connection Error Details") - with st.container(border=True): - st.markdown(error_message) - - time.sleep(0.1) + error_message = e.args[0] + return ObservabilityConnectionStatus( + successful=False, + message="Error attempting the connection", + details=error_message, + ) + + +@dataclass(frozen=True, slots=True) +class ObservabilityConnectionStatus: + message: str + successful: bool + details: str | None = field(default=None) + _: float = field(default_factory=random.random) diff --git a/testgen/ui/views/score_details.py b/testgen/ui/views/score_details.py index fad8403f..c2458036 100644 --- a/testgen/ui/views/score_details.py +++ b/testgen/ui/views/score_details.py @@ -63,6 +63,13 @@ def render( ) return + if not session.auth.user_has_project_access(score_definition.project_code): + self.router.navigate_with_warning( + "You don't have access to view this resource. Redirecting ...", + "quality-dashboard", + ) + return + session.set_sidebar_project(score_definition.project_code) testgen.page_header( diff --git a/testgen/ui/views/score_explorer.py b/testgen/ui/views/score_explorer.py index 48c3385a..841b156a 100644 --- a/testgen/ui/views/score_explorer.py +++ b/testgen/ui/views/score_explorer.py @@ -74,6 +74,14 @@ def render( return project_code = original_score_definition.project_code + + if not session.auth.user_has_project_access(project_code): + self.router.navigate_with_warning( + "You don't have access to view this resource. Redirecting ...", + "quality-dashboard", + ) + return + page_title = "Edit Scorecard" last_breadcrumb = original_score_definition.name diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index c494deed..64b23b34 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -62,6 +62,14 @@ def render( table_group = TableGroup.get_minimal(test_suite.table_groups_id) project_code = table_group.project_code + + if not session.auth.user_has_project_access(project_code): + self.router.navigate_with_warning( + "You don't have access to view this resource. Redirecting ...", + "test-suites", + ) + return + session.set_sidebar_project(project_code) user_can_edit = session.auth.user_has_permission("edit") user_can_disposition = session.auth.user_has_permission("disposition") @@ -600,7 +608,7 @@ def show_test_form( with container: testgen.link( href="profiling-runs:results", - params={"run_id": str(profile_run_id)}, + params={"run_id": str(profile_run_id), "project_code": table_group.project_code}, label=formatted_time, open_new=True, ) diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index f6065121..b51bf738 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -73,6 +73,13 @@ def render( ) return + if not session.auth.user_has_project_access(run.project_code): + self.router.navigate_with_warning( + "You don't have access to view this resource. Redirecting ...", + "test-runs", + ) + return + run_date = date_service.get_timezoned_timestamp(st.session_state, run.test_starttime) session.set_sidebar_project(run.project_code) diff --git a/tests/unit/common/notifications/test_profiling_run_notifications.py b/tests/unit/common/notifications/test_profiling_run_notifications.py index c9e7ca38..5320b997 100644 --- a/tests/unit/common/notifications/test_profiling_run_notifications.py +++ b/tests/unit/common/notifications/test_profiling_run_notifications.py @@ -100,7 +100,7 @@ def test_send_profiling_run_notification( hi_count_mock, send_mock, ): - profiling_run = ProfilingRun(id="pr-id", table_groups_id="tg-id", status=profiling_run_status) + profiling_run = ProfilingRun(id="pr-id", table_groups_id="tg-id", status=profiling_run_status, project_code="proj") get_prev_mock.return_value = ProfilingRun(id="pr-prev-id") if has_prev_run else None new_count = iter(count()) priorities = ("Definite", "Likely", "Possible", "High", "Moderate") @@ -133,8 +133,8 @@ def test_send_profiling_run_notification( { "profiling_run": { "id": "pr-id", - "issues_url": "http://tg-base-url/profiling-runs:hygiene?run_id=pr-id&source=email", - "results_url": "http://tg-base-url/profiling-runs:results?run_id=pr-id&source=email", + "issues_url": "http://tg-base-url/profiling-runs:hygiene?project_code=proj&run_id=pr-id&source=email", + "results_url": "http://tg-base-url/profiling-runs:results?project_code=proj&run_id=pr-id&source=email", "start_time": None, "end_time": None, "status": profiling_run_status, diff --git a/tests/unit/common/notifications/test_score_drop_notifications.py b/tests/unit/common/notifications/test_score_drop_notifications.py index 26267578..76617976 100644 --- a/tests/unit/common/notifications/test_score_drop_notifications.py +++ b/tests/unit/common/notifications/test_score_drop_notifications.py @@ -171,7 +171,7 @@ def test_send_score_drop_notifications( { "project_name": "Test Project", "definition": score_definition, - "scorecard_url": "http://tg-base-url/quality-dashboard:score-details?definition_id=sd-1&source=email", + "scorecard_url": "http://tg-base-url/quality-dashboard:score-details?project_code=test-proj&definition_id=sd-1&source=email", "diff": [ {**expected_total_diff, "notify": total_triggers}, {**expected_cde_diff, "notify": cde_triggers}, diff --git a/tests/unit/common/notifications/test_test_run_notifications.py b/tests/unit/common/notifications/test_test_run_notifications.py index bde2d6ff..06cd75f9 100644 --- a/tests/unit/common/notifications/test_test_run_notifications.py +++ b/tests/unit/common/notifications/test_test_run_notifications.py @@ -151,7 +151,7 @@ def test_send_test_run_notification( else: diff_mock.return_value = create_diff(**diff_mock_args) get_prev_mock.return_value = TestRun(id="tr-prev-id") - summary = object() + summary = Mock(project_code="test_project") select_summary_mock.return_value = [summary] send_test_run_notifications(test_run) @@ -174,7 +174,7 @@ def test_send_test_run_notification( expected_context = { "test_run": summary, - "test_run_url": "http://tg-base-url/test-runs:results?run_id=tr-id&source=email", + "test_run_url": "http://tg-base-url/test-runs:results?project_code=test_project&run_id=tr-id&source=email", "test_run_id": "tr-id", "test_result_summary": ANY, } From 6c232eff0d7ff632187d3278d7776efe2777df78 Mon Sep 17 00:00:00 2001 From: Luis Date: Thu, 26 Feb 2026 16:31:29 -0400 Subject: [PATCH 14/95] feat(plugins): support multiple pages per plugin spec PluginSpec.page (single class) replaced with PluginSpec.pages (list), allowing a plugin to register more than one Page in a single spec. Application.global_admin_paths now collects paths whose permission is 'global_admin' so the sidebar can switch into the admin context. --- testgen/ui/bootstrap.py | 7 ++++--- testgen/ui/components/utils/component.py | 11 +++++++++-- testgen/utils/plugins.py | 8 ++++++-- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/testgen/ui/bootstrap.py b/testgen/ui/bootstrap.py index 14a55790..52eb2b47 100644 --- a/testgen/ui/bootstrap.py +++ b/testgen/ui/bootstrap.py @@ -50,12 +50,13 @@ class Application(singleton.Singleton): - def __init__(self, auth_class: Authentication, logo: plugins.Logo, router: Router, menu: Menu, logger: logging.Logger) -> None: + def __init__(self, auth_class: Authentication, logo: plugins.Logo, router: Router, menu: Menu, logger: logging.Logger, global_admin_paths: frozenset[str]) -> None: self.auth_class = auth_class self.logo = logo self.router = router self.menu = menu self.logger = logger + self.global_admin_paths = global_admin_paths def run(log_level: int = logging.INFO) -> Application: @@ -79,8 +80,7 @@ def run(log_level: int = logging.INFO) -> Application: for plugin in installed_plugins: spec = plugin.load_streamlit() - if spec.page: - pages.append(spec.page) + pages.extend(spec.pages) if spec.auth: auth_class = spec.auth @@ -104,4 +104,5 @@ def run(log_level: int = logging.INFO) -> Application: ), ), logger=LOG, + global_admin_paths=frozenset(page.path for page in pages if page.permission == "global_admin"), ) diff --git a/testgen/ui/components/utils/component.py b/testgen/ui/components/utils/component.py index 330a42ac..0fa1cd52 100644 --- a/testgen/ui/components/utils/component.py +++ b/testgen/ui/components/utils/component.py @@ -1,5 +1,6 @@ import pathlib from collections.abc import Callable +from functools import partial import streamlit as st from streamlit.components import v1 as components @@ -10,6 +11,11 @@ component_function = components.declare_component("testgen", path=components_dir) +class RefreshableComponentRenderer(ComponentRenderer): + def refresh(self) -> None: + pass + + def component(*, id_, props, key=None, default=None, on_change=None): component_props = props if not component_props: @@ -17,7 +23,7 @@ def component(*, id_, props, key=None, default=None, on_change=None): return component_function(id=id_, props=component_props, key=key, default=default, on_change=on_change) -def component_v2_wrapped(renderer: ComponentRenderer) -> ComponentRenderer: +def component_v2_wrapped(renderer: ComponentRenderer) -> RefreshableComponentRenderer: def wrapped_renderer(key: str | None = None, **kwargs) -> BidiComponentResult: on_change_callbacks = { name: fn for name, fn, in kwargs.items() @@ -34,6 +40,7 @@ def wrapped_renderer(key: str | None = None, **kwargs) -> BidiComponentResult: on_change_callbacks[name] = _wrap_handler(key, name, callback) return renderer(**other_kwargs, **on_change_callbacks) + setattr(wrapped_renderer, "refresh", lambda: None) return wrapped_renderer @@ -44,7 +51,7 @@ def _is_change_callback(name: str) -> bool: def _wrap_handler(key: str | None, callback_name: str | None, callback: Callable | None): if key and callback_name and callback: def wrapper(): - component_value = st.session_state[key] or {} + component_value = st.session_state.get(key) or {} trigger_value_name = callback_name.removeprefix("on_").removesuffix("_change") trigger_value = (component_value.get(trigger_value_name) or {}).get("payload") return callback(trigger_value) diff --git a/testgen/utils/plugins.py b/testgen/utils/plugins.py index 34171900..87ea9450 100644 --- a/testgen/utils/plugins.py +++ b/testgen/utils/plugins.py @@ -113,7 +113,7 @@ def check_permission(_user: object, _permission: str) -> bool: class PluginSpec: rbac: ClassVar[type[RBACProvider]] = RBACProvider auth: ClassVar[type[Authentication] | None] = None - page: ClassVar[type[Page] | None] = None + pages: ClassVar[list[type[Page]]] = [] logo: ClassVar[type[Logo] | None] = None component: ClassVar[ComponentSpec | None] = None @@ -147,6 +147,10 @@ class Plugin: def load(self) -> type[PluginSpec]: """Lightweight load: import plugin module and populate PluginHook.""" + plugin_pages: list[type[Page]] = [] + plugin_auth = None + plugin_logo = None + module = importlib.import_module(self.package) spec = _find_plugin_spec(module) if spec is not None: @@ -162,7 +166,7 @@ def load_streamlit(self) -> type[PluginSpec]: return spec # Fallback: discover UI classes from module (backward compat for plugins without explicit PluginSpec) - _discoverable: dict[type, str] = {Page: "page", Authentication: "auth", Logo: "logo"} + _discoverable: dict[type, str] = {list[type[Page]]: "pages", Authentication: "auth", Logo: "logo"} attrs: dict[str, type] = {} module = importlib.import_module(self.package) From e65425815346d25bca663046a31e6a981f5c58cc Mon Sep 17 00:00:00 2001 From: Luis Date: Thu, 26 Feb 2026 16:31:40 -0400 Subject: [PATCH 15/95] feat(auth): scope sidebar projects to user memberships - Filter the project dropdown to only projects the user is a member of - Call load_user_role on every render so role reflects current project - Introduce 'global_admin' permission; sidebar enters admin-only context on global_admin pages (no project nav, AdminMenuItem items, AdminCTA) - is_global_admin prop lets non-admin users see the admin CTA in project context --- testgen/common/models/project_membership.py | 1 + testgen/ui/app.py | 18 +++- testgen/ui/auth.py | 8 +- testgen/ui/components/widgets/sidebar.py | 8 +- testgen/ui/static/js/sidebar.js | 95 ++++++++++++++++++--- 5 files changed, 113 insertions(+), 17 deletions(-) diff --git a/testgen/common/models/project_membership.py b/testgen/common/models/project_membership.py index 94bcad5e..6ee85f78 100644 --- a/testgen/common/models/project_membership.py +++ b/testgen/common/models/project_membership.py @@ -5,6 +5,7 @@ import streamlit as st from sqlalchemy import Column, ForeignKey, String, asc, select from sqlalchemy.dialects import postgresql +from sqlalchemy.orm import InstrumentedAttribute from testgen.common.models import get_current_session from testgen.common.models.entity import Entity diff --git a/testgen/ui/app.py b/testgen/ui/app.py index 36d8b9f7..f8aa4aa2 100644 --- a/testgen/ui/app.py +++ b/testgen/ui/app.py @@ -1,4 +1,5 @@ import logging +from urllib.parse import urlparse import streamlit as st @@ -52,17 +53,30 @@ def render(log_level: int = logging.INFO): if not session.auth.is_logged_in and not session.auth.logging_out: session.auth.load_user_session() + if session.auth.is_logged_in: + session.auth.load_user_role() + application.logo.render() if session.auth.is_logged_in and not session.auth.logging_in: + current_page = session.current_page + if not current_page: + try: + current_page = urlparse(st.context.url).path.lstrip("/") + except Exception: + current_page = "" + is_global_context = current_page in application.global_admin_paths with st.sidebar: testgen.sidebar( - projects=Project.select_where(), - current_project=session.sidebar_project, + projects=[] if is_global_context else [ + p for p in Project.select_where() if session.auth.user_has_project_access(p.project_code) + ], + current_project=None if is_global_context else session.sidebar_project, menu=application.menu, current_page=session.current_page, version=version_service.get_version(), support_email=settings.SUPPORT_EMAIL, + global_context=is_global_context, ) application.router.run() diff --git a/testgen/ui/auth.py b/testgen/ui/auth.py index 73ac1599..6803bb82 100644 --- a/testgen/ui/auth.py +++ b/testgen/ui/auth.py @@ -13,7 +13,7 @@ LOG = logging.getLogger("testgen") -Permission = Literal["catalog", "view", "disposition", "edit", "administer"] +Permission = Literal["catalog", "view", "disposition", "edit", "administer", "global_admin"] class Authentication: @@ -46,7 +46,7 @@ def default_page(self) -> str | None: return "project-dashboard" if self.user else "" def user_has_permission(self, permission: Permission, /, project_code: str | None = None) -> bool: # noqa: ARG002 - return True + return True # Dev/open-source: permissive, including global_admin def user_has_project_access(self, project_code: str) -> bool: # noqa: ARG002 return True @@ -91,7 +91,9 @@ def load_user_session(self) -> None: def load_user_role(self) -> None: if self.user and self.current_project: membership = ProjectMembership.get_by_user_and_project(self.user.id, self.current_project) - self.role = membership.role + self.role = membership.role if membership else None + else: + self.role = None def end_user_session(self) -> None: self._clear_jwt_cookie() diff --git a/testgen/ui/components/widgets/sidebar.py b/testgen/ui/components/widgets/sidebar.py index 0a9d52c8..1229c8b2 100644 --- a/testgen/ui/components/widgets/sidebar.py +++ b/testgen/ui/components/widgets/sidebar.py @@ -2,6 +2,7 @@ import time from collections.abc import Iterable +from testgen.common.models import with_database_session from testgen.common.models.project import Project from testgen.common.version_service import Version from testgen.ui.components.utils.component import component @@ -23,6 +24,7 @@ def sidebar( current_page: str | None = None, version: Version | None = None, support_email: str | None = None, + global_context: bool = False, ) -> None: """ Testgen custom component to display a styled menu over streamlit's @@ -33,6 +35,7 @@ def sidebar( :param username: username to display at the bottom of the menu :param menu: menu object with all root pages :param current_page: page address to highlight the selected item + :param global_context: when True, renders admin-only sidebar (no project nav) """ component( id_="sidebar", @@ -42,16 +45,19 @@ def sidebar( "menu": menu.filter_for_current_user().sort_items().unflatten().asdict(), "current_page": current_page, "username": session.auth.user_display, - "role": session.auth.role or "-", + "role": "" if global_context else (session.auth.role or "-"), "logout_path": LOGOUT_PATH, "version": version.__dict__, "support_email": support_email, + "global_context": global_context, + "is_global_admin": session.auth.user_has_permission("global_admin"), }, key=key, on_change=on_change, ) +@with_database_session def on_change(): # We cannot navigate directly here # because st.switch_page uses st.rerun under the hood diff --git a/testgen/ui/static/js/sidebar.js b/testgen/ui/static/js/sidebar.js index 9382b40f..a5770281 100644 --- a/testgen/ui/static/js/sidebar.js +++ b/testgen/ui/static/js/sidebar.js @@ -5,6 +5,7 @@ * @property {(string|null)} icon * @property {string} label * @property {(string|null)} page + * @property {(string|null)} permission * @property {(Array.|null)} items * * @typedef Version @@ -33,6 +34,8 @@ * @property {string} logout_path * @property {Version} version * @property {string} support_email + * @property {boolean} global_context + * @property {boolean} is_global_admin */ const van = window.top.van; const { a, button, div, i, img, label, option, select, span } = van.tags; @@ -55,29 +58,45 @@ const Sidebar = (/** @type {Properties} */ props) => { {class: 'menu'}, div( {class: 'fx-flex', style: 'overflow-y: auto;'}, + // Project dropdown — hidden in global admin context div( - { class: 'menu--project' }, + { + class: 'menu--project', + style: () => props.global_context?.val ? 'display: none' : '', + }, div({ class: 'caption' }, 'Project'), () => props.projects.val.length > 1 ? ProjectSelect(props.projects, currentProject) : div(currentProject.val?.name ?? '...'), ), () => { - const menuItems = props.menu?.val.items || []; - return div( - {class: 'content'}, - menuItems.map(item => - item.items?.length > 0 - ? MenuSection(item, props.current_page, currentProject.val?.code) - : MenuItem(item, props.current_page, currentProject.val?.code)) - ); + const allItems = props.menu?.val.items || []; + if (props.global_context?.val) { + // Global admin context: only show global_admin permission items, flat + const adminItems = allItems.filter(item => !item.items && item.permission === 'global_admin'); + return div( + {class: 'content'}, + adminItems.map(item => AdminMenuItem(item, props.current_page)), + ); + } else { + // Project context: filter out global_admin items (they have no section, appear at root level) + const projectItems = allItems.filter(item => item.items || item.permission !== 'global_admin'); + return div( + {class: 'content'}, + projectItems.map(item => + item.items?.length > 0 + ? MenuSection(item, props.current_page, currentProject.val?.code) + : MenuItem(item, props.current_page, currentProject.val?.code) + ), + ); + } }, ), div( div( { class: 'menu--user' }, span({class: 'menu--username', title: props.username}, props.username), - span({class: 'menu--role'}, props.role.val?.replace('_', ' ')), + span({class: 'menu--role'}, () => props.role.val?.replace('_', ' ')), ), div( { class: 'menu--buttons' }, @@ -100,6 +119,8 @@ const Sidebar = (/** @type {Properties} */ props) => { ) : null, ), ), + // Administration CTA — project context only, global admins only, opens in new tab + AdminCTA({ style: () => (!props.global_context?.val && props.is_global_admin?.val) ? '' : 'display: none' }), ); }; @@ -184,6 +205,47 @@ const MenuItem = ( ); }; +// Menu item for global admin context (no project_code in navigation) +const AdminMenuItem = ( + /** @type {MenuItem} */ item, + /** @type {string} */ currentPage, +) => { + const classes = van.derive(() => { + if (isCurrentPage(item.page, currentPage?.val)) { + return 'menu--item active'; + } + return 'menu--item'; + }); + + return a( + { + class: classes, + href: `/${item.page}`, + onclick: (event) => { + event.preventDefault(); + event.stopPropagation(); + emitEvent({ path: item.page, params: {} }); + }, + }, + i({class: 'menu--item--icon material-symbols-rounded'}, item.icon), + span({class: 'menu--item--label'}, item.label), + ); +}; + +// Single CTA shown in project context for global admins — opens admin area in new tab +const AdminCTA = ({ style } = {}) => a( + { + class: 'menu--item menu--admin-cta', + href: '/admin-projects', + target: '_blank', + rel: 'noopener noreferrer', + style, + }, + i({class: 'menu--item--icon material-symbols-rounded'}, 'admin_panel_settings'), + span({class: 'menu--item--label'}, 'Administration'), + i({class: 'menu--admin-cta--icon material-symbols-rounded'}, 'open_in_new'), +); + function emitEvent(/** @type Object */ data) { if (Sidebar.StreamlitInstance) { Sidebar.StreamlitInstance.sendData({ ...data, _id: Math.random() }); // Identify the event so its handler is called once @@ -326,7 +388,6 @@ stylesheet.replace(` .menu .menu--buttons { display: flex; justify-content: space-between; - margin-bottom: 16px; } .menu--buttons a { @@ -365,6 +426,18 @@ button.tg-button:hover { button.tg-button > i:has(+ span:not(.tg-tooltip)) { margin-right: 8px; } + +.menu--admin-cta { + margin-top: 4px; + border-top: 1px solid var(--disabled-text-color); +} + +.menu--admin-cta--icon { + margin-left: auto; + font-size: 16px !important; + line-height: 16px !important; + opacity: 0.6; +} /* ... */ `); From b0c96a32b2adc92700cf5dada32ca609e045037d Mon Sep 17 00:00:00 2001 From: Luis Date: Thu, 26 Feb 2026 16:31:51 -0400 Subject: [PATCH 16/95] feat(ui): add Dialog component; fix Portal stacking context Portal now appends to document.body so position:absolute is document-relative, preventing clip by positioned ancestors. Outside- click and mutual-exclusion logic moved into the component itself. Dialog wraps Portal to provide a modal overlay with a title bar and close button, used by the project settings members form. --- .../frontend/js/components/dialog.js | 135 ++++++++++++++++++ .../frontend/js/components/portal.js | 100 ++++++++----- testgen/ui/static/js/components/dialog.js | 135 ++++++++++++++++++ testgen/ui/static/js/components/portal.js | 100 ++++++++----- 4 files changed, 400 insertions(+), 70 deletions(-) create mode 100644 testgen/ui/components/frontend/js/components/dialog.js create mode 100644 testgen/ui/static/js/components/dialog.js diff --git a/testgen/ui/components/frontend/js/components/dialog.js b/testgen/ui/components/frontend/js/components/dialog.js new file mode 100644 index 00000000..6a249fc8 --- /dev/null +++ b/testgen/ui/components/frontend/js/components/dialog.js @@ -0,0 +1,135 @@ +/** + * @typedef DialogProps + * @type {object} + * @property {(string | import('../van.min.js').State)} title - Dialog title + * @property {import('../van.min.js').State} open - Reactive open state + * @property {Function} onClose - Called when the dialog is closed (backdrop click or X button) + * @property {string} [width] - CSS width value, default '30rem' + */ +import van from '../van.min.js'; +import { getValue, loadStylesheet } from '../utils.js'; + +const { button, div, i, span } = van.tags; + +/** + * A dialog component that mimics Streamlit's dialog visual style. + * Opens as a fixed-position overlay covering the full viewport so it + * works from within any V2 component container, regardless of depth. + * + * Usage: + * const open = van.state(false); + * + * Dialog( + * { title: 'Confirm', open, onClose: () => open.val = false }, + * div('Are you sure?'), + * Button({ label: 'Confirm', onclick: () => { doThing(); open.val = false; } }), + * ) + * + * @param {DialogProps} props + * @param {...(Element | string)} children - Content rendered in the dialog body + */ +const Dialog = ({ title, open, onClose, width = '30rem' }, ...children) => { + loadStylesheet('dialog', stylesheet); + + return div( + { + class: 'tg-dialog-overlay', + style: () => open.val ? '' : 'display: none', + onclick: () => onClose(), + }, + div( + { + class: 'tg-dialog', + role: 'dialog', + 'aria-modal': 'true', + tabindex: '-1', + style: () => `width: ${getValue(width)}`, + onclick: (e) => e.stopPropagation(), + }, + div( + { class: 'tg-dialog-header' }, + span({ class: 'tg-dialog-title' }, title), + ), + div({ class: 'tg-dialog-content' }, ...children), + button( + { + class: 'tg-dialog-close', + 'aria-label': 'Close', + onclick: () => onClose(), + }, + i({ class: 'material-symbols-rounded' }, 'close'), + ), + ), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-dialog-overlay { + position: fixed; + inset: 0; + z-index: 1000; + background: rgba(49, 51, 63, 0.5); + display: flex; + align-items: center; + justify-content: center; +} + +.tg-dialog { + position: relative; + background: var(--portal-background, white); + border-radius: 8px; + box-shadow: var(--portal-box-shadow, 0 4px 32px rgba(0, 0, 0, 0.25)); + max-width: calc(100vw - 2rem); + max-height: 80vh; + display: flex; + flex-direction: column; + overflow: hidden; +} + +.tg-dialog-header { + padding: 1.5rem 3.5rem 0.75rem 1.5rem; + font-size: 1.5rem; + font-weight: 600; + line-height: 1.5; + display: flex; + align-items: center; + flex-shrink: 0; +} + +.tg-dialog-content { + padding: 0.75rem 1.5rem 1.5rem; + overflow-y: auto; + font-size: 1rem; + color: var(--primary-text-color); +} + +.tg-dialog-close { + position: absolute; + top: 0.75rem; + right: 0.75rem; + display: flex; + align-items: center; + justify-content: center; + width: 2rem; + height: 2rem; + padding: 0; + border: none; + border-radius: 4px; + background: transparent; + cursor: pointer; + color: var(--secondary-text-color); + transition: background 200ms; +} + +.tg-dialog-close:hover { + background: rgba(0, 0, 0, 0.08); +} + +.tg-dialog-close .material-symbols-rounded { + font-size: 18px; + line-height: 18px; +} +`); + +export { Dialog }; diff --git a/testgen/ui/components/frontend/js/components/portal.js b/testgen/ui/components/frontend/js/components/portal.js index 12fa2e70..791994ed 100644 --- a/testgen/ui/components/frontend/js/components/portal.js +++ b/testgen/ui/components/frontend/js/components/portal.js @@ -1,13 +1,14 @@ /** * Container for any floating elements anchored to another element. + * The portal element is appended to document.body so position: absolute + * is document-relative, avoiding issues with positioned ancestors. * * NOTE: Ensure options is an object and turn individual properties into van.state * if dynamic updates are needed. - * + * * @typedef Options * @type {object} * @property {string} target - * @property {boolean?} targetRelative * @property {boolean} opened * @property {'left' | 'right'} align * @property {('top' | 'bottom')?} position @@ -17,50 +18,79 @@ import van from '../van.min.js'; import { getValue } from '../utils.js'; -const { div } = van.tags; - const Portal = (/** @type Options */ options, ...args) => { - const { target, targetRelative, align = 'left', position = 'bottom' } = getValue(options); + const { target, align = 'left', position = 'bottom' } = getValue(options); const id = `${target}-portal`; + let portalEl = null; + let outsideClickHandler = null; + + const close = () => { options.opened.val = false; }; + + window.testgen.portals[id] = { domId: id, targetId: target, opened: options.opened, close }; - window.testgen.portals[id] = { domId: id, targetId: target, opened: options.opened }; + van.derive(() => { + const isOpen = getValue(options.opened); - return () => { - if (!getValue(options.opened)) { - return ''; + if (!isOpen) { + portalEl?.remove(); + portalEl = null; + if (outsideClickHandler) { + document.removeEventListener('click', outsideClickHandler, true); + outsideClickHandler = null; + } + return; + } + + // Close other open portals before opening this one + for (const p of Object.values(window.testgen.portals)) { + if (p.domId !== id && getValue(p.opened)) { + p.close(); + } } const anchor = document.getElementById(target); - return div( - { - id, - class: getValue(options.class) ?? '', - style: `position: absolute; - z-index: 99; - ${position === 'bottom' ? calculateBottomPosition(anchor, align, targetRelative) : calculateTopPosition(anchor, align, targetRelative)} - ${getValue(options.style)}`, - }, - ...args, - ); - }; -}; + if (!anchor) return; -function calculateTopPosition(anchor, align, targetRelative) { - const anchorRect = anchor.getBoundingClientRect(); - const bottom = (targetRelative ? anchorRect.height : anchorRect.top); - const left = targetRelative ? 0 : anchorRect.left; - const right = targetRelative ? 0 : (window.innerWidth - anchorRect.right); + const coords = position === 'bottom' + ? calculateBottomPosition(anchor, align) + : calculateTopPosition(anchor, align); - return `min-width: ${anchorRect.width}px; bottom: ${bottom}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; -} + if (!portalEl) { + portalEl = document.createElement('div'); + document.body.appendChild(portalEl); + van.add(portalEl, ...args); + + outsideClickHandler = (event) => { + const anchor = document.getElementById(target); + if (!portalEl?.contains(event.target) && !anchor?.contains(event.target)) { + close(); + } + }; + document.addEventListener('click', outsideClickHandler, true); + } + + portalEl.id = id; + portalEl.className = getValue(options.class) ?? ''; + portalEl.style.cssText = `position: absolute; z-index: 1001; ${coords} ${getValue(options.style) ?? ''}`; + }); -function calculateBottomPosition(anchor, align, targetRelative) { - const anchorRect = anchor.getBoundingClientRect(); - const top = (targetRelative ? 0 : anchorRect.top) + anchorRect.height; - const left = targetRelative ? 0 : anchorRect.left; - const right = targetRelative ? 0 : (window.innerWidth - anchorRect.right); + return ''; +}; + +function calculateBottomPosition(anchor, align) { + const r = anchor.getBoundingClientRect(); + const top = r.top + r.height + window.scrollY; + const left = r.left + window.scrollX; + const right = window.innerWidth - r.right; + return `min-width: ${r.width}px; top: ${top}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; +} - return `min-width: ${anchorRect.width}px; top: ${top}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; +function calculateTopPosition(anchor, align) { + const r = anchor.getBoundingClientRect(); + const bottom = window.innerHeight - r.top + window.scrollY; + const left = r.left + window.scrollX; + const right = window.innerWidth - r.right; + return `min-width: ${r.width}px; bottom: ${bottom}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; } export { Portal }; diff --git a/testgen/ui/static/js/components/dialog.js b/testgen/ui/static/js/components/dialog.js new file mode 100644 index 00000000..838babbe --- /dev/null +++ b/testgen/ui/static/js/components/dialog.js @@ -0,0 +1,135 @@ +/** + * @typedef DialogProps + * @type {object} + * @property {(string | import('../van.min.js').State)} title - Dialog title + * @property {import('../van.min.js').State} open - Reactive open state + * @property {Function} onClose - Called when the dialog is closed (backdrop click or X button) + * @property {string} [width] - CSS width value, default '30rem' + */ +import van from '../van.min.js'; +import { getValue, loadStylesheet } from '../utils.js'; + +const { button, div, i, span } = van.tags; + +/** + * A dialog component that mimics Streamlit's dialog visual style. + * Opens as a fixed-position overlay covering the full viewport so it + * works from within any V2 component container, regardless of depth. + * + * Usage: + * const open = van.state(false); + * + * Dialog( + * { title: 'Confirm', open, onClose: () => open.val = false }, + * div('Are you sure?'), + * Button({ label: 'Confirm', onclick: () => { doThing(); open.val = false; } }), + * ) + * + * @param {DialogProps} props + * @param {...(Element | string)} children - Content rendered in the dialog body + */ +const Dialog = ({ title, open, onClose, width = '30rem' }, ...children) => { + loadStylesheet('dialog', stylesheet); + + return div( + { + class: 'tg-dialog-overlay', + style: () => open.val ? '' : 'display: none', + onclick: () => onClose(), + }, + div( + { + class: 'tg-dialog', + role: 'dialog', + 'aria-modal': 'true', + tabindex: '-1', + style: () => `width: ${getValue(width)}`, + onclick: (e) => e.stopPropagation(), + }, + div( + { class: 'tg-dialog-header' }, + span({ class: 'tg-dialog-title' }, title), + ), + div({ class: 'tg-dialog-content' }, ...children), + button( + { + class: 'tg-dialog-close', + 'aria-label': 'Close', + onclick: () => onClose(), + }, + i({ class: 'material-symbols-rounded' }, 'close'), + ), + ), + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.tg-dialog-overlay { + position: fixed; + inset: 0; + z-index: 1000; + background: rgba(49, 51, 63, 0.5); + display: flex; + align-items: center; + justify-content: center; +} + +.tg-dialog { + position: relative; + background: var(--portal-background, white); + border-radius: 8px; + box-shadow: var(--portal-box-shadow, 0 4px 32px rgba(0, 0, 0, 0.25)); + max-width: calc(100vw - 2rem); + max-height: 80vh; + display: flex; + flex-direction: column; + overflow: hidden; +} + +.tg-dialog-header { + padding: 1.5rem 3.5rem 0.75rem 1.5rem; + font-size: 1.5rem; + font-weight: 600; + line-height: 1.5; + display: flex; + align-items: center; + flex-shrink: 0; +} + +.tg-dialog-content { + padding: 0.75rem 1.5rem 1.5rem; + overflow-y: auto; + font-size: 1rem; + color: var(--primary-text-color); +} + +.tg-dialog-close { + position: absolute; + top: 0.75rem; + right: 0.75rem; + display: flex; + align-items: center; + justify-content: center; + width: 3rem; + height: 3rem; + padding: 0; + border: none; + border-radius: 4px; + background: transparent; + cursor: pointer; + color: var(--secondary-text-color); + transition: background 200ms; +} + +.tg-dialog-close:hover { + background: rgba(0, 0, 0, 0.08); +} + +.tg-dialog-close .material-symbols-rounded { + font-size: 24px; + line-height: 24px; +} +`); + +export { Dialog }; diff --git a/testgen/ui/static/js/components/portal.js b/testgen/ui/static/js/components/portal.js index 12fa2e70..791994ed 100644 --- a/testgen/ui/static/js/components/portal.js +++ b/testgen/ui/static/js/components/portal.js @@ -1,13 +1,14 @@ /** * Container for any floating elements anchored to another element. + * The portal element is appended to document.body so position: absolute + * is document-relative, avoiding issues with positioned ancestors. * * NOTE: Ensure options is an object and turn individual properties into van.state * if dynamic updates are needed. - * + * * @typedef Options * @type {object} * @property {string} target - * @property {boolean?} targetRelative * @property {boolean} opened * @property {'left' | 'right'} align * @property {('top' | 'bottom')?} position @@ -17,50 +18,79 @@ import van from '../van.min.js'; import { getValue } from '../utils.js'; -const { div } = van.tags; - const Portal = (/** @type Options */ options, ...args) => { - const { target, targetRelative, align = 'left', position = 'bottom' } = getValue(options); + const { target, align = 'left', position = 'bottom' } = getValue(options); const id = `${target}-portal`; + let portalEl = null; + let outsideClickHandler = null; + + const close = () => { options.opened.val = false; }; + + window.testgen.portals[id] = { domId: id, targetId: target, opened: options.opened, close }; - window.testgen.portals[id] = { domId: id, targetId: target, opened: options.opened }; + van.derive(() => { + const isOpen = getValue(options.opened); - return () => { - if (!getValue(options.opened)) { - return ''; + if (!isOpen) { + portalEl?.remove(); + portalEl = null; + if (outsideClickHandler) { + document.removeEventListener('click', outsideClickHandler, true); + outsideClickHandler = null; + } + return; + } + + // Close other open portals before opening this one + for (const p of Object.values(window.testgen.portals)) { + if (p.domId !== id && getValue(p.opened)) { + p.close(); + } } const anchor = document.getElementById(target); - return div( - { - id, - class: getValue(options.class) ?? '', - style: `position: absolute; - z-index: 99; - ${position === 'bottom' ? calculateBottomPosition(anchor, align, targetRelative) : calculateTopPosition(anchor, align, targetRelative)} - ${getValue(options.style)}`, - }, - ...args, - ); - }; -}; + if (!anchor) return; -function calculateTopPosition(anchor, align, targetRelative) { - const anchorRect = anchor.getBoundingClientRect(); - const bottom = (targetRelative ? anchorRect.height : anchorRect.top); - const left = targetRelative ? 0 : anchorRect.left; - const right = targetRelative ? 0 : (window.innerWidth - anchorRect.right); + const coords = position === 'bottom' + ? calculateBottomPosition(anchor, align) + : calculateTopPosition(anchor, align); - return `min-width: ${anchorRect.width}px; bottom: ${bottom}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; -} + if (!portalEl) { + portalEl = document.createElement('div'); + document.body.appendChild(portalEl); + van.add(portalEl, ...args); + + outsideClickHandler = (event) => { + const anchor = document.getElementById(target); + if (!portalEl?.contains(event.target) && !anchor?.contains(event.target)) { + close(); + } + }; + document.addEventListener('click', outsideClickHandler, true); + } + + portalEl.id = id; + portalEl.className = getValue(options.class) ?? ''; + portalEl.style.cssText = `position: absolute; z-index: 1001; ${coords} ${getValue(options.style) ?? ''}`; + }); -function calculateBottomPosition(anchor, align, targetRelative) { - const anchorRect = anchor.getBoundingClientRect(); - const top = (targetRelative ? 0 : anchorRect.top) + anchorRect.height; - const left = targetRelative ? 0 : anchorRect.left; - const right = targetRelative ? 0 : (window.innerWidth - anchorRect.right); + return ''; +}; + +function calculateBottomPosition(anchor, align) { + const r = anchor.getBoundingClientRect(); + const top = r.top + r.height + window.scrollY; + const left = r.left + window.scrollX; + const right = window.innerWidth - r.right; + return `min-width: ${r.width}px; top: ${top}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; +} - return `min-width: ${anchorRect.width}px; top: ${top}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; +function calculateTopPosition(anchor, align) { + const r = anchor.getBoundingClientRect(); + const bottom = window.innerHeight - r.top + window.scrollY; + const left = r.left + window.scrollX; + const right = window.innerWidth - r.right; + return `min-width: ${r.width}px; bottom: ${bottom}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; } export { Portal }; From 356ba7029c83189a936730c364d5a2c75412f03f Mon Sep 17 00:00:00 2001 From: Luis Date: Thu, 26 Feb 2026 16:31:56 -0400 Subject: [PATCH 17/95] feat(ui): add disabled prop to Toggle; add notIn form validator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Toggle accepts a disabled prop that greys out the input and suppresses the clickable cursor, consistent with other form components. notIn validator rejects values found in a provided list, with an optional formatter and custom error message — used for unique-name validation on the project settings form. --- .../frontend/js/components/toggle.js | 11 +++++++++- .../components/frontend/js/form_validators.js | 20 +++++++++++++++++++ testgen/ui/static/js/components/toggle.js | 11 +++++++++- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/testgen/ui/components/frontend/js/components/toggle.js b/testgen/ui/components/frontend/js/components/toggle.js index 0a635c7c..8d3fdbd4 100644 --- a/testgen/ui/components/frontend/js/components/toggle.js +++ b/testgen/ui/components/frontend/js/components/toggle.js @@ -4,6 +4,7 @@ * @property {string} label * @property {string?} name * @property {boolean?} checked + * @property {boolean?} disabled * @property {string?} style * @property {function(boolean)?} onChange */ @@ -15,14 +16,17 @@ const { input, label } = van.tags; const Toggle = (/** @type Properties */ props) => { loadStylesheet('toggle', stylesheet); + const disabled = props.disabled?.val ?? props.disabled ?? false; + return label( - { class: 'flex-row fx-gap-2 clickable', style: props.style ?? '', 'data-testid': props.name ?? '' }, + { class: `flex-row fx-gap-2 ${disabled ? '' : 'clickable'}`, style: props.style ?? '', 'data-testid': props.name ?? '' }, input({ type: 'checkbox', role: 'switch', class: 'tg-toggle--input clickable', name: props.name ?? '', checked: props.checked, + disabled, onchange: van.derive(() => { const onChange = props.onChange?.val ?? props.onChange; return onChange ? (/** @type Event */ event) => onChange(event.target.checked) : null; @@ -84,6 +88,11 @@ stylesheet.replace(` .tg-toggle--input:checked::after { left: 14px; } + +.tg-toggle--input:disabled { + opacity: 0.5; + cursor: not-allowed; +} `); export { Toggle }; diff --git a/testgen/ui/components/frontend/js/form_validators.js b/testgen/ui/components/frontend/js/form_validators.js index 635b8b6a..a0a85d5b 100644 --- a/testgen/ui/components/frontend/js/form_validators.js +++ b/testgen/ui/components/frontend/js/form_validators.js @@ -120,11 +120,31 @@ function sizeLimit(limit) { return validator; } +/** + * @typedef NotInOptions + * @type {object} + * @property {function(any): any} formatter + * @property {string} errorMessage + * + * @param {any[]} values + * @param {NotInOptions?} options + * @returns {Validator} + */ +function notIn(values, options) { + return (value) => { + if (value && values.includes(!!options?.formatter ? options.formatter(value) : value)) { + return options?.errorMessage ?? `Value cannot be any of: ${values.join(', ')}.`; + } + return null; + }; +} + export { maxLength, minLength, numberBetween, noSpaces, + notIn, required, requiredIf, sizeLimit, diff --git a/testgen/ui/static/js/components/toggle.js b/testgen/ui/static/js/components/toggle.js index 0a635c7c..8d3fdbd4 100644 --- a/testgen/ui/static/js/components/toggle.js +++ b/testgen/ui/static/js/components/toggle.js @@ -4,6 +4,7 @@ * @property {string} label * @property {string?} name * @property {boolean?} checked + * @property {boolean?} disabled * @property {string?} style * @property {function(boolean)?} onChange */ @@ -15,14 +16,17 @@ const { input, label } = van.tags; const Toggle = (/** @type Properties */ props) => { loadStylesheet('toggle', stylesheet); + const disabled = props.disabled?.val ?? props.disabled ?? false; + return label( - { class: 'flex-row fx-gap-2 clickable', style: props.style ?? '', 'data-testid': props.name ?? '' }, + { class: `flex-row fx-gap-2 ${disabled ? '' : 'clickable'}`, style: props.style ?? '', 'data-testid': props.name ?? '' }, input({ type: 'checkbox', role: 'switch', class: 'tg-toggle--input clickable', name: props.name ?? '', checked: props.checked, + disabled, onchange: van.derive(() => { const onChange = props.onChange?.val ?? props.onChange; return onChange ? (/** @type Event */ event) => onChange(event.target.checked) : null; @@ -84,6 +88,11 @@ stylesheet.replace(` .tg-toggle--input:checked::after { left: 14px; } + +.tg-toggle--input:disabled { + opacity: 0.5; + cursor: not-allowed; +} `); export { Toggle }; From 08a6a8aaf2707f7a5bc130688a768f336675d147 Mon Sep 17 00:00:00 2001 From: Luis Date: Thu, 26 Feb 2026 16:32:02 -0400 Subject: [PATCH 18/95] fix(ui): misc JS fixes - Table: use .val when reading dataColumns.length for colspan so it reacts to state updates rather than capturing the initial value - Streamlit: add disableV2() to clean up v2 state on component teardown - data_catalog: import DISABLED_ACTION_TEXT from display_utils - utils.js: improve JSDoc type for getValue() --- testgen/ui/components/frontend/js/pages/data_catalog.js | 2 +- testgen/ui/components/frontend/js/utils.js | 2 +- testgen/ui/static/js/components/table.js | 2 +- testgen/ui/static/js/streamlit.js | 4 ++++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/testgen/ui/components/frontend/js/pages/data_catalog.js b/testgen/ui/components/frontend/js/pages/data_catalog.js index 66808d7d..438a0687 100644 --- a/testgen/ui/components/frontend/js/pages/data_catalog.js +++ b/testgen/ui/components/frontend/js/pages/data_catalog.js @@ -67,7 +67,7 @@ import { getColumnIcon, TABLE_ICON, LatestProfilingTime } from '../data_profilin import { RadioGroup } from '../components/radio_group.js'; import { Checkbox } from '../components/checkbox.js'; import { Select } from '../components/select.js'; -import { capitalize, caseInsensitiveIncludes } from '../display_utils.js'; +import { capitalize, caseInsensitiveIncludes, DISABLED_ACTION_TEXT } from '../display_utils.js'; import { TableSizeCard } from '../data_profiling/table_size.js'; import { Card } from '../components/card.js'; import { Button } from '../components/button.js'; diff --git a/testgen/ui/components/frontend/js/utils.js b/testgen/ui/components/frontend/js/utils.js index 5dc5560f..d71d6ece 100644 --- a/testgen/ui/components/frontend/js/utils.js +++ b/testgen/ui/components/frontend/js/utils.js @@ -78,7 +78,7 @@ const stateProto = Object.getPrototypeOf(van.state()); /** * Get value from van.state * @template T - * @param {T} prop + * @param {(import('./van.min.js').VanState | T)} prop * @returns {T} */ function getValue(prop) { // van state or static value diff --git a/testgen/ui/static/js/components/table.js b/testgen/ui/static/js/components/table.js index c3ae90c1..c21ac284 100644 --- a/testgen/ui/static/js/components/table.js +++ b/testgen/ui/static/js/components/table.js @@ -201,7 +201,7 @@ const Table = (options, rows) => { {class: 'tg-table-empty-state-body'}, tr( td( - {colspan: dataColumns.length}, + {colspan: dataColumns.val.length}, options.emptyState, ), ), diff --git a/testgen/ui/static/js/streamlit.js b/testgen/ui/static/js/streamlit.js index 5b90454c..e3039bc5 100644 --- a/testgen/ui/static/js/streamlit.js +++ b/testgen/ui/static/js/streamlit.js @@ -10,6 +10,10 @@ const Streamlit = { window.testgen = window.testgen || {}; window.testgen.isPage = true; }, + disableV2() { + this._v2 = false; + this._customSendDataHandler = null; + }, setFrameHeight(height) { if (!this || !this._v2) { sendMessageToStreamlit('streamlit:setFrameHeight', { height: height }); From 9b4e120fe46dcf96f6f6785d32f24d68bf05c164 Mon Sep 17 00:00:00 2001 From: Luis Date: Thu, 26 Feb 2026 16:34:48 -0400 Subject: [PATCH 19/95] fix(ui): mirror utils.js JSDoc type improvement to static copy --- testgen/common/models/project_membership.py | 1 - testgen/ui/app.py | 1 + testgen/ui/components/utils/component.py | 9 +-------- testgen/ui/components/widgets/sidebar.py | 3 ++- testgen/ui/static/js/utils.js | 2 +- testgen/utils/plugins.py | 11 +++++------ 6 files changed, 10 insertions(+), 17 deletions(-) diff --git a/testgen/common/models/project_membership.py b/testgen/common/models/project_membership.py index 6ee85f78..94bcad5e 100644 --- a/testgen/common/models/project_membership.py +++ b/testgen/common/models/project_membership.py @@ -5,7 +5,6 @@ import streamlit as st from sqlalchemy import Column, ForeignKey, String, asc, select from sqlalchemy.dialects import postgresql -from sqlalchemy.orm import InstrumentedAttribute from testgen.common.models import get_current_session from testgen.common.models.entity import Entity diff --git a/testgen/ui/app.py b/testgen/ui/app.py index f8aa4aa2..10e0c8ec 100644 --- a/testgen/ui/app.py +++ b/testgen/ui/app.py @@ -77,6 +77,7 @@ def render(log_level: int = logging.INFO): version=version_service.get_version(), support_email=settings.SUPPORT_EMAIL, global_context=is_global_context, + is_global_admin=session.auth.user_has_permission("global_admin") and bool(application.global_admin_paths), ) application.router.run() diff --git a/testgen/ui/components/utils/component.py b/testgen/ui/components/utils/component.py index 0fa1cd52..9a25502b 100644 --- a/testgen/ui/components/utils/component.py +++ b/testgen/ui/components/utils/component.py @@ -1,6 +1,5 @@ import pathlib from collections.abc import Callable -from functools import partial import streamlit as st from streamlit.components import v1 as components @@ -11,11 +10,6 @@ component_function = components.declare_component("testgen", path=components_dir) -class RefreshableComponentRenderer(ComponentRenderer): - def refresh(self) -> None: - pass - - def component(*, id_, props, key=None, default=None, on_change=None): component_props = props if not component_props: @@ -23,7 +17,7 @@ def component(*, id_, props, key=None, default=None, on_change=None): return component_function(id=id_, props=component_props, key=key, default=default, on_change=on_change) -def component_v2_wrapped(renderer: ComponentRenderer) -> RefreshableComponentRenderer: +def component_v2_wrapped(renderer: ComponentRenderer) -> ComponentRenderer: def wrapped_renderer(key: str | None = None, **kwargs) -> BidiComponentResult: on_change_callbacks = { name: fn for name, fn, in kwargs.items() @@ -40,7 +34,6 @@ def wrapped_renderer(key: str | None = None, **kwargs) -> BidiComponentResult: on_change_callbacks[name] = _wrap_handler(key, name, callback) return renderer(**other_kwargs, **on_change_callbacks) - setattr(wrapped_renderer, "refresh", lambda: None) return wrapped_renderer diff --git a/testgen/ui/components/widgets/sidebar.py b/testgen/ui/components/widgets/sidebar.py index 1229c8b2..a0739364 100644 --- a/testgen/ui/components/widgets/sidebar.py +++ b/testgen/ui/components/widgets/sidebar.py @@ -25,6 +25,7 @@ def sidebar( version: Version | None = None, support_email: str | None = None, global_context: bool = False, + is_global_admin: bool = False, ) -> None: """ Testgen custom component to display a styled menu over streamlit's @@ -50,7 +51,7 @@ def sidebar( "version": version.__dict__, "support_email": support_email, "global_context": global_context, - "is_global_admin": session.auth.user_has_permission("global_admin"), + "is_global_admin": is_global_admin, }, key=key, on_change=on_change, diff --git a/testgen/ui/static/js/utils.js b/testgen/ui/static/js/utils.js index 5dc5560f..d71d6ece 100644 --- a/testgen/ui/static/js/utils.js +++ b/testgen/ui/static/js/utils.js @@ -78,7 +78,7 @@ const stateProto = Object.getPrototypeOf(van.state()); /** * Get value from van.state * @template T - * @param {T} prop + * @param {(import('./van.min.js').VanState | T)} prop * @returns {T} */ function getValue(prop) { // van state or static value diff --git a/testgen/utils/plugins.py b/testgen/utils/plugins.py index 87ea9450..acaea286 100644 --- a/testgen/utils/plugins.py +++ b/testgen/utils/plugins.py @@ -147,10 +147,6 @@ class Plugin: def load(self) -> type[PluginSpec]: """Lightweight load: import plugin module and populate PluginHook.""" - plugin_pages: list[type[Page]] = [] - plugin_auth = None - plugin_logo = None - module = importlib.import_module(self.package) spec = _find_plugin_spec(module) if spec is not None: @@ -166,7 +162,7 @@ def load_streamlit(self) -> type[PluginSpec]: return spec # Fallback: discover UI classes from module (backward compat for plugins without explicit PluginSpec) - _discoverable: dict[type, str] = {list[type[Page]]: "pages", Authentication: "auth", Logo: "logo"} + _discoverable: dict[type, str] = {Page: "page", Authentication: "auth", Logo: "logo"} attrs: dict[str, type] = {} module = importlib.import_module(self.package) @@ -176,6 +172,9 @@ def load_streamlit(self) -> type[PluginSpec]: continue for base, attr in _discoverable.items(): if issubclass(cls, base) and cls is not base: - attrs[attr] = cls + if attr == "page": + attrs.setdefault("pages", []).append(cls) + else: + attrs[attr] = cls return type("AnyPlugin", (PluginSpec,), attrs) if attrs else PluginSpec From c3451b4276d7c3f0e2b2e846de6d270b6e3e3b18 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 27 Feb 2026 11:19:07 -0300 Subject: [PATCH 20/95] Apply 16 suggestion(s) to 2 file(s) Co-authored-by: Aarthy Adityan --- testgen/mcp/prompts/workflows.py | 4 +-- testgen/mcp/tools/reference.py | 44 ++++++++++++++++---------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/testgen/mcp/prompts/workflows.py b/testgen/mcp/prompts/workflows.py index 7fe4c44f..4d7d53a1 100644 --- a/testgen/mcp/prompts/workflows.py +++ b/testgen/mcp/prompts/workflows.py @@ -79,7 +79,7 @@ def compare_runs(test_suite: str | None = None) -> str: 5. Compare the two runs: - **Regressions:** Tests that passed before but now fail. - **Improvements:** Tests that failed before but now pass. - - **Persistent failures:** Tests that fail in both runs. - - **Stable passes:** Tests that pass in both runs. + - **Persistent failures:** Tests that failed in both runs. + - **Stable passes:** Tests that passed in both runs. 6. Summarize the trend and highlight any concerning regressions. """ diff --git a/testgen/mcp/tools/reference.py b/testgen/mcp/tools/reference.py index 19954c85..f05f1ade 100644 --- a/testgen/mcp/tools/reference.py +++ b/testgen/mcp/tools/reference.py @@ -23,13 +23,13 @@ def get_test_type(test_type: str) -> str: if tt.test_description: lines.append(f"- **Description:** {tt.test_description}") if tt.measure_uom: - lines.append(f"- **Measure UOM:** {tt.measure_uom}") + lines.append(f"- **Unit of Measure:** {tt.measure_uom}") if tt.measure_uom_description: lines.append(f"- **Measure Description:** {tt.measure_uom_description}") if tt.threshold_description: lines.append(f"- **Threshold:** {tt.threshold_description}") if tt.dq_dimension: - lines.append(f"- **DQ Dimension:** {tt.dq_dimension}") + lines.append(f"- **Quality Dimension:** {tt.dq_dimension}") if tt.test_scope: lines.append(f"- **Scope:** {tt.test_scope}") if tt.except_message: @@ -42,15 +42,15 @@ def get_test_type(test_type: str) -> str: @with_database_session def test_types_resource() -> str: - """Reference table of all active test types with their descriptions and DQ dimensions.""" + """Reference table of all test types with their descriptions and data quality dimensions.""" test_types = TestType.select_where(TestType.active == "Y") if not test_types: - return "No active test types found." + return "No test types found." lines = [ "# TestGen Test Types Reference\n", - "| Test Type | Name | DQ Dimension | Scope | Description |", + "| Test Type | Name | Quality Dimension | Scope | Description |", "|---|---|---|---|---|", ] @@ -65,36 +65,36 @@ def test_types_resource() -> str: def glossary_resource() -> str: - """Glossary of TestGen concepts, entity hierarchy, result statuses, and DQ dimensions.""" + """Glossary of TestGen concepts, entity hierarchy, result statuses, and quality dimensions.""" return """\ # TestGen Glossary ## Entity Hierarchy -- **Project** — Top-level organizational unit. Contains connections and test suites. -- **Connection** — Database connection configuration (host, credentials, flavor). -- **Table Group** — A set of tables within a schema that are profiled and tested together. +- **Project** — Top-level organizational unit. Contains connections and table groups. +- **Connection** — Database connection configuration (database type, host, credentials). +- **Table Group** — A set of tables within a schema that are profiled, tested, and monitored together. - **Test Suite** — A collection of test definitions scoped to a table group. - **Test Definition** — A configured test with parameters, thresholds, and target table/column. - **Test Run** — An execution of a test suite producing test results. - **Test Result** — The outcome of a single test definition within a test run. +- **Monitor** — Tracks a table-level pattern (freshness, volume, schema, or a custom metric) over time and detects anomalies. +## Test Result Statuses -## Result Statuses - -- **Passed** — Test passed within acceptable thresholds. -- **Warning** — Test exceeded the failure threshold. -- **Failed** — Test exceeded the failure threshold. Higher severity. -- **Error** — Test could not execute (e.g., SQL error, missing table). -- **Log** — Informational result, not scored. +- **Passed** — Data meets test criteria. +- **Warning** — Data does not meet test criteria, but test severity is set to warn rather than fail. +- **Failed** — Data does not meet test criteria. +- **Error** — Test could not execute (e.g., missing table or permission issue). +- **Log** — Informational result recorded for reference. ## Disposition Disposition is a user-assigned review status for test results: - **Confirmed** (default) — Result is valid and counts toward scoring. - **Dismissed** — Result reviewed and dismissed (excluded from scoring). -- **Inactive** — Test was deactivated after this result (excluded from scoring). +- **Muted** — Test was deactivated after this result (excluded from scoring). -## DQ Dimensions +## Data Quality Dimensions - **Accuracy** — Data values are correct and reflect real-world truth. - **Completeness** — Required data is present (no unexpected NULLs or blanks). @@ -112,8 +112,8 @@ def glossary_resource() -> str: ## Monitor Types -- **Volume_Trend** — Tracks row count changes over time using statistical prediction. -- **Freshness_Trend** — Detects when a table has not been updated as expected. -- **Schema_Drift** — Detects column additions, deletions, or type changes. -- **Metric_Trend** — Tracks changes in user-defined metrics over time. +- **Freshness** — Detects when tables are not updated on their expected schedule. +- **Volume** — Tracks row count changes and alerts on unexpected spikes or drops. +- **Schema** — Detects column additions, deletions, or type changes. +- **Metric** — Tracks user-defined metrics for anomalies. """ From e2e95df1af8ea981d3b39306f51362fabdcdeca6 Mon Sep 17 00:00:00 2001 From: Luis Date: Mon, 2 Mar 2026 16:09:46 -0400 Subject: [PATCH 21/95] fix(ui): visual inconsistencies and navigation bug --- .../040_populate_new_schema_project.sql | 5 +- testgen/ui/app.py | 15 +-- .../frontend/js/components/dialog.js | 1 - .../frontend/js/components/portal.js | 100 ++++++------------ testgen/ui/static/js/components/button.js | 16 ++- .../ui/static/js/components/crontab_input.js | 2 +- testgen/ui/static/js/components/dialog.js | 1 - testgen/ui/static/js/components/input.js | 2 +- testgen/ui/static/js/components/select.js | 4 +- testgen/ui/static/js/components/tooltip.js | 66 +++++++++++- testgen/ui/static/js/streamlit.js | 8 +- 11 files changed, 119 insertions(+), 101 deletions(-) diff --git a/testgen/template/dbsetup/040_populate_new_schema_project.sql b/testgen/template/dbsetup/040_populate_new_schema_project.sql index cbfa8220..36f6a30c 100644 --- a/testgen/template/dbsetup/040_populate_new_schema_project.sql +++ b/testgen/template/dbsetup/040_populate_new_schema_project.sql @@ -10,12 +10,13 @@ SELECT '{PROJECT_CODE}' as project_code, WITH inserted_user AS ( INSERT INTO auth_users - (username, email, name, password) + (username, email, name, password, is_global_admin) SELECT '{UI_USER_USERNAME}' as username, '{UI_USER_EMAIL}' as email, '{UI_USER_NAME}' as name, - '{UI_USER_ENCRYPTED_PASSWORD}' as password + '{UI_USER_ENCRYPTED_PASSWORD}' as password, + true as is_global_admin RETURNING id ) INSERT INTO project_memberships diff --git a/testgen/ui/app.py b/testgen/ui/app.py index 10e0c8ec..3257fe22 100644 --- a/testgen/ui/app.py +++ b/testgen/ui/app.py @@ -22,16 +22,9 @@ def render(log_level: int = logging.INFO): page_icon=get_asset_path("favicon.ico"), layout="wide", # Collapse when logging out because the sidebar takes some time to be removed from the DOM - # Collapse for Catalog role since they only have access to one page initial_sidebar_state="collapsed" - if session.auth and ( - session.auth.logging_out - or ( - session.auth.is_logged_in - and not session.auth.user_has_permission("view") - ) - ) - else "auto", + if session.auth and session.auth.logging_out + else "auto", ) application = get_application(log_level=log_level) @@ -53,12 +46,12 @@ def render(log_level: int = logging.INFO): if not session.auth.is_logged_in and not session.auth.logging_out: session.auth.load_user_session() - if session.auth.is_logged_in: + if session.auth.is_logged_in and not session.auth.logging_out: session.auth.load_user_role() application.logo.render() - if session.auth.is_logged_in and not session.auth.logging_in: + if session.auth.is_logged_in and not session.auth.logging_in and not session.auth.logging_out: current_page = session.current_page if not current_page: try: diff --git a/testgen/ui/components/frontend/js/components/dialog.js b/testgen/ui/components/frontend/js/components/dialog.js index 6a249fc8..788a85eb 100644 --- a/testgen/ui/components/frontend/js/components/dialog.js +++ b/testgen/ui/components/frontend/js/components/dialog.js @@ -100,7 +100,6 @@ stylesheet.replace(` .tg-dialog-content { padding: 0.75rem 1.5rem 1.5rem; overflow-y: auto; - font-size: 1rem; color: var(--primary-text-color); } diff --git a/testgen/ui/components/frontend/js/components/portal.js b/testgen/ui/components/frontend/js/components/portal.js index 791994ed..12fa2e70 100644 --- a/testgen/ui/components/frontend/js/components/portal.js +++ b/testgen/ui/components/frontend/js/components/portal.js @@ -1,14 +1,13 @@ /** * Container for any floating elements anchored to another element. - * The portal element is appended to document.body so position: absolute - * is document-relative, avoiding issues with positioned ancestors. * * NOTE: Ensure options is an object and turn individual properties into van.state * if dynamic updates are needed. - * + * * @typedef Options * @type {object} * @property {string} target + * @property {boolean?} targetRelative * @property {boolean} opened * @property {'left' | 'right'} align * @property {('top' | 'bottom')?} position @@ -18,79 +17,50 @@ import van from '../van.min.js'; import { getValue } from '../utils.js'; +const { div } = van.tags; + const Portal = (/** @type Options */ options, ...args) => { - const { target, align = 'left', position = 'bottom' } = getValue(options); + const { target, targetRelative, align = 'left', position = 'bottom' } = getValue(options); const id = `${target}-portal`; - let portalEl = null; - let outsideClickHandler = null; - - const close = () => { options.opened.val = false; }; - - window.testgen.portals[id] = { domId: id, targetId: target, opened: options.opened, close }; - van.derive(() => { - const isOpen = getValue(options.opened); + window.testgen.portals[id] = { domId: id, targetId: target, opened: options.opened }; - if (!isOpen) { - portalEl?.remove(); - portalEl = null; - if (outsideClickHandler) { - document.removeEventListener('click', outsideClickHandler, true); - outsideClickHandler = null; - } - return; - } - - // Close other open portals before opening this one - for (const p of Object.values(window.testgen.portals)) { - if (p.domId !== id && getValue(p.opened)) { - p.close(); - } + return () => { + if (!getValue(options.opened)) { + return ''; } const anchor = document.getElementById(target); - if (!anchor) return; - - const coords = position === 'bottom' - ? calculateBottomPosition(anchor, align) - : calculateTopPosition(anchor, align); - - if (!portalEl) { - portalEl = document.createElement('div'); - document.body.appendChild(portalEl); - van.add(portalEl, ...args); - - outsideClickHandler = (event) => { - const anchor = document.getElementById(target); - if (!portalEl?.contains(event.target) && !anchor?.contains(event.target)) { - close(); - } - }; - document.addEventListener('click', outsideClickHandler, true); - } - - portalEl.id = id; - portalEl.className = getValue(options.class) ?? ''; - portalEl.style.cssText = `position: absolute; z-index: 1001; ${coords} ${getValue(options.style) ?? ''}`; - }); - - return ''; + return div( + { + id, + class: getValue(options.class) ?? '', + style: `position: absolute; + z-index: 99; + ${position === 'bottom' ? calculateBottomPosition(anchor, align, targetRelative) : calculateTopPosition(anchor, align, targetRelative)} + ${getValue(options.style)}`, + }, + ...args, + ); + }; }; -function calculateBottomPosition(anchor, align) { - const r = anchor.getBoundingClientRect(); - const top = r.top + r.height + window.scrollY; - const left = r.left + window.scrollX; - const right = window.innerWidth - r.right; - return `min-width: ${r.width}px; top: ${top}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; +function calculateTopPosition(anchor, align, targetRelative) { + const anchorRect = anchor.getBoundingClientRect(); + const bottom = (targetRelative ? anchorRect.height : anchorRect.top); + const left = targetRelative ? 0 : anchorRect.left; + const right = targetRelative ? 0 : (window.innerWidth - anchorRect.right); + + return `min-width: ${anchorRect.width}px; bottom: ${bottom}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; } -function calculateTopPosition(anchor, align) { - const r = anchor.getBoundingClientRect(); - const bottom = window.innerHeight - r.top + window.scrollY; - const left = r.left + window.scrollX; - const right = window.innerWidth - r.right; - return `min-width: ${r.width}px; bottom: ${bottom}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; +function calculateBottomPosition(anchor, align, targetRelative) { + const anchorRect = anchor.getBoundingClientRect(); + const top = (targetRelative ? 0 : anchorRect.top) + anchorRect.height; + const left = targetRelative ? 0 : anchorRect.left; + const right = targetRelative ? 0 : (window.innerWidth - anchorRect.right); + + return `min-width: ${anchorRect.width}px; top: ${top}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; } export { Portal }; diff --git a/testgen/ui/static/js/components/button.js b/testgen/ui/static/js/components/button.js index c78f2173..487aa1a0 100644 --- a/testgen/ui/static/js/components/button.js +++ b/testgen/ui/static/js/components/button.js @@ -18,7 +18,7 @@ import { emitEvent, enforceElementWidth, getValue, loadStylesheet } from '../utils.js'; import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; -import { Tooltip } from './tooltip.js'; +import { withTooltip } from './tooltip.js'; const { button, i, span } = van.tags; const BUTTON_TYPE = { @@ -52,24 +52,16 @@ const Button = (/** @type Properties */ props) => { } const onClickHandler = props.onclick || (() => emitEvent('ButtonClicked')); - const showTooltip = van.state(false); - return button( + const buttonEl = button( { id: getValue(props.id) ?? undefined, class: () => `tg-button tg-${getValue(props.type)}-button tg-${getValue(props.color) ?? 'basic'}-button ${getValue(props.type) !== 'icon' && isIconOnly ? 'tg-icon-button' : ''}`, style: () => `width: ${isIconOnly ? '' : (width ?? '100%')}; ${getValue(props.style)}`, onclick: onClickHandler, disabled: props.disabled, - onmouseenter: props.tooltip ? (() => showTooltip.val = true) : undefined, - onmouseleave: props.tooltip ? (() => showTooltip.val = false) : undefined, 'data-testid': getValue(props.testId) ?? '', }, - () => window.testgen.isPage && getValue(props.tooltip) ? Tooltip({ - text: props.tooltip, - show: showTooltip, - position: props.tooltipPosition, - }) : '', span({class: 'tg-button-focus-state-indicator'}, ''), props.icon ? i({ class: 'material-symbols-rounded', @@ -77,6 +69,10 @@ const Button = (/** @type Properties */ props) => { }, props.icon) : undefined, !isIconOnly ? span(props.label) : undefined, ); + + return getValue(props.tooltip) + ? withTooltip(buttonEl, { text: props.tooltip, position: props.tooltipPosition }) + : buttonEl; }; const stylesheet = new CSSStyleSheet(); diff --git a/testgen/ui/static/js/components/crontab_input.js b/testgen/ui/static/js/components/crontab_input.js index 5f0fc190..2701209b 100644 --- a/testgen/ui/static/js/components/crontab_input.js +++ b/testgen/ui/static/js/components/crontab_input.js @@ -86,7 +86,7 @@ const CrontabInput = (/** @type Options */ props) => { }), ), Portal( - {target: domId.val, targetRelative: true, align: 'right', style: 'width: 500px;', opened}, + {target: domId.val, align: 'right', style: 'width: 500px;', opened}, () => CrontabEditorPortal( { onChange: onEditorChange, diff --git a/testgen/ui/static/js/components/dialog.js b/testgen/ui/static/js/components/dialog.js index 838babbe..0bcdbd1b 100644 --- a/testgen/ui/static/js/components/dialog.js +++ b/testgen/ui/static/js/components/dialog.js @@ -100,7 +100,6 @@ stylesheet.replace(` .tg-dialog-content { padding: 0.75rem 1.5rem 1.5rem; overflow-y: auto; - font-size: 1rem; color: var(--primary-text-color); } diff --git a/testgen/ui/static/js/components/input.js b/testgen/ui/static/js/components/input.js index 130aba5c..b50efd1c 100644 --- a/testgen/ui/static/js/components/input.js +++ b/testgen/ui/static/js/components/input.js @@ -194,7 +194,7 @@ const Input = (/** @type Properties */ props) => { ? small({ class: 'tg-input--error' }, firstError) : '', Portal( - { target: domId.val, targetRelative: true, opened: autocompleteOpened }, + { target: domId.val, opened: autocompleteOpened }, () => div( { class: 'tg-input--options-wrapper' }, autocompleteOptions.val?.map(option => diff --git a/testgen/ui/static/js/components/select.js b/testgen/ui/static/js/components/select.js index 3e3e658c..efc055c4 100644 --- a/testgen/ui/static/js/components/select.js +++ b/testgen/ui/static/js/components/select.js @@ -185,7 +185,7 @@ const Select = (/** @type {Properties} */ props) => { ), Portal( - {target: domId.val, targetRelative: true, position: props.portalPosition?.val ?? props?.portalPosition, opened}, + {target: domId.val, position: props.portalPosition?.val ?? props?.portalPosition, opened}, () => div( { class: () => `tg-select--options-wrapper mt-1 ${getValue(props.portalClass) ?? ''}`, @@ -288,7 +288,7 @@ const MultiSelect = (props) => { ), Portal( - {target: domId.val, targetRelative: true, position: props.portalPosition?.val ?? props?.portalPosition, opened}, + {target: domId.val, position: props.portalPosition?.val ?? props?.portalPosition, opened}, () => div( { class: () => `tg-select--options-wrapper mt-1 ${getValue(props.portalClass) ?? ''}`, diff --git a/testgen/ui/static/js/components/tooltip.js b/testgen/ui/static/js/components/tooltip.js index e3b23a39..77af4b9b 100644 --- a/testgen/ui/static/js/components/tooltip.js +++ b/testgen/ui/static/js/components/tooltip.js @@ -32,13 +32,61 @@ const Tooltip = (/** @type Properties */ props) => { ); }; +const computeTooltipStyle = (rect, position) => { + const cx = rect.left + rect.width / 2; + const cy = rect.top + rect.height / 2; + const gap = 5; + + const variants = { + 'top': { left: cx, top: rect.top, transform: `translateX(-50%) translateY(calc(-100% - ${gap}px))` }, + 'top-left': { left: cx + 20, top: rect.top, transform: `translateX(-100%) translateY(calc(-100% - ${gap}px))` }, + 'top-right': { left: cx - 20, top: rect.top, transform: `translateY(calc(-100% - ${gap}px))` }, + 'bottom': { left: cx, top: rect.bottom, transform: `translateX(-50%) translateY(${gap}px)` }, + 'bottom-left': { left: cx + 20, top: rect.bottom, transform: `translateX(-100%) translateY(${gap}px)` }, + 'bottom-right': { left: cx - 20, top: rect.bottom, transform: `translateY(${gap}px)` }, + 'right': { left: rect.right, top: cy, transform: `translateX(${gap}px) translateY(-50%)` }, + 'left': { left: rect.left, top: cy, transform: `translateX(calc(-100% - ${gap}px)) translateY(-50%)` }, + }; + + const { left, top, transform } = variants[position] || variants['top']; + return `position: fixed; left: ${left}px; top: ${top}px; bottom: auto; right: auto; transform: ${transform};`; +}; + const withTooltip = (/** @type HTMLElement */ component, /** @type Properties */ tooltipProps) => { + loadStylesheet('tooltip', stylesheet); + const showTooltip = van.state(false); - const tooltip = Tooltip({ ...tooltipProps, show: showTooltip }); + const positionStyle = van.state(''); - component.onmouseenter = () => showTooltip.val = true; - component.onmouseleave = () => showTooltip.val = false; - component.appendChild(tooltip); + const tooltipEl = span( + { + class: () => `tg-tooltip portal ${getValue(tooltipProps.position) || defaultPosition} ${showTooltip.val ? '' : 'hidden'}`, + style: () => `opacity: ${showTooltip.val ? 1 : 0}; pointer-events: none; max-width: ${getValue(tooltipProps.width) || '400'}px; ${positionStyle.val}${getValue(tooltipProps.style) ?? ''}`, + }, + tooltipProps.text, + div({ class: 'tg-tooltip--triangle' }), + ); + + van.add(document.body, tooltipEl); + + requestAnimationFrame(() => { + if (!component.isConnected) return; + const observer = new MutationObserver(() => { + if (!component.isConnected) { + tooltipEl.remove(); + observer.disconnect(); + } + }); + observer.observe(document.body, { childList: true, subtree: true }); + }); + + component.addEventListener('mouseenter', () => { + positionStyle.val = computeTooltipStyle(component.getBoundingClientRect(), getValue(tooltipProps.position) || defaultPosition); + showTooltip.val = true; + }); + component.addEventListener('mouseleave', () => { + showTooltip.val = false; + }); return component; }; @@ -60,6 +108,16 @@ stylesheet.replace(` transition: opacity 0.3s; } +.tg-tooltip.portal { + position: fixed; + z-index: 9999; + top: unset; + bottom: unset; + left: unset; + right: unset; + transform: unset; +} + .tg-tooltip--triangle { width: 0; height: 0; diff --git a/testgen/ui/static/js/streamlit.js b/testgen/ui/static/js/streamlit.js index e3039bc5..2b1c7995 100644 --- a/testgen/ui/static/js/streamlit.js +++ b/testgen/ui/static/js/streamlit.js @@ -10,9 +10,11 @@ const Streamlit = { window.testgen = window.testgen || {}; window.testgen.isPage = true; }, - disableV2() { - this._v2 = false; - this._customSendDataHandler = null; + disableV2(handler) { + if (this._customSendDataHandler === handler) { + this._v2 = false; + this._customSendDataHandler = null; + } }, setFrameHeight(height) { if (!this || !this._v2) { From 2761ffa53c1146904fac3500248004810a8a84ee Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Mon, 2 Mar 2026 17:31:04 -0300 Subject: [PATCH 22/95] feat(mcp): polish inspector output and add validation - Remove description truncation in test types resource - Add project_code validation in list_test_suites - Restructure test run headings (## suite / ### run) - Add test type code attribute to test results - Split failure summary into Test Type | Test Name | Severity | Count - Add status to test_type GROUP BY in select_failures model - Fix RUF012 ClassVar annotation on TestRunSummary Co-Authored-By: Claude Opus 4.6 --- testgen/common/models/data_table.py | 3 +- testgen/common/models/test_result.py | 7 +- testgen/common/models/test_run.py | 11 ++- testgen/mcp/prompts/workflows.py | 15 ++-- testgen/mcp/server.py | 23 +++++- testgen/mcp/services/inventory_service.py | 83 ++------------------ testgen/mcp/tools/discovery.py | 44 ++++++++++- testgen/mcp/tools/reference.py | 29 +++---- testgen/mcp/tools/test_results.py | 54 ++++++++++--- testgen/mcp/tools/test_runs.py | 35 +++++---- tests/unit/mcp/test_inventory_service.py | 92 ++--------------------- tests/unit/mcp/test_model_test_result.py | 6 +- tests/unit/mcp/test_tools_discovery.py | 9 +++ tests/unit/mcp/test_tools_reference.py | 6 +- tests/unit/mcp/test_tools_test_results.py | 42 +++++++++-- tests/unit/mcp/test_tools_test_runs.py | 41 +++++++++- 16 files changed, 266 insertions(+), 234 deletions(-) diff --git a/testgen/common/models/data_table.py b/testgen/common/models/data_table.py index bab03a4a..5c387633 100644 --- a/testgen/common/models/data_table.py +++ b/testgen/common/models/data_table.py @@ -23,11 +23,12 @@ class DataTable(Entity): # dq_score_profiling, dq_score_testing @classmethod - def select_table_names(cls, table_groups_id: UUID, limit: int = 100) -> list[str]: + def select_table_names(cls, table_groups_id: UUID, limit: int = 100, offset: int = 0) -> list[str]: query = ( select(cls.table_name) .where(cls.table_groups_id == table_groups_id) .order_by(asc(func.lower(cls.table_name))) + .offset(offset) .limit(limit) ) return list(get_current_session().scalars(query).all()) diff --git a/testgen/common/models/test_result.py b/testgen/common/models/test_result.py index 296a39c8..6c9e4b97 100644 --- a/testgen/common/models/test_result.py +++ b/testgen/common/models/test_result.py @@ -60,6 +60,7 @@ def select_results( table_name: str | None = None, test_type: str | None = None, limit: int = 50, + offset: int = 0, ) -> list[Self]: clauses = [ cls.test_run_id == test_run_id, @@ -71,7 +72,7 @@ def select_results( clauses.append(cls.table_name == table_name) if test_type: clauses.append(cls.test_type == test_type) - query = select(cls).where(*clauses).order_by(cls.status, cls.table_name, cls.column_names).limit(limit) + query = select(cls).where(*clauses).order_by(cls.status, cls.table_name, cls.column_names).offset(offset).limit(limit) return get_current_session().scalars(query).all() @classmethod @@ -93,6 +94,8 @@ def select_failures( # Column grouping includes table_name for context → (table, column, count) if group_by == "column_names": group_cols = (cls.table_name, cls.column_names) + elif group_by == "test_type": + group_cols = (cls.test_type, cls.status) else: group_cols = (getattr(cls, group_by),) @@ -109,11 +112,13 @@ def select_history( cls, test_definition_id: UUID, limit: int = 20, + offset: int = 0, ) -> list[Self]: query = ( select(cls) .where(cls.test_definition_id == test_definition_id) .order_by(desc(cls.test_time)) + .offset(offset) .limit(limit) ) return get_current_session().scalars(query).all() diff --git a/testgen/common/models/test_run.py b/testgen/common/models/test_run.py index 3709328a..382b7acd 100644 --- a/testgen/common/models/test_run.py +++ b/testgen/common/models/test_run.py @@ -1,7 +1,7 @@ from collections.abc import Iterable from dataclasses import dataclass from datetime import UTC, datetime -from typing import Literal, NamedTuple, Self, TypedDict +from typing import ClassVar, Literal, NamedTuple, Self, TypedDict from uuid import UUID, uuid4 import streamlit as st @@ -63,6 +63,15 @@ class TestRunSummary(EntityMinimal): dismissed_ct: int dq_score_testing: float + STATUS_LABEL: ClassVar[dict[str, str]] = { + "Complete": "Completed", + "Cancelled": "Canceled", + } + + @property + def status_label(self) -> str: + return self.STATUS_LABEL.get(self.status, self.status) + @dataclass class TestRunMonitorSummary(EntityMinimal): diff --git a/testgen/mcp/prompts/workflows.py b/testgen/mcp/prompts/workflows.py index 4d7d53a1..03ae15de 100644 --- a/testgen/mcp/prompts/workflows.py +++ b/testgen/mcp/prompts/workflows.py @@ -7,7 +7,7 @@ def health_check() -> str: Please perform a data quality health check: 1. Call `get_data_inventory()` to get a complete overview of all projects, connections, table groups, and test suites. -2. For each project, call `get_recent_test_runs(project_code='...')` to get the most recent test run. +2. For each project, call `get_recent_test_runs(...)` to get the latest test runs across all suites. 3. Summarize the overall health: - Which projects/suites are healthy (all tests passing)? - Which have failures or warnings? @@ -29,7 +29,7 @@ def investigate_failures(test_suite: str | None = None) -> str: Please investigate test failures and identify root causes:{suite_filter} 1. Call `get_data_inventory()` to understand the project structure. -2. Call `get_recent_test_runs(project_code='...')` to find the most recent run{f" for suite `{test_suite}`" if test_suite else ""}. +2. Call `get_recent_test_runs(...)` to find the latest run per suite{f" for suite `{test_suite}`" if test_suite else ""}. 3. Call `get_failure_summary(test_run_id='...')` to see failures grouped by test type. 4. For each failure category, call `get_test_type(test_type='...')` to understand what the test checks. 5. Call `get_test_results(test_run_id='...', status='Failed')` to see individual failure details. @@ -50,14 +50,15 @@ def table_health(table_name: str) -> str: return f"""\ Please assess the data quality health of table `{table_name}`: -1. Call `get_data_inventory()` to find which table groups and test suites include this table. -2. For each relevant test suite, call `get_recent_test_runs(project_code='...')` to find the latest run. -3. Call `get_test_results(test_run_id='...', table_name='{table_name}')` to get all results for this table. -4. Summarize the table's health: +1. Call `get_data_inventory()` to discover all table groups. +2. For each table group, call `list_tables(table_group_id='...')` to check if it contains `{table_name}`. +3. For each relevant test suite, call `get_recent_test_runs(...)` to find the latest run. +4. Call `get_test_results(test_run_id='...', table_name='{table_name}')` to get all results for this table. +5. Summarize the table's health: - Which tests pass and which fail? - What data quality dimensions are affected? - Are there patterns in the failures (e.g., specific columns)? -5. Provide recommendations for improving data quality for this table. +6. Provide recommendations for improving data quality for this table. """ diff --git a/testgen/mcp/server.py b/testgen/mcp/server.py index c7096eb3..d537d841 100644 --- a/testgen/mcp/server.py +++ b/testgen/mcp/server.py @@ -32,7 +32,6 @@ CONVENTIONS - Identifiers are UUIDs passed as strings. - Dates are ISO 8601 format. -- Test results with disposition Dismissed or Inactive are excluded from counts and scores. """ @@ -56,7 +55,7 @@ def run_mcp() -> None: """Start the MCP server with streamable HTTP transport.""" from testgen.mcp import get_server_url from testgen.mcp.prompts.workflows import compare_runs, health_check, investigate_failures, table_health - from testgen.mcp.tools.discovery import get_data_inventory, list_projects, list_test_suites + from testgen.mcp.tools.discovery import get_data_inventory, list_projects, list_tables, list_test_suites from testgen.mcp.tools.reference import get_test_type, glossary_resource, test_types_resource from testgen.mcp.tools.test_results import get_failure_summary, get_test_result_history, get_test_results from testgen.mcp.tools.test_runs import get_recent_test_runs @@ -82,9 +81,10 @@ def run_mcp() -> None: token_verifier=JWTTokenVerifier(), ) - # Tools (8) + # Tools (9) mcp.tool()(get_data_inventory) mcp.tool()(list_projects) + mcp.tool()(list_tables) mcp.tool()(list_test_suites) mcp.tool()(get_recent_test_runs) mcp.tool()(get_test_results) @@ -103,4 +103,19 @@ def run_mcp() -> None: mcp.prompt()(compare_runs) LOG.info("Starting MCP server on %s:%s (auth issuer: %s)", settings.MCP_HOST, settings.MCP_PORT, server_url) - mcp.run(transport="streamable-http") + + if settings.IS_DEBUG: + import uvicorn + from starlette.middleware.cors import CORSMiddleware + + app = mcp.streamable_http_app() + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], + expose_headers=["Mcp-Session-Id"], + ) + uvicorn.run(app, host=settings.MCP_HOST, port=settings.MCP_PORT) + else: + mcp.run(transport="streamable-http") diff --git a/testgen/mcp/services/inventory_service.py b/testgen/mcp/services/inventory_service.py index 6f1551a6..d74562da 100644 --- a/testgen/mcp/services/inventory_service.py +++ b/testgen/mcp/services/inventory_service.py @@ -2,10 +2,8 @@ from testgen.common.models import get_current_session from testgen.common.models.connection import Connection -from testgen.common.models.data_table import DataTable from testgen.common.models.project import Project from testgen.common.models.table_group import TableGroup -from testgen.common.models.test_run import TestRun from testgen.common.models.test_suite import TestSuite @@ -19,19 +17,11 @@ def get_inventory() -> str: Project.project_name, Connection.connection_id, Connection.connection_name, - Connection.sql_flavor_code, TableGroup.id.label("table_group_id"), TableGroup.table_groups_name, TableGroup.table_group_schema, TestSuite.id.label("test_suite_id"), TestSuite.test_suite, - TestRun.id.label("last_run_id"), - TestRun.test_starttime, - TestRun.status.label("last_run_status"), - TestRun.test_ct, - TestRun.passed_ct, - TestRun.failed_ct, - TestRun.warning_ct, ) .outerjoin(Connection, Connection.project_code == Project.project_code) .outerjoin(TableGroup, TableGroup.connection_id == Connection.connection_id) @@ -42,30 +32,14 @@ def get_inventory() -> str: TestSuite.is_monitor.isnot(True), ), ) - .outerjoin(TestRun, TestRun.id == TestSuite.last_complete_test_run_id) .order_by(Project.project_name, Connection.connection_name, TableGroup.table_groups_name, TestSuite.test_suite) ) rows = session.execute(query).all() - # Collect table stats per project - project_codes = {row.project_code for row in rows if row.project_code} - stats_by_group = {} - for project_code in project_codes: - for stat in TableGroup.select_stats(project_code): - stats_by_group[stat.id] = stat - - # Collect table names per group (first 100) - group_ids = {row.table_group_id for row in rows if row.table_group_id} - tables_by_group: dict = {} - for gid in group_ids: - table_names = DataTable.select_table_names(gid, limit=100) - if table_names: - tables_by_group[gid] = table_names - # Build nested structure projects: dict[str, dict] = {} - total_suites = 0 + total_groups = 0 for row in rows: proj = projects.setdefault(row.project_code, { @@ -77,7 +51,6 @@ def get_inventory() -> str: conn = proj["connections"].setdefault(row.connection_id, { "name": row.connection_name, - "flavor": row.sql_flavor_code, "groups": {}, }) if row.table_group_id is None: @@ -89,21 +62,11 @@ def get_inventory() -> str: "suites": [], }) if row.test_suite_id is not None: - total_suites += 1 group["suites"].append({ "id": str(row.test_suite_id), "name": row.test_suite, - "last_run_id": str(row.last_run_id) if row.last_run_id else None, - "last_run_time": str(row.test_starttime) if row.test_starttime else None, - "last_run_status": row.last_run_status, - "test_ct": row.test_ct, - "passed_ct": row.passed_ct, - "failed_ct": row.failed_ct, - "warning_ct": row.warning_ct, }) - # Compact mode for large inventories - compact_suites = total_suites > 20 total_groups = sum( len(conn["groups"]) for proj in projects.values() @@ -122,70 +85,38 @@ def get_inventory() -> str: continue for _conn_id, conn in proj["connections"].items(): - lines.append(f"### Connection: {conn['name']} ({conn['flavor']})\n") + lines.append(f"### Connection: {conn['name']}\n") if not conn["groups"]: lines.append("_No table groups._\n") continue for group_id, group in conn["groups"].items(): - stat = stats_by_group.get(group_id) - table_ct = stat.table_ct if stat and stat.table_ct else 0 - column_ct = stat.column_ct if stat and stat.column_ct else 0 - group_tables = tables_by_group.get(group_id, []) - if compact_groups: lines.append( f"- **{group['name']}** (schema: `{group['schema']}`, " - f"{table_ct} tables, {column_ct} columns, " f"{len(group['suites'])} test suites)" ) continue lines.append( - f"#### Table Group: {group['name']} (schema: `{group['schema']}`, " - f"{table_ct} tables, {column_ct} columns)\n" + f"#### Table Group: {group['name']} (id: `{group_id}`, schema: `{group['schema']}`)\n" ) - if group_tables: - tables_str = ", ".join(f"`{t}`" for t in group_tables) - if table_ct and table_ct > 100: - tables_str += f", ... ({table_ct - 100} more)" - lines.append(f"Tables: {tables_str}\n") - if not group["suites"]: lines.append("_No test suites._\n") continue for suite in group["suites"]: - if compact_suites: - status_icon = "" - if suite["last_run_status"] == "Complete": - if suite["failed_ct"]: - status_icon = " [FAILURES]" - else: - status_icon = " [OK]" - lines.append(f"- **{suite['name']}** (`{suite['id']}`){status_icon}") - else: - lines.append(f"**Test Suite: {suite['name']}** (id: `{suite['id']}`)") - if suite["last_run_id"]: - lines.append(f" - Last run: `{suite['last_run_id']}` ({suite['last_run_status']})") - lines.append(f" - Time: {suite['last_run_time']}") - lines.append( - f" - Results: {suite['test_ct']} tests, " - f"{suite['passed_ct']} passed, " - f"{suite['failed_ct']} failed, " - f"{suite['warning_ct']} warnings" - ) - else: - lines.append(" - _No completed runs._") - lines.append("") + lines.append(f"- **{suite['name']}** (id: `{suite['id']}`)") + lines.append("") lines.append("") lines.append( "---\n" - "For test type definitions, read the `testgen://test-types` resource or call `get_test_type`." + "Use `list_tables(table_group_id='...')` to see tables in a group.\n" + "Use `list_test_suites(project_code='...')` for suite details and latest run stats." ) return "\n".join(lines) diff --git a/testgen/mcp/tools/discovery.py b/testgen/mcp/tools/discovery.py index d9bb872d..853374fd 100644 --- a/testgen/mcp/tools/discovery.py +++ b/testgen/mcp/tools/discovery.py @@ -1,11 +1,14 @@ +from uuid import UUID + from testgen.common.models import with_database_session +from testgen.common.models.data_table import DataTable from testgen.common.models.project import Project from testgen.common.models.test_suite import TestSuite @with_database_session def get_data_inventory() -> str: - """Get a complete inventory of all projects, connections, table groups, and test suites with their latest run status. + """Get a structural inventory of all projects, connections, table groups, and test suites. This is the recommended starting point for understanding the data quality landscape. Returns a structured markdown overview of the entire TestGen configuration. @@ -40,6 +43,9 @@ def list_test_suites(project_code: str) -> str: Args: project_code: The project code to list test suites for. """ + if not project_code: + return "Missing required parameter `project_code`." + summaries = TestSuite.select_summary(project_code) if not summaries: @@ -70,3 +76,39 @@ def list_test_suites(project_code: str) -> str: lines.append("") return "\n".join(lines) + + +@with_database_session +def list_tables(table_group_id: str, limit: int = 200, page: int = 1) -> str: + """List tables in a table group. + + Args: + table_group_id: The table group UUID. + limit: Maximum number of tables per page (default 200). + page: Page number, starting from 1 (default 1). + """ + try: + group_uuid = UUID(table_group_id) + except (ValueError, AttributeError) as err: + raise ValueError(f"Invalid table_group_id: `{table_group_id}` is not a valid UUID.") from err + + offset = (page - 1) * limit + table_names = DataTable.select_table_names(group_uuid, limit=limit, offset=offset) + total = DataTable.count_tables(group_uuid) + + if not table_names: + if page > 1: + return f"No tables on page {page} (total: {total})." + return f"No tables found for table group `{table_group_id}`." + + lines = [f"# Tables in Table Group `{table_group_id}`\n"] + lines.append(f"Total tables: {total}. Showing {len(table_names)} (page {page}).\n") + + for name in table_names: + lines.append(f"- `{name}`") + + total_pages = (total + limit - 1) // limit + if page < total_pages: + lines.append(f"\n_Page {page} of {total_pages}. Use `page={page + 1}` for more._") + + return "\n".join(lines) diff --git a/testgen/mcp/tools/reference.py b/testgen/mcp/tools/reference.py index f05f1ade..4c66f1ab 100644 --- a/testgen/mcp/tools/reference.py +++ b/testgen/mcp/tools/reference.py @@ -15,7 +15,7 @@ def get_test_type(test_type: str) -> str: return f"Test type `{test_type}` not found." lines = [ - f"# Test Type: {tt.test_type}\n", + f"# {tt.test_name_short} (`{tt.test_type}`)\n", f"- **Name:** {tt.test_name_short}", ] if tt.test_name_long: @@ -55,7 +55,7 @@ def test_types_resource() -> str: ] for tt in test_types: - desc = (tt.test_description or "")[:80] + desc = tt.test_description or "" lines.append( f"| {tt.test_type} | {tt.test_name_short or ''} | " f"{tt.dq_dimension or ''} | {tt.test_scope or ''} | {desc} |" @@ -71,21 +71,21 @@ def glossary_resource() -> str: ## Entity Hierarchy -- **Project** — Top-level organizational unit. Contains connections and table groups. -- **Connection** — Database connection configuration (database type, host, credentials). -- **Table Group** — A set of tables within a schema that are profiled, tested, and monitored together. +- **Project** — Top-level organizational unit. +- **Connection** — Database connection configuration (host, credentials). +- **Table Group** — A set of tables within a schema that are profiled and tested together. - **Test Suite** — A collection of test definitions scoped to a table group. - **Test Definition** — A configured test with parameters, thresholds, and target table/column. - **Test Run** — An execution of a test suite producing test results. - **Test Result** — The outcome of a single test definition within a test run. -- **Monitor** — Tracks a table-level pattern (freshness, volume, schema, or a custom metric) over time and detects anomalies. + ## Test Result Statuses -- **Passed** — Data meets test criteria. -- **Warning** — Data does not meet test criteria, but test severity is set to warn rather than fail. -- **Failed** — Data does not meet test criteria. -- **Error** — Test could not execute (e.g., missing table or permission issue). -- **Log** — Informational result recorded for reference. +- **Passed** — Test passed within acceptable thresholds. +- **Warning** — Test exceeded its threshold. Severity configured as Warning. +- **Failed** — Test exceeded its threshold. Severity configured as Fail. +- **Error** — Test could not execute (e.g., SQL error, missing table). +- **Log** — Informational result, not scored. ## Disposition @@ -109,11 +109,4 @@ def glossary_resource() -> str: - **table** — Tests table-level properties (e.g., row count, freshness). - **referential** — Tests relationships between tables (e.g., foreign key match). - **custom** — User-defined SQL tests. - -## Monitor Types - -- **Freshness** — Detects when tables are not updated on their expected schedule. -- **Volume** — Tracks row count changes and alerts on unexpected spikes or drops. -- **Schema** — Detects column additions, deletions, or type changes. -- **Metric** — Tracks user-defined metrics for anomalies. """ diff --git a/testgen/mcp/tools/test_results.py b/testgen/mcp/tools/test_results.py index 64eec2e6..cdd3360d 100644 --- a/testgen/mcp/tools/test_results.py +++ b/testgen/mcp/tools/test_results.py @@ -1,6 +1,7 @@ from uuid import UUID from testgen.common.models import with_database_session +from testgen.common.models.test_definition import TestType from testgen.common.models.test_result import TestResult, TestResultStatus @@ -26,6 +27,7 @@ def get_test_results( table_name: str | None = None, test_type: str | None = None, limit: int = 50, + page: int = 1, ) -> str: """Get individual test results for a test run, with optional filters. @@ -34,10 +36,12 @@ def get_test_results( status: Filter by result status (Passed, Failed, Warning, Error, Log). table_name: Filter by table name. test_type: Filter by test type code. - limit: Maximum number of results to return (default 50). + limit: Maximum number of results per page (default 50). + page: Page number, starting from 1 (default 1). """ run_uuid = _parse_uuid(test_run_id, "test_run_id") status_enum = _parse_status(status) if status else None + offset = (page - 1) * limit results = TestResult.select_results( test_run_id=run_uuid, @@ -45,6 +49,7 @@ def get_test_results( table_name=table_name, test_type=test_type, limit=limit, + offset=offset, ) if not results: @@ -58,12 +63,16 @@ def get_test_results( filter_str = f" (filters: {', '.join(filters)})" if filters else "" return f"No test results found for run `{test_run_id}`{filter_str}." + type_names = {tt.test_type: tt.test_name_short for tt in TestType.select_where(TestType.active == "Y")} + lines = [f"# Test Results for run `{test_run_id}`\n"] - lines.append(f"Showing {len(results)} result(s).\n") + lines.append(f"Showing {len(results)} result(s) (page {page}).\n") for r in results: status_str = r.status.value if r.status else "Unknown" - lines.append(f"## [{status_str}] {r.test_type} on `{r.table_name}`") + test_name = type_names.get(r.test_type, r.test_type) + lines.append(f"## [{status_str}] {test_name} on `{r.table_name}`") + lines.append(f"- **Test Type:** `{r.test_type}`") lines.append(f"- Test definition: `{r.test_definition_id}`") if r.column_names: lines.append(f"- Column: `{r.column_names}`") @@ -97,24 +106,39 @@ def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: return f"No confirmed failures found for run `{test_run_id}`." total = sum(row[-1] for row in failures) - group_label = {"test_type": "Test Type", "table": "Table Name", "column": "Column"}[group_by] + + if group_by == "test_type": + type_names = {tt.test_type: tt.test_name_short for tt in TestType.select_where(TestType.active == "Y")} lines = [ f"# Failure Summary for run `{test_run_id}`\n", f"**Total confirmed failures (Failed + Warning):** {total}\n", - f"| {group_label} | Count |", - "|---|---|", ] + if group_by == "test_type": + lines.append("| Test Type | Test Name | Severity | Count |") + lines.append("|---|---|---|---|") + else: + group_label = {"table": "Table Name", "column": "Column"}[group_by] + lines.append(f"| {group_label} | Count |") + lines.append("|---|---|") + for row in failures: count = row[-1] if group_by == "column": # Row is (table_name, column_names, count) table, column = row[0], row[1] label = f"{table}.{column}" if column else f"{table} (table-level)" + lines.append(f"| {label} | {count} |") + elif group_by == "test_type": + # Row is (test_type, status, count) + code = row[0] + status = row[1] + name = type_names.get(code, code) + severity = status.value if status else "Unknown" + lines.append(f"| {code} | `{name}` | {severity} | {count} |") else: - label = row[0] - lines.append(f"| {label} | {count} |") + lines.append(f"| {row[0]} | {count} |") if group_by == "test_type": lines.append( @@ -129,30 +153,36 @@ def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: def get_test_result_history( test_definition_id: str, limit: int = 20, + page: int = 1, ) -> str: """Get the historical results of a specific test definition across runs, showing how measure and status changed over time. Args: test_definition_id: The UUID of the test definition (from get_test_results output). - limit: Maximum number of historical results to return (default 20). + limit: Maximum number of historical results per page (default 20). + page: Page number, starting from 1 (default 1). """ def_uuid = _parse_uuid(test_definition_id, "test_definition_id") - results = TestResult.select_history(test_definition_id=def_uuid, limit=limit) + offset = (page - 1) * limit + results = TestResult.select_history(test_definition_id=def_uuid, limit=limit, offset=offset) if not results: return f"No historical results found for test definition `{test_definition_id}`." + type_names = {tt.test_type: tt.test_name_short for tt in TestType.select_where(TestType.active == "Y")} + first = results[0] + test_name = type_names.get(first.test_type, first.test_type) lines = [ "# Test Result History\n", - f"- **Test Type:** {first.test_type}", + f"- **Test Type:** {test_name} (`{first.test_type}`)", f"- **Table:** `{first.table_name}`", ] if first.column_names: lines.append(f"- **Column:** `{first.column_names}`") lines.extend([ - f"\nShowing {len(results)} result(s), newest first.\n", + f"\nShowing {len(results)} result(s), newest first (page {page}).\n", "| Date | Measure | Threshold | Status |", "|---|---|---|---|", ]) diff --git a/testgen/mcp/tools/test_runs.py b/testgen/mcp/tools/test_runs.py index a0524424..ef5b4869 100644 --- a/testgen/mcp/tools/test_runs.py +++ b/testgen/mcp/tools/test_runs.py @@ -4,14 +4,17 @@ @with_database_session -def get_recent_test_runs(project_code: str, test_suite: str | None = None, limit: int = 5) -> str: - """Get recent completed test runs for a project, optionally filtered by test suite name. +def get_recent_test_runs(project_code: str, test_suite: str | None = None, limit: int = 1) -> str: + """Get the latest test runs for each test suite in a project, optionally filtered by test suite name. Args: project_code: The project code to query. test_suite: Optional test suite name to filter by. - limit: Maximum number of runs to return (default 5). + limit: Maximum runs per test suite (default 1). """ + if not project_code: + return "Missing required parameter `project_code`." + test_suite_id = None if test_suite: suites = TestSuite.select_minimal_where( @@ -28,26 +31,32 @@ def get_recent_test_runs(project_code: str, test_suite: str | None = None, limit scope = f" for suite `{test_suite}`" if test_suite else "" return f"No completed test runs found in project `{project_code}`{scope}." - runs = summaries[:limit] + # Take the first `limit` runs per suite (summaries are ordered by test_starttime DESC) + seen: dict[str, int] = {} + runs = [] + for s in summaries: + count = seen.get(s.test_suite, 0) + if count < limit: + runs.append(s) + seen[s.test_suite] = count + 1 lines = [f"# Recent Test Runs for `{project_code}`\n"] if test_suite: lines[0] = f"# Recent Test Runs for `{project_code}` / `{test_suite}`\n" - lines.append(f"Showing {len(runs)} of {len(summaries)} run(s).\n") + lines.append(f"Showing {len(runs)} run(s) ({limit} per suite).\n") + current_suite = None for run in runs: + if run.test_suite != current_suite: + current_suite = run.test_suite + lines.append(f"## {current_suite}\n") + passed = run.passed_ct or 0 failed = run.failed_ct or 0 warning = run.warning_ct or 0 errors = run.error_ct or 0 - status_hint = "" - if failed or warning: - status_hint = f" **[{failed}F/{warning}W]**" - elif run.status == "Complete": - status_hint = " [OK]" - - lines.append(f"## {run.test_suite} — {run.status}{status_hint}") + lines.append(f"### {run.test_starttime} — {run.status_label}") lines.append(f"- **Run ID:** `{run.test_run_id}`") lines.append(f"- **Started:** {run.test_starttime} | **Ended:** {run.test_endtime}") lines.append(f"- **Results:** {run.test_ct or 0} tests — {passed} passed, {failed} failed, {warning} warnings, {errors} errors") @@ -56,7 +65,7 @@ def get_recent_test_runs(project_code: str, test_suite: str | None = None, limit lines.append(f"- **Dismissed:** {run.dismissed_ct}") if run.dq_score_testing is not None: - lines.append(f"- **DQ Score:** {run.dq_score_testing:.1f}") + lines.append(f"- **Testing Score:** {run.dq_score_testing:.1f}") lines.append("") diff --git a/tests/unit/mcp/test_inventory_service.py b/tests/unit/mcp/test_inventory_service.py index d393c820..413814e5 100644 --- a/tests/unit/mcp/test_inventory_service.py +++ b/tests/unit/mcp/test_inventory_service.py @@ -11,46 +11,27 @@ def session_mock(): def _make_row(project_code="demo", project_name="Demo", connection_id=1, connection_name="main", - sql_flavor_code="postgresql", table_group_id=None, table_groups_name="core", - table_group_schema="public", test_suite_id=None, test_suite="Quality", - last_run_id=None, test_starttime=None, last_run_status=None, - test_ct=None, passed_ct=None, failed_ct=None, warning_ct=None): + table_group_id=None, table_groups_name="core", + table_group_schema="public", test_suite_id=None, test_suite="Quality"): row = MagicMock() row.project_code = project_code row.project_name = project_name row.connection_id = connection_id row.connection_name = connection_name - row.sql_flavor_code = sql_flavor_code row.table_group_id = table_group_id or uuid4() row.table_groups_name = table_groups_name row.table_group_schema = table_group_schema row.test_suite_id = test_suite_id or uuid4() row.test_suite = test_suite - row.last_run_id = last_run_id or uuid4() - row.test_starttime = test_starttime or "2024-01-15T10:00:00" - row.last_run_status = last_run_status or "Complete" - row.test_ct = test_ct if test_ct is not None else 50 - row.passed_ct = passed_ct if passed_ct is not None else 47 - row.failed_ct = failed_ct if failed_ct is not None else 2 - row.warning_ct = warning_ct if warning_ct is not None else 1 return row -@patch("testgen.mcp.services.inventory_service.DataTable") -@patch("testgen.mcp.services.inventory_service.TableGroup") @patch("testgen.mcp.services.inventory_service.select") -def test_get_inventory_basic(mock_select, mock_tg, mock_dt, session_mock): +def test_get_inventory_basic(mock_select, session_mock): tg_id = uuid4() row = _make_row(table_group_id=tg_id) session_mock.execute.return_value.all.return_value = [row] - stat = MagicMock() - stat.id = tg_id - stat.table_ct = 10 - stat.column_ct = 50 - mock_tg.select_stats.return_value = [stat] - mock_dt.select_table_names.return_value = ["customers", "orders", "products"] - from testgen.mcp.services.inventory_service import get_inventory result = get_inventory() @@ -60,15 +41,10 @@ def test_get_inventory_basic(mock_select, mock_tg, mock_dt, session_mock): assert "main" in result assert "core" in result assert "Quality" in result - assert "10 tables" in result - assert "`customers`" in result - assert "`orders`" in result -@patch("testgen.mcp.services.inventory_service.DataTable") -@patch("testgen.mcp.services.inventory_service.TableGroup") @patch("testgen.mcp.services.inventory_service.select") -def test_get_inventory_empty(mock_select, mock_tg, mock_dt, session_mock): +def test_get_inventory_empty(mock_select, session_mock): session_mock.execute.return_value.all.return_value = [] from testgen.mcp.services.inventory_service import get_inventory @@ -78,13 +54,10 @@ def test_get_inventory_empty(mock_select, mock_tg, mock_dt, session_mock): assert "Data Inventory" in result -@patch("testgen.mcp.services.inventory_service.DataTable") -@patch("testgen.mcp.services.inventory_service.TableGroup") @patch("testgen.mcp.services.inventory_service.select") -def test_get_inventory_project_no_connections(mock_select, mock_tg, mock_dt, session_mock): +def test_get_inventory_project_no_connections(mock_select, session_mock): row = _make_row(connection_id=None) session_mock.execute.return_value.all.return_value = [row] - mock_tg.select_stats.return_value = [] from testgen.mcp.services.inventory_service import get_inventory @@ -94,65 +67,19 @@ def test_get_inventory_project_no_connections(mock_select, mock_tg, mock_dt, ses assert "No connections" in result -@patch("testgen.mcp.services.inventory_service.DataTable") -@patch("testgen.mcp.services.inventory_service.TableGroup") @patch("testgen.mcp.services.inventory_service.select") -def test_get_inventory_includes_test_type_hint(mock_select, mock_tg, mock_dt, session_mock): +def test_get_inventory_includes_list_tables_hint(mock_select, session_mock): session_mock.execute.return_value.all.return_value = [_make_row()] - stat = MagicMock() - stat.id = uuid4() - stat.table_ct = 5 - stat.column_ct = 20 - mock_tg.select_stats.return_value = [stat] - mock_dt.select_table_names.return_value = [] from testgen.mcp.services.inventory_service import get_inventory result = get_inventory() - assert "test-types" in result + assert "list_tables" in result -@patch("testgen.mcp.services.inventory_service.DataTable") -@patch("testgen.mcp.services.inventory_service.TableGroup") @patch("testgen.mcp.services.inventory_service.select") -def test_get_inventory_compact_suites(mock_select, mock_tg, mock_dt, session_mock): - """When >20 suites, suite output uses compact format (name + status icon only).""" - tg_id = uuid4() - rows = [ - _make_row( - table_group_id=tg_id, - test_suite=f"Suite_{i}", - test_suite_id=uuid4(), - failed_ct=1 if i == 0 else 0, - warning_ct=0, - ) - for i in range(25) - ] - session_mock.execute.return_value.all.return_value = rows - - stat = MagicMock() - stat.id = tg_id - stat.table_ct = 10 - stat.column_ct = 50 - mock_tg.select_stats.return_value = [stat] - mock_dt.select_table_names.return_value = ["t1"] - - from testgen.mcp.services.inventory_service import get_inventory - - result = get_inventory() - - # Compact suites: show "[FAILURES]" / "[OK]" badges, no full run details - assert "[FAILURES]" in result - assert "[OK]" in result - # Full format markers should NOT appear - assert "Last run:" not in result - - -@patch("testgen.mcp.services.inventory_service.DataTable") -@patch("testgen.mcp.services.inventory_service.TableGroup") -@patch("testgen.mcp.services.inventory_service.select") -def test_get_inventory_compact_groups(mock_select, mock_tg, mock_dt, session_mock): +def test_get_inventory_compact_groups(mock_select, session_mock): """When >50 groups, group output uses single-line compact format.""" rows = [ _make_row( @@ -165,9 +92,6 @@ def test_get_inventory_compact_groups(mock_select, mock_tg, mock_dt, session_moc ] session_mock.execute.return_value.all.return_value = rows - mock_tg.select_stats.return_value = [] - mock_dt.select_table_names.return_value = [] - from testgen.mcp.services.inventory_service import get_inventory result = get_inventory() diff --git a/tests/unit/mcp/test_model_test_result.py b/tests/unit/mcp/test_model_test_result.py index d32714aa..f04949b4 100644 --- a/tests/unit/mcp/test_model_test_result.py +++ b/tests/unit/mcp/test_model_test_result.py @@ -46,14 +46,14 @@ def test_select_results_with_all_filters(session_mock): def test_select_failures_by_test_type(session_mock): session_mock.execute.return_value.all.return_value = [ - ("Alpha_Trunc", 5), - ("Unique_Pct", 3), + ("Alpha_Trunc", TestResultStatus.Failed, 5), + ("Unique_Pct", TestResultStatus.Warning, 3), ] results = TestResult.select_failures(test_run_id=uuid4(), group_by="test_type") assert len(results) == 2 - assert results[0] == ("Alpha_Trunc", 5) + assert results[0] == ("Alpha_Trunc", TestResultStatus.Failed, 5) def test_select_failures_by_table_name(session_mock): diff --git a/tests/unit/mcp/test_tools_discovery.py b/tests/unit/mcp/test_tools_discovery.py index a82cffa6..6b2f0f2b 100644 --- a/tests/unit/mcp/test_tools_discovery.py +++ b/tests/unit/mcp/test_tools_discovery.py @@ -81,3 +81,12 @@ def test_list_test_suites_empty(mock_suite, db_session_mock): result = list_test_suites("nonexistent") assert "No test suites found" in result + + +def test_list_test_suites_empty_project_code(db_session_mock): + from testgen.mcp.tools.discovery import list_test_suites + + result = list_test_suites("") + + assert "Missing required parameter" in result + assert "project_code" in result diff --git a/tests/unit/mcp/test_tools_reference.py b/tests/unit/mcp/test_tools_reference.py index 84d3f9e8..308380bd 100644 --- a/tests/unit/mcp/test_tools_reference.py +++ b/tests/unit/mcp/test_tools_reference.py @@ -72,7 +72,7 @@ def test_test_types_resource_empty(mock_tt_cls, db_session_mock): result = test_types_resource() - assert "No active test types" in result + assert "No test types found" in result def test_glossary_resource(): @@ -82,7 +82,7 @@ def test_glossary_resource(): assert "Entity Hierarchy" in result assert "Result Statuses" in result - assert "DQ Dimensions" in result + assert "Data Quality Dimensions" in result assert "Test Scopes" in result - assert "Monitor Types" in result assert "Disposition" in result + assert "Monitor Types" not in result diff --git a/tests/unit/mcp/test_tools_test_results.py b/tests/unit/mcp/test_tools_test_results.py index f6325831..cb4c5d4a 100644 --- a/tests/unit/mcp/test_tools_test_results.py +++ b/tests/unit/mcp/test_tools_test_results.py @@ -6,12 +6,14 @@ from testgen.common.models.test_result import TestResultStatus +@patch("testgen.mcp.tools.test_results.TestType") @patch("testgen.mcp.tools.test_results.TestResult") -def test_get_test_results_basic(mock_result, db_session_mock): +def test_get_test_results_basic(mock_result, mock_tt_cls, db_session_mock): run_id = str(uuid4()) r1 = MagicMock() r1.status = TestResultStatus.Failed r1.test_type = "Alpha_Trunc" + r1.test_definition_id = uuid4() r1.table_name = "orders" r1.column_names = "customer_name" r1.result_measure = "15.3" @@ -19,11 +21,17 @@ def test_get_test_results_basic(mock_result, db_session_mock): r1.message = "Truncation detected" mock_result.select_results.return_value = [r1] + tt = MagicMock() + tt.test_type = "Alpha_Trunc" + tt.test_name_short = "Alpha Truncation" + mock_tt_cls.select_where.return_value = [tt] + from testgen.mcp.tools.test_results import get_test_results result = get_test_results(run_id) - assert "Alpha_Trunc" in result + assert "Alpha Truncation" in result + assert "`Alpha_Trunc`" in result assert "orders" in result assert "15.3" in result assert "Truncation detected" in result @@ -66,12 +74,20 @@ def test_get_test_results_invalid_status(db_session_mock): get_test_results(str(uuid4()), status="BadStatus") +@patch("testgen.mcp.tools.test_results.TestType") @patch("testgen.mcp.tools.test_results.TestResult") -def test_get_failure_summary_by_test_type(mock_result, db_session_mock): +def test_get_failure_summary_by_test_type(mock_result, mock_tt_cls, db_session_mock): mock_result.select_failures.return_value = [ - ("Alpha_Trunc", 5), - ("Unique_Pct", 3), + ("Alpha_Trunc", TestResultStatus.Failed, 5), + ("Unique_Pct", TestResultStatus.Warning, 3), ] + tt1 = MagicMock() + tt1.test_type = "Alpha_Trunc" + tt1.test_name_short = "Alpha Truncation" + tt2 = MagicMock() + tt2.test_type = "Unique_Pct" + tt2.test_name_short = "Unique Percent" + mock_tt_cls.select_where.return_value = [tt1, tt2] from testgen.mcp.tools.test_results import get_failure_summary @@ -79,7 +95,12 @@ def test_get_failure_summary_by_test_type(mock_result, db_session_mock): assert "Failed + Warning" in result assert "8" in result + assert "Alpha Truncation" in result assert "Alpha_Trunc" in result + assert "Test Name" in result + assert "Severity" in result + assert "Failed" in result + assert "Warning" in result assert "get_test_type" in result @@ -128,8 +149,9 @@ def test_get_failure_summary_invalid_uuid(db_session_mock): get_failure_summary("bad-uuid") +@patch("testgen.mcp.tools.test_results.TestType") @patch("testgen.mcp.tools.test_results.TestResult") -def test_get_test_result_history_basic(mock_result, db_session_mock): +def test_get_test_result_history_basic(mock_result, mock_tt_cls, db_session_mock): def_id = str(uuid4()) r1 = MagicMock() r1.test_type = "Unique_Pct" @@ -149,11 +171,17 @@ def test_get_test_result_history_basic(mock_result, db_session_mock): r2.status = TestResultStatus.Failed mock_result.select_history.return_value = [r1, r2] + tt = MagicMock() + tt.test_type = "Unique_Pct" + tt.test_name_short = "Unique Percent" + mock_tt_cls.select_where.return_value = [tt] + from testgen.mcp.tools.test_results import get_test_result_history result = get_test_result_history(def_id) - assert "Unique_Pct" in result + assert "Unique Percent" in result + assert "`Unique_Pct`" in result assert "orders" in result assert "99.5" in result assert "88.0" in result diff --git a/tests/unit/mcp/test_tools_test_runs.py b/tests/unit/mcp/test_tools_test_runs.py index e22b2c44..061783fe 100644 --- a/tests/unit/mcp/test_tools_test_runs.py +++ b/tests/unit/mcp/test_tools_test_runs.py @@ -17,6 +17,7 @@ def _make_run_summary(**overrides): @patch("testgen.mcp.tools.test_runs.TestRun") @patch("testgen.mcp.tools.test_runs.TestSuite") def test_get_recent_test_runs_default_limit(mock_suite, mock_run, db_session_mock): + """Default limit=1 returns one run per suite.""" runs = [_make_run_summary(test_run_id=uuid4()) for _ in range(7)] mock_run.select_summary.return_value = runs @@ -24,7 +25,8 @@ def test_get_recent_test_runs_default_limit(mock_suite, mock_run, db_session_moc result = get_recent_test_runs("demo") - assert "Showing 5 of 7" in result + # All 7 runs have test_suite="Quality Suite", so only 1 should appear + assert "1 run(s)" in result assert "Quality Suite" in result assert "92.5" in result mock_run.select_summary.assert_called_once_with(project_code="demo", test_suite_id=None) @@ -33,6 +35,7 @@ def test_get_recent_test_runs_default_limit(mock_suite, mock_run, db_session_moc @patch("testgen.mcp.tools.test_runs.TestRun") @patch("testgen.mcp.tools.test_runs.TestSuite") def test_get_recent_test_runs_custom_limit(mock_suite, mock_run, db_session_mock): + """Custom limit returns up to N runs per suite.""" runs = [_make_run_summary() for _ in range(3)] mock_run.select_summary.return_value = runs @@ -40,7 +43,29 @@ def test_get_recent_test_runs_custom_limit(mock_suite, mock_run, db_session_mock result = get_recent_test_runs("demo", limit=10) - assert "Showing 3 of 3" in result + assert "3 run(s)" in result + + +@patch("testgen.mcp.tools.test_runs.TestRun") +@patch("testgen.mcp.tools.test_runs.TestSuite") +def test_get_recent_test_runs_per_suite_grouping(mock_suite, mock_run, db_session_mock): + """With multiple suites, returns limit runs per suite.""" + runs = [ + _make_run_summary(test_suite="Suite A", test_run_id=uuid4()), + _make_run_summary(test_suite="Suite A", test_run_id=uuid4()), + _make_run_summary(test_suite="Suite B", test_run_id=uuid4()), + _make_run_summary(test_suite="Suite B", test_run_id=uuid4()), + ] + mock_run.select_summary.return_value = runs + + from testgen.mcp.tools.test_runs import get_recent_test_runs + + result = get_recent_test_runs("demo") + + # limit=1 (default), so 1 per suite = 2 total + assert "2 run(s)" in result + assert "Suite A" in result + assert "Suite B" in result @patch("testgen.mcp.tools.test_runs.TestRun") @@ -94,4 +119,14 @@ def test_get_recent_test_runs_shows_failure_counts(mock_suite, mock_run, db_sess result = get_recent_test_runs("demo") - assert "5F/2W" in result + assert "5 failed" in result + assert "2 warnings" in result + + +def test_get_recent_test_runs_empty_project_code(db_session_mock): + from testgen.mcp.tools.test_runs import get_recent_test_runs + + result = get_recent_test_runs("") + + assert "Missing required parameter" in result + assert "project_code" in result From 3100ae7a28d61f047e0ca1450bef7a2852bd94c5 Mon Sep 17 00:00:00 2001 From: Luis Date: Mon, 2 Mar 2026 17:17:17 -0400 Subject: [PATCH 23/95] fix(ui): hide portal when main content scrolls --- testgen/ui/static/js/components/portal.js | 36 +++++++++++++++-------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/testgen/ui/static/js/components/portal.js b/testgen/ui/static/js/components/portal.js index 791994ed..b1683dd3 100644 --- a/testgen/ui/static/js/components/portal.js +++ b/testgen/ui/static/js/components/portal.js @@ -51,9 +51,10 @@ const Portal = (/** @type Options */ options, ...args) => { const anchor = document.getElementById(target); if (!anchor) return; + const fixed = hasFixedAncestor(anchor); const coords = position === 'bottom' - ? calculateBottomPosition(anchor, align) - : calculateTopPosition(anchor, align); + ? calculateBottomPosition(anchor, align, fixed) + : calculateTopPosition(anchor, align, fixed); if (!portalEl) { portalEl = document.createElement('div'); @@ -71,26 +72,37 @@ const Portal = (/** @type Options */ options, ...args) => { portalEl.id = id; portalEl.className = getValue(options.class) ?? ''; - portalEl.style.cssText = `position: absolute; z-index: 1001; ${coords} ${getValue(options.style) ?? ''}`; + portalEl.style.cssText = `position: ${fixed ? 'fixed' : 'absolute'}; z-index: 1001; ${coords} ${getValue(options.style) ?? ''}`; }); return ''; }; -function calculateBottomPosition(anchor, align) { +function hasFixedAncestor(el) { + let node = el.parentElement; + while (node && node !== document.body) { + if (getComputedStyle(node).position === 'fixed') return true; + node = node.parentElement; + } + return false; +} + +function calculateBottomPosition(anchor, align, fixed = false) { const r = anchor.getBoundingClientRect(); - const top = r.top + r.height + window.scrollY; - const left = r.left + window.scrollX; + const top = fixed ? r.bottom : r.bottom + window.scrollY; + const left = fixed ? r.left : r.left + window.scrollX; const right = window.innerWidth - r.right; - return `min-width: ${r.width}px; top: ${top}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; + const constrain = fixed ? `max-height: calc(100vh - ${r.bottom}px - 8px); overflow-y: auto;` : ''; + return `min-width: ${r.width}px; top: ${top}px; ${constrain} ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; } -function calculateTopPosition(anchor, align) { +function calculateTopPosition(anchor, align, fixed = false) { const r = anchor.getBoundingClientRect(); - const bottom = window.innerHeight - r.top + window.scrollY; - const left = r.left + window.scrollX; - const right = window.innerWidth - r.right; - return `min-width: ${r.width}px; bottom: ${bottom}px; ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; + const bottom = fixed ? window.innerHeight - r.top : window.innerHeight - r.top + window.scrollY; + const left = fixed ? r.left : r.left + window.scrollX; + const right = window.innerWidth - r.right; + const constrain = fixed ? `max-height: calc(${r.top}px - 8px); overflow-y: auto;` : ''; + return `min-width: ${r.width}px; bottom: ${bottom}px; ${constrain} ${align === 'left' ? `left: ${left}px;` : `right: ${right}px;`}`; } export { Portal }; From f1d29e1c7c1bba6c8476f802e6b6e5833b4a0d9c Mon Sep 17 00:00:00 2001 From: Luis Date: Mon, 2 Mar 2026 22:50:06 -0400 Subject: [PATCH 24/95] fix(navigation): add project/permission checks and redirects --- testgen/ui/app.py | 40 ++++++++++--------- testgen/ui/auth.py | 5 +-- .../standalone/project_settings/index.js | 1 + testgen/ui/components/widgets/sidebar.py | 5 ++- testgen/ui/navigation/page.py | 24 +++++++++-- testgen/ui/navigation/router.py | 9 +++-- testgen/ui/static/css/style.css | 4 ++ testgen/ui/static/js/components/table.js | 3 +- 8 files changed, 60 insertions(+), 31 deletions(-) diff --git a/testgen/ui/app.py b/testgen/ui/app.py index 3257fe22..b5b4fbcd 100644 --- a/testgen/ui/app.py +++ b/testgen/ui/app.py @@ -21,9 +21,9 @@ def render(log_level: int = logging.INFO): page_title="TestGen", page_icon=get_asset_path("favicon.ico"), layout="wide", - # Collapse when logging out because the sidebar takes some time to be removed from the DOM + # Collapse when logging out or on the no-project page (no sidebar content on either) initial_sidebar_state="collapsed" - if session.auth and session.auth.logging_out + if (session.auth and session.auth.logging_out) or session.current_page == "no-project" else "auto", ) @@ -39,9 +39,12 @@ def render(log_level: int = logging.INFO): set_locale() - session.sidebar_project = ( - session.page_args_pending_router and session.page_args_pending_router.get("project_code") - ) or st.query_params.get("project_code", session.sidebar_project) + if session.auth.logging_out: + session.sidebar_project = None + else: + session.sidebar_project = ( + session.page_args_pending_router and session.page_args_pending_router.get("project_code") + ) or st.query_params.get("project_code", session.sidebar_project) if not session.auth.is_logged_in and not session.auth.logging_out: session.auth.load_user_session() @@ -59,19 +62,20 @@ def render(log_level: int = logging.INFO): except Exception: current_page = "" is_global_context = current_page in application.global_admin_paths - with st.sidebar: - testgen.sidebar( - projects=[] if is_global_context else [ - p for p in Project.select_where() if session.auth.user_has_project_access(p.project_code) - ], - current_project=None if is_global_context else session.sidebar_project, - menu=application.menu, - current_page=session.current_page, - version=version_service.get_version(), - support_email=settings.SUPPORT_EMAIL, - global_context=is_global_context, - is_global_admin=session.auth.user_has_permission("global_admin") and bool(application.global_admin_paths), - ) + if current_page != "no-project": + with st.sidebar: + testgen.sidebar( + projects=[] if is_global_context else [ + p for p in Project.select_where() if session.auth.user_has_project_access(p.project_code) + ], + current_project=None if is_global_context else session.sidebar_project, + menu=application.menu, + current_page=session.current_page, + version=version_service.get_version(), + support_email=settings.SUPPORT_EMAIL, + global_context=is_global_context, + is_global_admin=session.auth.user_has_permission("global_admin") and bool(application.global_admin_paths), + ) application.router.run() diff --git a/testgen/ui/auth.py b/testgen/ui/auth.py index 6803bb82..b3830100 100644 --- a/testgen/ui/auth.py +++ b/testgen/ui/auth.py @@ -41,12 +41,11 @@ def user_display(self) -> str | None: def current_project(self) -> str | None: return session.sidebar_project - @property - def default_page(self) -> str | None: + def get_default_page(self, _project_code: str | None = None) -> str: return "project-dashboard" if self.user else "" def user_has_permission(self, permission: Permission, /, project_code: str | None = None) -> bool: # noqa: ARG002 - return True # Dev/open-source: permissive, including global_admin + return True def user_has_project_access(self, project_code: str) -> bool: # noqa: ARG002 return True diff --git a/testgen/ui/components/frontend/standalone/project_settings/index.js b/testgen/ui/components/frontend/standalone/project_settings/index.js index fb7d0533..fa88c954 100644 --- a/testgen/ui/components/frontend/standalone/project_settings/index.js +++ b/testgen/ui/components/frontend/standalone/project_settings/index.js @@ -159,6 +159,7 @@ export default (component) => { } return () => { + Streamlit.disableV2(setTriggerValue); parentElement.state = null; }; }; diff --git a/testgen/ui/components/widgets/sidebar.py b/testgen/ui/components/widgets/sidebar.py index a0739364..f847dac0 100644 --- a/testgen/ui/components/widgets/sidebar.py +++ b/testgen/ui/components/widgets/sidebar.py @@ -87,7 +87,8 @@ def on_change(): # (even though it works fine locally) time.sleep(0.3) else: + query_params = event_data.get("params", {}) Router().queue_navigation( - to=event_data.get("path") or session.auth.default_page, - with_args=event_data.get("params", {}), + to=event_data.get("path") or session.auth.get_default_page(project_code=query_params.get("project_code")), + with_args=query_params, ) diff --git a/testgen/ui/navigation/page.py b/testgen/ui/navigation/page.py index d80bee29..11e93f06 100644 --- a/testgen/ui/navigation/page.py +++ b/testgen/ui/navigation/page.py @@ -8,7 +8,6 @@ from streamlit.runtime.state.query_params_proxy import QueryParamsProxy import testgen.ui.navigation.router -from testgen.common.models.project import Project from testgen.ui.auth import Permission from testgen.ui.navigation.menu import MenuItem from testgen.ui.session import session @@ -33,20 +32,37 @@ def __init__(self, router: testgen.ui.navigation.router.Router) -> None: def _navigate(self) -> None: self.router.navigate_to_pending() + + is_admin_page = self.permission == "global_admin" + requested_project = st.query_params.get("project_code") + if not is_admin_page and session.auth.user and requested_project and not session.auth.user_has_project_access(requested_project): + default_page = session.auth.get_default_page() + project_codes = session.auth.user.get_accessible_projects() + return self.router.navigate_with_warning( + "You do not have access to this project or it does not exist. Redirecting ...", + to=default_page, + with_args={"project_code": project_codes[0] if project_codes else None}, + ) + + sidebar_project = session.sidebar_project + if not sidebar_project and session.auth.user: + project_codes = [requested_project] if requested_project else session.auth.user.get_accessible_projects() + sidebar_project = project_codes[0] if project_codes else None + session.sidebar_project = sidebar_project + permission_guard = lambda: session.auth.user_has_permission(self.permission) if self.permission else True for guard in [ permission_guard, *(self.can_activate or []) ]: can_activate = guard() if can_activate != True: - session.sidebar_project = session.sidebar_project or Project.select_where()[0].project_code - if type(can_activate) == str: return self.router.navigate(to=can_activate, with_args={ "project_code": session.sidebar_project }) session.page_pending_login = self.path session.page_args_pending_login = st.query_params.to_dict() - default_page = session.auth.default_page or "" + default_page = session.auth.get_default_page(project_code=session.sidebar_project) with_args = { "project_code": session.sidebar_project } if default_page else {} + return self.router.navigate(to=default_page, with_args=with_args) self.render(**self._query_params_to_kwargs(st.query_params)) diff --git a/testgen/ui/navigation/router.py b/testgen/ui/navigation/router.py index bb6ae98d..eaa43a52 100644 --- a/testgen/ui/navigation/router.py +++ b/testgen/ui/navigation/router.py @@ -8,7 +8,6 @@ import testgen.ui.navigation.page from testgen.common.mixpanel_service import MixpanelService -from testgen.common.models.project import Project from testgen.common.models.settings import PersistedSetting from testgen.ui.session import session from testgen.utils.singleton import Singleton @@ -54,7 +53,7 @@ def run(self) -> None: if session.auth.logging_in: session.auth.logging_in = False - pending_route = session.page_pending_login or session.auth.default_page or "" + pending_route = session.page_pending_login or session.auth.get_default_page(project_code=session.sidebar_project) pending_args = ( (session.page_args_pending_login or {}) if session.page_pending_login @@ -133,7 +132,11 @@ def navigate(self, /, to: str, with_args: dict = {}) -> None: # noqa: B006 def navigate_with_warning(self, warning: str, to: str, with_args: dict = {}) -> None: # noqa: B006 st.warning(warning) time.sleep(3) - session.sidebar_project = session.sidebar_project or Project.select_where()[0].project_code + sidebar_project = session.sidebar_project + if session.auth.user and not sidebar_project: + project_codes = session.auth.user.get_accessible_projects() + sidebar_project = project_codes[0] if project_codes else None + session.sidebar_project = sidebar_project self.navigate(to, {"project_code": session.sidebar_project, **with_args}) def set_query_params(self, with_args: dict) -> None: diff --git a/testgen/ui/static/css/style.css b/testgen/ui/static/css/style.css index 2637dbd5..05f5768c 100644 --- a/testgen/ui/static/css/style.css +++ b/testgen/ui/static/css/style.css @@ -113,6 +113,10 @@ section.stSidebar > [data-testid="stSidebarContent"] { [data-testid="stSidebarUserContent"] { display: none; } + +.stAppViewContainer:has(.tg-no-project) > .stSidebar { + display: none; +} /* */ /* Main content */ diff --git a/testgen/ui/static/js/components/table.js b/testgen/ui/static/js/components/table.js index c21ac284..58185fb2 100644 --- a/testgen/ui/static/js/components/table.js +++ b/testgen/ui/static/js/components/table.js @@ -43,6 +43,7 @@ * @property {string?} class * @property {((row: any, index: number) => string)?} rowClass * @property {string?} height + * @property {string?} maxHeight * @property {string?} width * @property {boolean?} highDensity * @property {boolean?} dynamicWidth @@ -157,7 +158,7 @@ const Table = (options, rows) => { return div( { class: () => `tg-table flex-column border border-radius-1 ${getValue(options.highDensity) ? 'tg-table-high-density' : ''} ${getValue(options.dynamicWidth) ? 'tg-table-dynamic-width' : ''} ${options.onRowsSelected ? 'tg-table-hoverable' : ''}`, - style: () => `height: ${getValue(options.height) ? getValue(options.height) + 'px' : defaultHeight};`, + style: () => `height: ${getValue(options.height) ? getValue(options.height) : defaultHeight}; ${getValue(options.maxHeight) ? 'max-height: ' + getValue(options.maxHeight) + ';' : ''}`, }, options.header, div( From 62e490b43d5b874779403fee91c7acd0e6da1ee4 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Tue, 17 Feb 2026 22:58:33 -0300 Subject: [PATCH 25/95] feat(catalog): add CSV import/export for metadata Add a dialog to import table and column metadata (description, CDE, tags) from CSV files, with preview, validation, and inline status indicators. Also add a "Metadata CSV" export option. Co-Authored-By: Claude Opus 4.6 --- testgen/ui/components/frontend/js/main.js | 1 + .../frontend/js/pages/data_catalog.js | 32 +- .../js/pages/import_metadata_dialog.js | 213 ++++++++++ .../components/widgets/testgen_component.py | 1 + testgen/ui/views/data_catalog.py | 154 +++++-- .../views/dialogs/import_metadata_dialog.py | 401 ++++++++++++++++++ tests/unit/ui/__init__.py | 0 tests/unit/ui/conftest.py | 7 + tests/unit/ui/test_import_metadata.py | 324 ++++++++++++++ 9 files changed, 1096 insertions(+), 37 deletions(-) create mode 100644 testgen/ui/components/frontend/js/pages/import_metadata_dialog.js create mode 100644 testgen/ui/views/dialogs/import_metadata_dialog.py create mode 100644 tests/unit/ui/__init__.py create mode 100644 tests/unit/ui/conftest.py create mode 100644 tests/unit/ui/test_import_metadata.py diff --git a/testgen/ui/components/frontend/js/main.js b/testgen/ui/components/frontend/js/main.js index 8819548e..c5f06d10 100644 --- a/testgen/ui/components/frontend/js/main.js +++ b/testgen/ui/components/frontend/js/main.js @@ -47,6 +47,7 @@ const componentLoaders = { test_results_chart: () => import('./pages/test_results_chart.js').then(m => m.TestResultsChart), schema_changes_list: () => import('./components/schema_changes_list.js').then(m => m.SchemaChangesList), edit_monitor_settings: () => import('./pages/edit_monitor_settings.js').then(m => m.EditMonitorSettings), + import_metadata_dialog: () => import('./pages/import_metadata_dialog.js').then(m => m.ImportMetadataDialog), }; const TestGenComponent = async (/** @type {string} */ id, /** @type {object} */ props) => { diff --git a/testgen/ui/components/frontend/js/pages/data_catalog.js b/testgen/ui/components/frontend/js/pages/data_catalog.js index 1e2f4dfb..f6fddcd4 100644 --- a/testgen/ui/components/frontend/js/pages/data_catalog.js +++ b/testgen/ui/components/frontend/js/pages/data_catalog.js @@ -208,7 +208,22 @@ const DataCatalog = (/** @type Properties */ props) => { testId: 'table-group-filter', onChange: (value) => emitEvent('TableGroupSelected', {payload: value}), }), - ExportOptions(treeNodes, multiSelectedItems), + div( + { class: 'flex-row fx-gap-2' }, + userCanEdit + ? Button({ + icon: 'upload', + type: 'stroked', + label: 'Import', + tooltip: 'Import metadata from CSV', + tooltipPosition: 'left', + width: 'fit-content', + style: 'background: var(--button-generic-background-color);', + onclick: () => emitEvent('ImportClicked', {}), + }) + : null, + ExportOptions(treeNodes, multiSelectedItems, userCanEdit), + ), ), () => treeNodes.val.length ? div( @@ -322,7 +337,7 @@ const DataCatalog = (/** @type Properties */ props) => { : ConditionalEmptyState(projectSummary, userCanEdit, userCanNavigate); }; -const ExportOptions = (/** @type TreeNode[] */ treeNodes, /** @type SelectedNode[] */ selectedNodes) => { +const ExportOptions = (/** @type TreeNode[] */ treeNodes, /** @type SelectedNode[] */ selectedNodes, /** @type boolean */ userCanEdit) => { const exportOptionsDomId = `data-catalog-export-${getRandomId()}`; const exportOptionsOpened = van.state(false); @@ -399,6 +414,19 @@ const ExportOptions = (/** @type TreeNode[] */ treeNodes, /** @type SelectedNode 'Selected columns', ) : null, + userCanEdit + ? div( + { + class: 'tg-dh--export-option', + style: 'border-top: var(--button-stroked-border);', + onclick: () => { + emitEvent('ExportCsvClicked', {}); + exportOptionsOpened.val = false; + }, + }, + 'Metadata CSV', + ) + : null, ), ), ]; diff --git a/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js b/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js new file mode 100644 index 00000000..9d30d74b --- /dev/null +++ b/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js @@ -0,0 +1,213 @@ +/** + * @typedef Properties + * @type {object} + * @property {object|null} preview + * @property {object|null} result + */ +import van from '../van.min.js'; +import { Streamlit } from '../streamlit.js'; +import { emitEvent, getValue, loadStylesheet, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange } from '../utils.js'; +import { RadioGroup } from '../components/radio_group.js'; +import { FileInput } from '../components/file_input.js'; +import { Button } from '../components/button.js'; +import { Alert } from '../components/alert.js'; +import { Table } from '../components/table.js'; +import { capitalize } from '../display_utils.js'; + +const { div, i, span } = van.tags; + +const ImportMetadataDialog = (/** @type Properties */ props) => { + loadStylesheet('import-metadata-dialog', stylesheet); + Streamlit.setFrameHeight(1); + window.testgen.isPage = true; + + const wrapperId = 'import-metadata-wrapper'; + resizeFrameHeightToElement(wrapperId); + resizeFrameHeightOnDOMChange(wrapperId); + + const blankBehavior = van.state('keep'); + const fileValue = van.state(null); + + return div( + { id: wrapperId, class: 'flex-column fx-gap-4' }, + RadioGroup({ + label: 'When import value is blank', + options: [ + { label: 'Keep existing values', value: 'keep' }, + { label: 'Clear existing values', value: 'clear' }, + ], + value: blankBehavior, + onChange: (value) => blankBehavior.val = value, + layout: 'vertical', + }), + FileInput({ + name: 'csv_file', + label: 'Drop CSV file here or click to browse', + value: fileValue, + onChange: (value) => { + fileValue.val = value; + if (value?.content) { + emitEvent('FileUploaded', { + payload: { + content: value.content, + blank_behavior: blankBehavior.val, + }, + }); + } else { + emitEvent('FileCleared', {}); + } + }, + }), + () => { + const result = getValue(props.result); + if (result) { + return Alert( + { type: result.success ? 'success' : 'error', icon: result.success ? 'check_circle' : 'error' }, + span(result.message), + ); + } + + const preview = getValue(props.preview); + if (!preview) { + return ''; + } + + const hasError = !!preview.error; + const totalMatched = hasError ? 0 : (preview.table_count || 0) + (preview.column_count || 0); + const hasMatches = totalMatched > 0; + + return div( + { class: 'flex-column fx-gap-3' }, + hasError + ? '' + : span( + { class: 'text-secondary' }, + `Summary: ${preview.table_count || 0} table(s), ${preview.column_count || 0} column(s) matched`, + ), + hasError + ? Alert({ type: 'error', icon: 'error' }, span(preview.error)) + : PreviewTable(preview), + div( + { class: 'flex-row fx-justify-content-flex-end' }, + Button({ + type: 'stroked', + color: 'primary', + label: 'Import Metadata', + icon: 'upload', + width: 'auto', + disabled: !hasMatches, + onclick: () => emitEvent('ImportConfirmed', {}), + }), + ), + ); + }, + ); +}; + +const STATUS_ICONS = { + ok: 'check_circle', + warning: 'warning', + error: 'error', + unmatched: 'block', +}; + +const PreviewTable = (preview) => { + const metadataColumns = preview.metadata_columns || []; + const previewRows = preview.preview_rows || []; + + const columns = [ + { name: '_status_icon', label: '', width: 32 }, + { name: 'table_name', label: 'Table', width: 150 }, + { name: 'column_name', label: 'Column', width: 150 }, + ...metadataColumns.map(col => ({ + name: col, + label: col === 'critical_data_element' ? 'CDE' : capitalize(col.replaceAll('_', ' ')), + width: col === 'description' ? 200 : 120, + })), + ]; + + const rows = previewRows.map(row => { + const status = row._status || 'ok'; + const icon = STATUS_ICONS[status] || STATUS_ICONS.ok; + const truncatedFields = row._truncated_fields || []; + + const tableRow = { + _status: status, + _status_icon: i( + { + class: `material-symbols-rounded import-status-${status}`, + style: 'font-size: 16px; cursor: default', + title: row._status_detail || '', + }, + icon, + ), + table_name: row.table_name ?? '', + column_name: row.column_name ?? '', + }; + + for (const col of metadataColumns) { + let val = row[col] ?? ''; + if (truncatedFields.includes(col) && val) { + val += '\u2026'; + } + tableRow[col] = val; + } + + return tableRow; + }); + + return Table( + { + columns, + height: Math.min(300, 40 + rows.length * 40), + highDensity: true, + rowClass: (row) => { + if (row._status === 'unmatched') return 'import-row-unmatched'; + if (row._status === 'error') return 'import-row-error'; + if (row._status === 'warning') return 'import-row-warning'; + return ''; + }, + }, + rows, + ); +}; + +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.import-status-ok { + color: var(--primary-color); +} + +.import-status-warning { + color: var(--orange); +} + +.import-status-error { + color: var(--error-color); +} + +.import-status-unmatched { + color: var(--disabled-text-color); +} + +.import-row-unmatched > td { + background-color: rgba(0, 0, 0, 0.03); + color: var(--disabled-text-color); +} + +.import-row-error > td { + background-color: color-mix(in srgb, var(--error-color) 5%, transparent); +} + +.import-row-warning > td { + background-color: color-mix(in srgb, var(--orange) 8%, transparent); +} + +@media (prefers-color-scheme: dark) { + .import-row-unmatched > td { + background-color: rgba(255, 255, 255, 0.03); + } +} +`); + +export { ImportMetadataDialog }; diff --git a/testgen/ui/components/widgets/testgen_component.py b/testgen/ui/components/widgets/testgen_component.py index 8161a0b7..93dbe523 100644 --- a/testgen/ui/components/widgets/testgen_component.py +++ b/testgen/ui/components/widgets/testgen_component.py @@ -22,6 +22,7 @@ "table_group_wizard", "help_menu", "notification_settings", + "import_metadata_dialog", ] diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index c06c8b96..00881dd1 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -37,6 +37,7 @@ from testgen.ui.session import session, temp_value from testgen.ui.views.dialogs.column_history_dialog import column_history_dialog from testgen.ui.views.dialogs.data_preview_dialog import data_preview_dialog +from testgen.ui.views.dialogs.import_metadata_dialog import open_import_metadata_dialog from testgen.ui.views.dialogs.run_profiling_dialog import run_profiling_dialog from testgen.ui.views.dialogs.table_create_script_dialog import table_create_script_dialog from testgen.utils import friendly_score, is_uuid4, make_json_safe, score @@ -54,13 +55,15 @@ class DataCatalogPage(Page): ] menu_item = MenuItem(icon=PAGE_ICON, label=PAGE_TITLE, section="Data Profiling", order=0) - def render(self, project_code: str, table_group_id: str | None = None, selected: str | None = None, **_kwargs) -> None: + def render( + self, project_code: str, table_group_id: str | None = None, selected: str | None = None, **_kwargs + ) -> None: testgen.page_header( PAGE_TITLE, "data-catalog", ) - _, loading_column = st.columns([.4, .6]) + _, loading_column = st.columns([0.4, 0.6]) spinner_container = loading_column.container(key="data_catalog:spinner") with spinner_container: @@ -74,7 +77,7 @@ def render(self, project_code: str, table_group_id: str | None = None, selected: user_can_navigate = session.auth.user_has_permission("view") table_groups = TableGroup.select_minimal_where(TableGroup.project_code == project_code) - if not table_group_id or table_group_id not in [ str(item.id) for item in table_groups ]: + if not table_group_id or table_group_id not in [str(item.id) for item in table_groups]: table_group_id = str(table_groups[0].id) if table_groups else None on_table_group_selected(table_group_id) @@ -89,7 +92,7 @@ def render(self, project_code: str, table_group_id: str | None = None, selected: selected_item["connection_id"] = str(selected_table_group.connection_id) else: on_item_selected(None) - + testgen_component( "data_catalog", props={ @@ -99,7 +102,8 @@ def render(self, project_code: str, table_group_id: str | None = None, selected: "value": str(table_group.id), "label": table_group.table_groups_name, "selected": table_group_id == str(table_group.id), - } for table_group in table_groups + } + for table_group in table_groups ], "columns": json.dumps(make_json_safe(columns)) if columns else None, "selected_item": json.dumps(make_json_safe(selected_item)) if selected_item else None, @@ -114,7 +118,9 @@ def render(self, project_code: str, table_group_id: str | None = None, selected: "RunProfilingClicked": lambda _: run_profiling_dialog( project_code=project_code, table_group_id=selected_table_group.id, - ) if selected_table_group else None, + ) + if selected_table_group + else None, "TableGroupSelected": on_table_group_selected, "ItemSelected": on_item_selected, "ExportClicked": lambda items: download_dialog( @@ -140,32 +146,41 @@ def render(self, project_code: str, table_group_id: str | None = None, selected: item["column_name"], item["add_date"], ), + "ImportClicked": lambda _: open_import_metadata_dialog(str(selected_table_group.id)) + if selected_table_group + else None, + "ExportCsvClicked": lambda _: export_metadata_csv(selected_table_group) + if selected_table_group + else None, }, - event_handlers={ "TagsChanged": partial(on_tags_changed, spinner_container) }, + event_handlers={"TagsChanged": partial(on_tags_changed, spinner_container)}, ) def on_table_group_selected(table_group_id: str | None) -> None: - Router().set_query_params({ "table_group_id": table_group_id }) + Router().set_query_params({"table_group_id": table_group_id}) def on_item_selected(item_id: str | None) -> None: - Router().set_query_params({ "selected": item_id }) + Router().set_query_params({"selected": item_id}) class ExportItem(typing.TypedDict): id: str type: typing.Literal["table", "column"] -def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: TableGroupMinimal, items: list[ExportItem] | None) -> None: + +def get_excel_report_data( + update_progress: PROGRESS_UPDATE_TYPE, table_group: TableGroupMinimal, items: list[ExportItem] | None +) -> None: if items: table_data = get_tables_by_id( - table_ids=[ item["id"] for item in items if item["type"] == "table" ], + table_ids=[item["id"] for item in items if item["type"] == "table"], include_tags=True, include_active_tests=True, ) column_data = get_columns_by_id( - column_ids=[ item["id"] for item in items if item["type"] == "column" ], + column_ids=[item["id"] for item in items if item["type"] == "column"], include_tags=True, include_active_tests=True, ) @@ -180,10 +195,13 @@ def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: Ta include_tags=True, include_active_tests=True, ) - data = pd.DataFrame(table_data + column_data) - data = data.sort_values(by=["table_name", "ordinal_position"], na_position="first", key=lambda x: x.str.lower() if x.dtype == "object" else x) + data = data.sort_values( + by=["table_name", "ordinal_position"], + na_position="first", + key=lambda x: x.str.lower() if x.dtype == "object" else x, + ) for key in ["datatype_suggestion"]: data[key] = data[key].apply(lambda val: val.lower() if not pd.isna(val) else None) @@ -192,11 +210,18 @@ def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: Ta data[key] = data[key].apply(lambda val: round(val, 2) if not pd.isna(val) else None) for key in ["min_date", "max_date", "add_date", "last_mod_date", "drop_date"]: - data[key] = data[key].apply( - lambda val: val.strftime("%b %-d %Y, %-I:%M %p") if not pd.isna(val) else None - ) - - for key in ["data_source", "source_system", "source_process", "business_domain", "stakeholder_group", "transform_level", "aggregation_level", "data_product"]: + data[key] = data[key].apply(lambda val: val.strftime("%b %-d %Y, %-I:%M %p") if not pd.isna(val) else None) + + for key in [ + "data_source", + "source_system", + "source_process", + "business_domain", + "stakeholder_group", + "transform_level", + "aggregation_level", + "data_product", + ]: data[key] = data.apply( lambda row: row[key] or row[f"table_{key}"] or row.get(f"table_group_{key}"), axis=1, @@ -206,16 +231,18 @@ def get_excel_report_data(update_progress: PROGRESS_UPDATE_TYPE, table_group: Ta data["general_type"] = data["general_type"].apply(lambda val: type_map.get(val)) data["critical_data_element"] = data.apply( - lambda row: "Yes" if row["critical_data_element"] == True or row["table_critical_data_element"] == True else None, + lambda row: "Yes" + if row["critical_data_element"] == True or row["table_critical_data_element"] == True + else None, axis=1, ) data["top_freq_values"] = data["top_freq_values"].apply( - lambda val: "\n".join([ f"{part.split(" | ")[1]} | {part.split(" | ")[0]}" for part in val[2:].split("\n| ") ]) + lambda val: "\n".join([f"{part.split(" | ")[1]} | {part.split(" | ")[0]}" for part in val[2:].split("\n| ")]) if not pd.isna(val) else None ) data["top_patterns"] = data["top_patterns"].apply( - lambda val: "".join([ f"{part}{'\n' if index % 2 else ' | '}" for index, part in enumerate(val.split(" | ")) ]) + lambda val: "".join([f"{part}{'\n' if index % 2 else ' | '}" for index, part in enumerate(val.split(" | "))]) if not pd.isna(val) else None ) @@ -304,7 +331,7 @@ def remove_table_dialog(item: dict) -> None: st.html(f"Are you sure you want to remove the table {item['table_name']} from the data catalog?") st.warning("This action cannot be undone.") - _, button_column = st.columns([.85, .15]) + _, button_column = st.columns([0.85, 0.15]) with button_column: testgen.button( label="Remove", @@ -326,7 +353,7 @@ def remove_table_dialog(item: dict) -> None: st.success("Table has been removed.") time.sleep(1) - for func in [ get_table_group_columns, get_tag_values ]: + for func in [get_table_group_columns, get_tag_values]: func.clear() st.session_state["data_catalog:last_saved_timestamp"] = datetime.now().timestamp() st.rerun() @@ -337,14 +364,14 @@ def on_tags_changed(spinner_container: DeltaGenerator, payload: dict) -> FILE_DA attributes.extend(TAG_FIELDS) tags = payload["tags"] - set_attributes = [ f"{key} = NULLIF(:{key}, '')" for key in attributes if key in tags ] - params = { key: tags.get(key) or "" for key in attributes if key in tags } + set_attributes = [f"{key} = NULLIF(:{key}, '')" for key in attributes if key in tags] + params = {key: tags.get(key) or "" for key in attributes if key in tags} if "critical_data_element" in tags: set_attributes.append("critical_data_element = :critical_data_element") params.update({"critical_data_element": tags.get("critical_data_element")}) - params["table_ids"] = [ item["id"] for item in payload["items"] if item["type"] == "table" ] - params["column_ids"] = [ item["id"] for item in payload["items"] if item["type"] == "column" ] + params["table_ids"] = [item["id"] for item in payload["items"] if item["type"] == "table"] + params["column_ids"] = [item["id"] for item in payload["items"] if item["type"] == "column"] with spinner_container: with st.spinner("Saving tags"): @@ -378,17 +405,70 @@ def on_tags_changed(spinner_container: DeltaGenerator, payload: dict) -> FILE_DA params, ) - for func in [ get_table_group_columns, get_table_by_id, get_column_by_id, get_tag_values ]: + for func in [get_table_group_columns, get_table_by_id, get_column_by_id, get_tag_values]: func.clear() st.session_state["data_catalog:last_saved_timestamp"] = datetime.now().timestamp() st.rerun() +def export_metadata_csv(table_group: TableGroupMinimal) -> None: + def _get_csv_data(update_progress: PROGRESS_UPDATE_TYPE) -> FILE_DATA_TYPE: + table_data = fetch_all_from_db( + f""" + SELECT table_name, '' AS column_name, + description, + critical_data_element, + {", ".join(TAG_FIELDS)} + FROM data_table_chars + WHERE table_groups_id = :table_group_id + ORDER BY LOWER(table_name) + """, + {"table_group_id": str(table_group.id)}, + ) + + column_data = fetch_all_from_db( + f""" + SELECT c.table_name, c.column_name, + c.description, + c.critical_data_element, + {", ".join([ f"c.{tag}" for tag in TAG_FIELDS ])} + FROM data_column_chars c + LEFT JOIN data_table_chars t ON (c.table_id = t.table_id) + WHERE c.table_groups_id = :table_group_id + ORDER BY LOWER(c.table_name), c.ordinal_position + """, + {"table_group_id": str(table_group.id)}, + ) + + rows = [] + for row in list(table_data) + list(column_data): + csv_row = { + "Table": row["table_name"], + "Column": row["column_name"], + "Description": row["description"] or "", + "Critical Data Element": "Yes" if row["critical_data_element"] is True else "No" if row["critical_data_element"] is False else "", + } + for tag in TAG_FIELDS: + header = tag.replace("_", " ").title() + csv_row[header] = row[tag] or "" + rows.append(csv_row) + + df = pd.DataFrame(rows) + csv_content = df.to_csv(index=False) + update_progress(1.0) + return "Data Catalog Metadata.csv", "text/csv", csv_content + + download_dialog( + dialog_title="Download Metadata CSV", + file_content_func=_get_csv_data, + ) + + @st.cache_data(show_spinner=False) def get_table_group_columns(table_group_id: str) -> list[dict]: if not is_uuid4(table_group_id): return [] - + query = f""" SELECT CONCAT('column_', column_chars.column_id) AS column_id, CONCAT('table_', table_chars.table_id) AS table_id, @@ -424,7 +504,7 @@ def get_table_group_columns(table_group_id: str) -> list[dict]: params = {"table_group_id": table_group_id} results = fetch_all_from_db(query, params) - return [ dict(row) for row in results ] + return [dict(row) for row in results] def get_selected_item(selected: str, table_group_id: str) -> dict | None: @@ -445,8 +525,12 @@ def get_selected_item(selected: str, table_group_id: str) -> dict | None: item["dq_score_profiling"] = friendly_score(item["dq_score_profiling"]) item["dq_score_testing"] = friendly_score(item["dq_score_testing"]) item["hygiene_issues"] = get_hygiene_issues(item["profile_run_id"], item["table_name"], item.get("column_name")) - item["test_issues"] = get_latest_test_issues(item["table_group_id"], item["table_name"], item.get("column_name")) - item["test_suites"] = get_related_test_suites(item["table_group_id"], item["table_name"], item.get("column_name")) + item["test_issues"] = get_latest_test_issues( + item["table_group_id"], item["table_name"], item.get("column_name") + ) + item["test_suites"] = get_related_test_suites( + item["table_group_id"], item["table_name"], item.get("column_name") + ) return item @@ -491,7 +575,7 @@ def get_latest_test_issues(table_group_id: str, table_name: str, column_name: st } results = fetch_all_from_db(query, params) - return [ dict(row) for row in results ] + return [dict(row) for row in results] @st.cache_data(show_spinner=False) @@ -518,7 +602,7 @@ def get_related_test_suites(table_group_id: str, table_name: str, column_name: s } results = fetch_all_from_db(query, params) - return [ dict(row) for row in results ] + return [dict(row) for row in results] @st.cache_data(show_spinner=False) diff --git a/testgen/ui/views/dialogs/import_metadata_dialog.py b/testgen/ui/views/dialogs/import_metadata_dialog.py new file mode 100644 index 00000000..2312feb7 --- /dev/null +++ b/testgen/ui/views/dialogs/import_metadata_dialog.py @@ -0,0 +1,401 @@ +import base64 +import io +import logging +import time +from datetime import datetime + +import pandas as pd +import streamlit as st + +from testgen.common.models import with_database_session +from testgen.ui.components.widgets.testgen_component import testgen_component +from testgen.ui.queries.profiling_queries import TAG_FIELDS +from testgen.ui.services.database_service import execute_db_query, fetch_all_from_db +from testgen.ui.session import temp_value + +LOG = logging.getLogger("testgen") + +HEADER_MAP = { + "table": "table_name", + "column": "column_name", + "description": "description", + "critical data element": "critical_data_element", + "cde": "critical_data_element", + "data source": "data_source", + "source system": "source_system", + "source process": "source_process", + "business domain": "business_domain", + "stakeholder group": "stakeholder_group", + "transform level": "transform_level", + "aggregation level": "aggregation_level", + "data product": "data_product", +} + +METADATA_COLUMNS = ["description", "critical_data_element", *TAG_FIELDS] + +CDE_TRUE_VALUES = {"yes", "y", "true", "1"} +CDE_FALSE_VALUES = {"no", "n", "false", "0"} + +TAG_MAX_LENGTH = 40 +DESCRIPTION_MAX_LENGTH = 1000 + + +def parse_import_csv(content: str, table_group_id: str, blank_behavior: str) -> dict: + parsed = _parse_csv(content) + if "error" in parsed: + return parsed + + return _match_and_validate(parsed["df"], parsed["duplicate_rows"], table_group_id, blank_behavior) + + +def _parse_csv(content: str) -> dict: + try: + raw_bytes = base64.b64decode(content.split(",")[1]) + df = pd.read_csv(io.BytesIO(raw_bytes), dtype=str, keep_default_na=False) + except Exception as e: + LOG.warning("CSV parse error: %s", e) + return {"error": f"Could not parse CSV file: {e}"} + + # Normalize headers + normalized_columns = {} + for col in df.columns: + key = col.strip().lower().replace("_", " ") + mapped = HEADER_MAP.get(key) + if mapped: + normalized_columns[col] = mapped + + if "table_name" not in normalized_columns.values(): + return {"error": "CSV must contain a 'Table' column."} + + df = df.rename(columns=normalized_columns) + # Keep only recognized columns + recognized = [c for c in df.columns if c in ("table_name", "column_name", *METADATA_COLUMNS)] + df = df[recognized] + + if df.empty: + return {"error": "CSV file is empty."} + + # Strip whitespace from all string fields + for col in df.columns: + df[col] = df[col].str.strip() + + # Deduplicate: last occurrence wins, mark earlier duplicates + has_column_name = "column_name" in df.columns + if not has_column_name: + df["column_name"] = "" + dedup_cols = ["table_name", "column_name"] if has_column_name else ["table_name"] + is_last = ~df.duplicated(subset=dedup_cols, keep="last") + duplicate_rows = df[~is_last] + df = df[is_last] + + return {"df": df, "duplicate_rows": duplicate_rows} + + +def _match_and_validate( + df: pd.DataFrame, duplicate_rows: pd.DataFrame, table_group_id: str, blank_behavior: str +) -> dict: + # Query existing tables and columns in this table group + existing_tables = fetch_all_from_db( + """ + SELECT table_id::VARCHAR, table_name + FROM data_table_chars + WHERE table_groups_id = :table_group_id + """, + {"table_group_id": table_group_id}, + ) + table_lookup = {row["table_name"]: row["table_id"] for row in existing_tables} + + existing_columns = fetch_all_from_db( + """ + SELECT column_id::VARCHAR, table_name, column_name + FROM data_column_chars + WHERE table_groups_id = :table_group_id + """, + {"table_group_id": table_group_id}, + ) + column_lookup = {(row["table_name"], row["column_name"]): row["column_id"] for row in existing_columns} + + table_rows = [] + column_rows = [] + preview_rows = [] + + for _, dup_row in duplicate_rows.iterrows(): + preview_rows.append({ + "table_name": dup_row["table_name"], + "column_name": dup_row.get("column_name", ""), + "_status": "unmatched", + "_status_detail": "Duplicate row \u2014 last occurrence will be used", + "_truncated_fields": [], + }) + + for _, row in df.iterrows(): + table_name = row["table_name"] + column_name = row.get("column_name", "") + + if not table_name: + continue + + is_table_row = not column_name + preview_row = {"table_name": table_name, "column_name": column_name or ""} + + if is_table_row: + table_id = table_lookup.get(table_name) + if not table_id: + preview_row["_status"] = "unmatched" + preview_row["_status_detail"] = "Table not found in catalog" + preview_rows.append(preview_row) + continue + + fields, bad_cde = _extract_metadata_fields(row, blank_behavior) + fields, truncated = _truncate_fields(fields) + if fields: + table_rows.append({"table_id": table_id, "table_name": table_name, **fields}) + + preview_row.update(fields) + _set_row_status(preview_row, bad_cde, truncated) + preview_rows.append(preview_row) + else: + column_id = column_lookup.get((table_name, column_name)) + if not column_id: + preview_row["_status"] = "unmatched" + preview_row["_status_detail"] = ( + "Table not found in catalog" if table_name not in table_lookup else "Column not found in catalog" + ) + preview_rows.append(preview_row) + continue + + fields, bad_cde = _extract_metadata_fields(row, blank_behavior) + fields, truncated = _truncate_fields(fields) + if fields: + column_rows.append( + {"column_id": column_id, "table_name": table_name, "column_name": column_name, **fields} + ) + + preview_row.update(fields) + _set_row_status(preview_row, bad_cde, truncated) + preview_rows.append(preview_row) + + # Determine which metadata columns are present in the CSV + metadata_columns = [c for c in METADATA_COLUMNS if c in df.columns] + + return { + "table_rows": table_rows, + "column_rows": column_rows, + "preview_rows": preview_rows, + "metadata_columns": metadata_columns, + "blank_behavior": blank_behavior, + } + + +def _extract_metadata_fields(row: pd.Series, blank_behavior: str) -> tuple[dict, int]: + fields = {} + bad_cde = 0 + for col in METADATA_COLUMNS: + if col not in row.index: + continue + + value = row[col] + + if col == "critical_data_element": + if value.lower() in CDE_TRUE_VALUES: + fields[col] = True + elif value.lower() in CDE_FALSE_VALUES: + fields[col] = False + elif not value: + if blank_behavior == "clear": + fields[col] = None + # "keep" → skip this field + else: + # Unrecognized value — skip (don't set field at all) + bad_cde = 1 + else: + if value: + fields[col] = value + elif blank_behavior == "clear": + fields[col] = "" + # "keep" with blank value → skip this field + + return fields, bad_cde + + +def _truncate_fields(fields: dict) -> tuple[dict, list[str]]: + truncated = [] + for key, value in fields.items(): + if not isinstance(value, str): + continue + max_len = DESCRIPTION_MAX_LENGTH if key == "description" else TAG_MAX_LENGTH + if len(value) > max_len: + fields[key] = value[:max_len] + truncated.append(key) + return fields, truncated + + +def _set_row_status(preview_row: dict, bad_cde: int, truncated: list[str]) -> None: + issues = [] + if bad_cde: + issues.append("Unrecognized CDE value (expected Yes/No) — skipped") + if truncated: + issues.append(f"Value(s) truncated: {', '.join(truncated)}") + + if bad_cde: + preview_row["_status"] = "error" + elif truncated: + preview_row["_status"] = "warning" + else: + preview_row["_status"] = "ok" + preview_row["_status_detail"] = "\n".join(issues) + preview_row["_truncated_fields"] = truncated + + +def apply_metadata_import(preview: dict) -> dict: + table_count = 0 + column_count = 0 + + for row in preview.get("table_rows", []): + set_clauses, params = _build_update_params(row, preview["metadata_columns"]) + if not set_clauses: + continue + params["table_id"] = row["table_id"] + execute_db_query( + f"UPDATE data_table_chars SET {', '.join(set_clauses)} WHERE table_id = CAST(:table_id AS UUID)", + params, + ) + table_count += 1 + + for row in preview.get("column_rows", []): + set_clauses, params = _build_update_params(row, preview["metadata_columns"]) + if not set_clauses: + continue + params["column_id"] = row["column_id"] + execute_db_query( + f"UPDATE data_column_chars SET {', '.join(set_clauses)} WHERE column_id = CAST(:column_id AS UUID)", + params, + ) + column_count += 1 + + return {"table_count": table_count, "column_count": column_count} + + +def _build_update_params(row: dict, metadata_columns: list[str]) -> tuple[list[str], dict]: + set_clauses = [] + params = {} + + for col in metadata_columns: + if col not in row: + continue + + value = row[col] + if col == "critical_data_element": + set_clauses.append("critical_data_element = :critical_data_element") + params["critical_data_element"] = value + else: + set_clauses.append(f"{col} = NULLIF(:{col}, '')") + params[col] = value if value is not None else "" + + return set_clauses, params + + +PREVIEW_SESSION_KEY = "import_metadata:preview" + + +def open_import_metadata_dialog(table_group_id: str) -> None: + """Clear stale preview state before opening the dialog.""" + st.session_state.pop(PREVIEW_SESSION_KEY, None) + import_metadata_dialog(table_group_id) + + +@st.dialog(title="Import Metadata", width="large") +@with_database_session +def import_metadata_dialog(table_group_id: str) -> None: + should_import, set_should_import = temp_value("import_metadata:import") + + def on_file_uploaded(payload: dict) -> None: + content = payload["content"] + blank_behavior = payload["blank_behavior"] + preview = parse_import_csv(content, table_group_id, blank_behavior) + st.session_state[PREVIEW_SESSION_KEY] = preview + + def on_file_cleared(_payload: dict) -> None: + st.session_state.pop(PREVIEW_SESSION_KEY, None) + + # Preview persists in session state (not temp_value) so it survives across reruns + preview = st.session_state.get(PREVIEW_SESSION_KEY) + + result = None + if should_import() and preview and not preview.get("error"): + try: + counts = apply_metadata_import(preview) + + # Clear caches + from testgen.ui.queries.profiling_queries import get_column_by_id, get_table_by_id + from testgen.ui.views.data_catalog import get_table_group_columns, get_tag_values + + for func in [get_table_group_columns, get_table_by_id, get_column_by_id, get_tag_values]: + func.clear() + st.session_state["data_catalog:last_saved_timestamp"] = datetime.now().timestamp() + + result = { + "success": True, + "message": f"Metadata imported: {counts['table_count']} table(s), {counts['column_count']} column(s) updated.", + } + except Exception: + LOG.exception("Metadata import failed") + result = { + "success": False, + "message": "Import failed due to an unexpected error. Please try again.", + } + + st.session_state.pop(PREVIEW_SESSION_KEY, None) + + # Build preview data for JS display + preview_props = None + if preview: + if preview.get("error"): + preview_props = {"error": preview["error"]} + else: + preview_props = _build_preview_props(preview) + + testgen_component( + "import_metadata_dialog", + props={ + "preview": preview_props, + "result": result, + }, + on_change_handlers={ + "FileUploaded": on_file_uploaded, + "FileCleared": on_file_cleared, + "ImportConfirmed": lambda _: set_should_import(True), + }, + ) + + if result and result["success"]: + time.sleep(2) + st.rerun() + + +def _build_preview_props(preview: dict) -> dict: + formatted_rows = [] + metadata_columns = preview.get("metadata_columns", []) + + for row in preview.get("preview_rows", []): + formatted_row = { + "table_name": row["table_name"], + "column_name": row["column_name"], + "_status": row.get("_status", "ok"), + "_status_detail": row.get("_status_detail", ""), + "_truncated_fields": row.get("_truncated_fields", []), + } + for col in metadata_columns: + if col in row: + val = row[col] + formatted_row[col] = ( + "Yes" if val is True else "No" if val is False else ("" if val is None else str(val)) + ) + formatted_rows.append(formatted_row) + + return { + "table_count": len(preview.get("table_rows", [])), + "column_count": len(preview.get("column_rows", [])), + "metadata_columns": metadata_columns, + "preview_rows": formatted_rows, + } diff --git a/tests/unit/ui/__init__.py b/tests/unit/ui/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/ui/conftest.py b/tests/unit/ui/conftest.py new file mode 100644 index 00000000..1aa05727 --- /dev/null +++ b/tests/unit/ui/conftest.py @@ -0,0 +1,7 @@ +import sys +from unittest.mock import MagicMock + +# Mock the Streamlit component registration that fails outside a running Streamlit app. +# The testgen_component module triggers component registration at import time, which +# requires a Streamlit runtime. We mock it so pure-logic tests can import freely. +sys.modules.setdefault("testgen.ui.components.widgets.testgen_component", MagicMock()) diff --git a/tests/unit/ui/test_import_metadata.py b/tests/unit/ui/test_import_metadata.py new file mode 100644 index 00000000..5ac9a8d2 --- /dev/null +++ b/tests/unit/ui/test_import_metadata.py @@ -0,0 +1,324 @@ +import base64 + +import pandas as pd +import pytest + +from testgen.ui.views.dialogs.import_metadata_dialog import ( + DESCRIPTION_MAX_LENGTH, + TAG_MAX_LENGTH, + _build_preview_props, + _extract_metadata_fields, + _parse_csv, + _set_row_status, + _truncate_fields, +) + +pytestmark = pytest.mark.unit + + +def _make_base64_csv(csv_text: str) -> str: + encoded = base64.b64encode(csv_text.encode()).decode() + return f"data:text/csv;base64,{encoded}" + + +def _make_series(data: dict) -> pd.Series: + return pd.Series(data) + + +# --- _parse_csv --- + + +def test_parse_csv_basic_table_and_column(): + content = _make_base64_csv("Table,Column,Description\nmy_table,,Table desc\nmy_table,col1,Col desc\n") + result = _parse_csv(content) + assert "error" not in result + df = result["df"] + assert len(df) == 2 + assert list(df["table_name"]) == ["my_table", "my_table"] + assert list(df["column_name"]) == ["", "col1"] + + +def test_parse_csv_missing_table_column(): + content = _make_base64_csv("Column,Description\ncol1,desc\n") + result = _parse_csv(content) + assert result["error"] == "CSV must contain a 'Table' column." + + +def test_parse_csv_empty(): + content = _make_base64_csv("Table,Column\n") + result = _parse_csv(content) + assert result["error"] == "CSV file is empty." + + +def test_parse_csv_invalid_base64(): + result = _parse_csv("data:text/csv;base64,!!!invalid!!!") + assert "error" in result + assert "Could not parse CSV file" in result["error"] + + +def test_parse_csv_header_normalization_underscores(): + content = _make_base64_csv("Table,Critical_Data_Element\nmy_table,Yes\n") + result = _parse_csv(content) + assert "error" not in result + assert "critical_data_element" in result["df"].columns + + +def test_parse_csv_header_normalization_spaces(): + content = _make_base64_csv("Table,Critical Data Element\nmy_table,Yes\n") + result = _parse_csv(content) + assert "error" not in result + assert "critical_data_element" in result["df"].columns + + +def test_parse_csv_header_cde_alias(): + content = _make_base64_csv("Table,CDE\nmy_table,Yes\n") + result = _parse_csv(content) + assert "error" not in result + assert "critical_data_element" in result["df"].columns + + +def test_parse_csv_header_case_insensitive(): + content = _make_base64_csv("TABLE,DESCRIPTION\nmy_table,desc\n") + result = _parse_csv(content) + assert "error" not in result + assert "description" in result["df"].columns + + +def test_parse_csv_extra_columns_ignored(): + content = _make_base64_csv("Table,Description,UnknownCol\nmy_table,desc,ignored\n") + result = _parse_csv(content) + assert "error" not in result + assert "UnknownCol" not in result["df"].columns + + +def test_parse_csv_whitespace_stripped(): + content = _make_base64_csv("Table,Description\n my_table , desc \n") + result = _parse_csv(content) + df = result["df"] + assert df.iloc[0]["table_name"] == "my_table" + assert df.iloc[0]["description"] == "desc" + + +def test_parse_csv_duplicate_table_rows(): + content = _make_base64_csv("Table,Description\nmy_table,first\nmy_table,second\n") + result = _parse_csv(content) + assert len(result["duplicate_rows"]) == 1 + assert len(result["df"]) == 1 + assert result["df"].iloc[0]["description"] == "second" + + +def test_parse_csv_duplicate_column_rows(): + content = _make_base64_csv("Table,Column,Description\nt,c,first\nt,c,second\n") + result = _parse_csv(content) + assert len(result["duplicate_rows"]) == 1 + assert result["df"].iloc[0]["description"] == "second" + + +def test_parse_csv_no_column_header_adds_empty(): + content = _make_base64_csv("Table,Description\nmy_table,desc\n") + result = _parse_csv(content) + assert "column_name" in result["df"].columns + assert result["df"].iloc[0]["column_name"] == "" + + +# --- _extract_metadata_fields --- + + +@pytest.mark.parametrize("val", ["Yes", "yes", "Y", "y", "True", "true", "1"]) +def test_extract_cde_true_values(val): + fields, bad_cde = _extract_metadata_fields(_make_series({"critical_data_element": val}), "keep") + assert fields["critical_data_element"] is True + assert bad_cde == 0 + + +@pytest.mark.parametrize("val", ["No", "no", "N", "n", "False", "false", "0"]) +def test_extract_cde_false_values(val): + fields, bad_cde = _extract_metadata_fields(_make_series({"critical_data_element": val}), "keep") + assert fields["critical_data_element"] is False + assert bad_cde == 0 + + +def test_extract_cde_blank_keep(): + fields, bad_cde = _extract_metadata_fields(_make_series({"critical_data_element": ""}), "keep") + assert "critical_data_element" not in fields + assert bad_cde == 0 + + +def test_extract_cde_blank_clear(): + fields, bad_cde = _extract_metadata_fields(_make_series({"critical_data_element": ""}), "clear") + assert fields["critical_data_element"] is None + assert bad_cde == 0 + + +def test_extract_cde_unrecognized(): + fields, bad_cde = _extract_metadata_fields(_make_series({"critical_data_element": "Maybe"}), "keep") + assert "critical_data_element" not in fields + assert bad_cde == 1 + + +def test_extract_text_field_with_value(): + fields, _ = _extract_metadata_fields(_make_series({"description": "test desc"}), "keep") + assert fields["description"] == "test desc" + + +def test_extract_text_field_blank_keep(): + fields, _ = _extract_metadata_fields(_make_series({"description": ""}), "keep") + assert "description" not in fields + + +def test_extract_text_field_blank_clear(): + fields, _ = _extract_metadata_fields(_make_series({"description": ""}), "clear") + assert fields["description"] == "" + + +def test_extract_missing_column_skipped(): + fields, _ = _extract_metadata_fields(_make_series({"description": "test"}), "keep") + assert "data_source" not in fields + + +def test_extract_tag_field_with_value(): + fields, _ = _extract_metadata_fields(_make_series({"data_source": "ERP"}), "keep") + assert fields["data_source"] == "ERP" + + +# --- _truncate_fields --- + + +def test_truncate_no_truncation_needed(): + fields = {"description": "short", "data_source": "ERP"} + result, truncated = _truncate_fields(fields) + assert truncated == [] + assert result["description"] == "short" + + +def test_truncate_tag_at_max(): + fields = {"data_source": "x" * (TAG_MAX_LENGTH + 10)} + result, truncated = _truncate_fields(fields) + assert truncated == ["data_source"] + assert len(result["data_source"]) == TAG_MAX_LENGTH + + +def test_truncate_description_at_max(): + fields = {"description": "x" * (DESCRIPTION_MAX_LENGTH + 10)} + result, truncated = _truncate_fields(fields) + assert truncated == ["description"] + assert len(result["description"]) == DESCRIPTION_MAX_LENGTH + + +def test_truncate_boolean_fields_skipped(): + fields = {"critical_data_element": True} + result, truncated = _truncate_fields(fields) + assert truncated == [] + assert result["critical_data_element"] is True + + +def test_truncate_multiple_fields(): + fields = {"data_source": "x" * 50, "source_system": "y" * 50} + _, truncated = _truncate_fields(fields) + assert "data_source" in truncated + assert "source_system" in truncated + + +# --- _set_row_status --- + + +def test_set_row_status_ok(): + row = {} + _set_row_status(row, bad_cde=0, truncated=[]) + assert row["_status"] == "ok" + assert row["_status_detail"] == "" + assert row["_truncated_fields"] == [] + + +def test_set_row_status_error_bad_cde(): + row = {} + _set_row_status(row, bad_cde=1, truncated=[]) + assert row["_status"] == "error" + assert "Unrecognized CDE" in row["_status_detail"] + + +def test_set_row_status_warning_truncated(): + row = {} + _set_row_status(row, bad_cde=0, truncated=["data_source"]) + assert row["_status"] == "warning" + assert "truncated" in row["_status_detail"] + assert "data_source" in row["_status_detail"] + + +def test_set_row_status_error_precedence(): + row = {} + _set_row_status(row, bad_cde=1, truncated=["data_source"]) + assert row["_status"] == "error" + assert "CDE" in row["_status_detail"] + assert "truncated" in row["_status_detail"] + + +# --- _build_preview_props --- + + +def test_preview_props_basic(): + preview = { + "table_rows": [{"table_id": "1", "table_name": "t1", "description": "desc"}], + "column_rows": [], + "preview_rows": [ + {"table_name": "t1", "column_name": "", "description": "desc", "_status": "ok", "_status_detail": "", "_truncated_fields": []}, + ], + "metadata_columns": ["description"], + } + result = _build_preview_props(preview) + assert result["table_count"] == 1 + assert result["column_count"] == 0 + assert len(result["preview_rows"]) == 1 + assert result["preview_rows"][0]["description"] == "desc" + + +def test_preview_props_cde_true(): + preview = { + "table_rows": [{"table_id": "1", "table_name": "t", "critical_data_element": True}], + "column_rows": [], + "preview_rows": [ + {"table_name": "t", "column_name": "", "critical_data_element": True, "_status": "ok", "_status_detail": "", "_truncated_fields": []}, + ], + "metadata_columns": ["critical_data_element"], + } + result = _build_preview_props(preview) + assert result["preview_rows"][0]["critical_data_element"] == "Yes" + + +def test_preview_props_cde_false(): + preview = { + "table_rows": [{"table_id": "1", "table_name": "t", "critical_data_element": False}], + "column_rows": [], + "preview_rows": [ + {"table_name": "t", "column_name": "", "critical_data_element": False, "_status": "ok", "_status_detail": "", "_truncated_fields": []}, + ], + "metadata_columns": ["critical_data_element"], + } + result = _build_preview_props(preview) + assert result["preview_rows"][0]["critical_data_element"] == "No" + + +def test_preview_props_cde_none(): + preview = { + "table_rows": [], + "column_rows": [], + "preview_rows": [ + {"table_name": "t", "column_name": "", "critical_data_element": None, "_status": "ok", "_status_detail": "", "_truncated_fields": []}, + ], + "metadata_columns": ["critical_data_element"], + } + result = _build_preview_props(preview) + assert result["preview_rows"][0]["critical_data_element"] == "" + + +def test_preview_props_unmatched_preserved(): + preview = { + "table_rows": [], + "column_rows": [], + "preview_rows": [ + {"table_name": "fake", "column_name": "", "_status": "unmatched", "_status_detail": "Table not found", "_truncated_fields": []}, + ], + "metadata_columns": ["description"], + } + result = _build_preview_props(preview) + assert result["preview_rows"][0]["_status"] == "unmatched" From f4e876a4cd6fb2b40faeecbe666b66b567e0b180 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Tue, 3 Mar 2026 16:01:55 -0300 Subject: [PATCH 26/95] feat(mcp): enforce project-level permission scoping on all tools MCP tools now filter data based on the authenticated user's project memberships. Global admins see everything; scoped users only see entities belonging to their accessible projects. Inaccessible entities return the same "not found" message to avoid leaking existence. Co-Authored-By: Claude Opus 4.6 --- testgen/common/models/data_table.py | 26 +- testgen/common/models/test_result.py | 36 +- testgen/mcp/permissions.py | 157 ++++++++ testgen/mcp/server.py | 43 +- testgen/mcp/services/inventory_service.py | 31 +- testgen/mcp/tools/discovery.py | 28 +- testgen/mcp/tools/test_results.py | 16 +- testgen/mcp/tools/test_runs.py | 5 + testgen/utils/plugins.py | 20 +- tests/unit/mcp/conftest.py | 26 ++ tests/unit/mcp/test_inventory_service.py | 69 ++++ tests/unit/mcp/test_permissions.py | 459 ++++++++++++++++++++++ tests/unit/mcp/test_tools_discovery.py | 154 ++++++++ tests/unit/mcp/test_tools_test_results.py | 65 +++ tests/unit/mcp/test_tools_test_runs.py | 41 ++ 15 files changed, 1129 insertions(+), 47 deletions(-) create mode 100644 testgen/mcp/permissions.py create mode 100644 tests/unit/mcp/conftest.py create mode 100644 tests/unit/mcp/test_permissions.py diff --git a/testgen/common/models/data_table.py b/testgen/common/models/data_table.py index 5c387633..4cfa814d 100644 --- a/testgen/common/models/data_table.py +++ b/testgen/common/models/data_table.py @@ -5,6 +5,7 @@ from testgen.common.models import get_current_session from testgen.common.models.entity import Entity +from testgen.common.models.table_group import TableGroup class DataTable(Entity): @@ -23,17 +24,22 @@ class DataTable(Entity): # dq_score_profiling, dq_score_testing @classmethod - def select_table_names(cls, table_groups_id: UUID, limit: int = 100, offset: int = 0) -> list[str]: - query = ( - select(cls.table_name) - .where(cls.table_groups_id == table_groups_id) - .order_by(asc(func.lower(cls.table_name))) - .offset(offset) - .limit(limit) - ) + def select_table_names( + cls, table_groups_id: UUID, project_codes: list[str] | None = None, limit: int = 100, offset: int = 0, + ) -> list[str]: + query = select(cls.table_name).where(cls.table_groups_id == table_groups_id) + if project_codes is not None: + query = query.join(TableGroup, cls.table_groups_id == TableGroup.id).where( + TableGroup.project_code.in_(project_codes) + ) + query = query.order_by(asc(func.lower(cls.table_name))).offset(offset).limit(limit) return list(get_current_session().scalars(query).all()) @classmethod - def count_tables(cls, table_groups_id: UUID) -> int: - query = select(func.count()).where(cls.table_groups_id == table_groups_id) + def count_tables(cls, table_groups_id: UUID, project_codes: list[str] | None = None) -> int: + query = select(func.count()).select_from(cls).where(cls.table_groups_id == table_groups_id) + if project_codes is not None: + query = query.join(TableGroup, cls.table_groups_id == TableGroup.id).where( + TableGroup.project_code.in_(project_codes) + ) return get_current_session().scalar(query) or 0 diff --git a/testgen/common/models/test_result.py b/testgen/common/models/test_result.py index 6c9e4b97..8e517900 100644 --- a/testgen/common/models/test_result.py +++ b/testgen/common/models/test_result.py @@ -10,6 +10,7 @@ from testgen.common.models import get_current_session from testgen.common.models.entity import Entity +from testgen.common.models.test_suite import TestSuite class TestResultStatus(enum.Enum): @@ -59,6 +60,7 @@ def select_results( status: TestResultStatus | None = None, table_name: str | None = None, test_type: str | None = None, + project_codes: list[str] | None = None, limit: int = 50, offset: int = 0, ) -> list[Self]: @@ -72,13 +74,19 @@ def select_results( clauses.append(cls.table_name == table_name) if test_type: clauses.append(cls.test_type == test_type) - query = select(cls).where(*clauses).order_by(cls.status, cls.table_name, cls.column_names).offset(offset).limit(limit) + query = select(cls).where(*clauses) + if project_codes is not None: + query = query.join(TestSuite, cls.test_suite_id == TestSuite.id).where( + TestSuite.project_code.in_(project_codes) + ) + query = query.order_by(cls.status, cls.table_name, cls.column_names).offset(offset).limit(limit) return get_current_session().scalars(query).all() @classmethod def select_failures( cls, test_run_id: UUID, + project_codes: list[str] | None = None, group_by: str = "test_type", ) -> list[tuple]: allowed = {"test_type", "table_name", "column_names"} @@ -99,28 +107,28 @@ def select_failures( else: group_cols = (getattr(cls, group_by),) - query = ( - select(*group_cols, func.count().label("failure_count")) - .where(*where) - .group_by(*group_cols) - .order_by(func.count().desc()) - ) + query = select(*group_cols, func.count().label("failure_count")).where(*where) + if project_codes is not None: + query = query.join(TestSuite, cls.test_suite_id == TestSuite.id).where( + TestSuite.project_code.in_(project_codes) + ) + query = query.group_by(*group_cols).order_by(func.count().desc()) return get_current_session().execute(query).all() @classmethod def select_history( cls, test_definition_id: UUID, + project_codes: list[str] | None = None, limit: int = 20, offset: int = 0, ) -> list[Self]: - query = ( - select(cls) - .where(cls.test_definition_id == test_definition_id) - .order_by(desc(cls.test_time)) - .offset(offset) - .limit(limit) - ) + query = select(cls).where(cls.test_definition_id == test_definition_id) + if project_codes is not None: + query = query.join(TestSuite, cls.test_suite_id == TestSuite.id).where( + TestSuite.project_code.in_(project_codes) + ) + query = query.order_by(desc(cls.test_time)).offset(offset).limit(limit) return get_current_session().scalars(query).all() @classmethod diff --git a/testgen/mcp/permissions.py b/testgen/mcp/permissions.py new file mode 100644 index 00000000..a6218331 --- /dev/null +++ b/testgen/mcp/permissions.py @@ -0,0 +1,157 @@ +"""MCP permission enforcement — project-level and role-based access filtering.""" + +import contextvars +import functools +from collections.abc import Callable +from dataclasses import dataclass + +from testgen.common.models.project_membership import ProjectMembership +from testgen.common.models.user import User +from testgen.utils.plugins import PluginHook + +_NOT_SET = object() + +_mcp_username: contextvars.ContextVar[str | None] = contextvars.ContextVar("mcp_username", default=None) +_mcp_project_access: contextvars.ContextVar["ProjectAccess | object"] = contextvars.ContextVar( + "mcp_project_access", default=_NOT_SET +) + + +class MCPPermissionDenied(Exception): + """Raised by ProjectAccess when access is denied. Caught by the decorator.""" + + +@dataclass(frozen=True, slots=True) +class ProjectAccess: + is_unrestricted: bool + memberships: dict[str, str] + permission: str + allowed_codes: frozenset[str] + + def verify_access(self, project_code: str, not_found: str) -> None: + """Raise MCPPermissionDenied if user can't access this project. + + - Admin: always passes (no-op). + - Has access: passes. + - Has membership but wrong role: raises with denial message. + - No membership: raises with not_found (hides project existence). + """ + if self.is_unrestricted or project_code in self.allowed_codes: + return + if project_code in self.memberships: + raise MCPPermissionDenied( + "Your role on this project does not include the necessary permission for this operation." + ) + raise MCPPermissionDenied(not_found) + + def has_access(self, project_code: str) -> bool: + """For filtering lists — no exception, just a bool.""" + return self.is_unrestricted or project_code in self.allowed_codes + + @property + def query_codes(self) -> list[str] | None: + """Project codes for SQL WHERE. None = no filter (admin).""" + return None if self.is_unrestricted else list(self.allowed_codes) + + def query_codes_for(self, permission: str) -> list[str] | None: + """Project codes for a different permission (e.g. 'view' inside a 'catalog' tool).""" + if self.is_unrestricted: + return None + if permission == self.permission: + return list(self.allowed_codes) + allowed_roles = PluginHook.instance().rbac.get_roles_with_permission(permission) + return [code for code, role in self.memberships.items() if role in allowed_roles] + + +def set_mcp_username(username: str | None) -> None: + """Store the authenticated username (called by JWTTokenVerifier).""" + _mcp_username.set(username) + + +def get_current_mcp_user() -> User: + """Get the authenticated User for the current MCP request. + + Must be called within @with_database_session scope. + """ + username = _mcp_username.get() + if not username: + raise RuntimeError("No authenticated user in MCP context") + user = User.get(username) + if user is None: + raise ValueError(f"Authenticated user not found: {username}") + return user + + +def _compute_project_access(user: User, permission: str) -> ProjectAccess: + """Build a ProjectAccess for the given user and permission.""" + if user.is_global_admin: + return ProjectAccess( + is_unrestricted=True, + memberships={}, + permission=permission, + allowed_codes=frozenset(), + ) + + allowed_roles = PluginHook.instance().rbac.get_roles_with_permission(permission) + memberships_list = ProjectMembership.get_memberships_for_user(user.id) + memberships = {m.project_code: m.role for m in memberships_list} + allowed_codes = frozenset(code for code, role in memberships.items() if role in allowed_roles) + + return ProjectAccess( + is_unrestricted=False, + memberships=memberships, + permission=permission, + allowed_codes=allowed_codes, + ) + + +def get_project_access() -> ProjectAccess: + """Retrieve the ProjectAccess computed by @mcp_permission for the current request. + + Raises RuntimeError if called without @mcp_permission — prevents silent + admin-level access when a developer forgets to add the decorator. + """ + value = _mcp_project_access.get() + if value is _NOT_SET: + raise RuntimeError( + "get_project_access() called without @mcp_permission — add the decorator to this tool" + ) + return value # type: ignore[return-value] + + +def resolve_project_access(permission: str) -> ProjectAccess: + """Compute a ProjectAccess for a specific permission, using the current MCP user.""" + user = get_current_mcp_user() + return _compute_project_access(user, permission) + + +def mcp_permission(permission: str) -> Callable: + """Decorator that enforces role-based project filtering for MCP tools. + + Resolves the authenticated user, computes a ProjectAccess for the given + permission, and stores it in a ContextVar. The tool retrieves the value + via ``get_project_access()``. + + If the user has no projects with the required permission, returns an + early denial message. Catches MCPPermissionDenied raised by tool code + and returns str(e) as the tool response. + """ + + def decorator(fn: Callable) -> Callable: + @functools.wraps(fn) + def wrapper(*args, **kwargs): + user = get_current_mcp_user() + access = _compute_project_access(user, permission) + if not access.is_unrestricted and not access.allowed_codes: + return "Your role does not include the necessary permission for this operation on any project." + tok = _mcp_project_access.set(access) + try: + return fn(*args, **kwargs) + except MCPPermissionDenied as e: + return str(e) + finally: + _mcp_project_access.reset(tok) + + return wrapper + + return decorator diff --git a/testgen/mcp/server.py b/testgen/mcp/server.py index d537d841..56f7d81c 100644 --- a/testgen/mcp/server.py +++ b/testgen/mcp/server.py @@ -7,6 +7,7 @@ from testgen import settings from testgen.common.auth import decode_jwt_token from testgen.common.models import with_database_session +from testgen.mcp.permissions import set_mcp_username LOG = logging.getLogger("testgen") @@ -41,6 +42,7 @@ class JWTTokenVerifier: async def verify_token(self, token: str) -> AccessToken | None: try: payload = decode_jwt_token(token) + set_mcp_username(payload["username"]) return AccessToken( token=token, client_id=payload["username"], @@ -51,6 +53,30 @@ async def verify_token(self, token: str) -> AccessToken | None: return None +# Uvicorn log config: strip default handlers so logs propagate to the testgen logger. +_UVICORN_LOG_CONFIG: dict = { + "version": 1, + "disable_existing_loggers": False, + "loggers": { + "uvicorn": {"handlers": [], "propagate": True}, + "uvicorn.access": {"handlers": [], "propagate": True}, + "uvicorn.error": {"handlers": [], "propagate": True}, + }, +} + + +def _configure_mcp_logging() -> None: + """Route FastMCP and uvicorn logs through the testgen logger.""" + testgen_logger = logging.getLogger("testgen") + + # FastMCP.__init__ calls basicConfig() which adds a RichHandler to the root logger — remove it + logging.getLogger().handlers.clear() + + # Reparent top-level third-party loggers so they (and their children) propagate through testgen's handler + for name in ("mcp", "uvicorn"): + logging.getLogger(name).parent = testgen_logger + + def run_mcp() -> None: """Start the MCP server with streamable HTTP transport.""" from testgen.mcp import get_server_url @@ -62,10 +88,7 @@ def run_mcp() -> None: from testgen.utils.plugins import discover for plugin in discover(): - try: - plugin.load() - except Exception: - LOG.debug("Plugin %s skipped (not loadable in MCP context)", plugin.package) + plugin.load() server_url = with_database_session(get_server_url)() @@ -80,6 +103,7 @@ def run_mcp() -> None: ), token_verifier=JWTTokenVerifier(), ) + _configure_mcp_logging() # Tools (9) mcp.tool()(get_data_inventory) @@ -104,11 +128,13 @@ def run_mcp() -> None: LOG.info("Starting MCP server on %s:%s (auth issuer: %s)", settings.MCP_HOST, settings.MCP_PORT, server_url) + import uvicorn + + app = mcp.streamable_http_app() + if settings.IS_DEBUG: - import uvicorn from starlette.middleware.cors import CORSMiddleware - app = mcp.streamable_http_app() app.add_middleware( CORSMiddleware, allow_origins=["*"], @@ -116,6 +142,5 @@ def run_mcp() -> None: allow_headers=["*"], expose_headers=["Mcp-Session-Id"], ) - uvicorn.run(app, host=settings.MCP_HOST, port=settings.MCP_PORT) - else: - mcp.run(transport="streamable-http") + + uvicorn.run(app, host=settings.MCP_HOST, port=settings.MCP_PORT, log_config=_UVICORN_LOG_CONFIG) diff --git a/testgen/mcp/services/inventory_service.py b/testgen/mcp/services/inventory_service.py index d74562da..d6ca6f84 100644 --- a/testgen/mcp/services/inventory_service.py +++ b/testgen/mcp/services/inventory_service.py @@ -7,8 +7,17 @@ from testgen.common.models.test_suite import TestSuite -def get_inventory() -> str: - """Build a markdown inventory of all projects, connections, table groups, and test suites.""" +def get_inventory( + project_codes: list[str] | None = None, + view_project_codes: list[str] | None = None, +) -> str: + """Build a markdown inventory of all projects, connections, table groups, and test suites. + + Args: + project_codes: Projects the user can see (None = all). + view_project_codes: Projects where the user has 'view' permission (None = all). + When set, suites are hidden for projects not in this list. + """ session = get_current_session() query = ( @@ -32,7 +41,13 @@ def get_inventory() -> str: TestSuite.is_monitor.isnot(True), ), ) - .order_by(Project.project_name, Connection.connection_name, TableGroup.table_groups_name, TestSuite.test_suite) + ) + + if project_codes is not None: + query = query.where(Project.project_code.in_(project_codes)) + + query = query.order_by( + Project.project_name, Connection.connection_name, TableGroup.table_groups_name, TestSuite.test_suite, ) rows = session.execute(query).all() @@ -78,6 +93,7 @@ def get_inventory() -> str: lines = ["# Data Inventory\n"] for project_code, proj in projects.items(): + can_view_suites = view_project_codes is None or project_code in view_project_codes lines.append(f"## Project: {proj['name']} (`{project_code}`)\n") if not proj["connections"]: @@ -103,6 +119,15 @@ def get_inventory() -> str: f"#### Table Group: {group['name']} (id: `{group_id}`, schema: `{group['schema']}`)\n" ) + if not can_view_suites: + if group["suites"]: + lines.append( + f"_{len(group['suites'])} test suite(s) — requires `view` permission._\n" + ) + else: + lines.append("_No test suites._\n") + continue + if not group["suites"]: lines.append("_No test suites._\n") continue diff --git a/testgen/mcp/tools/discovery.py b/testgen/mcp/tools/discovery.py index 853374fd..cbadeff3 100644 --- a/testgen/mcp/tools/discovery.py +++ b/testgen/mcp/tools/discovery.py @@ -4,27 +4,33 @@ from testgen.common.models.data_table import DataTable from testgen.common.models.project import Project from testgen.common.models.test_suite import TestSuite +from testgen.mcp.permissions import get_project_access, mcp_permission @with_database_session +@mcp_permission("catalog") def get_data_inventory() -> str: - """Get a structural inventory of all projects, connections, table groups, and test suites. + """Get a structural inventory of all projects, connections, table groups, and test suites + accessible to the authenticated user. This is the recommended starting point for understanding the data quality landscape. - Returns a structured markdown overview of the entire TestGen configuration. + Returns a structured markdown overview of the TestGen configuration. """ from testgen.mcp.services.inventory_service import get_inventory - return get_inventory() + access = get_project_access() + return get_inventory(project_codes=access.query_codes, view_project_codes=access.query_codes_for("view")) @with_database_session +@mcp_permission("catalog") def list_projects() -> str: - """List all configured projects. + """List all projects the authenticated user has access to. Returns project codes and names. Use these to scope queries to specific projects. """ - projects = Project.select_where() + access = get_project_access() + projects = [p for p in Project.select_where() if access.has_access(p.project_code)] if not projects: return "No projects found." @@ -37,6 +43,7 @@ def list_projects() -> str: @with_database_session +@mcp_permission("view") def list_test_suites(project_code: str) -> str: """List all test suites for a project with their latest run statistics. @@ -46,6 +53,9 @@ def list_test_suites(project_code: str) -> str: if not project_code: return "Missing required parameter `project_code`." + access = get_project_access() + access.verify_access(project_code, not_found=f"No test suites found for project `{project_code}`.") + summaries = TestSuite.select_summary(project_code) if not summaries: @@ -79,6 +89,7 @@ def list_test_suites(project_code: str) -> str: @with_database_session +@mcp_permission("catalog") def list_tables(table_group_id: str, limit: int = 200, page: int = 1) -> str: """List tables in a table group. @@ -92,9 +103,12 @@ def list_tables(table_group_id: str, limit: int = 200, page: int = 1) -> str: except (ValueError, AttributeError) as err: raise ValueError(f"Invalid table_group_id: `{table_group_id}` is not a valid UUID.") from err + access = get_project_access() + project_codes = access.query_codes + offset = (page - 1) * limit - table_names = DataTable.select_table_names(group_uuid, limit=limit, offset=offset) - total = DataTable.count_tables(group_uuid) + table_names = DataTable.select_table_names(group_uuid, limit=limit, offset=offset, project_codes=project_codes) + total = DataTable.count_tables(group_uuid, project_codes=project_codes) if not table_names: if page > 1: diff --git a/testgen/mcp/tools/test_results.py b/testgen/mcp/tools/test_results.py index cdd3360d..117ddd2f 100644 --- a/testgen/mcp/tools/test_results.py +++ b/testgen/mcp/tools/test_results.py @@ -3,6 +3,7 @@ from testgen.common.models import with_database_session from testgen.common.models.test_definition import TestType from testgen.common.models.test_result import TestResult, TestResultStatus +from testgen.mcp.permissions import get_project_access, mcp_permission def _parse_uuid(value: str, label: str = "ID") -> UUID: @@ -21,6 +22,7 @@ def _parse_status(value: str) -> TestResultStatus: @with_database_session +@mcp_permission("view") def get_test_results( test_run_id: str, status: str | None = None, @@ -43,6 +45,8 @@ def get_test_results( status_enum = _parse_status(status) if status else None offset = (page - 1) * limit + access = get_project_access() + results = TestResult.select_results( test_run_id=run_uuid, status=status_enum, @@ -50,6 +54,7 @@ def get_test_results( test_type=test_type, limit=limit, offset=offset, + project_codes=access.query_codes, ) if not results: @@ -88,6 +93,7 @@ def get_test_results( @with_database_session +@mcp_permission("view") def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: """Get a summary of test failures (Failed and Warning) grouped by test type, table name, or column. @@ -97,10 +103,12 @@ def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: """ run_uuid = _parse_uuid(test_run_id, "test_run_id") + access = get_project_access() + # Map public param names to model field names model_group_map = {"table": "table_name", "column": "column_names"} model_group_by = model_group_map.get(group_by, group_by) - failures = TestResult.select_failures(test_run_id=run_uuid, group_by=model_group_by) + failures = TestResult.select_failures(test_run_id=run_uuid, group_by=model_group_by, project_codes=access.query_codes) if not failures: return f"No confirmed failures found for run `{test_run_id}`." @@ -150,6 +158,7 @@ def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: @with_database_session +@mcp_permission("view") def get_test_result_history( test_definition_id: str, limit: int = 20, @@ -164,7 +173,10 @@ def get_test_result_history( """ def_uuid = _parse_uuid(test_definition_id, "test_definition_id") offset = (page - 1) * limit - results = TestResult.select_history(test_definition_id=def_uuid, limit=limit, offset=offset) + + access = get_project_access() + + results = TestResult.select_history(test_definition_id=def_uuid, limit=limit, offset=offset, project_codes=access.query_codes) if not results: return f"No historical results found for test definition `{test_definition_id}`." diff --git a/testgen/mcp/tools/test_runs.py b/testgen/mcp/tools/test_runs.py index ef5b4869..36cff652 100644 --- a/testgen/mcp/tools/test_runs.py +++ b/testgen/mcp/tools/test_runs.py @@ -1,9 +1,11 @@ from testgen.common.models import with_database_session from testgen.common.models.test_run import TestRun from testgen.common.models.test_suite import TestSuite +from testgen.mcp.permissions import get_project_access, mcp_permission @with_database_session +@mcp_permission("view") def get_recent_test_runs(project_code: str, test_suite: str | None = None, limit: int = 1) -> str: """Get the latest test runs for each test suite in a project, optionally filtered by test suite name. @@ -15,6 +17,9 @@ def get_recent_test_runs(project_code: str, test_suite: str | None = None, limit if not project_code: return "Missing required parameter `project_code`." + access = get_project_access() + access.verify_access(project_code, not_found=f"No completed test runs found in project `{project_code}`.") + test_suite_id = None if test_suite: suites = TestSuite.select_minimal_where( diff --git a/testgen/utils/plugins.py b/testgen/utils/plugins.py index acaea286..1863d03e 100644 --- a/testgen/utils/plugins.py +++ b/testgen/utils/plugins.py @@ -10,7 +10,7 @@ from collections.abc import Generator from pathlib import Path from types import ModuleType -from typing import ClassVar +from typing import ClassVar, get_args from testgen.ui.assets import get_asset_path from testgen.ui.auth import Authentication @@ -109,6 +109,13 @@ class RBACProvider: def check_permission(_user: object, _permission: str) -> bool: return True + @staticmethod + def get_roles_with_permission(_permission: str) -> list[str]: + """Return roles that have the given permission. OS default: all roles.""" + from testgen.common.models.project_membership import RoleType + + return list(get_args(RoleType)) + class PluginSpec: rbac: ClassVar[type[RBACProvider]] = RBACProvider @@ -117,6 +124,14 @@ class PluginSpec: logo: ClassVar[type[Logo] | None] = None component: ClassVar[ComponentSpec | None] = None + @classmethod + def configure_ui(cls) -> None: + """Populate UI-related class attributes (pages, auth, logo, component). + + Override this in plugins to defer Streamlit-dependent imports until Streamlit + is actually running. Called by ``Plugin.load_streamlit()``, never by ``Plugin.load()``. + """ + class PluginHook: """Singleton holding resolved plugin values, pre-loaded with defaults.""" @@ -156,8 +171,9 @@ def load(self) -> type[PluginSpec]: return spec or PluginSpec def load_streamlit(self) -> type[PluginSpec]: - """Full Streamlit load. Calls load() first, then returns spec for UI access.""" + """Full Streamlit load. Calls load() first, then configure_ui() for UI attributes.""" spec = self.load() + spec.configure_ui() if spec is not PluginSpec: return spec diff --git a/tests/unit/mcp/conftest.py b/tests/unit/mcp/conftest.py new file mode 100644 index 00000000..46541d59 --- /dev/null +++ b/tests/unit/mcp/conftest.py @@ -0,0 +1,26 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from testgen.mcp.permissions import set_mcp_username + + +@pytest.fixture(autouse=True) +def mcp_user(): + """Set up an authenticated MCP user for all tool tests. + + Patches User.get to return a global admin by default (no filtering). + The @mcp_permission decorator calls get_current_mcp_user() which uses + User.get, then get_allowed_project_codes() which returns None for + global admins — so the ContextVar is set to None (no project filtering). + + Individual tests can patch get_allowed_project_codes to simulate + scoped access. + """ + set_mcp_username("test_user") + user = MagicMock() + user.is_global_admin = True + with patch("testgen.mcp.permissions.User") as mock_user_cls: + mock_user_cls.get.return_value = user + yield user + set_mcp_username(None) diff --git a/tests/unit/mcp/test_inventory_service.py b/tests/unit/mcp/test_inventory_service.py index 413814e5..8ea34436 100644 --- a/tests/unit/mcp/test_inventory_service.py +++ b/tests/unit/mcp/test_inventory_service.py @@ -99,3 +99,72 @@ def test_get_inventory_compact_groups(mock_select, session_mock): # Compact groups: single line with "X test suites", no "#### Table Group:" headers assert "test suites)" in result assert "#### Table Group:" not in result + + +@patch("testgen.mcp.services.inventory_service.select") +def test_get_inventory_hides_suites_without_view_permission(mock_select, session_mock): + """Suites are hidden for projects where user lacks view permission.""" + tg_id = uuid4() + suite_id = uuid4() + row = _make_row(table_group_id=tg_id, test_suite_id=suite_id, test_suite="Secret Suite") + session_mock.execute.return_value.all.return_value = [row] + + from testgen.mcp.services.inventory_service import get_inventory + + result = get_inventory(project_codes=["demo"], view_project_codes=[]) + + assert "Demo" in result + assert "Secret Suite" not in result + assert str(suite_id) not in result + assert "requires `view` permission" in result + assert "1 test suite(s)" in result + + +@patch("testgen.mcp.services.inventory_service.select") +def test_get_inventory_shows_suites_with_view_permission(mock_select, session_mock): + """Suites are shown for projects where user has view permission.""" + tg_id = uuid4() + suite_id = uuid4() + row = _make_row(table_group_id=tg_id, test_suite_id=suite_id, test_suite="Visible Suite") + session_mock.execute.return_value.all.return_value = [row] + + from testgen.mcp.services.inventory_service import get_inventory + + result = get_inventory(project_codes=["demo"], view_project_codes=["demo"]) + + assert "Visible Suite" in result + assert str(suite_id) in result + assert "requires `view` permission" not in result + + +@patch("testgen.mcp.services.inventory_service.select") +def test_get_inventory_view_none_shows_all_suites(mock_select, session_mock): + """When view_project_codes is None (global admin), all suites shown.""" + tg_id = uuid4() + suite_id = uuid4() + row = _make_row(table_group_id=tg_id, test_suite_id=suite_id, test_suite="Admin Suite") + session_mock.execute.return_value.all.return_value = [row] + + from testgen.mcp.services.inventory_service import get_inventory + + result = get_inventory(project_codes=None, view_project_codes=None) + + assert "Admin Suite" in result + assert "requires `view` permission" not in result + + +@patch("testgen.mcp.services.inventory_service.select") +def test_get_inventory_no_suites_without_view_shows_no_suites(mock_select, session_mock): + """When group has no suites and user lacks view, shows 'No test suites'.""" + tg_id = uuid4() + row = _make_row(table_group_id=tg_id, test_suite_id=None, test_suite=None) + # Remove the suite from the row + row.test_suite_id = None + session_mock.execute.return_value.all.return_value = [row] + + from testgen.mcp.services.inventory_service import get_inventory + + result = get_inventory(project_codes=["demo"], view_project_codes=[]) + + assert "No test suites" in result + assert "requires `view` permission" not in result diff --git a/tests/unit/mcp/test_permissions.py b/tests/unit/mcp/test_permissions.py new file mode 100644 index 00000000..0639f2a1 --- /dev/null +++ b/tests/unit/mcp/test_permissions.py @@ -0,0 +1,459 @@ +from unittest.mock import MagicMock, patch +from uuid import uuid4 + +import pytest + +from testgen.mcp.permissions import ( + _NOT_SET, + MCPPermissionDenied, + ProjectAccess, + _compute_project_access, + _mcp_project_access, + get_current_mcp_user, + get_project_access, + mcp_permission, + resolve_project_access, + set_mcp_username, +) + + +@pytest.fixture(autouse=True) +def _reset_contextvars(): + set_mcp_username(None) + tok = _mcp_project_access.set(_NOT_SET) + yield + set_mcp_username(None) + _mcp_project_access.reset(tok) + + +# --- get_current_mcp_user --- + + +def test_get_current_mcp_user_raises_when_no_username(): + with pytest.raises(RuntimeError, match="No authenticated user"): + get_current_mcp_user() + + +@patch("testgen.mcp.permissions.User") +def test_get_current_mcp_user_raises_when_user_not_found(mock_user): + mock_user.get.return_value = None + set_mcp_username("ghost") + + with pytest.raises(ValueError, match="Authenticated user not found: ghost"): + get_current_mcp_user() + + +@patch("testgen.mcp.permissions.User") +def test_get_current_mcp_user_returns_user(mock_user): + user = MagicMock() + mock_user.get.return_value = user + set_mcp_username("admin") + + result = get_current_mcp_user() + + assert result is user + mock_user.get.assert_called_once_with("admin") + + +# --- _compute_project_access --- + + +def test_compute_project_access_global_admin(): + user = MagicMock() + user.is_global_admin = True + + result = _compute_project_access(user, "view") + + assert result.is_unrestricted is True + assert result.memberships == {} + assert result.permission == "view" + assert result.allowed_codes == frozenset() + + +@patch("testgen.mcp.permissions.ProjectMembership") +@patch("testgen.mcp.permissions.PluginHook") +def test_compute_project_access_os_default_all_roles_allowed(mock_hook, mock_membership): + """OS default: get_roles_with_permission returns all roles — all memberships returned.""" + user = MagicMock() + user.is_global_admin = False + user.id = uuid4() + + mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = [ + "admin", "data_quality", "analyst", "business", "catalog", + ] + + m1 = MagicMock() + m1.project_code = "proj_a" + m1.role = "admin" + m2 = MagicMock() + m2.project_code = "proj_b" + m2.role = "catalog" + mock_membership.get_memberships_for_user.return_value = [m1, m2] + + result = _compute_project_access(user, "view") + + assert result.is_unrestricted is False + assert result.memberships == {"proj_a": "admin", "proj_b": "catalog"} + assert result.allowed_codes == frozenset(["proj_a", "proj_b"]) + + +@patch("testgen.mcp.permissions.ProjectMembership") +@patch("testgen.mcp.permissions.PluginHook") +def test_compute_project_access_filters_by_role(mock_hook, mock_membership): + """Enterprise: only memberships with allowed roles are returned.""" + user = MagicMock() + user.is_global_admin = False + user.id = uuid4() + + # "view" permission: admin, data_quality, analyst, business — NOT catalog + mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = [ + "admin", "data_quality", "analyst", "business", + ] + + m1 = MagicMock() + m1.project_code = "proj_a" + m1.role = "admin" + m2 = MagicMock() + m2.project_code = "proj_b" + m2.role = "catalog" + mock_membership.get_memberships_for_user.return_value = [m1, m2] + + result = _compute_project_access(user, "view") + + assert result.allowed_codes == frozenset(["proj_a"]) + assert result.memberships == {"proj_a": "admin", "proj_b": "catalog"} + + +@patch("testgen.mcp.permissions.ProjectMembership") +@patch("testgen.mcp.permissions.PluginHook") +def test_compute_project_access_catalog_user_with_catalog_permission(mock_hook, mock_membership): + """Catalog user calling catalog-permission tool gets their projects.""" + user = MagicMock() + user.is_global_admin = False + user.id = uuid4() + + mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = [ + "admin", "data_quality", "analyst", "business", "catalog", + ] + + m1 = MagicMock() + m1.project_code = "proj_a" + m1.role = "catalog" + mock_membership.get_memberships_for_user.return_value = [m1] + + result = _compute_project_access(user, "catalog") + + assert result.allowed_codes == frozenset(["proj_a"]) + + +@patch("testgen.mcp.permissions.ProjectMembership") +@patch("testgen.mcp.permissions.PluginHook") +def test_compute_project_access_catalog_user_with_view_permission_gets_empty(mock_hook, mock_membership): + """Catalog user calling view-permission tool gets empty allowed set.""" + user = MagicMock() + user.is_global_admin = False + user.id = uuid4() + + # "view" excludes catalog role + mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = [ + "admin", "data_quality", "analyst", "business", + ] + + m1 = MagicMock() + m1.project_code = "proj_a" + m1.role = "catalog" + mock_membership.get_memberships_for_user.return_value = [m1] + + result = _compute_project_access(user, "view") + + assert result.allowed_codes == frozenset() + + +# --- ProjectAccess.verify_access --- + + +def test_verify_access_admin_always_passes(): + access = ProjectAccess(is_unrestricted=True, memberships={}, permission="view", allowed_codes=frozenset()) + access.verify_access("any_project", not_found="not found") + + +def test_verify_access_allowed_passes(): + access = ProjectAccess( + is_unrestricted=False, + memberships={"proj_a": "admin"}, + permission="view", + allowed_codes=frozenset(["proj_a"]), + ) + access.verify_access("proj_a", not_found="not found") + + +def test_verify_access_membership_but_wrong_role_raises(): + access = ProjectAccess( + is_unrestricted=False, + memberships={"proj_a": "admin", "proj_b": "catalog"}, + permission="view", + allowed_codes=frozenset(["proj_a"]), + ) + with pytest.raises(MCPPermissionDenied, match="necessary permission"): + access.verify_access("proj_b", not_found="not found") + + +def test_verify_access_no_membership_raises_not_found(): + access = ProjectAccess( + is_unrestricted=False, + memberships={"proj_a": "admin"}, + permission="view", + allowed_codes=frozenset(["proj_a"]), + ) + with pytest.raises(MCPPermissionDenied, match="not found"): + access.verify_access("secret", not_found="not found") + + +# --- ProjectAccess.has_access --- + + +def test_has_access_admin(): + access = ProjectAccess(is_unrestricted=True, memberships={}, permission="view", allowed_codes=frozenset()) + assert access.has_access("anything") is True + + +def test_has_access_allowed(): + access = ProjectAccess( + is_unrestricted=False, memberships={"proj_a": "admin"}, permission="view", allowed_codes=frozenset(["proj_a"]), + ) + assert access.has_access("proj_a") is True + assert access.has_access("proj_b") is False + + +# --- ProjectAccess.query_codes --- + + +def test_query_codes_admin(): + access = ProjectAccess(is_unrestricted=True, memberships={}, permission="view", allowed_codes=frozenset()) + assert access.query_codes is None + + +def test_query_codes_scoped(): + access = ProjectAccess( + is_unrestricted=False, memberships={"proj_a": "admin"}, permission="view", allowed_codes=frozenset(["proj_a"]), + ) + assert access.query_codes == ["proj_a"] + + +# --- ProjectAccess.query_codes_for --- + + +@patch("testgen.mcp.permissions.PluginHook") +def test_query_codes_for_admin(mock_hook): + access = ProjectAccess(is_unrestricted=True, memberships={}, permission="catalog", allowed_codes=frozenset()) + assert access.query_codes_for("view") is None + + +def test_query_codes_for_same_permission(): + access = ProjectAccess( + is_unrestricted=False, memberships={"proj_a": "admin"}, permission="view", allowed_codes=frozenset(["proj_a"]), + ) + assert access.query_codes_for("view") == ["proj_a"] + + +@patch("testgen.mcp.permissions.PluginHook") +def test_query_codes_for_different_permission(mock_hook): + mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = ["admin"] + access = ProjectAccess( + is_unrestricted=False, + memberships={"proj_a": "admin", "proj_b": "catalog"}, + permission="catalog", + allowed_codes=frozenset(["proj_a", "proj_b"]), + ) + result = access.query_codes_for("view") + assert result == ["proj_a"] + + +# --- get_project_access --- + + +def test_get_project_access_raises_without_decorator(): + with pytest.raises(RuntimeError, match="add the decorator"): + get_project_access() + + +def test_get_project_access_returns_set_value(): + access = ProjectAccess(is_unrestricted=True, memberships={}, permission="view", allowed_codes=frozenset()) + token = _mcp_project_access.set(access) + try: + assert get_project_access() is access + finally: + _mcp_project_access.reset(token) + + +# --- resolve_project_access --- + + +@patch("testgen.mcp.permissions.User") +def test_resolve_project_access_global_admin(mock_user): + user = MagicMock() + user.is_global_admin = True + mock_user.get.return_value = user + set_mcp_username("admin") + + result = resolve_project_access("view") + + assert result.is_unrestricted is True + + +@patch("testgen.mcp.permissions.ProjectMembership") +@patch("testgen.mcp.permissions.PluginHook") +@patch("testgen.mcp.permissions.User") +def test_resolve_project_access_scoped_user(mock_user, mock_hook, mock_membership): + user = MagicMock() + user.is_global_admin = False + user.id = uuid4() + mock_user.get.return_value = user + set_mcp_username("scoped") + + mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = ["admin"] + + m1 = MagicMock() + m1.project_code = "proj_a" + m1.role = "admin" + mock_membership.get_memberships_for_user.return_value = [m1] + + result = resolve_project_access("view") + + assert result.allowed_codes == frozenset(["proj_a"]) + + +# --- mcp_permission decorator --- + + +@patch("testgen.mcp.permissions.User") +def test_mcp_permission_sets_contextvar_for_global_admin(mock_user): + user = MagicMock() + user.is_global_admin = True + mock_user.get.return_value = user + set_mcp_username("admin") + + captured = {} + + @mcp_permission("view") + def tool_fn(): + access = get_project_access() + captured["access"] = access + return "ok" + + result = tool_fn() + + assert result == "ok" + assert captured["access"].is_unrestricted is True + assert captured["access"].query_codes is None + + +@patch("testgen.mcp.permissions.ProjectMembership") +@patch("testgen.mcp.permissions.PluginHook") +@patch("testgen.mcp.permissions.User") +def test_mcp_permission_sets_contextvar_for_scoped_user(mock_user, mock_hook, mock_membership): + user = MagicMock() + user.is_global_admin = False + user.id = uuid4() + mock_user.get.return_value = user + set_mcp_username("scoped") + + mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = [ + "admin", "data_quality", "analyst", "business", "catalog", + ] + + m1 = MagicMock() + m1.project_code = "proj_x" + m1.role = "admin" + mock_membership.get_memberships_for_user.return_value = [m1] + + captured = {} + + @mcp_permission("view") + def tool_fn(): + access = get_project_access() + captured["access"] = access + return "ok" + + result = tool_fn() + + assert result == "ok" + assert captured["access"].allowed_codes == frozenset(["proj_x"]) + assert captured["access"].memberships == {"proj_x": "admin"} + + +@patch("testgen.mcp.permissions.ProjectMembership") +@patch("testgen.mcp.permissions.PluginHook") +@patch("testgen.mcp.permissions.User") +def test_mcp_permission_early_return_when_no_allowed_codes(mock_user, mock_hook, mock_membership): + """Decorator returns early if user has no projects with the required permission.""" + user = MagicMock() + user.is_global_admin = False + user.id = uuid4() + mock_user.get.return_value = user + set_mcp_username("scoped") + + # "view" excludes catalog role + mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = ["admin"] + + m1 = MagicMock() + m1.project_code = "proj_a" + m1.role = "catalog" + mock_membership.get_memberships_for_user.return_value = [m1] + + @mcp_permission("view") + def tool_fn(): + raise AssertionError("Should not be called") + + result = tool_fn() + + assert "permission" in result + assert "role" in result.lower() + + +@patch("testgen.mcp.permissions.User") +def test_mcp_permission_catches_mcp_permission_denied(mock_user): + """Decorator catches MCPPermissionDenied and returns str(e).""" + user = MagicMock() + user.is_global_admin = True + mock_user.get.return_value = user + set_mcp_username("admin") + + @mcp_permission("view") + def tool_fn(): + raise MCPPermissionDenied("Access denied for testing") + + result = tool_fn() + + assert result == "Access denied for testing" + + +@patch("testgen.mcp.permissions.User") +def test_mcp_permission_resets_contextvar_after_call(mock_user): + user = MagicMock() + user.is_global_admin = True + mock_user.get.return_value = user + set_mcp_username("admin") + + @mcp_permission("view") + def tool_fn(): + return "ok" + + tool_fn() + + assert _mcp_project_access.get() is _NOT_SET + + +@patch("testgen.mcp.permissions.User") +def test_mcp_permission_preserves_function_metadata(mock_user): + user = MagicMock() + user.is_global_admin = True + mock_user.get.return_value = user + + @mcp_permission("view") + def my_tool(x: int, y: str = "default") -> str: + """Tool docstring.""" + return f"{x}-{y}" + + assert my_tool.__name__ == "my_tool" + assert my_tool.__doc__ == "Tool docstring." diff --git a/tests/unit/mcp/test_tools_discovery.py b/tests/unit/mcp/test_tools_discovery.py index 6b2f0f2b..ddd55947 100644 --- a/tests/unit/mcp/test_tools_discovery.py +++ b/tests/unit/mcp/test_tools_discovery.py @@ -1,6 +1,8 @@ from unittest.mock import MagicMock, patch from uuid import uuid4 +from testgen.mcp.permissions import ProjectAccess + @patch("testgen.mcp.services.inventory_service.get_inventory") def test_get_data_inventory_returns_markdown(mock_get_inventory, db_session_mock): @@ -14,6 +16,67 @@ def test_get_data_inventory_returns_markdown(mock_get_inventory, db_session_mock mock_get_inventory.assert_called_once() +@patch("testgen.mcp.services.inventory_service.get_inventory") +@patch("testgen.mcp.permissions._compute_project_access") +def test_get_data_inventory_passes_project_codes_for_scoped_user( + mock_compute, mock_get_inventory, db_session_mock, mcp_user, +): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"proj_a": "catalog"}, + permission="catalog", + allowed_codes=frozenset(["proj_a"]), + ) + mock_get_inventory.return_value = "# Data Inventory" + + from testgen.mcp.tools.discovery import get_data_inventory + + get_data_inventory() + + # query_codes_for("view") calls PluginHook — but catalog != view, so it goes through the branch. + # For this test, we verify the call was made with list form of allowed_codes for project_codes. + call_kwargs = mock_get_inventory.call_args.kwargs + assert call_kwargs["project_codes"] == ["proj_a"] + + +@patch("testgen.mcp.services.inventory_service.get_inventory") +@patch("testgen.mcp.permissions.PluginHook") +@patch("testgen.mcp.permissions._compute_project_access") +def test_get_data_inventory_view_codes_for_scoped_user( + mock_compute, mock_hook, mock_get_inventory, db_session_mock, mcp_user, +): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"proj_a": "catalog", "proj_b": "admin"}, + permission="catalog", + allowed_codes=frozenset(["proj_a", "proj_b"]), + ) + # "view" allows admin but not catalog + mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = ["admin"] + mock_get_inventory.return_value = "# Data Inventory" + + from testgen.mcp.tools.discovery import get_data_inventory + + get_data_inventory() + + call_kwargs = mock_get_inventory.call_args.kwargs + assert call_kwargs["view_project_codes"] == ["proj_b"] + + +@patch("testgen.mcp.services.inventory_service.get_inventory") +def test_get_data_inventory_passes_none_for_global_admin(mock_get_inventory, db_session_mock, mcp_user): + mcp_user.is_global_admin = True + mock_get_inventory.return_value = "# Data Inventory" + + from testgen.mcp.tools.discovery import get_data_inventory + + get_data_inventory() + + mock_get_inventory.assert_called_once_with(project_codes=None, view_project_codes=None) + + @patch("testgen.mcp.tools.discovery.Project") def test_list_projects_returns_formatted(mock_project, db_session_mock): proj1 = MagicMock() @@ -44,6 +107,33 @@ def test_list_projects_empty(mock_project, db_session_mock): assert "No projects found" in result +@patch("testgen.mcp.tools.discovery.Project") +@patch("testgen.mcp.permissions._compute_project_access") +def test_list_projects_filters_for_scoped_user(mock_compute, mock_project, db_session_mock, mcp_user): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"demo": "admin"}, + permission="catalog", + allowed_codes=frozenset(["demo"]), + ) + + proj1 = MagicMock() + proj1.project_name = "Demo Project" + proj1.project_code = "demo" + proj2 = MagicMock() + proj2.project_name = "Secret" + proj2.project_code = "secret" + mock_project.select_where.return_value = [proj1, proj2] + + from testgen.mcp.tools.discovery import list_projects + + result = list_projects() + + assert "Demo Project" in result + assert "Secret" not in result + + @patch("testgen.mcp.tools.discovery.TestSuite") def test_list_test_suites_returns_stats(mock_suite, db_session_mock): summary = MagicMock() @@ -90,3 +180,67 @@ def test_list_test_suites_empty_project_code(db_session_mock): assert "Missing required parameter" in result assert "project_code" in result + + +@patch("testgen.mcp.permissions._compute_project_access") +def test_list_test_suites_returns_not_found_for_inaccessible_project( + mock_compute, db_session_mock, mcp_user, +): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"other_project": "admin"}, + permission="view", + allowed_codes=frozenset(["other_project"]), + ) + + from testgen.mcp.tools.discovery import list_test_suites + + result = list_test_suites("secret_project") + + assert "No test suites found for project `secret_project`" in result + + +@patch("testgen.mcp.permissions._compute_project_access") +def test_list_test_suites_returns_denial_for_insufficient_permission( + mock_compute, db_session_mock, mcp_user, +): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"other_project": "admin", "secret_project": "catalog"}, + permission="view", + allowed_codes=frozenset(["other_project"]), + ) + + from testgen.mcp.tools.discovery import list_test_suites + + result = list_test_suites("secret_project") + + assert "necessary permission" in result + assert "role" in result.lower() + + +@patch("testgen.mcp.tools.discovery.DataTable") +@patch("testgen.mcp.permissions._compute_project_access") +def test_list_tables_returns_not_found_for_inaccessible_group( + mock_compute, mock_dt, db_session_mock, mcp_user, +): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"proj_a": "admin"}, + permission="catalog", + allowed_codes=frozenset(["proj_a"]), + ) + mock_dt.select_table_names.return_value = [] + mock_dt.count_tables.return_value = 0 + + from testgen.mcp.tools.discovery import list_tables + + result = list_tables(str(uuid4())) + + assert "No tables found" in result + mock_dt.select_table_names.assert_called_once() + call_kwargs = mock_dt.select_table_names.call_args + assert call_kwargs.kwargs["project_codes"] == ["proj_a"] diff --git a/tests/unit/mcp/test_tools_test_results.py b/tests/unit/mcp/test_tools_test_results.py index cb4c5d4a..ee1aa1d0 100644 --- a/tests/unit/mcp/test_tools_test_results.py +++ b/tests/unit/mcp/test_tools_test_results.py @@ -4,6 +4,7 @@ import pytest from testgen.common.models.test_result import TestResultStatus +from testgen.mcp.permissions import ProjectAccess @patch("testgen.mcp.tools.test_results.TestType") @@ -74,6 +75,26 @@ def test_get_test_results_invalid_status(db_session_mock): get_test_results(str(uuid4()), status="BadStatus") +@patch("testgen.mcp.tools.test_results.TestResult") +@patch("testgen.mcp.permissions._compute_project_access") +def test_get_test_results_passes_project_codes(mock_compute, mock_result, db_session_mock, mcp_user): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"proj_a": "admin"}, + permission="view", + allowed_codes=frozenset(["proj_a"]), + ) + mock_result.select_results.return_value = [] + + from testgen.mcp.tools.test_results import get_test_results + + get_test_results(str(uuid4())) + + call_kwargs = mock_result.select_results.call_args.kwargs + assert call_kwargs["project_codes"] == ["proj_a"] + + @patch("testgen.mcp.tools.test_results.TestType") @patch("testgen.mcp.tools.test_results.TestResult") def test_get_failure_summary_by_test_type(mock_result, mock_tt_cls, db_session_mock): @@ -149,6 +170,28 @@ def test_get_failure_summary_invalid_uuid(db_session_mock): get_failure_summary("bad-uuid") +@patch("testgen.mcp.tools.test_results.TestResult") +@patch("testgen.mcp.permissions._compute_project_access") +def test_get_failure_summary_passes_project_codes( + mock_compute, mock_result, db_session_mock, mcp_user, +): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"proj_a": "admin"}, + permission="view", + allowed_codes=frozenset(["proj_a"]), + ) + mock_result.select_failures.return_value = [] + + from testgen.mcp.tools.test_results import get_failure_summary + + get_failure_summary(str(uuid4())) + + call_kwargs = mock_result.select_failures.call_args.kwargs + assert call_kwargs["project_codes"] == ["proj_a"] + + @patch("testgen.mcp.tools.test_results.TestType") @patch("testgen.mcp.tools.test_results.TestResult") def test_get_test_result_history_basic(mock_result, mock_tt_cls, db_session_mock): @@ -205,3 +248,25 @@ def test_get_test_result_history_invalid_uuid(db_session_mock): with pytest.raises(ValueError, match="not a valid UUID"): get_test_result_history("bad-uuid") + + +@patch("testgen.mcp.tools.test_results.TestResult") +@patch("testgen.mcp.permissions._compute_project_access") +def test_get_test_result_history_passes_project_codes( + mock_compute, mock_result, db_session_mock, mcp_user, +): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"proj_a": "admin"}, + permission="view", + allowed_codes=frozenset(["proj_a"]), + ) + mock_result.select_history.return_value = [] + + from testgen.mcp.tools.test_results import get_test_result_history + + get_test_result_history(str(uuid4())) + + call_kwargs = mock_result.select_history.call_args.kwargs + assert call_kwargs["project_codes"] == ["proj_a"] diff --git a/tests/unit/mcp/test_tools_test_runs.py b/tests/unit/mcp/test_tools_test_runs.py index 061783fe..c9a0c35f 100644 --- a/tests/unit/mcp/test_tools_test_runs.py +++ b/tests/unit/mcp/test_tools_test_runs.py @@ -1,6 +1,8 @@ from unittest.mock import MagicMock, patch from uuid import uuid4 +from testgen.mcp.permissions import ProjectAccess + def _make_run_summary(**overrides): defaults = { @@ -130,3 +132,42 @@ def test_get_recent_test_runs_empty_project_code(db_session_mock): assert "Missing required parameter" in result assert "project_code" in result + + +@patch("testgen.mcp.permissions._compute_project_access") +def test_get_recent_test_runs_returns_not_found_for_inaccessible_project( + mock_compute, db_session_mock, mcp_user, +): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"other_project": "admin"}, + permission="view", + allowed_codes=frozenset(["other_project"]), + ) + + from testgen.mcp.tools.test_runs import get_recent_test_runs + + result = get_recent_test_runs("secret_project") + + assert "No completed test runs found in project `secret_project`" in result + + +@patch("testgen.mcp.permissions._compute_project_access") +def test_get_recent_test_runs_returns_denial_for_insufficient_permission( + mock_compute, db_session_mock, mcp_user, +): + mcp_user.is_global_admin = False + mock_compute.return_value = ProjectAccess( + is_unrestricted=False, + memberships={"other_project": "admin", "secret_project": "catalog"}, + permission="view", + allowed_codes=frozenset(["other_project"]), + ) + + from testgen.mcp.tools.test_runs import get_recent_test_runs + + result = get_recent_test_runs("secret_project") + + assert "necessary permission" in result + assert "role" in result.lower() From cb1c9dca11086f8c57c1dd3531251cd97205da78 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Wed, 4 Mar 2026 17:12:28 -0300 Subject: [PATCH 27/95] fix(ui): address MR review feedback for CSV metadata import (TG-988) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove userCanEdit gate on Metadata CSV export (all users can export) - Fix undefinedB file size display when no size limit validator present - Add 2MB file size limit to CSV upload - Update labels: FileInput → "Upload metadata CSV file", RadioGroup → "When CSV values are blank" - Add help tooltips to FileInput and RadioGroup components (group-level) - Reorder dialog: file upload first, blank behavior second; RadioGroup layout → default - Replace title attr with withTooltip on preview table status icons - Fix summary text: proper pluralization, matched row counts, skipped row count - Align Python success message with preview counts (use matched rows, not update rows) - Change bad_cde from int to bool Co-Authored-By: Claude Opus 4.6 --- .../frontend/js/components/file_input.js | 16 ++++- .../frontend/js/components/radio_group.js | 9 ++- .../frontend/js/pages/data_catalog.js | 22 +++--- .../js/pages/import_metadata_dialog.js | 69 ++++++++++++------- .../views/dialogs/import_metadata_dialog.py | 32 ++++++--- tests/unit/ui/test_import_metadata.py | 4 ++ 6 files changed, 105 insertions(+), 47 deletions(-) diff --git a/testgen/ui/components/frontend/js/components/file_input.js b/testgen/ui/components/frontend/js/components/file_input.js index 5b49f503..77738aa0 100644 --- a/testgen/ui/components/frontend/js/components/file_input.js +++ b/testgen/ui/components/frontend/js/components/file_input.js @@ -15,14 +15,16 @@ * @property {string} name * @property {string} value * @property {string?} class + * @property {string?} help * @property {Array?} validators * @property {function(FileValue?, InputState)?} onChange - * + * */ import van from '../van.min.js'; import { checkIsRequired, getRandomId, getValue, loadStylesheet } from "../utils.js"; import { Icon } from './icon.js'; import { Button } from './button.js'; +import { withTooltip } from './tooltip.js'; import { humanReadableSize } from '../display_utils.js'; const { div, input, label, span } = van.tags; @@ -112,12 +114,18 @@ const FileInput = (options) => { return div( { class: cssClass }, - label( + div( { class: 'tg-file-uploader--label text-caption flex-row fx-gap-1' }, options.label, () => isRequired.val ? span({ class: 'text-error' }, '*') : '', + () => getValue(options.help) + ? withTooltip( + Icon({ size: 16, classes: 'text-disabled' }, 'help'), + { text: options.help, position: 'bottom', width: 200 } + ) + : null, ), div( { class: () => `tg-file-uploader--dropzone flex-column clickable ${fileOver.val ? 'on-dragover' : ''}` }, @@ -177,7 +185,9 @@ const FileSelectionDropZone = (placeholder, sizeLimit) => { div( { class: 'flex-column fx-gap-1' }, span({}, placeholder), - span({ class: 'text-secondary text-caption' }, `Limit ${humanReadableSize(sizeLimit)} per file`), + sizeLimit + ? span({ class: 'text-secondary text-caption' }, `Limit ${humanReadableSize(sizeLimit)} per file`) + : null, ), ); }; diff --git a/testgen/ui/components/frontend/js/components/radio_group.js b/testgen/ui/components/frontend/js/components/radio_group.js index 4f8b0008..9ddaba78 100644 --- a/testgen/ui/components/frontend/js/components/radio_group.js +++ b/testgen/ui/components/frontend/js/components/radio_group.js @@ -8,6 +8,7 @@ * @typedef Properties * @type {object} * @property {string} label + * @property {string?} help * @property {Option[]} options * @property {string | number | boolean | null} value * @property {function(string | number | boolean | null)?} onChange @@ -30,8 +31,14 @@ const RadioGroup = (/** @type Properties */ props) => { return div( { class: () => `tg-radio-group--wrapper ${layout}`, style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}` }, div( - { class: 'text-caption tg-radio-group--label' }, + { class: 'text-caption tg-radio-group--label flex-row fx-gap-1' }, props.label, + () => getValue(props.help) + ? withTooltip( + Icon({ size: 16, classes: 'text-disabled' }, 'help'), + { text: props.help, position: 'top', width: 200 } + ) + : null, ), () => div( { class: 'tg-radio-group' }, diff --git a/testgen/ui/components/frontend/js/pages/data_catalog.js b/testgen/ui/components/frontend/js/pages/data_catalog.js index 7b430d03..6d25e80f 100644 --- a/testgen/ui/components/frontend/js/pages/data_catalog.js +++ b/testgen/ui/components/frontend/js/pages/data_catalog.js @@ -414,19 +414,17 @@ const ExportOptions = (/** @type TreeNode[] */ treeNodes, /** @type SelectedNode 'Selected columns', ) : null, - userCanEdit - ? div( - { - class: 'tg-dh--export-option', - style: 'border-top: var(--button-stroked-border);', - onclick: () => { - emitEvent('ExportCsvClicked', {}); - exportOptionsOpened.val = false; - }, + div( + { + class: 'tg-dh--export-option', + style: 'border-top: var(--button-stroked-border);', + onclick: () => { + emitEvent('ExportCsvClicked', {}); + exportOptionsOpened.val = false; }, - 'Metadata CSV', - ) - : null, + }, + 'Metadata CSV', + ), ), ), ]; diff --git a/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js b/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js index 9d30d74b..5ca5259c 100644 --- a/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js +++ b/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js @@ -13,6 +13,10 @@ import { Button } from '../components/button.js'; import { Alert } from '../components/alert.js'; import { Table } from '../components/table.js'; import { capitalize } from '../display_utils.js'; +import { withTooltip } from '../components/tooltip.js'; +import { sizeLimit } from '../form_validators.js'; + +const CSV_SIZE_LIMIT = 2 * 1024 * 1024; // 2 MB const { div, i, span } = van.tags; @@ -30,19 +34,11 @@ const ImportMetadataDialog = (/** @type Properties */ props) => { return div( { id: wrapperId, class: 'flex-column fx-gap-4' }, - RadioGroup({ - label: 'When import value is blank', - options: [ - { label: 'Keep existing values', value: 'keep' }, - { label: 'Clear existing values', value: 'clear' }, - ], - value: blankBehavior, - onChange: (value) => blankBehavior.val = value, - layout: 'vertical', - }), FileInput({ name: 'csv_file', - label: 'Drop CSV file here or click to browse', + label: 'Upload metadata CSV file', + help: 'Use the Export menu on the Data Catalog page to download the current metadata as a CSV template.', + validators: [sizeLimit(CSV_SIZE_LIMIT)], value: fileValue, onChange: (value) => { fileValue.val = value; @@ -58,6 +54,17 @@ const ImportMetadataDialog = (/** @type Properties */ props) => { } }, }), + RadioGroup({ + label: 'When CSV values are blank', + help: 'Controls whether blank cells in the CSV overwrite existing metadata or leave it unchanged.', + options: [ + { label: 'Keep existing values', value: 'keep' }, + { label: 'Clear existing values', value: 'clear' }, + ], + value: blankBehavior, + onChange: (value) => blankBehavior.val = value, + layout: 'default', + }), () => { const result = getValue(props.result); if (result) { @@ -73,8 +80,21 @@ const ImportMetadataDialog = (/** @type Properties */ props) => { } const hasError = !!preview.error; - const totalMatched = hasError ? 0 : (preview.table_count || 0) + (preview.column_count || 0); - const hasMatches = totalMatched > 0; + const tableCount = hasError ? 0 : (preview.table_count || 0); + const columnCount = hasError ? 0 : (preview.column_count || 0); + const skippedCount = hasError ? 0 : (preview.skipped_count || 0); + const hasMatches = tableCount + columnCount > 0; + + const plural = (n, word) => `${n} ${n === 1 ? word : word + 's'}`; + const importedParts = [ + tableCount ? plural(tableCount, 'table') : '', + columnCount ? plural(columnCount, 'column') : '', + ].filter(Boolean); + const importedText = importedParts.length + ? `Metadata for ${importedParts.join(', ')} will be imported` + : 'No metadata will be imported'; + const skippedText = skippedCount ? `${plural(skippedCount, 'row')} skipped` : ''; + const summaryText = [importedText, skippedText].filter(Boolean).join(' | '); return div( { class: 'flex-column fx-gap-3' }, @@ -82,7 +102,7 @@ const ImportMetadataDialog = (/** @type Properties */ props) => { ? '' : span( { class: 'text-secondary' }, - `Summary: ${preview.table_count || 0} table(s), ${preview.column_count || 0} column(s) matched`, + summaryText, ), hasError ? Alert({ type: 'error', icon: 'error' }, span(preview.error)) @@ -116,7 +136,7 @@ const PreviewTable = (preview) => { const previewRows = preview.preview_rows || []; const columns = [ - { name: '_status_icon', label: '', width: 32 }, + { name: '_status_icon', label: '', width: 32, overflow: 'visible' }, { name: 'table_name', label: 'Table', width: 150 }, { name: 'column_name', label: 'Column', width: 150 }, ...metadataColumns.map(col => ({ @@ -131,16 +151,19 @@ const PreviewTable = (preview) => { const icon = STATUS_ICONS[status] || STATUS_ICONS.ok; const truncatedFields = row._truncated_fields || []; + const statusIcon = i( + { + class: `material-symbols-rounded import-status-${status}`, + style: 'font-size: 16px; cursor: default; overflow: visible; position: relative', + }, + icon, + ); + const tableRow = { _status: status, - _status_icon: i( - { - class: `material-symbols-rounded import-status-${status}`, - style: 'font-size: 16px; cursor: default', - title: row._status_detail || '', - }, - icon, - ), + _status_icon: row._status_detail + ? withTooltip(statusIcon, { text: row._status_detail, position: 'right', width: 200 }) + : statusIcon, table_name: row.table_name ?? '', column_name: row.column_name ?? '', }; diff --git a/testgen/ui/views/dialogs/import_metadata_dialog.py b/testgen/ui/views/dialogs/import_metadata_dialog.py index 2312feb7..6d779a24 100644 --- a/testgen/ui/views/dialogs/import_metadata_dialog.py +++ b/testgen/ui/views/dialogs/import_metadata_dialog.py @@ -178,18 +178,26 @@ def _match_and_validate( # Determine which metadata columns are present in the CSV metadata_columns = [c for c in METADATA_COLUMNS if c in df.columns] + # Count matched vs skipped rows from preview + matched_tables = sum(1 for r in preview_rows if not r.get("column_name") and r.get("_status") != "unmatched") + matched_columns = sum(1 for r in preview_rows if r.get("column_name") and r.get("_status") != "unmatched") + skipped = sum(1 for r in preview_rows if r.get("_status") == "unmatched") + return { "table_rows": table_rows, "column_rows": column_rows, "preview_rows": preview_rows, "metadata_columns": metadata_columns, "blank_behavior": blank_behavior, + "matched_tables": matched_tables, + "matched_columns": matched_columns, + "skipped_count": skipped, } -def _extract_metadata_fields(row: pd.Series, blank_behavior: str) -> tuple[dict, int]: +def _extract_metadata_fields(row: pd.Series, blank_behavior: str) -> tuple[dict, bool]: fields = {} - bad_cde = 0 + bad_cde = False for col in METADATA_COLUMNS: if col not in row.index: continue @@ -207,7 +215,7 @@ def _extract_metadata_fields(row: pd.Series, blank_behavior: str) -> tuple[dict, # "keep" → skip this field else: # Unrecognized value — skip (don't set field at all) - bad_cde = 1 + bad_cde = True else: if value: fields[col] = value @@ -235,7 +243,7 @@ def _set_row_status(preview_row: dict, bad_cde: int, truncated: list[str]) -> No if bad_cde: issues.append("Unrecognized CDE value (expected Yes/No) — skipped") if truncated: - issues.append(f"Value(s) truncated: {', '.join(truncated)}") + issues.append(f"Values truncated: {', '.join(truncated)}") if bad_cde: preview_row["_status"] = "error" @@ -324,7 +332,7 @@ def on_file_cleared(_payload: dict) -> None: result = None if should_import() and preview and not preview.get("error"): try: - counts = apply_metadata_import(preview) + apply_metadata_import(preview) # Clear caches from testgen.ui.queries.profiling_queries import get_column_by_id, get_table_by_id @@ -334,9 +342,16 @@ def on_file_cleared(_payload: dict) -> None: func.clear() st.session_state["data_catalog:last_saved_timestamp"] = datetime.now().timestamp() + parts = [] + if tc := preview.get("matched_tables", 0): + parts.append(f"{tc} {'table' if tc == 1 else 'tables'}") + if cc := preview.get("matched_columns", 0): + parts.append(f"{cc} {'column' if cc == 1 else 'columns'}") + summary = f"Metadata for {', '.join(parts)} imported." if parts else "No metadata was imported." + result = { "success": True, - "message": f"Metadata imported: {counts['table_count']} table(s), {counts['column_count']} column(s) updated.", + "message": summary, } except Exception: LOG.exception("Metadata import failed") @@ -394,8 +409,9 @@ def _build_preview_props(preview: dict) -> dict: formatted_rows.append(formatted_row) return { - "table_count": len(preview.get("table_rows", [])), - "column_count": len(preview.get("column_rows", [])), + "table_count": preview.get("matched_tables", 0), + "column_count": preview.get("matched_columns", 0), + "skipped_count": preview.get("skipped_count", 0), "metadata_columns": metadata_columns, "preview_rows": formatted_rows, } diff --git a/tests/unit/ui/test_import_metadata.py b/tests/unit/ui/test_import_metadata.py index 5ac9a8d2..75c85f65 100644 --- a/tests/unit/ui/test_import_metadata.py +++ b/tests/unit/ui/test_import_metadata.py @@ -264,10 +264,14 @@ def test_preview_props_basic(): {"table_name": "t1", "column_name": "", "description": "desc", "_status": "ok", "_status_detail": "", "_truncated_fields": []}, ], "metadata_columns": ["description"], + "matched_tables": 1, + "matched_columns": 0, + "skipped_count": 0, } result = _build_preview_props(preview) assert result["table_count"] == 1 assert result["column_count"] == 0 + assert result["skipped_count"] == 0 assert len(result["preview_rows"]) == 1 assert result["preview_rows"][0]["description"] == "desc" From cacab093c9c89ba870349649af70f6c557f35899 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Wed, 4 Mar 2026 20:17:15 -0300 Subject: [PATCH 28/95] refactor(mcp): hide internal test_type codes from user-facing output Replace internal codes (e.g. Alpha_Trunc) with short names (e.g. Alpha Truncation) in all MCP tool output. Accept short names as input for test_type filters. Fix result titles to distinguish column-level (on `column` in `table`) from table-level tests. Co-Authored-By: Claude Opus 4.6 --- testgen/mcp/tools/reference.py | 16 +++---- testgen/mcp/tools/test_results.py | 35 +++++++++++----- tests/unit/mcp/test_tools_reference.py | 15 ++++--- tests/unit/mcp/test_tools_test_results.py | 51 +++++++++++++++++++---- 4 files changed, 83 insertions(+), 34 deletions(-) diff --git a/testgen/mcp/tools/reference.py b/testgen/mcp/tools/reference.py index 4c66f1ab..3d25f899 100644 --- a/testgen/mcp/tools/reference.py +++ b/testgen/mcp/tools/reference.py @@ -7,16 +7,16 @@ def get_test_type(test_type: str) -> str: """Get detailed information about a specific test type. Args: - test_type: The test type code (e.g., 'Alpha_Trunc', 'Unique_Pct'). + test_type: The test type (e.g., 'Alpha Truncation', 'Unique Percent'). """ - tt = TestType.get(test_type) + matches = TestType.select_where(TestType.test_name_short == test_type) + tt = matches[0] if matches else None if not tt: - return f"Test type `{test_type}` not found." + return f"Test type `{test_type}` not found. Use `testgen://test-types` to see available types." lines = [ - f"# {tt.test_name_short} (`{tt.test_type}`)\n", - f"- **Name:** {tt.test_name_short}", + f"# {tt.test_name_short}\n", ] if tt.test_name_long: lines.append(f"- **Full Name:** {tt.test_name_long}") @@ -50,14 +50,14 @@ def test_types_resource() -> str: lines = [ "# TestGen Test Types Reference\n", - "| Test Type | Name | Quality Dimension | Scope | Description |", - "|---|---|---|---|---|", + "| Test Type | Quality Dimension | Scope | Description |", + "|---|---|---|---|", ] for tt in test_types: desc = tt.test_description or "" lines.append( - f"| {tt.test_type} | {tt.test_name_short or ''} | " + f"| {tt.test_name_short or ''} | " f"{tt.dq_dimension or ''} | {tt.test_scope or ''} | {desc} |" ) diff --git a/testgen/mcp/tools/test_results.py b/testgen/mcp/tools/test_results.py index 117ddd2f..2c35df9d 100644 --- a/testgen/mcp/tools/test_results.py +++ b/testgen/mcp/tools/test_results.py @@ -21,6 +21,14 @@ def _parse_status(value: str) -> TestResultStatus: raise ValueError(f"Invalid status `{value}`. Valid values: {valid}") from err +def _resolve_test_type(short_name: str) -> str: + """Resolve a test type short name to its internal code.""" + matches = TestType.select_where(TestType.test_name_short == short_name) + if not matches: + raise ValueError(f"Unknown test type: `{short_name}`. Use the testgen://test-types resource to see available types.") + return matches[0].test_type + + @with_database_session @mcp_permission("view") def get_test_results( @@ -37,7 +45,7 @@ def get_test_results( test_run_id: The UUID of the test run. status: Filter by result status (Passed, Failed, Warning, Error, Log). table_name: Filter by table name. - test_type: Filter by test type code. + test_type: Filter by test type (e.g. 'Alpha Truncation', 'Unique Percent'). limit: Maximum number of results per page (default 50). page: Page number, starting from 1 (default 1). """ @@ -45,13 +53,15 @@ def get_test_results( status_enum = _parse_status(status) if status else None offset = (page - 1) * limit + test_type_code = _resolve_test_type(test_type) if test_type else None + access = get_project_access() results = TestResult.select_results( test_run_id=run_uuid, status=status_enum, table_name=table_name, - test_type=test_type, + test_type=test_type_code, limit=limit, offset=offset, project_codes=access.query_codes, @@ -76,8 +86,11 @@ def get_test_results( for r in results: status_str = r.status.value if r.status else "Unknown" test_name = type_names.get(r.test_type, r.test_type) - lines.append(f"## [{status_str}] {test_name} on `{r.table_name}`") - lines.append(f"- **Test Type:** `{r.test_type}`") + if r.column_names: + title = f"## [{status_str}] {test_name} on `{r.column_names}` in `{r.table_name}`" + else: + title = f"## [{status_str}] {test_name} on `{r.table_name}`" + lines.append(title) lines.append(f"- Test definition: `{r.test_definition_id}`") if r.column_names: lines.append(f"- Column: `{r.column_names}`") @@ -124,8 +137,8 @@ def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: ] if group_by == "test_type": - lines.append("| Test Type | Test Name | Severity | Count |") - lines.append("|---|---|---|---|") + lines.append("| Test Type | Severity | Count |") + lines.append("|---|---|---|") else: group_label = {"table": "Table Name", "column": "Column"}[group_by] lines.append(f"| {group_label} | Count |") @@ -136,7 +149,7 @@ def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: if group_by == "column": # Row is (table_name, column_names, count) table, column = row[0], row[1] - label = f"{table}.{column}" if column else f"{table} (table-level)" + label = f"`{column}` in `{table}`" if column else f"`{table}` (table-level)" lines.append(f"| {label} | {count} |") elif group_by == "test_type": # Row is (test_type, status, count) @@ -144,14 +157,14 @@ def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: status = row[1] name = type_names.get(code, code) severity = status.value if status else "Unknown" - lines.append(f"| {code} | `{name}` | {severity} | {count} |") + lines.append(f"| {name} | {severity} | {count} |") else: - lines.append(f"| {row[0]} | {count} |") + lines.append(f"| `{row[0]}` | {count} |") if group_by == "test_type": lines.append( "\nCheck `testgen://test-types` to understand what each test type checks " - "and `get_test_type(test_type='...')` to fetch more details." + "and `get_test_type(test_type='Alpha Truncation')` to fetch more details." ) return "\n".join(lines) @@ -187,7 +200,7 @@ def get_test_result_history( test_name = type_names.get(first.test_type, first.test_type) lines = [ "# Test Result History\n", - f"- **Test Type:** {test_name} (`{first.test_type}`)", + f"- **Test Type:** {test_name}", f"- **Table:** `{first.table_name}`", ] if first.column_names: diff --git a/tests/unit/mcp/test_tools_reference.py b/tests/unit/mcp/test_tools_reference.py index 308380bd..bbfcdead 100644 --- a/tests/unit/mcp/test_tools_reference.py +++ b/tests/unit/mcp/test_tools_reference.py @@ -15,13 +15,14 @@ def test_get_test_type_found(mock_tt_cls, db_session_mock): tt.test_scope = "column" tt.except_message = "Alpha truncation detected" tt.usage_notes = "Best for VARCHAR columns" - mock_tt_cls.get.return_value = tt + mock_tt_cls.select_where.return_value = [tt] from testgen.mcp.tools.reference import get_test_type - result = get_test_type("Alpha_Trunc") + result = get_test_type("Alpha Truncation") assert "Alpha Truncation" in result + assert "Alpha_Trunc" not in result assert "Accuracy" in result assert "column" in result assert "truncated" in result.lower() @@ -29,11 +30,11 @@ def test_get_test_type_found(mock_tt_cls, db_session_mock): @patch("testgen.mcp.tools.reference.TestType") def test_get_test_type_not_found(mock_tt_cls, db_session_mock): - mock_tt_cls.get.return_value = None + mock_tt_cls.select_where.return_value = [] from testgen.mcp.tools.reference import get_test_type - result = get_test_type("Nonexistent_Type") + result = get_test_type("Nonexistent Type") assert "not found" in result @@ -58,8 +59,10 @@ def test_test_types_resource(mock_tt_cls, db_session_mock): result = test_types_resource() - assert "Alpha_Trunc" in result - assert "Unique_Pct" in result + assert "Alpha Truncation" in result + assert "Unique Percent" in result + assert "Alpha_Trunc" not in result + assert "Unique_Pct" not in result assert "Accuracy" in result assert "Uniqueness" in result diff --git a/tests/unit/mcp/test_tools_test_results.py b/tests/unit/mcp/test_tools_test_results.py index ee1aa1d0..cf3ed91c 100644 --- a/tests/unit/mcp/test_tools_test_results.py +++ b/tests/unit/mcp/test_tools_test_results.py @@ -32,12 +32,40 @@ def test_get_test_results_basic(mock_result, mock_tt_cls, db_session_mock): result = get_test_results(run_id) assert "Alpha Truncation" in result - assert "`Alpha_Trunc`" in result - assert "orders" in result + assert "Alpha_Trunc" not in result + assert "on `customer_name` in `orders`" in result assert "15.3" in result assert "Truncation detected" in result +@patch("testgen.mcp.tools.test_results.TestType") +@patch("testgen.mcp.tools.test_results.TestResult") +def test_get_test_results_table_level_title(mock_result, mock_tt_cls, db_session_mock): + run_id = str(uuid4()) + r1 = MagicMock() + r1.status = TestResultStatus.Passed + r1.test_type = "Row_Ct" + r1.test_definition_id = uuid4() + r1.table_name = "orders" + r1.column_names = None + r1.result_measure = "1000" + r1.threshold_value = "500" + r1.message = None + mock_result.select_results.return_value = [r1] + + tt = MagicMock() + tt.test_type = "Row_Ct" + tt.test_name_short = "Row Count" + mock_tt_cls.select_where.return_value = [tt] + + from testgen.mcp.tools.test_results import get_test_results + + result = get_test_results(run_id) + + assert "Row Count on `orders`" in result + assert "` in `" not in result + + @patch("testgen.mcp.tools.test_results.TestResult") def test_get_test_results_empty(mock_result, db_session_mock): mock_result.select_results.return_value = [] @@ -49,16 +77,22 @@ def test_get_test_results_empty(mock_result, db_session_mock): assert "No test results found" in result +@patch("testgen.mcp.tools.test_results.TestType") @patch("testgen.mcp.tools.test_results.TestResult") -def test_get_test_results_with_filters(mock_result, db_session_mock): +def test_get_test_results_with_filters(mock_result, mock_tt_cls, db_session_mock): + tt = MagicMock() + tt.test_type = "Alpha_Trunc" + tt.test_name_short = "Alpha Truncation" + mock_tt_cls.select_where.return_value = [tt] mock_result.select_results.return_value = [] from testgen.mcp.tools.test_results import get_test_results - result = get_test_results(str(uuid4()), status="Failed", table_name="orders", test_type="Alpha_Trunc") + result = get_test_results(str(uuid4()), status="Failed", table_name="orders", test_type="Alpha Truncation") assert "status=Failed" in result assert "table=orders" in result + assert "type=Alpha Truncation" in result def test_get_test_results_invalid_uuid(db_session_mock): @@ -117,8 +151,7 @@ def test_get_failure_summary_by_test_type(mock_result, mock_tt_cls, db_session_m assert "Failed + Warning" in result assert "8" in result assert "Alpha Truncation" in result - assert "Alpha_Trunc" in result - assert "Test Name" in result + assert "Alpha_Trunc" not in result assert "Severity" in result assert "Failed" in result assert "Warning" in result @@ -158,8 +191,8 @@ def test_get_failure_summary_by_column(mock_result, db_session_mock): result = get_failure_summary(str(uuid4()), group_by="column") assert "Column" in result - assert "orders.total_value" in result - assert "orders (table-level)" in result + assert "`total_value` in `orders`" in result + assert "`orders` (table-level)" in result assert "get_test_type" not in result @@ -224,7 +257,7 @@ def test_get_test_result_history_basic(mock_result, mock_tt_cls, db_session_mock result = get_test_result_history(def_id) assert "Unique Percent" in result - assert "`Unique_Pct`" in result + assert "Unique_Pct" not in result assert "orders" in result assert "99.5" in result assert "88.0" in result From e694400de9369244c1f179fb31ece53a5004377d Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Wed, 4 Mar 2026 20:47:09 -0300 Subject: [PATCH 29/95] fix(ui): skip entire row on CDE error, not just the field (TG-988) Error rows (unrecognized CDE value) are now fully excluded from import instead of importing other fields. Preview and success counts treat error rows as skipped alongside unmatched rows. Co-Authored-By: Claude Opus 4.6 --- testgen/ui/views/dialogs/import_metadata_dialog.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/testgen/ui/views/dialogs/import_metadata_dialog.py b/testgen/ui/views/dialogs/import_metadata_dialog.py index 6d779a24..92d9a2dc 100644 --- a/testgen/ui/views/dialogs/import_metadata_dialog.py +++ b/testgen/ui/views/dialogs/import_metadata_dialog.py @@ -148,7 +148,7 @@ def _match_and_validate( fields, bad_cde = _extract_metadata_fields(row, blank_behavior) fields, truncated = _truncate_fields(fields) - if fields: + if fields and not bad_cde: table_rows.append({"table_id": table_id, "table_name": table_name, **fields}) preview_row.update(fields) @@ -166,7 +166,7 @@ def _match_and_validate( fields, bad_cde = _extract_metadata_fields(row, blank_behavior) fields, truncated = _truncate_fields(fields) - if fields: + if fields and not bad_cde: column_rows.append( {"column_id": column_id, "table_name": table_name, "column_name": column_name, **fields} ) @@ -179,9 +179,11 @@ def _match_and_validate( metadata_columns = [c for c in METADATA_COLUMNS if c in df.columns] # Count matched vs skipped rows from preview - matched_tables = sum(1 for r in preview_rows if not r.get("column_name") and r.get("_status") != "unmatched") - matched_columns = sum(1 for r in preview_rows if r.get("column_name") and r.get("_status") != "unmatched") - skipped = sum(1 for r in preview_rows if r.get("_status") == "unmatched") + # "ok" and "warning" rows will be imported; "error" and "unmatched" rows are skipped + _importable = {"ok", "warning"} + matched_tables = sum(1 for r in preview_rows if not r.get("column_name") and r.get("_status") in _importable) + matched_columns = sum(1 for r in preview_rows if r.get("column_name") and r.get("_status") in _importable) + skipped = sum(1 for r in preview_rows if r.get("_status") not in _importable) return { "table_rows": table_rows, @@ -238,7 +240,7 @@ def _truncate_fields(fields: dict) -> tuple[dict, list[str]]: return fields, truncated -def _set_row_status(preview_row: dict, bad_cde: int, truncated: list[str]) -> None: +def _set_row_status(preview_row: dict, bad_cde: bool, truncated: list[str]) -> None: issues = [] if bad_cde: issues.append("Unrecognized CDE value (expected Yes/No) — skipped") From 99e8781aac20e8e7056dd16a4b8a091cdb57cf6c Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Thu, 5 Mar 2026 14:52:34 -0300 Subject: [PATCH 30/95] refactor(mcp): replace global admin bypass with role-based permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove is_global_admin special case from MCP permissions — access is now determined solely by project memberships and roles. Rename ProjectAccess → ProjectPermissions with allowed_codes as a computed property. Hide connection names from catalog users in inventory while keeping table groups visible. Update reference tool wording. Co-Authored-By: Claude Opus 4.6 --- testgen/mcp/permissions.py | 97 ++---- testgen/mcp/services/inventory_service.py | 44 ++- testgen/mcp/tools/discovery.py | 21 +- testgen/mcp/tools/reference.py | 10 +- testgen/mcp/tools/test_results.py | 16 +- testgen/mcp/tools/test_runs.py | 6 +- tests/unit/mcp/conftest.py | 37 ++- tests/unit/mcp/test_inventory_service.py | 66 ++-- tests/unit/mcp/test_permissions.py | 377 ++++++---------------- tests/unit/mcp/test_tools_discovery.py | 89 ++--- tests/unit/mcp/test_tools_test_results.py | 35 +- tests/unit/mcp/test_tools_test_runs.py | 24 +- 12 files changed, 272 insertions(+), 550 deletions(-) diff --git a/testgen/mcp/permissions.py b/testgen/mcp/permissions.py index a6218331..ed45a645 100644 --- a/testgen/mcp/permissions.py +++ b/testgen/mcp/permissions.py @@ -12,31 +12,42 @@ _NOT_SET = object() _mcp_username: contextvars.ContextVar[str | None] = contextvars.ContextVar("mcp_username", default=None) -_mcp_project_access: contextvars.ContextVar["ProjectAccess | object"] = contextvars.ContextVar( - "mcp_project_access", default=_NOT_SET +_mcp_project_permissions: contextvars.ContextVar["ProjectPermissions | object"] = contextvars.ContextVar( + "mcp_project_permissions", default=_NOT_SET ) class MCPPermissionDenied(Exception): - """Raised by ProjectAccess when access is denied. Caught by the decorator.""" + """Raised by ProjectPermissions when access is denied. Caught by the decorator.""" @dataclass(frozen=True, slots=True) -class ProjectAccess: - is_unrestricted: bool - memberships: dict[str, str] +class ProjectPermissions: + memberships: dict[str, str] # {project_code: role} permission: str - allowed_codes: frozenset[str] + + def codes_allowed_to(self, permission: str) -> list[str]: + """Project codes where the user's role includes the given permission.""" + allowed_roles = PluginHook.instance().rbac.get_roles_with_permission(permission) + return [code for code, role in self.memberships.items() if role in allowed_roles] + + @property + def allowed_codes(self) -> list[str]: + """Project codes for the decorator's permission.""" + return self.codes_allowed_to(self.permission) + + def has_access(self, project_code: str) -> bool: + """For filtering lists — no exception, just a bool.""" + return project_code in self.allowed_codes def verify_access(self, project_code: str, not_found: str) -> None: """Raise MCPPermissionDenied if user can't access this project. - - Admin: always passes (no-op). - Has access: passes. - Has membership but wrong role: raises with denial message. - No membership: raises with not_found (hides project existence). """ - if self.is_unrestricted or project_code in self.allowed_codes: + if project_code in self.allowed_codes: return if project_code in self.memberships: raise MCPPermissionDenied( @@ -44,24 +55,6 @@ def verify_access(self, project_code: str, not_found: str) -> None: ) raise MCPPermissionDenied(not_found) - def has_access(self, project_code: str) -> bool: - """For filtering lists — no exception, just a bool.""" - return self.is_unrestricted or project_code in self.allowed_codes - - @property - def query_codes(self) -> list[str] | None: - """Project codes for SQL WHERE. None = no filter (admin).""" - return None if self.is_unrestricted else list(self.allowed_codes) - - def query_codes_for(self, permission: str) -> list[str] | None: - """Project codes for a different permission (e.g. 'view' inside a 'catalog' tool).""" - if self.is_unrestricted: - return None - if permission == self.permission: - return list(self.allowed_codes) - allowed_roles = PluginHook.instance().rbac.get_roles_with_permission(permission) - return [code for code, role in self.memberships.items() if role in allowed_roles] - def set_mcp_username(username: str | None) -> None: """Store the authenticated username (called by JWTTokenVerifier).""" @@ -82,55 +75,35 @@ def get_current_mcp_user() -> User: return user -def _compute_project_access(user: User, permission: str) -> ProjectAccess: - """Build a ProjectAccess for the given user and permission.""" - if user.is_global_admin: - return ProjectAccess( - is_unrestricted=True, - memberships={}, - permission=permission, - allowed_codes=frozenset(), - ) - - allowed_roles = PluginHook.instance().rbac.get_roles_with_permission(permission) +def _compute_project_permissions(user: User, permission: str) -> ProjectPermissions: + """Build a ProjectPermissions for the given user and permission.""" memberships_list = ProjectMembership.get_memberships_for_user(user.id) - memberships = {m.project_code: m.role for m in memberships_list} - allowed_codes = frozenset(code for code, role in memberships.items() if role in allowed_roles) - - return ProjectAccess( - is_unrestricted=False, - memberships=memberships, + return ProjectPermissions( + memberships={m.project_code: m.role for m in memberships_list}, permission=permission, - allowed_codes=allowed_codes, ) -def get_project_access() -> ProjectAccess: - """Retrieve the ProjectAccess computed by @mcp_permission for the current request. +def get_project_permissions() -> "ProjectPermissions": + """Retrieve the ProjectPermissions computed by @mcp_permission for the current request. Raises RuntimeError if called without @mcp_permission — prevents silent - admin-level access when a developer forgets to add the decorator. + unfiltered access when a developer forgets to add the decorator. """ - value = _mcp_project_access.get() + value = _mcp_project_permissions.get() if value is _NOT_SET: raise RuntimeError( - "get_project_access() called without @mcp_permission — add the decorator to this tool" + "get_project_permissions() called without @mcp_permission — add the decorator to this tool" ) return value # type: ignore[return-value] -def resolve_project_access(permission: str) -> ProjectAccess: - """Compute a ProjectAccess for a specific permission, using the current MCP user.""" - user = get_current_mcp_user() - return _compute_project_access(user, permission) - - def mcp_permission(permission: str) -> Callable: """Decorator that enforces role-based project filtering for MCP tools. - Resolves the authenticated user, computes a ProjectAccess for the given + Resolves the authenticated user, computes a ProjectPermissions for the given permission, and stores it in a ContextVar. The tool retrieves the value - via ``get_project_access()``. + via ``get_project_permissions()``. If the user has no projects with the required permission, returns an early denial message. Catches MCPPermissionDenied raised by tool code @@ -141,16 +114,16 @@ def decorator(fn: Callable) -> Callable: @functools.wraps(fn) def wrapper(*args, **kwargs): user = get_current_mcp_user() - access = _compute_project_access(user, permission) - if not access.is_unrestricted and not access.allowed_codes: + perms = _compute_project_permissions(user, permission) + if not perms.allowed_codes: return "Your role does not include the necessary permission for this operation on any project." - tok = _mcp_project_access.set(access) + tok = _mcp_project_permissions.set(perms) try: return fn(*args, **kwargs) except MCPPermissionDenied as e: return str(e) finally: - _mcp_project_access.reset(tok) + _mcp_project_permissions.reset(tok) return wrapper diff --git a/testgen/mcp/services/inventory_service.py b/testgen/mcp/services/inventory_service.py index d6ca6f84..55d40045 100644 --- a/testgen/mcp/services/inventory_service.py +++ b/testgen/mcp/services/inventory_service.py @@ -8,15 +8,16 @@ def get_inventory( - project_codes: list[str] | None = None, - view_project_codes: list[str] | None = None, + project_codes: list[str], + view_project_codes: list[str], ) -> str: """Build a markdown inventory of all projects, connections, table groups, and test suites. Args: - project_codes: Projects the user can see (None = all). - view_project_codes: Projects where the user has 'view' permission (None = all). - When set, suites are hidden for projects not in this list. + project_codes: Projects the user can see (based on decorator permission). + view_project_codes: Projects where the user has 'view' permission. + Connection names and test suites are only shown for these projects. + Table groups are always shown so catalog users can browse tables. """ session = get_current_session() @@ -43,8 +44,7 @@ def get_inventory( ) ) - if project_codes is not None: - query = query.where(Project.project_code.in_(project_codes)) + query = query.where(Project.project_code.in_(project_codes)) query = query.order_by( Project.project_name, Connection.connection_name, TableGroup.table_groups_name, TestSuite.test_suite, @@ -89,29 +89,36 @@ def get_inventory( ) compact_groups = total_groups > 50 + view_codes_set = set(view_project_codes) + # Format as Markdown lines = ["# Data Inventory\n"] for project_code, proj in projects.items(): - can_view_suites = view_project_codes is None or project_code in view_project_codes + can_view = project_code in view_codes_set lines.append(f"## Project: {proj['name']} (`{project_code}`)\n") if not proj["connections"]: - lines.append("_No connections configured._\n") + if can_view: + lines.append("_No connections configured._\n") + else: + lines.append("_No table groups._\n") continue for _conn_id, conn in proj["connections"].items(): - lines.append(f"### Connection: {conn['name']}\n") + if can_view: + lines.append(f"### Connection: {conn['name']}\n") if not conn["groups"]: - lines.append("_No table groups._\n") + if can_view: + lines.append("_No table groups._\n") continue for group_id, group in conn["groups"].items(): - if compact_groups: + if compact_groups or not can_view: lines.append( - f"- **{group['name']}** (schema: `{group['schema']}`, " - f"{len(group['suites'])} test suites)" + f"- **{group['name']}**: id: `{group_id}`, schema: `{group['schema']}`, " + f"test suites: {len(group['suites'])}" ) continue @@ -119,15 +126,6 @@ def get_inventory( f"#### Table Group: {group['name']} (id: `{group_id}`, schema: `{group['schema']}`)\n" ) - if not can_view_suites: - if group["suites"]: - lines.append( - f"_{len(group['suites'])} test suite(s) — requires `view` permission._\n" - ) - else: - lines.append("_No test suites._\n") - continue - if not group["suites"]: lines.append("_No test suites._\n") continue diff --git a/testgen/mcp/tools/discovery.py b/testgen/mcp/tools/discovery.py index cbadeff3..360c2ac4 100644 --- a/testgen/mcp/tools/discovery.py +++ b/testgen/mcp/tools/discovery.py @@ -4,7 +4,7 @@ from testgen.common.models.data_table import DataTable from testgen.common.models.project import Project from testgen.common.models.test_suite import TestSuite -from testgen.mcp.permissions import get_project_access, mcp_permission +from testgen.mcp.permissions import get_project_permissions, mcp_permission @with_database_session @@ -18,8 +18,11 @@ def get_data_inventory() -> str: """ from testgen.mcp.services.inventory_service import get_inventory - access = get_project_access() - return get_inventory(project_codes=access.query_codes, view_project_codes=access.query_codes_for("view")) + perms = get_project_permissions() + return get_inventory( + project_codes=perms.allowed_codes, + view_project_codes=perms.codes_allowed_to("view"), + ) @with_database_session @@ -29,8 +32,8 @@ def list_projects() -> str: Returns project codes and names. Use these to scope queries to specific projects. """ - access = get_project_access() - projects = [p for p in Project.select_where() if access.has_access(p.project_code)] + perms = get_project_permissions() + projects = [p for p in Project.select_where() if perms.has_access(p.project_code)] if not projects: return "No projects found." @@ -53,8 +56,8 @@ def list_test_suites(project_code: str) -> str: if not project_code: return "Missing required parameter `project_code`." - access = get_project_access() - access.verify_access(project_code, not_found=f"No test suites found for project `{project_code}`.") + perms = get_project_permissions() + perms.verify_access(project_code, not_found=f"No test suites found for project `{project_code}`.") summaries = TestSuite.select_summary(project_code) @@ -103,8 +106,8 @@ def list_tables(table_group_id: str, limit: int = 200, page: int = 1) -> str: except (ValueError, AttributeError) as err: raise ValueError(f"Invalid table_group_id: `{table_group_id}` is not a valid UUID.") from err - access = get_project_access() - project_codes = access.query_codes + perms = get_project_permissions() + project_codes = perms.allowed_codes offset = (page - 1) * limit table_names = DataTable.select_table_names(group_uuid, limit=limit, offset=offset, project_codes=project_codes) diff --git a/testgen/mcp/tools/reference.py b/testgen/mcp/tools/reference.py index 3d25f899..9887effa 100644 --- a/testgen/mcp/tools/reference.py +++ b/testgen/mcp/tools/reference.py @@ -81,11 +81,11 @@ def glossary_resource() -> str: ## Test Result Statuses -- **Passed** — Test passed within acceptable thresholds. -- **Warning** — Test exceeded its threshold. Severity configured as Warning. -- **Failed** — Test exceeded its threshold. Severity configured as Fail. -- **Error** — Test could not execute (e.g., SQL error, missing table). -- **Log** — Informational result, not scored. +- **Passed** — Data meets test criteria. +- **Warning** — Data does not meet test criteria. Severity configured as Warning. +- **Failed** — Data does not meet test criteria. Severity configured as Fail. +- **Error** — Test could not execute (e.g., missing table or permission issue). +- **Log** — Informational result recorded for reference. ## Disposition diff --git a/testgen/mcp/tools/test_results.py b/testgen/mcp/tools/test_results.py index 2c35df9d..9eff3e22 100644 --- a/testgen/mcp/tools/test_results.py +++ b/testgen/mcp/tools/test_results.py @@ -3,7 +3,7 @@ from testgen.common.models import with_database_session from testgen.common.models.test_definition import TestType from testgen.common.models.test_result import TestResult, TestResultStatus -from testgen.mcp.permissions import get_project_access, mcp_permission +from testgen.mcp.permissions import get_project_permissions, mcp_permission def _parse_uuid(value: str, label: str = "ID") -> UUID: @@ -55,7 +55,7 @@ def get_test_results( test_type_code = _resolve_test_type(test_type) if test_type else None - access = get_project_access() + perms = get_project_permissions() results = TestResult.select_results( test_run_id=run_uuid, @@ -64,7 +64,7 @@ def get_test_results( test_type=test_type_code, limit=limit, offset=offset, - project_codes=access.query_codes, + project_codes=perms.allowed_codes, ) if not results: @@ -116,12 +116,12 @@ def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: """ run_uuid = _parse_uuid(test_run_id, "test_run_id") - access = get_project_access() + perms = get_project_permissions() # Map public param names to model field names model_group_map = {"table": "table_name", "column": "column_names"} model_group_by = model_group_map.get(group_by, group_by) - failures = TestResult.select_failures(test_run_id=run_uuid, group_by=model_group_by, project_codes=access.query_codes) + failures = TestResult.select_failures(test_run_id=run_uuid, group_by=model_group_by, project_codes=perms.allowed_codes) if not failures: return f"No confirmed failures found for run `{test_run_id}`." @@ -164,7 +164,7 @@ def get_failure_summary(test_run_id: str, group_by: str = "test_type") -> str: if group_by == "test_type": lines.append( "\nCheck `testgen://test-types` to understand what each test type checks " - "and `get_test_type(test_type='Alpha Truncation')` to fetch more details." + "and `get_test_type(test_type='...')` to fetch more details." ) return "\n".join(lines) @@ -187,9 +187,9 @@ def get_test_result_history( def_uuid = _parse_uuid(test_definition_id, "test_definition_id") offset = (page - 1) * limit - access = get_project_access() + perms = get_project_permissions() - results = TestResult.select_history(test_definition_id=def_uuid, limit=limit, offset=offset, project_codes=access.query_codes) + results = TestResult.select_history(test_definition_id=def_uuid, limit=limit, offset=offset, project_codes=perms.allowed_codes) if not results: return f"No historical results found for test definition `{test_definition_id}`." diff --git a/testgen/mcp/tools/test_runs.py b/testgen/mcp/tools/test_runs.py index 36cff652..26053832 100644 --- a/testgen/mcp/tools/test_runs.py +++ b/testgen/mcp/tools/test_runs.py @@ -1,7 +1,7 @@ from testgen.common.models import with_database_session from testgen.common.models.test_run import TestRun from testgen.common.models.test_suite import TestSuite -from testgen.mcp.permissions import get_project_access, mcp_permission +from testgen.mcp.permissions import get_project_permissions, mcp_permission @with_database_session @@ -17,8 +17,8 @@ def get_recent_test_runs(project_code: str, test_suite: str | None = None, limit if not project_code: return "Missing required parameter `project_code`." - access = get_project_access() - access.verify_access(project_code, not_found=f"No completed test runs found in project `{project_code}`.") + perms = get_project_permissions() + perms.verify_access(project_code, not_found=f"No completed test runs found in project `{project_code}`.") test_suite_id = None if test_suite: diff --git a/tests/unit/mcp/conftest.py b/tests/unit/mcp/conftest.py index 46541d59..dd706d27 100644 --- a/tests/unit/mcp/conftest.py +++ b/tests/unit/mcp/conftest.py @@ -1,26 +1,47 @@ from unittest.mock import MagicMock, patch +from uuid import uuid4 import pytest from testgen.mcp.permissions import set_mcp_username +# Fictional role matrix for tests. role_a has full access, role_c is restricted. +TEST_PERM_MATRIX = { + "view": ["role_a", "role_b"], + "catalog": ["role_a", "role_b", "role_c"], +} + + +def _test_roles_with_permission(permission): + return TEST_PERM_MATRIX.get(permission, []) + @pytest.fixture(autouse=True) def mcp_user(): """Set up an authenticated MCP user for all tool tests. - Patches User.get to return a global admin by default (no filtering). - The @mcp_permission decorator calls get_current_mcp_user() which uses - User.get, then get_allowed_project_codes() which returns None for - global admins — so the ContextVar is set to None (no project filtering). + Default: user has 'role_a' on 'demo' project (full access). + The @mcp_permission decorator passes for any permission. - Individual tests can patch get_allowed_project_codes to simulate - scoped access. + Tests needing scoped access patch _compute_project_permissions directly. """ set_mcp_username("test_user") user = MagicMock() - user.is_global_admin = True - with patch("testgen.mcp.permissions.User") as mock_user_cls: + user.id = uuid4() + + membership = MagicMock() + membership.project_code = "demo" + membership.role = "role_a" + + with ( + patch("testgen.mcp.permissions.User") as mock_user_cls, + patch("testgen.mcp.permissions.ProjectMembership") as mock_membership, + patch("testgen.mcp.permissions.PluginHook") as mock_hook, + ): mock_user_cls.get.return_value = user + mock_membership.get_memberships_for_user.return_value = [membership] + mock_hook.instance.return_value.rbac.get_roles_with_permission.side_effect = ( + _test_roles_with_permission + ) yield user set_mcp_username(None) diff --git a/tests/unit/mcp/test_inventory_service.py b/tests/unit/mcp/test_inventory_service.py index 8ea34436..715ef476 100644 --- a/tests/unit/mcp/test_inventory_service.py +++ b/tests/unit/mcp/test_inventory_service.py @@ -34,7 +34,7 @@ def test_get_inventory_basic(mock_select, session_mock): from testgen.mcp.services.inventory_service import get_inventory - result = get_inventory() + result = get_inventory(project_codes=["demo"], view_project_codes=["demo"]) assert "Data Inventory" in result assert "Demo" in result @@ -49,7 +49,7 @@ def test_get_inventory_empty(mock_select, session_mock): from testgen.mcp.services.inventory_service import get_inventory - result = get_inventory() + result = get_inventory(project_codes=["demo"], view_project_codes=["demo"]) assert "Data Inventory" in result @@ -61,7 +61,7 @@ def test_get_inventory_project_no_connections(mock_select, session_mock): from testgen.mcp.services.inventory_service import get_inventory - result = get_inventory() + result = get_inventory(project_codes=["demo"], view_project_codes=["demo"]) assert "Demo" in result assert "No connections" in result @@ -73,7 +73,7 @@ def test_get_inventory_includes_list_tables_hint(mock_select, session_mock): from testgen.mcp.services.inventory_service import get_inventory - result = get_inventory() + result = get_inventory(project_codes=["demo"], view_project_codes=["demo"]) assert "list_tables" in result @@ -94,16 +94,16 @@ def test_get_inventory_compact_groups(mock_select, session_mock): from testgen.mcp.services.inventory_service import get_inventory - result = get_inventory() + result = get_inventory(project_codes=["demo"], view_project_codes=["demo"]) - # Compact groups: single line with "X test suites", no "#### Table Group:" headers - assert "test suites)" in result + # Compact groups: single line with "test suites: N", no "#### Table Group:" headers + assert "test suites:" in result assert "#### Table Group:" not in result @patch("testgen.mcp.services.inventory_service.select") -def test_get_inventory_hides_suites_without_view_permission(mock_select, session_mock): - """Suites are hidden for projects where user lacks view permission.""" +def test_get_inventory_without_view_hides_connections_and_suites(mock_select, session_mock): + """Without view permission: connection names hidden, table groups shown in compact format, suites hidden.""" tg_id = uuid4() suite_id = uuid4() row = _make_row(table_group_id=tg_id, test_suite_id=suite_id, test_suite="Secret Suite") @@ -114,15 +114,17 @@ def test_get_inventory_hides_suites_without_view_permission(mock_select, session result = get_inventory(project_codes=["demo"], view_project_codes=[]) assert "Demo" in result - assert "Secret Suite" not in result - assert str(suite_id) not in result - assert "requires `view` permission" in result - assert "1 test suite(s)" in result + assert "main" not in result # connection name hidden + assert "core" in result # table group still shown + assert str(tg_id) in result # table group id still shown + assert "Secret Suite" not in result # suite name hidden + assert str(suite_id) not in result # suite id hidden + assert "test suites: 1" in result # suite count shown @patch("testgen.mcp.services.inventory_service.select") -def test_get_inventory_shows_suites_with_view_permission(mock_select, session_mock): - """Suites are shown for projects where user has view permission.""" +def test_get_inventory_with_view_shows_all_details(mock_select, session_mock): + """With view permission: connections, table groups, and suites all shown.""" tg_id = uuid4() suite_id = uuid4() row = _make_row(table_group_id=tg_id, test_suite_id=suite_id, test_suite="Visible Suite") @@ -132,39 +134,7 @@ def test_get_inventory_shows_suites_with_view_permission(mock_select, session_mo result = get_inventory(project_codes=["demo"], view_project_codes=["demo"]) + assert "main" in result # connection name shown assert "Visible Suite" in result assert str(suite_id) in result assert "requires `view` permission" not in result - - -@patch("testgen.mcp.services.inventory_service.select") -def test_get_inventory_view_none_shows_all_suites(mock_select, session_mock): - """When view_project_codes is None (global admin), all suites shown.""" - tg_id = uuid4() - suite_id = uuid4() - row = _make_row(table_group_id=tg_id, test_suite_id=suite_id, test_suite="Admin Suite") - session_mock.execute.return_value.all.return_value = [row] - - from testgen.mcp.services.inventory_service import get_inventory - - result = get_inventory(project_codes=None, view_project_codes=None) - - assert "Admin Suite" in result - assert "requires `view` permission" not in result - - -@patch("testgen.mcp.services.inventory_service.select") -def test_get_inventory_no_suites_without_view_shows_no_suites(mock_select, session_mock): - """When group has no suites and user lacks view, shows 'No test suites'.""" - tg_id = uuid4() - row = _make_row(table_group_id=tg_id, test_suite_id=None, test_suite=None) - # Remove the suite from the row - row.test_suite_id = None - session_mock.execute.return_value.all.return_value = [row] - - from testgen.mcp.services.inventory_service import get_inventory - - result = get_inventory(project_codes=["demo"], view_project_codes=[]) - - assert "No test suites" in result - assert "requires `view` permission" not in result diff --git a/tests/unit/mcp/test_permissions.py b/tests/unit/mcp/test_permissions.py index 0639f2a1..da980825 100644 --- a/tests/unit/mcp/test_permissions.py +++ b/tests/unit/mcp/test_permissions.py @@ -6,13 +6,12 @@ from testgen.mcp.permissions import ( _NOT_SET, MCPPermissionDenied, - ProjectAccess, - _compute_project_access, - _mcp_project_access, + ProjectPermissions, + _compute_project_permissions, + _mcp_project_permissions, get_current_mcp_user, - get_project_access, + get_project_permissions, mcp_permission, - resolve_project_access, set_mcp_username, ) @@ -20,10 +19,10 @@ @pytest.fixture(autouse=True) def _reset_contextvars(): set_mcp_username(None) - tok = _mcp_project_access.set(_NOT_SET) + tok = _mcp_project_permissions.set(_NOT_SET) yield set_mcp_username(None) - _mcp_project_access.reset(tok) + _mcp_project_permissions.reset(tok) # --- get_current_mcp_user --- @@ -55,350 +54,169 @@ def test_get_current_mcp_user_returns_user(mock_user): mock_user.get.assert_called_once_with("admin") -# --- _compute_project_access --- - - -def test_compute_project_access_global_admin(): - user = MagicMock() - user.is_global_admin = True - - result = _compute_project_access(user, "view") - - assert result.is_unrestricted is True - assert result.memberships == {} - assert result.permission == "view" - assert result.allowed_codes == frozenset() +# --- _compute_project_permissions --- @patch("testgen.mcp.permissions.ProjectMembership") -@patch("testgen.mcp.permissions.PluginHook") -def test_compute_project_access_os_default_all_roles_allowed(mock_hook, mock_membership): - """OS default: get_roles_with_permission returns all roles — all memberships returned.""" +def test_compute_project_permissions_returns_memberships(mock_membership): user = MagicMock() - user.is_global_admin = False user.id = uuid4() - mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = [ - "admin", "data_quality", "analyst", "business", "catalog", - ] - m1 = MagicMock() m1.project_code = "proj_a" - m1.role = "admin" + m1.role = "role_a" m2 = MagicMock() m2.project_code = "proj_b" - m2.role = "catalog" + m2.role = "role_c" mock_membership.get_memberships_for_user.return_value = [m1, m2] - result = _compute_project_access(user, "view") + result = _compute_project_permissions(user, "view") - assert result.is_unrestricted is False - assert result.memberships == {"proj_a": "admin", "proj_b": "catalog"} - assert result.allowed_codes == frozenset(["proj_a", "proj_b"]) + assert result.memberships == {"proj_a": "role_a", "proj_b": "role_c"} + assert result.permission == "view" + mock_membership.get_memberships_for_user.assert_called_once_with(user.id) @patch("testgen.mcp.permissions.ProjectMembership") -@patch("testgen.mcp.permissions.PluginHook") -def test_compute_project_access_filters_by_role(mock_hook, mock_membership): - """Enterprise: only memberships with allowed roles are returned.""" +def test_compute_project_permissions_no_memberships(mock_membership): user = MagicMock() - user.is_global_admin = False user.id = uuid4() + mock_membership.get_memberships_for_user.return_value = [] - # "view" permission: admin, data_quality, analyst, business — NOT catalog - mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = [ - "admin", "data_quality", "analyst", "business", - ] - - m1 = MagicMock() - m1.project_code = "proj_a" - m1.role = "admin" - m2 = MagicMock() - m2.project_code = "proj_b" - m2.role = "catalog" - mock_membership.get_memberships_for_user.return_value = [m1, m2] - - result = _compute_project_access(user, "view") - - assert result.allowed_codes == frozenset(["proj_a"]) - assert result.memberships == {"proj_a": "admin", "proj_b": "catalog"} + result = _compute_project_permissions(user, "view") + assert result.memberships == {} + assert result.permission == "view" -@patch("testgen.mcp.permissions.ProjectMembership") -@patch("testgen.mcp.permissions.PluginHook") -def test_compute_project_access_catalog_user_with_catalog_permission(mock_hook, mock_membership): - """Catalog user calling catalog-permission tool gets their projects.""" - user = MagicMock() - user.is_global_admin = False - user.id = uuid4() - - mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = [ - "admin", "data_quality", "analyst", "business", "catalog", - ] - m1 = MagicMock() - m1.project_code = "proj_a" - m1.role = "catalog" - mock_membership.get_memberships_for_user.return_value = [m1] +# --- ProjectPermissions.codes_allowed_to --- +# These rely on the conftest's PluginHook mock (TEST_PERM_MATRIX). - result = _compute_project_access(user, "catalog") - assert result.allowed_codes == frozenset(["proj_a"]) +def test_codes_allowed_to_filters_by_role(): + perms = ProjectPermissions( + memberships={"proj_a": "role_a", "proj_b": "role_c"}, + permission="catalog", + ) + # "view" includes role_a but not role_c + result = perms.codes_allowed_to("view") + assert result == ["proj_a"] -@patch("testgen.mcp.permissions.ProjectMembership") -@patch("testgen.mcp.permissions.PluginHook") -def test_compute_project_access_catalog_user_with_view_permission_gets_empty(mock_hook, mock_membership): - """Catalog user calling view-permission tool gets empty allowed set.""" - user = MagicMock() - user.is_global_admin = False - user.id = uuid4() +def test_codes_allowed_to_all_matching(): + perms = ProjectPermissions( + memberships={"proj_a": "role_a", "proj_b": "role_b"}, + permission="catalog", + ) + # "catalog" includes all roles + result = perms.codes_allowed_to("catalog") + assert sorted(result) == ["proj_a", "proj_b"] - # "view" excludes catalog role - mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = [ - "admin", "data_quality", "analyst", "business", - ] - m1 = MagicMock() - m1.project_code = "proj_a" - m1.role = "catalog" - mock_membership.get_memberships_for_user.return_value = [m1] +def test_codes_allowed_to_none_matching(): + perms = ProjectPermissions( + memberships={"proj_a": "role_c"}, + permission="catalog", + ) + # "view" excludes role_c + result = perms.codes_allowed_to("view") + assert result == [] - result = _compute_project_access(user, "view") - assert result.allowed_codes == frozenset() +# --- ProjectPermissions.allowed_codes --- -# --- ProjectAccess.verify_access --- +def test_allowed_codes_uses_decorator_permission(): + perms = ProjectPermissions( + memberships={"proj_a": "role_a", "proj_b": "role_c"}, + permission="view", + ) + # "view" includes role_a but not role_c + assert perms.allowed_codes == ["proj_a"] -def test_verify_access_admin_always_passes(): - access = ProjectAccess(is_unrestricted=True, memberships={}, permission="view", allowed_codes=frozenset()) - access.verify_access("any_project", not_found="not found") +# --- ProjectPermissions.verify_access --- def test_verify_access_allowed_passes(): - access = ProjectAccess( - is_unrestricted=False, - memberships={"proj_a": "admin"}, - permission="view", - allowed_codes=frozenset(["proj_a"]), - ) - access.verify_access("proj_a", not_found="not found") + perms = ProjectPermissions(memberships={"proj_a": "role_a"}, permission="view") + perms.verify_access("proj_a", not_found="not found") def test_verify_access_membership_but_wrong_role_raises(): - access = ProjectAccess( - is_unrestricted=False, - memberships={"proj_a": "admin", "proj_b": "catalog"}, + perms = ProjectPermissions( + memberships={"proj_a": "role_a", "proj_b": "role_c"}, permission="view", - allowed_codes=frozenset(["proj_a"]), ) with pytest.raises(MCPPermissionDenied, match="necessary permission"): - access.verify_access("proj_b", not_found="not found") + perms.verify_access("proj_b", not_found="not found") def test_verify_access_no_membership_raises_not_found(): - access = ProjectAccess( - is_unrestricted=False, - memberships={"proj_a": "admin"}, + perms = ProjectPermissions( + memberships={"proj_a": "role_a"}, permission="view", - allowed_codes=frozenset(["proj_a"]), ) with pytest.raises(MCPPermissionDenied, match="not found"): - access.verify_access("secret", not_found="not found") - - -# --- ProjectAccess.has_access --- - + perms.verify_access("secret", not_found="not found") -def test_has_access_admin(): - access = ProjectAccess(is_unrestricted=True, memberships={}, permission="view", allowed_codes=frozenset()) - assert access.has_access("anything") is True - -def test_has_access_allowed(): - access = ProjectAccess( - is_unrestricted=False, memberships={"proj_a": "admin"}, permission="view", allowed_codes=frozenset(["proj_a"]), - ) - assert access.has_access("proj_a") is True - assert access.has_access("proj_b") is False +# --- ProjectPermissions.has_access --- -# --- ProjectAccess.query_codes --- +def test_has_access(): + perms = ProjectPermissions(memberships={"proj_a": "role_a"}, permission="view") + assert perms.has_access("proj_a") is True + assert perms.has_access("proj_b") is False -def test_query_codes_admin(): - access = ProjectAccess(is_unrestricted=True, memberships={}, permission="view", allowed_codes=frozenset()) - assert access.query_codes is None +# --- get_project_permissions --- -def test_query_codes_scoped(): - access = ProjectAccess( - is_unrestricted=False, memberships={"proj_a": "admin"}, permission="view", allowed_codes=frozenset(["proj_a"]), - ) - assert access.query_codes == ["proj_a"] - - -# --- ProjectAccess.query_codes_for --- - - -@patch("testgen.mcp.permissions.PluginHook") -def test_query_codes_for_admin(mock_hook): - access = ProjectAccess(is_unrestricted=True, memberships={}, permission="catalog", allowed_codes=frozenset()) - assert access.query_codes_for("view") is None - - -def test_query_codes_for_same_permission(): - access = ProjectAccess( - is_unrestricted=False, memberships={"proj_a": "admin"}, permission="view", allowed_codes=frozenset(["proj_a"]), - ) - assert access.query_codes_for("view") == ["proj_a"] - - -@patch("testgen.mcp.permissions.PluginHook") -def test_query_codes_for_different_permission(mock_hook): - mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = ["admin"] - access = ProjectAccess( - is_unrestricted=False, - memberships={"proj_a": "admin", "proj_b": "catalog"}, - permission="catalog", - allowed_codes=frozenset(["proj_a", "proj_b"]), - ) - result = access.query_codes_for("view") - assert result == ["proj_a"] - - -# --- get_project_access --- - - -def test_get_project_access_raises_without_decorator(): +def test_get_project_permissions_raises_without_decorator(): with pytest.raises(RuntimeError, match="add the decorator"): - get_project_access() + get_project_permissions() -def test_get_project_access_returns_set_value(): - access = ProjectAccess(is_unrestricted=True, memberships={}, permission="view", allowed_codes=frozenset()) - token = _mcp_project_access.set(access) +def test_get_project_permissions_returns_set_value(): + perms = ProjectPermissions(memberships={}, permission="view") + token = _mcp_project_permissions.set(perms) try: - assert get_project_access() is access + assert get_project_permissions() is perms finally: - _mcp_project_access.reset(token) - - -# --- resolve_project_access --- - - -@patch("testgen.mcp.permissions.User") -def test_resolve_project_access_global_admin(mock_user): - user = MagicMock() - user.is_global_admin = True - mock_user.get.return_value = user - set_mcp_username("admin") - - result = resolve_project_access("view") - - assert result.is_unrestricted is True - - -@patch("testgen.mcp.permissions.ProjectMembership") -@patch("testgen.mcp.permissions.PluginHook") -@patch("testgen.mcp.permissions.User") -def test_resolve_project_access_scoped_user(mock_user, mock_hook, mock_membership): - user = MagicMock() - user.is_global_admin = False - user.id = uuid4() - mock_user.get.return_value = user - set_mcp_username("scoped") - - mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = ["admin"] - - m1 = MagicMock() - m1.project_code = "proj_a" - m1.role = "admin" - mock_membership.get_memberships_for_user.return_value = [m1] - - result = resolve_project_access("view") - - assert result.allowed_codes == frozenset(["proj_a"]) + _mcp_project_permissions.reset(token) # --- mcp_permission decorator --- +# These rely on conftest's mocks (User, ProjectMembership, PluginHook). -@patch("testgen.mcp.permissions.User") -def test_mcp_permission_sets_contextvar_for_global_admin(mock_user): - user = MagicMock() - user.is_global_admin = True - mock_user.get.return_value = user - set_mcp_username("admin") +def test_mcp_permission_sets_contextvar(): + set_mcp_username("test") captured = {} @mcp_permission("view") def tool_fn(): - access = get_project_access() - captured["access"] = access + perms = get_project_permissions() + captured["perms"] = perms return "ok" result = tool_fn() assert result == "ok" - assert captured["access"].is_unrestricted is True - assert captured["access"].query_codes is None + assert "demo" in captured["perms"].allowed_codes + assert captured["perms"].memberships == {"demo": "role_a"} @patch("testgen.mcp.permissions.ProjectMembership") -@patch("testgen.mcp.permissions.PluginHook") -@patch("testgen.mcp.permissions.User") -def test_mcp_permission_sets_contextvar_for_scoped_user(mock_user, mock_hook, mock_membership): - user = MagicMock() - user.is_global_admin = False - user.id = uuid4() - mock_user.get.return_value = user - set_mcp_username("scoped") - - mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = [ - "admin", "data_quality", "analyst", "business", "catalog", - ] - - m1 = MagicMock() - m1.project_code = "proj_x" - m1.role = "admin" - mock_membership.get_memberships_for_user.return_value = [m1] - - captured = {} - - @mcp_permission("view") - def tool_fn(): - access = get_project_access() - captured["access"] = access - return "ok" - - result = tool_fn() - - assert result == "ok" - assert captured["access"].allowed_codes == frozenset(["proj_x"]) - assert captured["access"].memberships == {"proj_x": "admin"} - - -@patch("testgen.mcp.permissions.ProjectMembership") -@patch("testgen.mcp.permissions.PluginHook") -@patch("testgen.mcp.permissions.User") -def test_mcp_permission_early_return_when_no_allowed_codes(mock_user, mock_hook, mock_membership): +def test_mcp_permission_early_return_when_no_allowed_codes(mock_membership): """Decorator returns early if user has no projects with the required permission.""" - user = MagicMock() - user.is_global_admin = False - user.id = uuid4() - mock_user.get.return_value = user - set_mcp_username("scoped") - - # "view" excludes catalog role - mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = ["admin"] + set_mcp_username("test") m1 = MagicMock() m1.project_code = "proj_a" - m1.role = "catalog" + m1.role = "role_c" mock_membership.get_memberships_for_user.return_value = [m1] @mcp_permission("view") @@ -411,13 +229,9 @@ def tool_fn(): assert "role" in result.lower() -@patch("testgen.mcp.permissions.User") -def test_mcp_permission_catches_mcp_permission_denied(mock_user): +def test_mcp_permission_catches_mcp_permission_denied(): """Decorator catches MCPPermissionDenied and returns str(e).""" - user = MagicMock() - user.is_global_admin = True - mock_user.get.return_value = user - set_mcp_username("admin") + set_mcp_username("test") @mcp_permission("view") def tool_fn(): @@ -428,12 +242,8 @@ def tool_fn(): assert result == "Access denied for testing" -@patch("testgen.mcp.permissions.User") -def test_mcp_permission_resets_contextvar_after_call(mock_user): - user = MagicMock() - user.is_global_admin = True - mock_user.get.return_value = user - set_mcp_username("admin") +def test_mcp_permission_resets_contextvar_after_call(): + set_mcp_username("test") @mcp_permission("view") def tool_fn(): @@ -441,15 +251,10 @@ def tool_fn(): tool_fn() - assert _mcp_project_access.get() is _NOT_SET + assert _mcp_project_permissions.get() is _NOT_SET -@patch("testgen.mcp.permissions.User") -def test_mcp_permission_preserves_function_metadata(mock_user): - user = MagicMock() - user.is_global_admin = True - mock_user.get.return_value = user - +def test_mcp_permission_preserves_function_metadata(): @mcp_permission("view") def my_tool(x: int, y: str = "default") -> str: """Tool docstring.""" diff --git a/tests/unit/mcp/test_tools_discovery.py b/tests/unit/mcp/test_tools_discovery.py index ddd55947..48438cb6 100644 --- a/tests/unit/mcp/test_tools_discovery.py +++ b/tests/unit/mcp/test_tools_discovery.py @@ -1,7 +1,7 @@ from unittest.mock import MagicMock, patch from uuid import uuid4 -from testgen.mcp.permissions import ProjectAccess +from testgen.mcp.permissions import ProjectPermissions @patch("testgen.mcp.services.inventory_service.get_inventory") @@ -17,16 +17,13 @@ def test_get_data_inventory_returns_markdown(mock_get_inventory, db_session_mock @patch("testgen.mcp.services.inventory_service.get_inventory") -@patch("testgen.mcp.permissions._compute_project_access") +@patch("testgen.mcp.permissions._compute_project_permissions") def test_get_data_inventory_passes_project_codes_for_scoped_user( - mock_compute, mock_get_inventory, db_session_mock, mcp_user, + mock_compute, mock_get_inventory, db_session_mock, ): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"proj_a": "catalog"}, + mock_compute.return_value = ProjectPermissions( + memberships={"proj_a": "role_c"}, permission="catalog", - allowed_codes=frozenset(["proj_a"]), ) mock_get_inventory.return_value = "# Data Inventory" @@ -34,27 +31,19 @@ def test_get_data_inventory_passes_project_codes_for_scoped_user( get_data_inventory() - # query_codes_for("view") calls PluginHook — but catalog != view, so it goes through the branch. - # For this test, we verify the call was made with list form of allowed_codes for project_codes. call_kwargs = mock_get_inventory.call_args.kwargs assert call_kwargs["project_codes"] == ["proj_a"] @patch("testgen.mcp.services.inventory_service.get_inventory") -@patch("testgen.mcp.permissions.PluginHook") -@patch("testgen.mcp.permissions._compute_project_access") +@patch("testgen.mcp.permissions._compute_project_permissions") def test_get_data_inventory_view_codes_for_scoped_user( - mock_compute, mock_hook, mock_get_inventory, db_session_mock, mcp_user, + mock_compute, mock_get_inventory, db_session_mock, ): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"proj_a": "catalog", "proj_b": "admin"}, + mock_compute.return_value = ProjectPermissions( + memberships={"proj_a": "role_c", "proj_b": "role_a"}, permission="catalog", - allowed_codes=frozenset(["proj_a", "proj_b"]), ) - # "view" allows admin but not catalog - mock_hook.instance.return_value.rbac.get_roles_with_permission.return_value = ["admin"] mock_get_inventory.return_value = "# Data Inventory" from testgen.mcp.tools.discovery import get_data_inventory @@ -62,21 +51,10 @@ def test_get_data_inventory_view_codes_for_scoped_user( get_data_inventory() call_kwargs = mock_get_inventory.call_args.kwargs + # "view" includes role_a but not role_c assert call_kwargs["view_project_codes"] == ["proj_b"] -@patch("testgen.mcp.services.inventory_service.get_inventory") -def test_get_data_inventory_passes_none_for_global_admin(mock_get_inventory, db_session_mock, mcp_user): - mcp_user.is_global_admin = True - mock_get_inventory.return_value = "# Data Inventory" - - from testgen.mcp.tools.discovery import get_data_inventory - - get_data_inventory() - - mock_get_inventory.assert_called_once_with(project_codes=None, view_project_codes=None) - - @patch("testgen.mcp.tools.discovery.Project") def test_list_projects_returns_formatted(mock_project, db_session_mock): proj1 = MagicMock() @@ -93,7 +71,8 @@ def test_list_projects_returns_formatted(mock_project, db_session_mock): assert "Demo Project" in result assert "`demo`" in result - assert "Staging" in result + # "staging" is not in conftest's default memberships, so filtered out + assert "Staging" not in result @patch("testgen.mcp.tools.discovery.Project") @@ -108,14 +87,11 @@ def test_list_projects_empty(mock_project, db_session_mock): @patch("testgen.mcp.tools.discovery.Project") -@patch("testgen.mcp.permissions._compute_project_access") -def test_list_projects_filters_for_scoped_user(mock_compute, mock_project, db_session_mock, mcp_user): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"demo": "admin"}, +@patch("testgen.mcp.permissions._compute_project_permissions") +def test_list_projects_filters_for_scoped_user(mock_compute, mock_project, db_session_mock): + mock_compute.return_value = ProjectPermissions( + memberships={"demo": "role_a"}, permission="catalog", - allowed_codes=frozenset(["demo"]), ) proj1 = MagicMock() @@ -182,16 +158,13 @@ def test_list_test_suites_empty_project_code(db_session_mock): assert "project_code" in result -@patch("testgen.mcp.permissions._compute_project_access") +@patch("testgen.mcp.permissions._compute_project_permissions") def test_list_test_suites_returns_not_found_for_inaccessible_project( - mock_compute, db_session_mock, mcp_user, + mock_compute, db_session_mock, ): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"other_project": "admin"}, + mock_compute.return_value = ProjectPermissions( + memberships={"other_project": "role_a"}, permission="view", - allowed_codes=frozenset(["other_project"]), ) from testgen.mcp.tools.discovery import list_test_suites @@ -201,16 +174,13 @@ def test_list_test_suites_returns_not_found_for_inaccessible_project( assert "No test suites found for project `secret_project`" in result -@patch("testgen.mcp.permissions._compute_project_access") +@patch("testgen.mcp.permissions._compute_project_permissions") def test_list_test_suites_returns_denial_for_insufficient_permission( - mock_compute, db_session_mock, mcp_user, + mock_compute, db_session_mock, ): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"other_project": "admin", "secret_project": "catalog"}, + mock_compute.return_value = ProjectPermissions( + memberships={"other_project": "role_a", "secret_project": "role_c"}, permission="view", - allowed_codes=frozenset(["other_project"]), ) from testgen.mcp.tools.discovery import list_test_suites @@ -222,16 +192,13 @@ def test_list_test_suites_returns_denial_for_insufficient_permission( @patch("testgen.mcp.tools.discovery.DataTable") -@patch("testgen.mcp.permissions._compute_project_access") +@patch("testgen.mcp.permissions._compute_project_permissions") def test_list_tables_returns_not_found_for_inaccessible_group( - mock_compute, mock_dt, db_session_mock, mcp_user, + mock_compute, mock_dt, db_session_mock, ): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"proj_a": "admin"}, + mock_compute.return_value = ProjectPermissions( + memberships={"proj_a": "role_a"}, permission="catalog", - allowed_codes=frozenset(["proj_a"]), ) mock_dt.select_table_names.return_value = [] mock_dt.count_tables.return_value = 0 diff --git a/tests/unit/mcp/test_tools_test_results.py b/tests/unit/mcp/test_tools_test_results.py index cf3ed91c..b1b6cea2 100644 --- a/tests/unit/mcp/test_tools_test_results.py +++ b/tests/unit/mcp/test_tools_test_results.py @@ -4,7 +4,7 @@ import pytest from testgen.common.models.test_result import TestResultStatus -from testgen.mcp.permissions import ProjectAccess +from testgen.mcp.permissions import ProjectPermissions @patch("testgen.mcp.tools.test_results.TestType") @@ -110,14 +110,11 @@ def test_get_test_results_invalid_status(db_session_mock): @patch("testgen.mcp.tools.test_results.TestResult") -@patch("testgen.mcp.permissions._compute_project_access") -def test_get_test_results_passes_project_codes(mock_compute, mock_result, db_session_mock, mcp_user): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"proj_a": "admin"}, +@patch("testgen.mcp.permissions._compute_project_permissions") +def test_get_test_results_passes_project_codes(mock_compute, mock_result, db_session_mock): + mock_compute.return_value = ProjectPermissions( + memberships={"proj_a": "role_a"}, permission="view", - allowed_codes=frozenset(["proj_a"]), ) mock_result.select_results.return_value = [] @@ -204,16 +201,13 @@ def test_get_failure_summary_invalid_uuid(db_session_mock): @patch("testgen.mcp.tools.test_results.TestResult") -@patch("testgen.mcp.permissions._compute_project_access") +@patch("testgen.mcp.permissions._compute_project_permissions") def test_get_failure_summary_passes_project_codes( - mock_compute, mock_result, db_session_mock, mcp_user, + mock_compute, mock_result, db_session_mock, ): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"proj_a": "admin"}, + mock_compute.return_value = ProjectPermissions( + memberships={"proj_a": "role_a"}, permission="view", - allowed_codes=frozenset(["proj_a"]), ) mock_result.select_failures.return_value = [] @@ -284,16 +278,13 @@ def test_get_test_result_history_invalid_uuid(db_session_mock): @patch("testgen.mcp.tools.test_results.TestResult") -@patch("testgen.mcp.permissions._compute_project_access") +@patch("testgen.mcp.permissions._compute_project_permissions") def test_get_test_result_history_passes_project_codes( - mock_compute, mock_result, db_session_mock, mcp_user, + mock_compute, mock_result, db_session_mock, ): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"proj_a": "admin"}, + mock_compute.return_value = ProjectPermissions( + memberships={"proj_a": "role_a"}, permission="view", - allowed_codes=frozenset(["proj_a"]), ) mock_result.select_history.return_value = [] diff --git a/tests/unit/mcp/test_tools_test_runs.py b/tests/unit/mcp/test_tools_test_runs.py index c9a0c35f..1cbb7b99 100644 --- a/tests/unit/mcp/test_tools_test_runs.py +++ b/tests/unit/mcp/test_tools_test_runs.py @@ -1,7 +1,7 @@ from unittest.mock import MagicMock, patch from uuid import uuid4 -from testgen.mcp.permissions import ProjectAccess +from testgen.mcp.permissions import ProjectPermissions def _make_run_summary(**overrides): @@ -134,16 +134,13 @@ def test_get_recent_test_runs_empty_project_code(db_session_mock): assert "project_code" in result -@patch("testgen.mcp.permissions._compute_project_access") +@patch("testgen.mcp.permissions._compute_project_permissions") def test_get_recent_test_runs_returns_not_found_for_inaccessible_project( - mock_compute, db_session_mock, mcp_user, + mock_compute, db_session_mock, ): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"other_project": "admin"}, + mock_compute.return_value = ProjectPermissions( + memberships={"other_project": "role_a"}, permission="view", - allowed_codes=frozenset(["other_project"]), ) from testgen.mcp.tools.test_runs import get_recent_test_runs @@ -153,16 +150,13 @@ def test_get_recent_test_runs_returns_not_found_for_inaccessible_project( assert "No completed test runs found in project `secret_project`" in result -@patch("testgen.mcp.permissions._compute_project_access") +@patch("testgen.mcp.permissions._compute_project_permissions") def test_get_recent_test_runs_returns_denial_for_insufficient_permission( - mock_compute, db_session_mock, mcp_user, + mock_compute, db_session_mock, ): - mcp_user.is_global_admin = False - mock_compute.return_value = ProjectAccess( - is_unrestricted=False, - memberships={"other_project": "admin", "secret_project": "catalog"}, + mock_compute.return_value = ProjectPermissions( + memberships={"other_project": "role_a", "secret_project": "role_c"}, permission="view", - allowed_codes=frozenset(["other_project"]), ) from testgen.mcp.tools.test_runs import get_recent_test_runs From e3b05818db9d27eae953d1041f68fbc78a322c1d Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 6 Mar 2026 01:02:54 -0300 Subject: [PATCH 31/95] fix(ui): fix auth base class kwarg, review column spacing, and grants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix Authentication.get_default_page() parameter name mismatch (_project_code → project_code) that caused TypeError when enterprise auth plugin was not loaded - Add space between note icon and count in test results review column; use middle-dot separator between review items for clarity - Add test_definition_notes to execute role grants in 075 Co-Authored-By: Claude Opus 4.6 --- testgen/template/dbsetup/075_grant_role_rights.sql | 3 ++- testgen/ui/auth.py | 2 +- testgen/ui/views/test_results.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/testgen/template/dbsetup/075_grant_role_rights.sql b/testgen/template/dbsetup/075_grant_role_rights.sql index df1d6dea..af100289 100644 --- a/testgen/template/dbsetup/075_grant_role_rights.sql +++ b/testgen/template/dbsetup/075_grant_role_rights.sql @@ -41,7 +41,8 @@ GRANT SELECT, INSERT, DELETE, UPDATE ON {SCHEMA_NAME}.score_history_latest_runs, {SCHEMA_NAME}.job_schedules, {SCHEMA_NAME}.settings, - {SCHEMA_NAME}.notification_settings + {SCHEMA_NAME}.notification_settings, + {SCHEMA_NAME}.test_definition_notes TO testgen_execute_role; diff --git a/testgen/ui/auth.py b/testgen/ui/auth.py index b3830100..1ae89c2d 100644 --- a/testgen/ui/auth.py +++ b/testgen/ui/auth.py @@ -41,7 +41,7 @@ def user_display(self) -> str | None: def current_project(self) -> str | None: return session.sidebar_project - def get_default_page(self, _project_code: str | None = None) -> str: + def get_default_page(self, project_code: str | None = None) -> str: # noqa: ARG002 return "project-dashboard" if self.user else "" def user_has_permission(self, permission: Permission, /, project_code: str | None = None) -> bool: # noqa: ARG002 diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 1f126114..8c06962b 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -241,8 +241,8 @@ def build_review_column(row): if row["flagged"]: parts.append("🚩") if row.get("notes_count", 0) > 0: - parts.append(f"📝{row['notes_count']}") - return " ".join(parts) + parts.append(f"📝 {row['notes_count']}") + return " ¡ ".join(parts) df["review"] = df.apply(build_review_column, axis=1) From 1f2a5f85f8b1c86f89e69f143b5b76503ab1cf95 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 6 Mar 2026 10:37:15 -0300 Subject: [PATCH 32/95] fix(ui): address MR review feedback for notes dialog (TG-976) - Replace custom CSS with shared.css utility classes (tdn-label, tdn-notes-list, tdn-note-header, tdn-note-author) - Use formatTimestamp from display_utils.js for note timestamps - Restructure note actions: hide edit button during delete confirmation - Improve empty state with utility classes and descriptive text - Shorten dialog title to "Test Notes" - Remove st.rerun() from handlers to fix fragment rerun warnings - Fix grid not updating after notes dialog: clear test results cache on add/delete, use on_dismiss="rerun" - Guard against empty DataFrame crash in review column Co-Authored-By: Claude Opus 4.6 --- .../js/pages/test_definition_notes.js | 134 +++++++----------- .../dialogs/test_definition_notes_dialog.py | 8 +- testgen/ui/views/test_results.py | 2 +- 3 files changed, 57 insertions(+), 87 deletions(-) diff --git a/testgen/ui/components/frontend/js/pages/test_definition_notes.js b/testgen/ui/components/frontend/js/pages/test_definition_notes.js index bf9983b3..91cc9f48 100644 --- a/testgen/ui/components/frontend/js/pages/test_definition_notes.js +++ b/testgen/ui/components/frontend/js/pages/test_definition_notes.js @@ -19,20 +19,11 @@ import { Icon } from '../components/icon.js'; import { Streamlit } from '../streamlit.js'; import { emitEvent, getValue, loadStylesheet } from '../utils.js'; import { ExpansionPanel } from '../components/expansion_panel.js'; +import { formatTimestamp } from '../display_utils.js'; const minHeight = 400; const { div, span, textarea, p } = van.tags; -/** - * @param {string?} isoString - * @returns {string} - */ -function formatDate(isoString) { - if (!isoString) return ''; - const date = new Date(isoString); - return Intl.DateTimeFormat('en-US', { dateStyle: 'medium', timeStyle: 'short' }).format(date); -} - /** * @param {Properties} props * @returns @@ -64,22 +55,42 @@ const TestDefinitionNotes = (props) => { return div( { class: () => `tdn-note ${isEdit.val && editNoteId.val === note.id ? 'tdn-editing' : ''}` }, div( - { class: 'tdn-note-header' }, - span({ class: 'tdn-note-author' }, `@${note.created_by}`), + { class: 'flex-row fx-gap-2' }, + span({ class: 'text-bold text-small' }, `@${note.created_by}`), span({ class: 'tdn-note-separator' }, '\u2014'), span({ class: 'tdn-note-date' }, - formatDate(note.created_at), + formatTimestamp(new Date(note.created_at), true), note.updated_at ? ' (edited)' : '', ), isOwner ? div( { class: 'tdn-note-actions' }, - () => isEdit.val && editNoteId.val === note.id - ? div( - { class: 'flex-row fx-gap-1 fx-align-center' }, - Icon({ size: 18, classes: 'tdn-editing-indicator' }, 'edit'), - span({ class: 'tdn-editing-indicator text-caption' }, 'Editing'), - ) - : div( + () => { + if (isEdit.val && editNoteId.val === note.id) { + return div( + { class: 'flex-row fx-gap-1 fx-align-center' }, + Icon({ size: 18, classes: 'tdn-editing-indicator' }, 'edit'), + span({ class: 'tdn-editing-indicator text-caption' }, 'Editing'), + ); + } + if (confirmingDelete.val) { + return div( + { class: 'flex-row fx-gap-1 fx-align-center' }, + span({ class: 'text-caption' }, 'Delete?'), + Button({ + label: 'Yes', + type: 'stroked', + color: 'warn', + onclick: () => emitEvent('NoteDeleted', { payload: { id: note.id } }), + }), + Button({ + label: 'No', + type: 'stroked', + color: 'basic', + onclick: () => { confirmingDelete.val = false; }, + }), + ); + } + return div( { class: 'flex-row fx-gap-1' }, Button({ type: 'icon', @@ -91,31 +102,15 @@ const TestDefinitionNotes = (props) => { noteText.val = note.detail; }, }), - () => confirmingDelete.val - ? div( - { class: 'flex-row fx-gap-1 fx-align-center' }, - span({ class: 'text-caption' }, 'Delete?'), - Button({ - label: 'Yes', - type: 'stroked', - color: 'warn', - onclick: () => emitEvent('NoteDeleted', { payload: { id: note.id } }), - }), - Button({ - label: 'No', - type: 'stroked', - color: 'basic', - onclick: () => { confirmingDelete.val = false; }, - }), - ) - : Button({ - type: 'icon', - icon: 'delete', - tooltip: 'Delete note', - tooltipPosition: 'top-left', - onclick: () => { confirmingDelete.val = true; }, - }), - ), + Button({ + type: 'icon', + icon: 'delete', + tooltip: 'Delete note', + tooltipPosition: 'top-left', + onclick: () => { confirmingDelete.val = true; }, + }), + ); + }, ) : null, ), p({ class: 'tdn-note-detail' }, note.detail), @@ -127,7 +122,7 @@ const TestDefinitionNotes = (props) => { () => { const label = getValue(props.test_label); return div( - { class: 'tdn-label' }, + { class: 'flex-row fx-flex-wrap fx-gap-1' }, span({ class: 'text-secondary' }, 'Table: '), span(label.table), span({ class: 'tdn-separator' }, '|'), span({ class: 'text-secondary' }, 'Column: '), span(label.column), @@ -188,12 +183,13 @@ const TestDefinitionNotes = (props) => { return notes.length > 0 ? div( - { class: 'tdn-notes-list' }, + { class: 'flex-column fx-gap-2' }, ...notes.map(note => NoteItem(note, currentUser)), ) : div( - { class: 'tdn-empty-state text-secondary' }, - 'No notes yet. Add one above.', + { class: 'flex-column fx-gap-2 fx-align-flex-center mt-7 text-secondary' }, + span({ class: 'text-large' }, 'No notes yet'), + span('Document context, decisions, or issues related to this test definition.'), ); }, ); @@ -201,13 +197,6 @@ const TestDefinitionNotes = (props) => { const stylesheet = new CSSStyleSheet(); stylesheet.replace(` -.tdn-label { - font-size: 14px; - display: flex; - flex-wrap: wrap; - align-items: center; - gap: 4px; -} .tdn-separator { color: var(--disabled-text-color); margin: 0 4px; @@ -234,36 +223,22 @@ stylesheet.replace(` font-style: italic; color: var(--disabled-text-color); } -.tdn-notes-list { - display: flex; - flex-direction: column; - gap: 4px; -} .tdn-note { - padding: 12px; + padding: 4px 12px 12px; border-radius: 8px; - background-color: var(--dk-card-background); - border: 1px solid var(--dk-card-border-color, rgba(0,0,0,0.06)); - transition: background-color 0.2s; + background-color: var(--app-background-color); +} +@media (prefers-color-scheme: dark) { + .tdn-note { + background-color: var(--dk-card-background); + } } .tdn-note.tdn-editing { background-color: var(--select-hover-background); } -.tdn-note-header { - display: flex; - flex-direction: row; - align-items: center; - gap: 6px; - margin-bottom: 6px; -} .tdn-editing-indicator { color: var(--purple); } -.tdn-note-author { - font-weight: 600; - font-size: 13px; - color: var(--primary-text-color); -} .tdn-note-separator { color: var(--disabled-text-color); font-size: 12px; @@ -286,11 +261,6 @@ stylesheet.replace(` color: var(--primary-text-color); white-space: pre-wrap; } -.tdn-empty-state { - text-align: center; - padding: 24px 0; - font-style: italic; -} `); export { TestDefinitionNotes }; diff --git a/testgen/ui/views/dialogs/test_definition_notes_dialog.py b/testgen/ui/views/dialogs/test_definition_notes_dialog.py index f6686e26..26a269c6 100644 --- a/testgen/ui/views/dialogs/test_definition_notes_dialog.py +++ b/testgen/ui/views/dialogs/test_definition_notes_dialog.py @@ -3,10 +3,11 @@ from testgen.common.models import with_database_session from testgen.common.models.test_definition import TestDefinitionNote from testgen.ui.components import widgets as testgen +from testgen.ui.queries import test_result_queries from testgen.ui.session import session -@st.dialog(title="Test Definition Notes") +@st.dialog(title="Test Notes", on_dismiss="rerun") @with_database_session def test_definition_notes_dialog(test_definition_id: str, test_label: dict) -> None: current_user = session.auth.user.username if session.auth.user else "unknown" @@ -14,15 +15,14 @@ def test_definition_notes_dialog(test_definition_id: str, test_label: dict) -> N def on_note_added(payload: dict) -> None: TestDefinitionNote.add_note(test_definition_id, payload["text"], current_user) - st.rerun() + test_result_queries.get_test_results.clear() def on_note_updated(payload: dict) -> None: TestDefinitionNote.update_note(payload["id"], payload["text"]) - st.rerun() def on_note_deleted(payload: dict) -> None: TestDefinitionNote.delete_note(payload["id"]) - st.rerun() + test_result_queries.get_test_results.clear() testgen.testgen_component( "test_definition_notes", diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 8c06962b..99b035c9 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -244,7 +244,7 @@ def build_review_column(row): parts.append(f"📝 {row['notes_count']}") return " ¡ ".join(parts) - df["review"] = df.apply(build_review_column, axis=1) + df["review"] = df.apply(build_review_column, axis=1) if not df.empty else "" test_suite = TestSuite.get_minimal(run.test_suite_id) table_group = TableGroup.get_minimal(test_suite.table_groups_id) From cffe34c31dd52fb66feb5dc7270d823b02cb48b6 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 5 Mar 2026 22:13:00 -0500 Subject: [PATCH 33/95] feat: support oauth for databricks --- pyproject.toml | 1 + .../flavor/databricks_flavor_service.py | 51 +++++++- .../frontend/js/components/connection_form.js | 114 +++++++++++++----- .../frontend/js/components/input.js | 4 +- .../js/components/table_group_test.js | 2 +- .../static/js/components/connection_form.js | 114 +++++++++++++----- testgen/ui/static/js/components/input.js | 4 +- .../static/js/components/table_group_test.js | 2 +- 8 files changed, 217 insertions(+), 75 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b3b88fb7..f11aba4d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "click==8.1.3", "sqlalchemy==1.4.46", "databricks-sql-connector==2.9.3", + "databricks-sdk>=0.20.0", "snowflake-sqlalchemy==1.6.1", "sqlalchemy-bigquery==1.14.1", "pyodbc==5.0.0", diff --git a/testgen/common/database/flavor/databricks_flavor_service.py b/testgen/common/database/flavor/databricks_flavor_service.py index b9b339ef..1595213c 100644 --- a/testgen/common/database/flavor/databricks_flavor_service.py +++ b/testgen/common/database/flavor/databricks_flavor_service.py @@ -9,11 +9,56 @@ class DatabricksFlavorService(FlavorService): escaped_single_quote = "\\'" varchar_type = "STRING" + def get_pre_connection_queries(self) -> list[tuple[str, dict | None]]: + if self.dbname: + return [(f"USE CATALOG `{self.dbname}`", None)] + return [] + + def get_connect_args(self) -> dict: + args = {} + if self.dbname: + args["catalog"] = self.dbname + if self.connect_by_key: + args["credentials_provider"] = self._get_oauth_credentials_provider() + return args + def get_connection_string_head(self): - return f"{self.flavor}://{self.username}:{quote_plus(self.password)}@" + if self.connect_by_key: + return f"{self.flavor}://oauth:@" + return f"{self.flavor}://token:{quote_plus(self.password)}@" def get_connection_string_from_fields(self): + if self.connect_by_key: + return ( + f"{self.flavor}://oauth:@{self.host}:{self.port}/{self.dbname}" + f"?http_path={self.http_path}&catalog={self.dbname}" + ) return ( - f"{self.flavor}://{self.username}:{quote_plus(self.password)}@{self.host}:{self.port}/{self.dbname}" - f"?http_path={self.http_path}" + f"{self.flavor}://token:{quote_plus(self.password)}@{self.host}:{self.port}/{self.dbname}" + f"?http_path={self.http_path}&catalog={self.dbname}" + ) + + def _get_oauth_credentials_provider(self): + from databricks.sdk.core import Config, oauth_service_principal + + config = Config( + host=f"https://{self.host}", + client_id=self.username, + client_secret=self.password, ) + # oauth_service_principal(config) returns an OAuthCredentialsProvider, + # which is callable: provider() -> Dict[str, str] (auth headers). + # + # The SQL connector's ExternalAuthProvider expects a CredentialsProvider + # with two levels: credentials_provider() -> HeaderFactory, then + # HeaderFactory() -> Dict[str, str]. Wrap to bridge the interface. + oauth_provider = oauth_service_principal(config) + + class _CredentialsProvider: + def auth_type(self): + return "oauth" + + def __call__(self): + return oauth_provider + + return _CredentialsProvider() diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js index 011e425a..0c0b3cfa 100644 --- a/testgen/ui/components/frontend/js/components/connection_form.js +++ b/testgen/ui/components/frontend/js/components/connection_form.js @@ -766,10 +766,11 @@ const DatabricksForm = ( ) => { const isValid = van.state(true); const connectByUrl = van.state(connection.rawVal?.connect_by_url ?? false); + const useOAuth = van.state(connection.rawVal?.connect_by_key ?? false); const connectionHost = van.state(connection.rawVal?.project_host ?? ''); const connectionPort = van.state(connection.rawVal?.project_port || defaultPorts[flavor.flavor]); const connectionHttpPath = van.state(connection.rawVal?.http_path ?? ''); - const connectionDatabase = van.state(connection.rawVal?.project_db ?? ''); + const connectionCatalog = van.state(connection.rawVal?.project_db ?? ''); const connectionUsername = van.state(connection.rawVal?.project_user ?? ''); const connectionPassword = van.state(connection.rawVal?.project_pw_encrypted ?? ''); const connectionUrl = van.state(connection.rawVal?.url ?? ''); @@ -780,13 +781,13 @@ const DatabricksForm = ( onChange({ project_host: connectionHost.val, project_port: connectionPort.val, - project_db: connectionDatabase.val, - project_user: connectionUsername.val, + project_db: connectionCatalog.val, + project_user: useOAuth.val ? connectionUsername.val : 'token', project_pw_encrypted: connectionPassword.val, http_path: connectionHttpPath.val, connect_by_url: connectByUrl.val, url: connectByUrl.val ? connectionUrl.val : connectionUrl.rawVal, - connect_by_key: false, + connect_by_key: useOAuth.val, }, isValid.val); }); @@ -803,7 +804,7 @@ const DatabricksForm = ( { class: 'flex-column border border-radius-1 p-3 mt-1 fx-gap-1', style: 'position: relative;' }, Caption({content: 'Server', style: 'position: absolute; top: -10px; background: var(--app-background-color); padding: 0px 8px;' }), - RadioGroup({ + () => useOAuth.val ? div() : RadioGroup({ label: 'Connect by', options: [ { @@ -868,16 +869,17 @@ const DatabricksForm = ( }, validators: [ requiredIf(() => !connectByUrl.val), - maxLength(50), + maxLength(200), ], }), Input({ name: 'db_name', - label: 'Database', - value: connectionDatabase, + label: 'Catalog', + value: connectionCatalog, + value: connectionCatalog, disabled: connectByUrl, onChange: (value, state) => { - connectionDatabase.val = value; + connectionCatalog.val = value; validityPerField['db_name'] = state.valid; isValid.val = Object.values(validityPerField).every(v => v); }, @@ -906,38 +908,84 @@ const DatabricksForm = ( }), ), ), - div( { class: 'flex-column border border-radius-1 p-3 mt-1 fx-gap-1', style: 'position: relative;' }, Caption({content: 'Authentication', style: 'position: absolute; top: -10px; background: var(--app-background-color); padding: 0px 8px;' }), - Input({ - name: 'db_user', - label: 'Username', - value: connectionUsername, - onChange: (value, state) => { - connectionUsername.val = value; - validityPerField['db_user'] = state.valid; - isValid.val = Object.values(validityPerField).every(v => v); - }, - validators: [ - required, - maxLength(50), + RadioGroup({ + label: 'Authentication method', + options: [ + {label: 'Access Token (PAT)', value: false}, + {label: 'Service Principal (OAuth)', value: true}, ], - }), - Input({ - name: 'password', - label: 'Password', - value: connectionPassword, - type: 'password', - passwordSuggestions: false, - placeholder: (originalConnection?.connection_id && originalConnection?.project_pw_encrypted) ? secretsPlaceholder : '', - onChange: (value, state) => { - connectionPassword.val = value; - validityPerField['password'] = state.valid; + value: useOAuth, + onChange: (value) => { + useOAuth.val = value; + connectionPassword.val = ''; + delete validityPerField['password']; + if (value) { + connectByUrl.val = false; + delete validityPerField['db_user']; + } isValid.val = Object.values(validityPerField).every(v => v); }, + layout: 'inline', }), + + () => { + if (useOAuth.val) { + return div( + { class: 'flex-column fx-gap-3' }, + Input({ + name: 'db_user', + label: 'Client ID', + value: connectionUsername, + onChange: (value, state) => { + connectionUsername.val = value; + validityPerField['db_user'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + required, + maxLength(100), + ], + }), + Input({ + name: 'password', + label: 'Client Secret', + value: connectionPassword, + type: 'password', + passwordSuggestions: false, + placeholder: (originalConnection?.connection_id && originalConnection?.project_pw_encrypted) ? secretsPlaceholder : '', + onChange: (value, state) => { + connectionPassword.val = value; + validityPerField['password'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + requiredIf(() => !originalConnection?.connection_id || !originalConnection?.project_pw_encrypted), + ], + }), + ); + } + + return Input({ + name: 'password', + label: 'Access Token', + value: connectionPassword, + type: 'password', + passwordSuggestions: false, + placeholder: (originalConnection?.connection_id && originalConnection?.project_pw_encrypted) ? secretsPlaceholder : '', + onChange: (value, state) => { + connectionPassword.val = value; + validityPerField['password'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + requiredIf(() => !originalConnection?.connection_id || !originalConnection?.project_pw_encrypted), + ], + }); + }, ), ); }; diff --git a/testgen/ui/components/frontend/js/components/input.js b/testgen/ui/components/frontend/js/components/input.js index 130aba5c..da3b93fc 100644 --- a/testgen/ui/components/frontend/js/components/input.js +++ b/testgen/ui/components/frontend/js/components/input.js @@ -132,7 +132,7 @@ const Input = (/** @type Properties */ props) => { props.prefix, ) : undefined, - input({ + () => input({ value, name: props.name ?? '', type: inputType, @@ -173,7 +173,7 @@ const Input = (/** @type Properties */ props) => { style: `top: ${((getValue(props.height) || defaultHeight) - addonIconSize) / 2}px`, onclick: () => inputType.val = passwordFieldTypeSwitch[inputType.val], }, - inputType.val === 'password' ? 'visibility' : 'visibility_off', + () => inputType.val === 'password' ? 'visibility' : 'visibility_off', ) : '', showClearable diff --git a/testgen/ui/components/frontend/js/components/table_group_test.js b/testgen/ui/components/frontend/js/components/table_group_test.js index ff987f06..94aa4898 100644 --- a/testgen/ui/components/frontend/js/components/table_group_test.js +++ b/testgen/ui/components/frontend/js/components/table_group_test.js @@ -111,7 +111,7 @@ const TableGroupTest = (preview, options) => { ), ) : div( - { class: 'flex-row fx-justify-center', style: 'height: 50px; font-size: 16px;'}, + { class: 'flex-row fx-justify-center p-3', style: 'min-height: 50px; font-size: 14px;'}, tableGroupPreview.message ?? 'No tables found.' ), ), diff --git a/testgen/ui/static/js/components/connection_form.js b/testgen/ui/static/js/components/connection_form.js index 011e425a..0c0b3cfa 100644 --- a/testgen/ui/static/js/components/connection_form.js +++ b/testgen/ui/static/js/components/connection_form.js @@ -766,10 +766,11 @@ const DatabricksForm = ( ) => { const isValid = van.state(true); const connectByUrl = van.state(connection.rawVal?.connect_by_url ?? false); + const useOAuth = van.state(connection.rawVal?.connect_by_key ?? false); const connectionHost = van.state(connection.rawVal?.project_host ?? ''); const connectionPort = van.state(connection.rawVal?.project_port || defaultPorts[flavor.flavor]); const connectionHttpPath = van.state(connection.rawVal?.http_path ?? ''); - const connectionDatabase = van.state(connection.rawVal?.project_db ?? ''); + const connectionCatalog = van.state(connection.rawVal?.project_db ?? ''); const connectionUsername = van.state(connection.rawVal?.project_user ?? ''); const connectionPassword = van.state(connection.rawVal?.project_pw_encrypted ?? ''); const connectionUrl = van.state(connection.rawVal?.url ?? ''); @@ -780,13 +781,13 @@ const DatabricksForm = ( onChange({ project_host: connectionHost.val, project_port: connectionPort.val, - project_db: connectionDatabase.val, - project_user: connectionUsername.val, + project_db: connectionCatalog.val, + project_user: useOAuth.val ? connectionUsername.val : 'token', project_pw_encrypted: connectionPassword.val, http_path: connectionHttpPath.val, connect_by_url: connectByUrl.val, url: connectByUrl.val ? connectionUrl.val : connectionUrl.rawVal, - connect_by_key: false, + connect_by_key: useOAuth.val, }, isValid.val); }); @@ -803,7 +804,7 @@ const DatabricksForm = ( { class: 'flex-column border border-radius-1 p-3 mt-1 fx-gap-1', style: 'position: relative;' }, Caption({content: 'Server', style: 'position: absolute; top: -10px; background: var(--app-background-color); padding: 0px 8px;' }), - RadioGroup({ + () => useOAuth.val ? div() : RadioGroup({ label: 'Connect by', options: [ { @@ -868,16 +869,17 @@ const DatabricksForm = ( }, validators: [ requiredIf(() => !connectByUrl.val), - maxLength(50), + maxLength(200), ], }), Input({ name: 'db_name', - label: 'Database', - value: connectionDatabase, + label: 'Catalog', + value: connectionCatalog, + value: connectionCatalog, disabled: connectByUrl, onChange: (value, state) => { - connectionDatabase.val = value; + connectionCatalog.val = value; validityPerField['db_name'] = state.valid; isValid.val = Object.values(validityPerField).every(v => v); }, @@ -906,38 +908,84 @@ const DatabricksForm = ( }), ), ), - div( { class: 'flex-column border border-radius-1 p-3 mt-1 fx-gap-1', style: 'position: relative;' }, Caption({content: 'Authentication', style: 'position: absolute; top: -10px; background: var(--app-background-color); padding: 0px 8px;' }), - Input({ - name: 'db_user', - label: 'Username', - value: connectionUsername, - onChange: (value, state) => { - connectionUsername.val = value; - validityPerField['db_user'] = state.valid; - isValid.val = Object.values(validityPerField).every(v => v); - }, - validators: [ - required, - maxLength(50), + RadioGroup({ + label: 'Authentication method', + options: [ + {label: 'Access Token (PAT)', value: false}, + {label: 'Service Principal (OAuth)', value: true}, ], - }), - Input({ - name: 'password', - label: 'Password', - value: connectionPassword, - type: 'password', - passwordSuggestions: false, - placeholder: (originalConnection?.connection_id && originalConnection?.project_pw_encrypted) ? secretsPlaceholder : '', - onChange: (value, state) => { - connectionPassword.val = value; - validityPerField['password'] = state.valid; + value: useOAuth, + onChange: (value) => { + useOAuth.val = value; + connectionPassword.val = ''; + delete validityPerField['password']; + if (value) { + connectByUrl.val = false; + delete validityPerField['db_user']; + } isValid.val = Object.values(validityPerField).every(v => v); }, + layout: 'inline', }), + + () => { + if (useOAuth.val) { + return div( + { class: 'flex-column fx-gap-3' }, + Input({ + name: 'db_user', + label: 'Client ID', + value: connectionUsername, + onChange: (value, state) => { + connectionUsername.val = value; + validityPerField['db_user'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + required, + maxLength(100), + ], + }), + Input({ + name: 'password', + label: 'Client Secret', + value: connectionPassword, + type: 'password', + passwordSuggestions: false, + placeholder: (originalConnection?.connection_id && originalConnection?.project_pw_encrypted) ? secretsPlaceholder : '', + onChange: (value, state) => { + connectionPassword.val = value; + validityPerField['password'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + requiredIf(() => !originalConnection?.connection_id || !originalConnection?.project_pw_encrypted), + ], + }), + ); + } + + return Input({ + name: 'password', + label: 'Access Token', + value: connectionPassword, + type: 'password', + passwordSuggestions: false, + placeholder: (originalConnection?.connection_id && originalConnection?.project_pw_encrypted) ? secretsPlaceholder : '', + onChange: (value, state) => { + connectionPassword.val = value; + validityPerField['password'] = state.valid; + isValid.val = Object.values(validityPerField).every(v => v); + }, + validators: [ + requiredIf(() => !originalConnection?.connection_id || !originalConnection?.project_pw_encrypted), + ], + }); + }, ), ); }; diff --git a/testgen/ui/static/js/components/input.js b/testgen/ui/static/js/components/input.js index b50efd1c..1efb0924 100644 --- a/testgen/ui/static/js/components/input.js +++ b/testgen/ui/static/js/components/input.js @@ -132,7 +132,7 @@ const Input = (/** @type Properties */ props) => { props.prefix, ) : undefined, - input({ + () => input({ value, name: props.name ?? '', type: inputType, @@ -173,7 +173,7 @@ const Input = (/** @type Properties */ props) => { style: `top: ${((getValue(props.height) || defaultHeight) - addonIconSize) / 2}px`, onclick: () => inputType.val = passwordFieldTypeSwitch[inputType.val], }, - inputType.val === 'password' ? 'visibility' : 'visibility_off', + () => inputType.val === 'password' ? 'visibility' : 'visibility_off', ) : '', showClearable diff --git a/testgen/ui/static/js/components/table_group_test.js b/testgen/ui/static/js/components/table_group_test.js index ff987f06..94aa4898 100644 --- a/testgen/ui/static/js/components/table_group_test.js +++ b/testgen/ui/static/js/components/table_group_test.js @@ -111,7 +111,7 @@ const TableGroupTest = (preview, options) => { ), ) : div( - { class: 'flex-row fx-justify-center', style: 'height: 50px; font-size: 16px;'}, + { class: 'flex-row fx-justify-center p-3', style: 'min-height: 50px; font-size: 14px;'}, tableGroupPreview.message ?? 'No tables found.' ), ), From f50b454d9351454cf029f6d01a13546f26514ed1 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Sun, 8 Feb 2026 22:18:32 -0500 Subject: [PATCH 34/95] refactor(profiling): replace yaml files with TG- conditional SQL templates --- testgen/commands/queries/profiling_query.py | 68 ++-- ...query.yaml => project_profiling_query.sql} | 278 +++++++-------- ...query.yaml => project_profiling_query.sql} | 283 ++++++++-------- ...query.yaml => project_profiling_query.sql} | 318 ++++++++++-------- ...query.yaml => project_profiling_query.sql} | 254 +++++++------- .../profiling/project_profiling_query.sql | 205 +++++++++++ .../profiling/project_profiling_query.yaml | 204 ----------- .../profiling/project_profiling_query.sql | 205 +++++++++++ .../profiling/project_profiling_query.yaml | 204 ----------- .../profiling/project_profiling_query.sql | 221 ++++++++++++ .../profiling/project_profiling_query.yaml | 211 ------------ 11 files changed, 1245 insertions(+), 1206 deletions(-) rename testgen/template/flavors/bigquery/profiling/{project_profiling_query.yaml => project_profiling_query.sql} (50%) rename testgen/template/flavors/databricks/profiling/{project_profiling_query.yaml => project_profiling_query.sql} (67%) rename testgen/template/flavors/mssql/profiling/{project_profiling_query.yaml => project_profiling_query.sql} (57%) rename testgen/template/flavors/postgresql/profiling/{project_profiling_query.yaml => project_profiling_query.sql} (67%) create mode 100644 testgen/template/flavors/redshift/profiling/project_profiling_query.sql delete mode 100644 testgen/template/flavors/redshift/profiling/project_profiling_query.yaml create mode 100644 testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql delete mode 100644 testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml create mode 100644 testgen/template/flavors/snowflake/profiling/project_profiling_query.sql delete mode 100644 testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index 4f67fde6..b69c7063 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -2,7 +2,7 @@ from uuid import UUID from testgen.commands.queries.refresh_data_chars_query import ColumnChars -from testgen.common import read_template_sql_file, read_template_yaml_file +from testgen.common import read_template_sql_file from testgen.common.database.database_service import process_conditionals, replace_params from testgen.common.models.connection import Connection from testgen.common.models.profiling_run import ProfilingRun @@ -94,7 +94,6 @@ def __init__(self, connection: Connection, table_group: TableGroup, profiling_ru self.profiling_run = profiling_run self.run_date = profiling_run.profiling_starttime self.flavor = connection.sql_flavor - self._profiling_template: dict = None def _get_params(self, column_chars: ColumnChars | None = None, table_sampling: TableSampling | None = None) -> dict: params = { @@ -150,14 +149,6 @@ def _get_query( return query, params - def _get_profiling_template(self) -> dict: - if not self._profiling_template: - self._profiling_template = read_template_yaml_file( - "project_profiling_query.yaml", - sub_directory=f"flavors/{self.flavor}/profiling", - ) - return self._profiling_template - def get_frequency_analysis_columns(self) -> tuple[str, dict]: # Runs on App database return self._get_query("secondary_profiling_columns.sql") @@ -228,42 +219,33 @@ def update_hygiene_issue_prevalence(self, issue_type: HygieneIssueType) -> tuple def run_column_profiling(self, column_chars: ColumnChars, table_sampling: TableSampling | None = None) -> tuple[str, dict]: # Runs on Target database - template = self._get_profiling_template() general_type = column_chars.general_type + do_sample = bool(table_sampling) - query = "" - query += template["01_sampling" if table_sampling else "01_else"] - query += template["01_all"] - query += template["02_X" if general_type == "X" else "02_else"] - query += template["03_ADN" if general_type in ["A", "D", "N"] else "03_else"] - - if general_type == "A": - query += template["04_A"] - elif general_type == "N": - query += template["04_N"] - else: - query += template["04_else"] - - query += template["05_A" if general_type == "A" else "05_else"] - query += template["06_A" if general_type == "A" else "06_else"] - query += template["08_N" if general_type == "N" else "08_else"] - query += template["10_N_dec" if general_type == "N" and column_chars.is_decimal == True else "10_else"] - query += template["11_D" if general_type == "D" else "11_else"] - query += template["12_B" if general_type == "B" else "12_else"] - query += template["14_A" if general_type == "A" else "14_else"] - query += template["16_all"] - query += template["98_all"] - - if general_type == "N": - query += template["99_N_sampling" if table_sampling else "99_N"] - else: - query += template["99_else"] - - params = self._get_params(column_chars, table_sampling) - query = replace_params(query, params) - query = replace_templated_functions(query, self.flavor) + extra_params = { + "do_sample": do_sample, + "is_type_A": general_type == "A", + "is_type_N": general_type == "N", + "is_type_D": general_type == "D", + "is_type_B": general_type == "B", + "is_type_ADN": general_type in ("A", "D", "N"), + "is_type_X": general_type == "X", + "is_A_sampling": general_type == "A" and do_sample, + "is_A_no_sampling": general_type == "A" and not do_sample, + "is_N_decimal": general_type == "N" and column_chars.is_decimal, + "is_N_sampling": general_type == "N" and do_sample, + "is_N_no_sampling": general_type == "N" and not do_sample, + "is_not_A": general_type != "A", + "is_not_A_not_N": general_type not in ("A", "N"), + } - return query, params + return self._get_query( + "project_profiling_query.sql", + f"flavors/{self.flavor}/profiling", + extra_params=extra_params, + column_chars=column_chars, + table_sampling=table_sampling, + ) def get_profiling_errors(self, column_errors: list[tuple[ColumnChars, str]]) -> list[list[str | UUID | int]]: return [ diff --git a/testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml b/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql similarity index 50% rename from testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml rename to testgen/template/flavors/bigquery/profiling/project_profiling_query.sql index 0a9c6350..ece95159 100644 --- a/testgen/template/flavors/bigquery/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql @@ -1,18 +1,14 @@ ---- -01_sampling: | - WITH target_table AS ( - SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` WHERE RAND() * 100 < {SAMPLE_PERCENT_CALC} - ) - SELECT -01_else: | - WITH target_table AS ( - SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` - ) - SELECT -01_all: | - {CONNECTION_ID} as connection_id, - '{PROJECT_CODE}' as project_code, - '{TABLE_GROUPS_ID}' as table_groups_id, +WITH target_table AS ( +-- TG-IF do_sample + SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` WHERE RAND() * 100 < {SAMPLE_PERCENT_CALC} +-- TG-ELSE + SELECT * FROM `{DATA_SCHEMA}.{DATA_TABLE}` +-- TG-ENDIF +) +SELECT + {CONNECTION_ID} AS connection_id, + '{PROJECT_CODE}' AS project_code, + '{TABLE_GROUPS_ID}' AS table_groups_id, '{DATA_SCHEMA}' AS schema_name, '{RUN_DATE}' AS run_date, '{DATA_TABLE}' AS table_name, @@ -22,44 +18,44 @@ '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, - -02_X: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(IF(`{COL_NAME}` IS NULL, 1, 0)) AS null_value_ct, -02_else: | - COUNT(`{COL_NAME}`) AS value_ct, - COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, - SUM(IF(`{COL_NAME}` IS NULL, 1, 0)) AS null_value_ct, - -03_ADN: MIN(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS min_length, +-- TG-IF is_type_ADN + MIN(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS min_length, MAX(LENGTH(CAST(`{COL_NAME}` AS STRING))) AS max_length, AVG(NULLIF(LENGTH(CAST(`{COL_NAME}` AS STRING)), 0)) AS avg_length, -03_else: NULL as min_length, - NULL as max_length, - NULL as avg_length, - -04_A: SUM( +-- TG-ELSE + NULL AS min_length, + NULL AS max_length, + NULL AS avg_length, +-- TG-ENDIF +-- TG-IF is_type_A + SUM( CASE WHEN REGEXP_CONTAINS(TRIM(CAST(`{COL_NAME}` AS STRING)), r'^0(\.0*)?$') THEN 1 ELSE 0 END ) AS zero_value_ct, -04_N: CAST(SUM(1 - ABS(SIGN(CAST(`{COL_NAME}` AS NUMERIC)))) AS INT64) AS zero_value_ct, -04_else: NULL as zero_value_ct, - -05_A: | +-- TG-ENDIF +-- TG-IF is_type_N + CAST(SUM(1 - ABS(SIGN(CAST(`{COL_NAME}` AS NUMERIC)))) AS INT64) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_not_A_not_N + NULL AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_A COUNT( DISTINCT UPPER( REGEXP_REPLACE(CAST(`{COL_NAME}` AS STRING), r"[ '\.,-]", "") ) - ) as distinct_std_value_ct, + ) AS distinct_std_value_ct, SUM(CASE WHEN `{COL_NAME}` = '' THEN 1 ELSE 0 END) AS zero_length_ct, SUM(CASE WHEN `{COL_NAME}` BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) AS lead_space_ct, SUM( CASE WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) LIKE '"%"' - OR LOWER(CAST(`{COL_NAME}` AS STRING)) LIKE "'%'" THEN 1 + OR LOWER(CAST(`{COL_NAME}` AS STRING)) LIKE "'%'" THEN 1 ELSE 0 END ) AS quoted_value_ct, @@ -71,8 +67,7 @@ ) AS includes_digit_ct, SUM( CASE - WHEN CAST(`{COL_NAME}` AS STRING) IN ('.', '?', ' ') THEN 1 - WHEN REGEXP_CONTAINS(LOWER(CAST(`{COL_NAME}` AS STRING)), r'^\s*[-09xz]{2,}\s*$') THEN 1 + WHEN REGEXP_CONTAINS(LOWER(CAST(`{COL_NAME}` AS STRING)), r'^(\.{1,}|-{1,}|\?{1,}|\s{1,}|0{2,}|9{2,}|x{2,}|z{2,})$') THEN 1 WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(CAST(`{COL_NAME}` AS STRING)) IN ('(blank)','(error)','(missing)','(tbd)', @@ -82,60 +77,61 @@ ELSE 0 END ) AS filled_value_ct, - LEFT(MIN(NULLIF(`{COL_NAME}`, '')), 100) AS min_text, - LEFT(MAX(NULLIF(`{COL_NAME}`, '')), 100) AS max_text, - SUM( CASE WHEN `{COL_NAME}` = UPPER(`{COL_NAME}`) AND `{COL_NAME}` <> LOWER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS upper_case_ct, - SUM( CASE WHEN `{COL_NAME}` = LOWER(`{COL_NAME}`) AND `{COL_NAME}` <> UPPER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS lower_case_ct, - SUM( CASE WHEN `{COL_NAME}` = UPPER(`{COL_NAME}`) AND `{COL_NAME}` = LOWER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS non_alpha_ct, + LEFT(MIN(NULLIF(`{COL_NAME}`, '')), 100) AS min_text, + LEFT(MAX(NULLIF(`{COL_NAME}`, '')), 100) AS max_text, + SUM(CASE WHEN `{COL_NAME}` = UPPER(`{COL_NAME}`) AND `{COL_NAME}` <> LOWER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS upper_case_ct, + SUM(CASE WHEN `{COL_NAME}` = LOWER(`{COL_NAME}`) AND `{COL_NAME}` <> UPPER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS lower_case_ct, + SUM(CASE WHEN `{COL_NAME}` = UPPER(`{COL_NAME}`) AND `{COL_NAME}` = LOWER(`{COL_NAME}`) THEN 1 ELSE 0 END) AS non_alpha_ct, COUNTIF( TRANSLATE( CAST(`{COL_NAME}` AS STRING), CODE_POINTS_TO_STRING([160, 8201, 8203, 8204, 8205, 8206, 8207, 8239, 12288, 65279]), REPEAT('X', 10) ) <> CAST(`{COL_NAME}` AS STRING) - ) as non_printing_ct, - SUM(<%IS_NUM;LEFT(`{COL_NAME}`, 31)%>) AS numeric_ct, - SUM(<%IS_DATE;LEFT(`{COL_NAME}`, 26)%>) AS date_ct, + ) AS non_printing_ct, + SUM(<%IS_NUM;LEFT(`{COL_NAME}`, 31)%>) AS numeric_ct, + SUM(<%IS_DATE;LEFT(`{COL_NAME}`, 26)%>) AS date_ct, CASE - WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') - THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'STREET_ADDR' - WHEN SAFE_DIVIDE(SUM(CASE WHEN `{COL_NAME}` IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') - THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'STATE_USA' - WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$') - THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'PHONE_USA' - WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') - THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'EMAIL' - WHEN SAFE_DIVIDE(SUM(CASE WHEN TRANSLATE(`{COL_NAME}`, '012345678', '999999999') IN ('99999', '999999999', '99999-9999') - THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'ZIP_USA' - WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[\w\s\-]+\.(txt|csv|tsv|dat|doc|pdf|xlsx)$') - THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'FILE_NAME' - WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([0-9]{4}[- ]){3}[0-9]{4}$') - THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'CREDIT_CARD' - WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') - AND NOT REGEXP_CONTAINS(`{COL_NAME}`, r'\s(and|but|or|yet)\s') - THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'DELIMITED_DATA' - WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$') - AND CAST(SUBSTR(`{COL_NAME}`, 1, 3) AS INT64) NOT BETWEEN 734 AND 749 - AND SUBSTR(`{COL_NAME}`, 1, 3) <> '666' - THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'SSN' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'STREET_ADDR' + WHEN SAFE_DIVIDE(SUM(CASE WHEN `{COL_NAME}` IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'STATE_USA' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'PHONE_USA' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'EMAIL' + WHEN SAFE_DIVIDE(SUM(CASE WHEN TRANSLATE(`{COL_NAME}`, '012345678', '999999999') IN ('99999', '999999999', '99999-9999') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'ZIP_USA' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[\w\s\-]+\.(txt|csv|tsv|dat|doc|pdf|xlsx)$') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'FILE_NAME' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([0-9]{4}[- ]){3}[0-9]{4}$') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'CREDIT_CARD' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') + AND NOT REGEXP_CONTAINS(`{COL_NAME}`, r'\s(and|but|or|yet)\s') + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'DELIMITED_DATA' + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$') + AND CAST(SUBSTR(`{COL_NAME}`, 1, 3) AS INT64) NOT BETWEEN 734 AND 749 + AND SUBSTR(`{COL_NAME}`, 1, 3) <> '666' + THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'SSN' END AS std_pattern_match, -05_else: NULL as distinct_std_value_ct, - NULL as zero_length_ct, - NULL as lead_space_ct, - NULL as quoted_value_ct, - NULL as includes_digit_ct, - NULL as filled_value_ct, - NULL as min_text, - NULL as max_text, - NULL as upper_case_ct, - NULL as lower_case_ct, - NULL as non_alpha_ct, - NULL as non_printing_ct, - NULL as numeric_ct, - NULL as date_ct, - NULL as std_pattern_match, - -06_A: | +-- TG-ELSE + NULL AS distinct_std_value_ct, + NULL AS zero_length_ct, + NULL AS lead_space_ct, + NULL AS quoted_value_ct, + NULL AS includes_digit_ct, + NULL AS filled_value_ct, + NULL AS min_text, + NULL AS max_text, + NULL AS upper_case_ct, + NULL AS lower_case_ct, + NULL AS non_alpha_ct, + NULL AS non_printing_ct, + NULL AS numeric_ct, + NULL AS date_ct, + NULL AS std_pattern_match, +-- TG-ENDIF +-- TG-IF is_type_A ( SELECT LEFT(STRING_AGG(val, ' | ' ORDER BY ct DESC), 1000) AS top_patterns FROM ( @@ -163,10 +159,12 @@ LIMIT 5 ) ) ps - ) as top_patterns, -06_else: NULL as top_patterns, - -08_N: MIN(`{COL_NAME}`) AS min_value, + ) AS top_patterns, +-- TG-ELSE + NULL AS top_patterns, +-- TG-ENDIF +-- TG-IF is_type_N + MIN(`{COL_NAME}`) AS min_value, MIN(CASE WHEN `{COL_NAME}` > 0 THEN `{COL_NAME}` ELSE NULL END) AS min_value_over_0, MAX(`{COL_NAME}`) AS max_value, AVG(CAST(`{COL_NAME}` AS FLOAT64)) AS avg_value, @@ -174,21 +172,24 @@ MIN(pct_25) AS percentile_25, MIN(pct_50) AS percentile_50, MIN(pct_75) AS percentile_75, -08_else: NULL as min_value, - NULL as min_value_over_0, - NULL as max_value, - NULL as avg_value, - NULL as stdev_value, - NULL as percentile_25, - NULL as percentile_50, - NULL as percentile_75, - -10_N_dec: SUM(COALESCE(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5), 0)) as fractional_sum, -10_else: NULL as fractional_sum, - -11_D: | +-- TG-ELSE + NULL AS min_value, + NULL AS min_value_over_0, + NULL AS max_value, + NULL AS avg_value, + NULL AS stdev_value, + NULL AS percentile_25, + NULL AS percentile_50, + NULL AS percentile_75, +-- TG-ENDIF +-- TG-IF is_N_decimal + SUM(COALESCE(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5), 0)) AS fractional_sum, +-- TG-ELSE + NULL AS fractional_sum, +-- TG-ENDIF +-- TG-IF is_type_D MIN(`{COL_NAME}`) AS min_date, -- Other flavors have a minimum threshold of 0001-01-01, but BigQuery doesn't make it easy to to the same - MAX(`{COL_NAME}`) as max_date, + MAX(`{COL_NAME}`) AS max_date, COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 12 THEN 1 END) AS before_1yr_date_ct, COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 60 THEN 1 END) AS before_5yr_date_ct, COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH) > 240 THEN 1 END) AS before_20yr_date_ct, @@ -197,27 +198,30 @@ COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY) BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, COUNT(CASE WHEN SAFE_CAST(DATE(`{COL_NAME}`) AS DATE) > SAFE_CAST(DATE('{RUN_DATE}') AS DATE) THEN 1 END) AS future_date_ct, COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), SAFE_CAST(DATE('{RUN_DATE}') AS DATE), MONTH) > 240 THEN 1 END) AS distant_future_date_ct, - COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY)) AS date_days_present, - COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), WEEK)) AS date_weeks_present, + COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY)) AS date_days_present, + COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), WEEK)) AS date_weeks_present, COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH)) AS date_months_present, -11_else: NULL as min_date, - NULL as max_date, - NULL as before_1yr_date_ct, - NULL as before_5yr_date_ct, - NULL as before_20yr_date_ct, +-- TG-ELSE + NULL AS min_date, + NULL AS max_date, + NULL AS before_1yr_date_ct, + NULL AS before_5yr_date_ct, + NULL AS before_20yr_date_ct, NULL AS before_100yr_date_ct, - NULL as within_1yr_date_ct, - NULL as within_1mo_date_ct, - NULL as future_date_ct, - NULL as distant_future_date_ct, - NULL as date_days_present, - NULL as date_weeks_present, - NULL as date_months_present, - -12_B: SUM(CAST(`{COL_NAME}` AS INT64)) AS boolean_true_ct, -12_else: NULL as boolean_true_ct, - -14_A: | + NULL AS within_1yr_date_ct, + NULL AS within_1mo_date_ct, + NULL AS future_date_ct, + NULL AS distant_future_date_ct, + NULL AS date_days_present, + NULL AS date_weeks_present, + NULL AS date_months_present, +-- TG-ENDIF +-- TG-IF is_type_B + SUM(CAST(`{COL_NAME}` AS INT64)) AS boolean_true_ct, +-- TG-ELSE + NULL AS boolean_true_ct, +-- TG-ENDIF +-- TG-IF is_type_A ( SELECT COUNT(DISTINCT REGEXP_REPLACE( @@ -229,29 +233,29 @@ )) AS pattern_ct FROM `target_table` WHERE `{COL_NAME}` > ' ' - ) as distinct_pattern_ct, + ) AS distinct_pattern_ct, SUM(CAST(SIGN(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', ''))) AS INT64)) AS embedded_space_ct, AVG(CAST(LENGTH(TRIM(`{COL_NAME}`)) - LENGTH(REPLACE(TRIM(`{COL_NAME}`), ' ', '')) AS FLOAT64)) AS avg_embedded_spaces, -14_else: NULL as distinct_pattern_ct, - NULL as embedded_space_ct, - NULL as avg_embedded_spaces, - -16_all: " '{PROFILE_RUN_ID}' as profile_run_id " - -98_all: ' FROM target_table' - -99_N: | - , - (SELECT - PERCENTILE_CONT(`{COL_NAME}`, 0.25) OVER() AS pct_25, - PERCENTILE_CONT(`{COL_NAME}`, 0.50) OVER() AS pct_50, - PERCENTILE_CONT(`{COL_NAME}`, 0.75) OVER() AS pct_75 - FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile -99_N_sampling: | +-- TG-ELSE + NULL AS distinct_pattern_ct, + NULL AS embedded_space_ct, + NULL AS avg_embedded_spaces, +-- TG-ENDIF + '{PROFILE_RUN_ID}' AS profile_run_id + FROM target_table +-- TG-IF is_N_sampling , (SELECT APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(25)] AS pct_25, APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(50)] AS pct_50, APPROX_QUANTILES(`{COL_NAME}`, 100)[OFFSET(75)] AS pct_75 FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile -99_else: ; +-- TG-ENDIF +-- TG-IF is_N_no_sampling + , + (SELECT + PERCENTILE_CONT(`{COL_NAME}`, 0.25) OVER() AS pct_25, + PERCENTILE_CONT(`{COL_NAME}`, 0.50) OVER() AS pct_50, + PERCENTILE_CONT(`{COL_NAME}`, 0.75) OVER() AS pct_75 + FROM `{DATA_SCHEMA}.{DATA_TABLE}` LIMIT 1) pctile +-- TG-ENDIF diff --git a/testgen/template/flavors/databricks/profiling/project_profiling_query.yaml b/testgen/template/flavors/databricks/profiling/project_profiling_query.sql similarity index 67% rename from testgen/template/flavors/databricks/profiling/project_profiling_query.yaml rename to testgen/template/flavors/databricks/profiling/project_profiling_query.sql index 2fc9350d..407901ba 100644 --- a/testgen/template/flavors/databricks/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/databricks/profiling/project_profiling_query.sql @@ -1,18 +1,14 @@ ---- -01_sampling: | - WITH target_table AS ( - SELECT * FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) - ) - SELECT -01_else: | - WITH target_table AS ( - SELECT * FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` - ) - SELECT -01_all: | - {CONNECTION_ID} as connection_id, - '{PROJECT_CODE}' as project_code, - '{TABLE_GROUPS_ID}' as table_groups_id, +WITH target_table AS ( +-- TG-IF do_sample + SELECT * FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) +-- TG-ELSE + SELECT * FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` +-- TG-ENDIF +) +SELECT + {CONNECTION_ID} AS connection_id, + '{PROJECT_CODE}' AS project_code, + '{TABLE_GROUPS_ID}' AS table_groups_id, '{DATA_SCHEMA}' AS schema_name, '{RUN_DATE}' AS run_date, '{DATA_TABLE}' AS table_name, @@ -22,49 +18,54 @@ '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, - -02_X: | COUNT(`{COL_NAME}`) AS value_ct, COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, SUM(CASE WHEN `{COL_NAME}` IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -02_else: | - COUNT(`{COL_NAME}`) AS value_ct, - COUNT(DISTINCT `{COL_NAME}`) AS distinct_value_ct, - SUM(CASE WHEN `{COL_NAME}` IS NULL THEN 1 ELSE 0 END) AS null_value_ct, - -03_ADN: MIN(LEN(`{COL_NAME}`)) AS min_length, - MAX(LEN(`{COL_NAME}`)) AS max_length, +-- TG-IF is_type_ADN + MIN(LEN(`{COL_NAME}`)) AS min_length, + MAX(LEN(`{COL_NAME}`)) AS max_length, AVG(CAST(NULLIF(LEN(`{COL_NAME}`), 0) AS FLOAT)) AS avg_length, -03_else: NULL as min_length, - NULL as max_length, - NULL as avg_length, - -04_A: SUM(CASE +-- TG-ELSE + NULL AS min_length, + NULL AS max_length, + NULL AS avg_length, +-- TG-ENDIF +-- TG-IF is_type_A + SUM(CASE WHEN LTRIM(RTRIM(`{COL_NAME}`)) RLIKE '0([.]0*)' THEN 1 ELSE 0 - END) AS zero_value_ct, -04_N: CAST(SUM( 1 - ABS(SIGN(`{COL_NAME}`)))AS BIGINT ) AS zero_value_ct, -04_else: NULL as zero_value_ct, - -05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE(`{COL_NAME}`,' '''',.-',REPEAT(' ', LEN(' '''',.-'))),' ',''))) as distinct_std_value_ct, + END) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_N + CAST(SUM( 1 - ABS(SIGN(`{COL_NAME}`)))AS BIGINT ) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_not_A_not_N + NULL AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(DISTINCT UPPER(REPLACE(TRANSLATE(`{COL_NAME}`,' '''',.-',REPEAT(' ', LEN(' '''',.-'))),' ',''))) AS distinct_std_value_ct, SUM(CASE WHEN `{COL_NAME}` = '' THEN 1 - ELSE 0 - END) AS zero_length_ct, - SUM( CASE - WHEN `{COL_NAME}` BETWEEN ' !' AND '!' THEN 1 - ELSE 0 - END ) AS lead_space_ct, - SUM( CASE WHEN `{COL_NAME}` LIKE '"%"' OR `{COL_NAME}` LIKE '\'%\'' THEN 1 ELSE 0 END ) as quoted_value_ct, - SUM( CASE WHEN `{COL_NAME}` RLIKE '[0-9]' THEN 1 ELSE 0 END ) as includes_digit_ct, - SUM( CASE - WHEN `{COL_NAME}` IN ('.', '?') OR `{COL_NAME}` RLIKE '^\s+$' THEN 1 + ELSE 0 + END) AS zero_length_ct, + SUM(CASE + WHEN `{COL_NAME}` BETWEEN ' !' AND '!' THEN 1 + ELSE 0 + END) AS lead_space_ct, + SUM(CASE WHEN `{COL_NAME}` LIKE '"%"' OR `{COL_NAME}` LIKE '\'%\'' THEN 1 ELSE 0 END) AS quoted_value_ct, + SUM(CASE WHEN `{COL_NAME}` RLIKE '[0-9]' THEN 1 ELSE 0 END) AS includes_digit_ct, + SUM(CASE + WHEN LEN(`{COL_NAME}`) > 0 + AND ((LEN(REPLACE(`{COL_NAME}`, '.', ''))= 0 ) + OR (LEN(REPLACE(`{COL_NAME}`, '-', ''))= 0 ) + OR (LEN(REPLACE(`{COL_NAME}`, '?', ''))= 0 ) + OR (LEN(REPLACE(`{COL_NAME}`, ' ', ''))= 0 ) + ) THEN 1 WHEN LEN(`{COL_NAME}`) > 1 - AND ( LOWER(`{COL_NAME}`) LIKE '%..%' OR LOWER(`{COL_NAME}`) RLIKE '--' - OR (LEN(REPLACE(`{COL_NAME}`, '0', ''))= 0 ) + AND ((LEN(REPLACE(`{COL_NAME}`, '0', ''))= 0 ) OR (LEN(REPLACE(`{COL_NAME}`, '9', ''))= 0 ) OR (LEN(REPLACE(LOWER(`{COL_NAME}`), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER(`{COL_NAME}`), 'z', ''))= 0 ) - ) THEN 1 + ) THEN 1 WHEN LOWER(`{COL_NAME}`) IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER(`{COL_NAME}`) IN ('(blank)','(error)','(missing)','(tbd)', @@ -72,9 +73,9 @@ WHEN LOWER(`{COL_NAME}`) IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 ELSE 0 - END ) AS filled_value_ct, - LEFT(MIN(NULLIF(`{COL_NAME}`, '')), 100) AS min_text, - LEFT(MAX(NULLIF(`{COL_NAME}`, '')), 100) AS max_text, + END) AS filled_value_ct, + LEFT(MIN(NULLIF(`{COL_NAME}`, '')), 100) AS min_text, + LEFT(MAX(NULLIF(`{COL_NAME}`, '')), 100) AS max_text, SUM(CASE WHEN TRANSLATE(`{COL_NAME}`, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = `{COL_NAME}` THEN 0 WHEN TRANSLATE(`{COL_NAME}`, 'abcdefghijklmnopqrstuvwxyz', ' ') = `{COL_NAME}` THEN 1 @@ -89,12 +90,12 @@ WHEN TRANSLATE(`{COL_NAME}`, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = `{COL_NAME}` THEN 1 ELSE 0 END) AS non_alpha_ct, - COUNT( CASE WHEN TRANSLATE(`{COL_NAME}`, '\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> `{COL_NAME}` THEN 1 END) as non_printing_ct, - SUM(<%IS_NUM;LEFT(`{COL_NAME}`, 31)%>) AS numeric_ct, - SUM(<%IS_DATE;LEFT(`{COL_NAME}`, 26)%>) AS date_ct, + COUNT(CASE WHEN TRANSLATE(`{COL_NAME}`, '\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> `{COL_NAME}` THEN 1 END) AS non_printing_ct, + SUM(<%IS_NUM;LEFT(`{COL_NAME}`, 31)%>) AS numeric_ct, + SUM(<%IS_DATE;LEFT(`{COL_NAME}`, 26)%>) AS date_ct, CASE WHEN CAST(SUM( CASE WHEN UPPER(`{COL_NAME}`) RLIKE '[1-9]{1,5} [A-Z]+ .*' - THEN 1 END ) as FLOAT) /CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.8 THEN 'STREET_ADDR' + THEN 1 END ) AS FLOAT) /CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.8 THEN 'STREET_ADDR' WHEN CAST(SUM(CASE WHEN `{COL_NAME}` IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.9 THEN 'STATE_USA' WHEN CAST(SUM( CASE WHEN `{COL_NAME}` RLIKE '\\+1\\s*\\(?\\d{3}\\)?[-. ]*\\d{3}[-. ]*\\d{4}' @@ -123,30 +124,32 @@ OR `{COL_NAME}` LIKE '% but %' OR `{COL_NAME}` LIKE '% or %' OR `{COL_NAME}` LIKE '% yet %' ) - AND COALESCE(CAST(LEN(`{COL_NAME}`) - LEN(REPLACE(`{COL_NAME}`, ',', '')) as FLOAT) - / CAST(NULLIF(LEN(`{COL_NAME}`) - LEN(REPLACE(`{COL_NAME}`, ' ', '')), 0) as FLOAT), 1) > 0.6 + AND COALESCE(CAST(LEN(`{COL_NAME}`) - LEN(REPLACE(`{COL_NAME}`, ',', '')) AS FLOAT) + / CAST(NULLIF(LEN(`{COL_NAME}`) - LEN(REPLACE(`{COL_NAME}`, ' ', '')), 0) AS FLOAT), 1) > 0.6 THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.8 THEN 'DELIMITED_DATA' WHEN CAST(SUM ( CASE WHEN `{COL_NAME}` RLIKE '[0-8][0-9][0-9][- ][0-9][0-9][- ][0-9][0-9][0-9][0-9]' AND LEFT(`{COL_NAME}`, 3) NOT BETWEEN '734' AND '749' AND LEFT(`{COL_NAME}`, 3) <> '666' THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.9 THEN 'SSN' - END as std_pattern_match, -05_else: NULL as distinct_std_value_ct, - NULL as zero_length_ct, - NULL as lead_space_ct, - NULL as quoted_value_ct, - NULL as includes_digit_ct, - NULL as filled_value_ct, - NULL as min_text, - NULL as max_text, - NULL as upper_case_ct, - NULL as lower_case_ct, - NULL as non_alpha_ct, - NULL as non_printing_ct, - NULL as numeric_ct, - NULL as date_ct, - NULL as std_pattern_match, - -06_A: (SELECT CONCAT_WS(' | ', collect_list(ct_pattern)) + END AS std_pattern_match, +-- TG-ELSE + NULL AS distinct_std_value_ct, + NULL AS zero_length_ct, + NULL AS lead_space_ct, + NULL AS quoted_value_ct, + NULL AS includes_digit_ct, + NULL AS filled_value_ct, + NULL AS min_text, + NULL AS max_text, + NULL AS upper_case_ct, + NULL AS lower_case_ct, + NULL AS non_alpha_ct, + NULL AS non_printing_ct, + NULL AS numeric_ct, + NULL AS date_ct, + NULL AS std_pattern_match, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT CONCAT_WS(' | ', collect_list(ct_pattern)) FROM ( SELECT TRANSLATE( @@ -165,33 +168,39 @@ ORDER BY ct DESC LIMIT 5 )) AS top_patterns, -06_else: NULL as top_patterns, - -08_N: MIN(`{COL_NAME}`) AS min_value, - MIN(CASE WHEN `{COL_NAME}` > 0 THEN `{COL_NAME}` ELSE NULL END) AS min_value_over_0, - MAX(`{COL_NAME}`) AS max_value, +-- TG-ELSE + NULL AS top_patterns, +-- TG-ENDIF +-- TG-IF is_type_N + MIN(`{COL_NAME}`) AS min_value, + MIN(CASE WHEN `{COL_NAME}` > 0 THEN `{COL_NAME}` ELSE NULL END) AS min_value_over_0, + MAX(`{COL_NAME}`) AS max_value, AVG(CAST(`{COL_NAME}` AS FLOAT)) AS avg_value, STDDEV_SAMP(CAST(`{COL_NAME}` AS FLOAT)) AS stdev_value, - MIN(pct_25) as percentile_25, - MIN(pct_50) as percentile_50, - MIN(pct_75) as percentile_75, -08_else: NULL as min_value, - NULL as min_value_over_0, - NULL as max_value, - NULL as avg_value, - NULL as stdev_value, - NULL as percentile_25, - NULL as percentile_50, - NULL as percentile_75, - -10_N_dec: SUM(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5)) as fractional_sum, -10_else: NULL as fractional_sum, - -11_D: CASE + MIN(pct_25) AS percentile_25, + MIN(pct_50) AS percentile_50, + MIN(pct_75) AS percentile_75, +-- TG-ELSE + NULL AS min_value, + NULL AS min_value_over_0, + NULL AS max_value, + NULL AS avg_value, + NULL AS stdev_value, + NULL AS percentile_25, + NULL AS percentile_50, + NULL AS percentile_75, +-- TG-ENDIF +-- TG-IF is_N_decimal + SUM(ROUND(ABS(MOD(`{COL_NAME}`, 1)), 5)) AS fractional_sum, +-- TG-ELSE + NULL AS fractional_sum, +-- TG-ENDIF +-- TG-IF is_type_D + CASE WHEN MIN(`{COL_NAME}`) IS NULL THEN NULL - ELSE CASE WHEN MIN(`{COL_NAME}`) >= CAST('0001-01-01' as date) THEN MIN(`{COL_NAME}`) ELSE CAST('0001-01-01' as date) END - END as min_date, - MAX(`{COL_NAME}`) as max_date, + ELSE CASE WHEN MIN(`{COL_NAME}`) >= CAST('0001-01-01' AS date) THEN MIN(`{COL_NAME}`) ELSE CAST('0001-01-01' AS date) END + END AS min_date, + MAX(`{COL_NAME}`) AS max_date, SUM(CASE WHEN <%DATEDIFF_MONTH; `{COL_NAME}`; '{RUN_DATE}'::TIMESTAMP%> > 12 THEN 1 ELSE 0 @@ -223,53 +232,57 @@ WHEN <%DATEDIFF_MONTH; `{COL_NAME}`; '{RUN_DATE}'::TIMESTAMP%> > 240 THEN 1 ELSE 0 END) AS distant_future_date_ct, - COUNT(DISTINCT <%DATEDIFF_DAY; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) as date_days_present, - COUNT(DISTINCT <%DATEDIFF_WEEK; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) as date_weeks_present, - COUNT(DISTINCT <%DATEDIFF_MONTH; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) as date_months_present, -11_else: NULL as min_date, - NULL as max_date, - NULL as before_1yr_date_ct, - NULL as before_5yr_date_ct, - NULL as before_20yr_date_ct, + COUNT(DISTINCT <%DATEDIFF_DAY; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) AS date_days_present, + COUNT(DISTINCT <%DATEDIFF_WEEK; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) AS date_weeks_present, + COUNT(DISTINCT <%DATEDIFF_MONTH; `{COL_NAME}`; '{RUN_DATE}'::DATE%>) AS date_months_present, +-- TG-ELSE + NULL AS min_date, + NULL AS max_date, + NULL AS before_1yr_date_ct, + NULL AS before_5yr_date_ct, + NULL AS before_20yr_date_ct, NULL AS before_100yr_date_ct, - NULL as within_1yr_date_ct, - NULL as within_1mo_date_ct, - NULL as future_date_ct, - NULL as distant_future_date_ct, - NULL as date_days_present, - NULL as date_weeks_present, - NULL as date_months_present, - -12_B: SUM(CAST(`{COL_NAME}` AS INTEGER)) AS boolean_true_ct, -12_else: NULL as boolean_true_ct, - -14_A: ( SELECT COUNT(DISTINCT TRANSLATE(`{COL_NAME}`, + NULL AS within_1yr_date_ct, + NULL AS within_1mo_date_ct, + NULL AS future_date_ct, + NULL AS distant_future_date_ct, + NULL AS date_days_present, + NULL AS date_weeks_present, + NULL AS date_months_present, +-- TG-ENDIF +-- TG-IF is_type_B + SUM(CAST(`{COL_NAME}` AS INTEGER)) AS boolean_true_ct, +-- TG-ELSE + NULL AS boolean_true_ct, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT COUNT(DISTINCT TRANSLATE(`{COL_NAME}`, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) ) AS pattern_ct FROM target_table - WHERE `{COL_NAME}` > ' ' ) AS distinct_pattern_ct, + WHERE `{COL_NAME}` > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ',''))) AS BIGINT)) AS embedded_space_ct, - AVG(CAST(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ','')) AS FLOAT)) AS avg_embedded_spaces, -14_else: NULL as distinct_pattern_ct, - NULL as embedded_space_ct, - NULL as avg_embedded_spaces, - -16_all: " '{PROFILE_RUN_ID}' as profile_run_id" - -98_all: ' FROM target_table' - -99_N: | - , (SELECT + AVG(CAST(LEN(TRIM(`{COL_NAME}`)) - LEN(REPLACE(TRIM(`{COL_NAME}`),' ','')) AS FLOAT)) AS avg_embedded_spaces, +-- TG-ELSE + NULL AS distinct_pattern_ct, + NULL AS embedded_space_ct, + NULL AS avg_embedded_spaces, +-- TG-ENDIF + '{PROFILE_RUN_ID}' AS profile_run_id + FROM target_table +-- TG-IF is_N_sampling + , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75 - FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` LIMIT 1) pctile -99_N_sampling: | - , (SELECT + FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) LIMIT 1 ) pctile +-- TG-ENDIF +-- TG-IF is_N_no_sampling + , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY `{COL_NAME}`) OVER () AS pct_75 - FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) LIMIT 1 ) pctile -99_else: ' ' + FROM `{DATA_SCHEMA}`.`{DATA_TABLE}` LIMIT 1) pctile +-- TG-ENDIF diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query.sql similarity index 57% rename from testgen/template/flavors/mssql/profiling/project_profiling_query.yaml rename to testgen/template/flavors/mssql/profiling/project_profiling_query.sql index 77ec98c8..89c5c42b 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query.sql @@ -1,18 +1,14 @@ ---- -01_sampling: | - WITH target_table AS ( - SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK) - ) - SELECT -01_else: | - WITH target_table AS ( - SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) - ) - SELECT -01_all: | - {CONNECTION_ID} as connection_id, - '{PROJECT_CODE}' as project_code, - '{TABLE_GROUPS_ID}' as table_groups_id, +WITH target_table AS ( +-- TG-IF do_sample + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK) +-- TG-ELSE + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK) +-- TG-ENDIF +) +SELECT + {CONNECTION_ID} AS connection_id, + '{PROJECT_CODE}' AS project_code, + '{TABLE_GROUPS_ID}' AS table_groups_id, '{DATA_SCHEMA}' AS schema_name, '{RUN_DATE}' AS run_date, '{DATA_TABLE}' AS table_name, @@ -22,49 +18,61 @@ '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, - -02_X: | +-- TG-IF is_type_X COUNT(CASE WHEN "{COL_NAME}" IS NOT NULL THEN 1 END) AS value_ct, NULL AS distinct_value_ct, - SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -02_else: | +-- TG-ELSE COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, +-- TG-ENDIF SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, - -03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, - MAX(LEN("{COL_NAME}")) AS max_length, +-- TG-IF is_type_ADN + MIN(LEN("{COL_NAME}")) AS min_length, + MAX(LEN("{COL_NAME}")) AS max_length, AVG(CAST(NULLIF(LEN("{COL_NAME}"), 0) AS FLOAT)) AS avg_length, -03_else: NULL as min_length, - NULL as max_length, - NULL as avg_length, - -04_A: SUM(CASE +-- TG-ELSE + NULL AS min_length, + NULL AS max_length, + NULL AS avg_length, +-- TG-ENDIF +-- TG-IF is_type_A + SUM(CASE WHEN LTRIM(RTRIM("{COL_NAME}")) LIKE '0([.]0*)' THEN 1 ELSE 0 - END) AS zero_value_ct, -04_N: CAST(SUM( 1 - ABS(SIGN("{COL_NAME}")))AS BIGINT ) AS zero_value_ct, -04_else: NULL as zero_value_ct, - -05_A: COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ',''))) as distinct_std_value_ct, + END) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_N + CAST(SUM( 1 - ABS(SIGN("{COL_NAME}")))AS BIGINT ) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_not_A_not_N + NULL AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(DISTINCT UPPER(REPLACE(TRANSLATE("{COL_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ',''))) AS distinct_std_value_ct, SUM(CASE WHEN "{COL_NAME}" = '' THEN 1 - ELSE 0 - END) AS zero_length_ct, - SUM( CASE - WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 - ELSE 0 - END ) AS lead_space_ct, - SUM( CASE WHEN "{COL_NAME}" LIKE '"%"' OR "{COL_NAME}" LIKE '''%''' THEN 1 ELSE 0 END ) as quoted_value_ct, - SUM( CASE WHEN "{COL_NAME}" LIKE '%[0-9]%' THEN 1 ELSE 0 END ) as includes_digit_ct, - SUM( CASE - WHEN "{COL_NAME}" IN ('.', '?') OR "{COL_NAME}" LIKE ' ' THEN 1 + ELSE 0 + END) AS zero_length_ct, + SUM(CASE + WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 + ELSE 0 + END) AS lead_space_ct, + SUM(CASE WHEN "{COL_NAME}" LIKE '"%"' OR "{COL_NAME}" LIKE '''%''' THEN 1 ELSE 0 END ) AS quoted_value_ct, + SUM(CASE WHEN "{COL_NAME}" LIKE '%[0-9]%' THEN 1 ELSE 0 END ) AS includes_digit_ct, + SUM(CASE + WHEN LEN("{COL_NAME}") > 0 + AND ((LEN(REPLACE("{COL_NAME}", '.', ''))= 0 ) + OR (LEN(REPLACE("{COL_NAME}", '-', ''))= 0 ) + OR (LEN(REPLACE("{COL_NAME}", '?', ''))= 0 ) + ) THEN 1 + WHEN DATALENGTH("{COL_NAME}") > 0 + AND LEN(LTRIM(RTRIM("{COL_NAME}")))= 0 + THEN 1 WHEN LEN("{COL_NAME}") > 1 - AND ( LOWER("{COL_NAME}") LIKE '%..%' OR LOWER("{COL_NAME}") LIKE '%--%' - OR (LEN(REPLACE("{COL_NAME}", '0', ''))= 0 ) + AND ((LEN(REPLACE("{COL_NAME}", '0', ''))= 0 ) OR (LEN(REPLACE("{COL_NAME}", '9', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COL_NAME}"), 'x', ''))= 0 ) OR (LEN(REPLACE(LOWER("{COL_NAME}"), 'z', ''))= 0 ) - ) THEN 1 + ) THEN 1 WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', @@ -72,40 +80,40 @@ WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 ELSE 0 - END ) AS filled_value_ct, - LEFT(MIN(NULLIF("{COL_NAME}", '') COLLATE Latin1_General_BIN ), 100) AS min_text, - LEFT(MAX(NULLIF("{COL_NAME}", '') COLLATE Latin1_General_BIN ), 100) AS max_text, + END) AS filled_value_ct, + LEFT(MIN(NULLIF("{COL_NAME}", '') COLLATE Latin1_General_BIN ), 100) AS min_text, + LEFT(MAX(NULLIF("{COL_NAME}", '') COLLATE Latin1_General_BIN ), 100) AS max_text, SUM(CASE - WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 0 - WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 1 + WHEN "{COL_NAME}" COLLATE Latin1_General_BIN = UPPER("{COL_NAME}") + AND "{COL_NAME}" COLLATE Latin1_General_BIN <> LOWER("{COL_NAME}") THEN 1 ELSE 0 END) AS upper_case_ct, SUM(CASE - WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 0 - WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', ' ') = "{COL_NAME}" THEN 1 + WHEN "{COL_NAME}" COLLATE Latin1_General_BIN = LOWER("{COL_NAME}") + AND "{COL_NAME}" COLLATE Latin1_General_BIN <> UPPER("{COL_NAME}") THEN 1 ELSE 0 END) AS lower_case_ct, SUM(CASE WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 1 ELSE 0 END) AS non_alpha_ct, - COUNT( CASE WHEN TRANSLATE("{COL_NAME}", NCHAR(160), 'X') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, - SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, - SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, + COUNT(CASE WHEN TRANSLATE("{COL_NAME}", NCHAR(160), 'X') <> "{COL_NAME}" THEN 1 END) AS non_printing_ct, + SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, + SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, CASE - WHEN CAST(SUM( CASE WHEN UPPER("{COL_NAME}") LIKE '[1-9]% [A-Z]% %' + WHEN CAST(SUM(CASE WHEN UPPER("{COL_NAME}") LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COL_NAME}") BETWEEN 2 and 6 THEN 1 - END ) as FLOAT) /CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'STREET_ADDR' + END) AS FLOAT) /CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'STREET_ADDR' WHEN CAST(SUM(CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'STATE_USA' - WHEN CAST(SUM( CASE WHEN ("{COL_NAME}" LIKE '[+]1%[0-9][0-9][0-9]%[-. ][0-9][0-9][0-9]%[0-9][0-9][0-9][0-9,0-9,0-9,0-9,0-9,0-9]' AND "{COL_NAME}" NOT LIKE '%[^0-9+()-]%') + WHEN CAST(SUM(CASE WHEN ("{COL_NAME}" LIKE '[+]1%[0-9][0-9][0-9]%[-. ][0-9][0-9][0-9]%[0-9][0-9][0-9][0-9,0-9,0-9,0-9,0-9,0-9]' AND "{COL_NAME}" NOT LIKE '%[^0-9+()-]%') OR ("{COL_NAME}" LIKE '[+]1%[0-9][0-9][0-9][-. ][0-9][0-9][0-9][-. ][0-9][0-9][0-9][0-9]' AND "{COL_NAME}" NOT LIKE '%[^0-9+-]%') THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'PHONE_USA' - WHEN CAST(SUM( CASE WHEN "{COL_NAME}" LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' + WHEN CAST(SUM(CASE WHEN "{COL_NAME}" LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'EMAIL' - WHEN CAST(SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') + WHEN CAST(SUM(CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'ZIP_USA' - WHEN CAST(SUM( CASE WHEN "{COL_NAME}" COLLATE SQL_Latin1_General_CP1_CI_AS NOT LIKE ' %' + WHEN CAST(SUM(CASE WHEN "{COL_NAME}" COLLATE SQL_Latin1_General_CP1_CI_AS NOT LIKE ' %' AND "{COL_NAME}" COLLATE SQL_Latin1_General_CP1_CI_AS LIKE '[a-z0-9 _-]%' AND ("{COL_NAME}" COLLATE SQL_Latin1_General_CP1_CI_AS LIKE '%.txt' OR "{COL_NAME}" COLLATE SQL_Latin1_General_CP1_CI_AS LIKE '%.csv' @@ -115,9 +123,9 @@ OR "{COL_NAME}" COLLATE SQL_Latin1_General_CP1_CI_AS LIKE '%.pdf' OR "{COL_NAME}" COLLATE SQL_Latin1_General_CP1_CI_AS LIKE '%.xlsx') THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'FILE_NAME' - WHEN CAST(SUM( CASE WHEN "{COL_NAME}" LIKE '[0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9]' + WHEN CAST(SUM(CASE WHEN "{COL_NAME}" LIKE '[0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9]' THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'CREDIT_CARD' - WHEN CAST(SUM( CASE WHEN ( "{COL_NAME}" LIKE '%,%,%,%' + WHEN CAST(SUM(CASE WHEN ( "{COL_NAME}" LIKE '%,%,%,%' OR "{COL_NAME}" LIKE '%|%|%|%' OR "{COL_NAME}" LIKE '%^%^%^%' OR "{COL_NAME}" LIKE '%' + CHAR(9) + '%' + CHAR(9) + '%' + CHAR(9) + '%' ) @@ -125,37 +133,39 @@ OR "{COL_NAME}" LIKE '% but %' OR "{COL_NAME}" LIKE '% or %' OR "{COL_NAME}" LIKE '% yet %' ) - AND ISNULL(CAST(LEN("{COL_NAME}") - LEN(REPLACE("{COL_NAME}", ',', '')) as FLOAT) - / CAST(NULLIF(LEN("{COL_NAME}") - LEN(REPLACE("{COL_NAME}", ' ', '')), 0) as FLOAT), 1) > 0.6 + AND ISNULL(CAST(LEN("{COL_NAME}") - LEN(REPLACE("{COL_NAME}", ',', '')) AS FLOAT) + / CAST(NULLIF(LEN("{COL_NAME}") - LEN(REPLACE("{COL_NAME}", ' ', '')), 0) AS FLOAT), 1) > 0.6 THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'DELIMITED_DATA' - WHEN CAST(SUM ( CASE WHEN "{COL_NAME}" LIKE '[0-8][0-9][0-9][- ][0-9][0-9][- ][0-9][0-9][0-9][0-9]' + WHEN CAST(SUM (CASE WHEN "{COL_NAME}" LIKE '[0-8][0-9][0-9][- ][0-9][0-9][- ][0-9][0-9][0-9][0-9]' AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'SSN' - END as std_pattern_match, -05_else: NULL as distinct_std_value_ct, - NULL as zero_length_ct, - NULL as lead_space_ct, - NULL as quoted_value_ct, - NULL as includes_digit_ct, - NULL as filled_value_ct, - NULL as min_text, - NULL as max_text, - NULL as upper_case_ct, - NULL as lower_case_ct, - NULL as non_alpha_ct, - NULL as non_printing_ct, - NULL as numeric_ct, - NULL as date_ct, - NULL as std_pattern_match, - -06_A: ( SELECT LEFT(STRING_AGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + END AS std_pattern_match, +-- TG-ELSE + NULL AS distinct_std_value_ct, + NULL AS zero_length_ct, + NULL AS lead_space_ct, + NULL AS quoted_value_ct, + NULL AS includes_digit_ct, + NULL AS filled_value_ct, + NULL AS min_text, + NULL AS max_text, + NULL AS upper_case_ct, + NULL AS lower_case_ct, + NULL AS non_alpha_ct, + NULL AS non_printing_ct, + NULL AS numeric_ct, + NULL AS date_ct, + NULL AS std_pattern_match, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT LEFT(STRING_AGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) + ' | ' + pattern AS pattern, COUNT(*) AS ct - FROM ( SELECT TRANSLATE("{COL_NAME}" COLLATE Latin1_General_BIN, + FROM (SELECT TRANSLATE("{COL_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) - AS pattern + AS pattern FROM target_table WHERE "{COL_NAME}" > ' ' AND ((SELECT MAX(LEN("{COL_NAME}")) FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH})) p @@ -163,33 +173,39 @@ HAVING pattern > ' ' ORDER BY COUNT(*) DESC ) ps) AS top_patterns, -06_else: NULL as top_patterns, - -08_N: MIN("{COL_NAME}") AS min_value, - MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, - MAX("{COL_NAME}") AS max_value, - AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, - STDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value, - MIN(pct_25) as percentile_25, - MIN(pct_50) as percentile_50, - MIN(pct_75) as percentile_75, -08_else: NULL as min_value, - NULL as min_value_over_0, - NULL as max_value, - NULL as avg_value, - NULL as stdev_value, - NULL as percentile_25, - NULL as percentile_50, - NULL as percentile_75, - -10_N_dec: SUM(ROUND(ABS(("{COL_NAME}" % 1)), 5)) as fractional_sum, -10_else: NULL as fractional_sum, - -11_D: CASE +-- TG-ELSE + NULL AS top_patterns, +-- TG-ENDIF +-- TG-IF is_type_N + MIN("{COL_NAME}") AS min_value, + MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, + MAX("{COL_NAME}") AS max_value, + AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, + STDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value, + MIN(pct_25) AS percentile_25, + MIN(pct_50) AS percentile_50, + MIN(pct_75) AS percentile_75, +-- TG-ELSE + NULL AS min_value, + NULL AS min_value_over_0, + NULL AS max_value, + NULL AS avg_value, + NULL AS stdev_value, + NULL AS percentile_25, + NULL AS percentile_50, + NULL AS percentile_75, +-- TG-ENDIF +-- TG-IF is_N_decimal + SUM(ROUND(ABS(("{COL_NAME}" % 1)), 5)) AS fractional_sum, +-- TG-ELSE + NULL AS fractional_sum, +-- TG-ENDIF +-- TG-IF is_type_D + CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL - ELSE CASE WHEN MIN("{COL_NAME}") >= CAST('0001-01-01' as date) THEN MIN("{COL_NAME}") ELSE CAST('0001-01-01' as date) END - END as min_date, - MAX("{COL_NAME}") as max_date, + ELSE CASE WHEN MIN("{COL_NAME}") >= CAST('0001-01-01' AS date) THEN MIN("{COL_NAME}") ELSE CAST('0001-01-01' AS date) END + END AS min_date, + MAX("{COL_NAME}") AS max_date, SUM(CASE WHEN DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 ELSE 0 @@ -221,53 +237,57 @@ WHEN DATEDIFF(month, '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 ELSE 0 END) AS distant_future_date_ct, - COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, - COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, - COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, -11_else: NULL as min_date, - NULL as max_date, - NULL as before_1yr_date_ct, - NULL as before_5yr_date_ct, - NULL as before_20yr_date_ct, + COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) AS date_days_present, + COUNT(DISTINCT DATEDIFF(week, DATEADD(day, -1, "{COL_NAME}"), DATEADD(day, -1, CAST('{RUN_DATE}' AS DATE)) ) ) AS date_weeks_present, + COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) AS date_months_present, +-- TG-ELSE + NULL AS min_date, + NULL AS max_date, + NULL AS before_1yr_date_ct, + NULL AS before_5yr_date_ct, + NULL AS before_20yr_date_ct, NULL AS before_100yr_date_ct, - NULL as within_1yr_date_ct, - NULL as within_1mo_date_ct, - NULL as future_date_ct, - NULL as distant_future_date_ct, - NULL as date_days_present, - NULL as date_weeks_present, - NULL as date_months_present, - -12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, -12_else: NULL as boolean_true_ct, - -14_A: ( SELECT COUNT(DISTINCT TRANSLATE("{COL_NAME}" COLLATE Latin1_General_BIN, + NULL AS within_1yr_date_ct, + NULL AS within_1mo_date_ct, + NULL AS future_date_ct, + NULL AS distant_future_date_ct, + NULL AS date_days_present, + NULL AS date_weeks_present, + NULL AS date_months_present, +-- TG-ENDIF +-- TG-IF is_type_B + SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +-- TG-ELSE + NULL AS boolean_true_ct, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT COUNT(DISTINCT TRANSLATE("{COL_NAME}" COLLATE Latin1_General_BIN, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', 'aaaaaaaaaaaaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAANNNNNNNNNN' ) ) AS pattern_ct FROM target_table - WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, + WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(CAST(SIGN(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ',''))) AS BIGINT)) AS embedded_space_ct, - AVG(CAST(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ','')) AS FLOAT)) AS avg_embedded_spaces, -14_else: NULL as distinct_pattern_ct, - NULL as embedded_space_ct, - NULL as avg_embedded_spaces, - -16_all: " '{PROFILE_RUN_ID}' as profile_run_id" - -98_all: ' FROM target_table ' - -99_N: | - , (SELECT TOP 1 + AVG(CAST(LEN(RTRIM(LTRIM("{COL_NAME}"))) - LEN(REPLACE(RTRIM(LTRIM("{COL_NAME}")),' ','')) AS FLOAT)) AS avg_embedded_spaces, +-- TG-ELSE + NULL AS distinct_pattern_ct, + NULL AS embedded_space_ct, + NULL AS avg_embedded_spaces, +-- TG-ENDIF + '{PROFILE_RUN_ID}' AS profile_run_id + FROM target_table +-- TG-IF is_N_sampling + , (SELECT TOP 1 PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) pctile -99_N_sampling: | - , (SELECT TOP 1 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)) pctile +-- TG-ENDIF +-- TG-IF is_N_no_sampling + , (SELECT TOP 1 PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE ({SAMPLE_PERCENT_CALC} PERCENT) WITH (NOLOCK)) pctile -99_else: ' ' + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WITH (NOLOCK)) pctile +-- TG-ENDIF diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query.sql similarity index 67% rename from testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml rename to testgen/template/flavors/postgresql/profiling/project_profiling_query.sql index 67156d77..8ad3a999 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query.yaml +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query.sql @@ -1,18 +1,14 @@ ---- -01_sampling: | - WITH target_table AS ( - SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) - ) - SELECT -01_else: | - WITH target_table AS ( - SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - ) - SELECT -01_all: | - {CONNECTION_ID} as connection_id, - '{PROJECT_CODE}' as project_code, - '{TABLE_GROUPS_ID}' as table_groups_id, +WITH target_table AS ( +-- TG-IF do_sample + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) +-- TG-ELSE + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-ENDIF +) +SELECT + {CONNECTION_ID} AS connection_id, + '{PROJECT_CODE}' AS project_code, + '{TABLE_GROUPS_ID}' AS table_groups_id, '{DATA_SCHEMA}' AS schema_name, '{RUN_DATE}' AS run_date, '{DATA_TABLE}' AS table_name, @@ -22,43 +18,43 @@ '{DB_DATA_TYPE}' AS db_data_type, '{COL_GEN_TYPE}' AS general_type, COUNT(*) AS record_ct, - -02_X: | COUNT("{COL_NAME}") AS value_ct, COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, -02_else: | - COUNT("{COL_NAME}") AS value_ct, - COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, - SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, - -03_ADN: MIN(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS min_length, - MAX(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS max_length, +-- TG-IF is_type_ADN + MIN(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS min_length, + MAX(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS max_length, AVG(NULLIF(LENGTH(CAST("{COL_NAME}" AS TEXT)), 0)::FLOAT) AS avg_length, -03_else: NULL as min_length, - NULL as max_length, - NULL as avg_length, - -04_A: SUM(CASE +-- TG-ELSE + NULL AS min_length, + NULL AS max_length, + NULL AS avg_length, +-- TG-ENDIF +-- TG-IF is_type_A + SUM(CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 ELSE 0 - END) AS zero_value_ct, -04_N: SUM( 1 - ABS(SIGN("{COL_NAME}"::NUMERIC)) )::BIGINT AS zero_value_ct, -04_else: NULL as zero_value_ct, - -05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, + END) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_N + SUM( 1 - ABS(SIGN("{COL_NAME}"::NUMERIC)) )::BIGINT AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_not_A_not_N + NULL AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) AS distinct_std_value_ct, SUM(CASE WHEN "{COL_NAME}" = '' THEN 1 ELSE 0 - END) AS zero_length_ct, + END) AS zero_length_ct, SUM( CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 - END ) AS lead_space_ct, - SUM( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END ) as quoted_value_ct, - SUM( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 ELSE 0 END ) as includes_digit_ct, + END ) AS lead_space_ct, + SUM( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END ) AS quoted_value_ct, + SUM( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 ELSE 0 END ) AS includes_digit_ct, SUM( CASE - WHEN "{COL_NAME}" IN ('.', '?', ' ') THEN 1 - WHEN LOWER("{COL_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 + WHEN LOWER("{COL_NAME}") SIMILAR TO '(.{1,}|-{1,}|\?{1,}|\s{1,}|0{2,}|9{2,}|x{2,}|z{2,})' THEN 1 WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', @@ -66,9 +62,9 @@ WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 ELSE 0 - END ) AS filled_value_ct, - LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, - LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, + END ) AS filled_value_ct, + LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, + LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, SUM(CASE WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 0 WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 @@ -83,9 +79,9 @@ WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', '') = "{COL_NAME}" THEN 1 ELSE 0 END) AS non_alpha_ct, - COUNT( CASE WHEN TRANSLATE("{COL_NAME}", E'\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, - SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, - SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, + COUNT( CASE WHEN TRANSLATE("{COL_NAME}", E'\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) AS non_printing_ct, + SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, + SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, CASE WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'STREET_ADDR' @@ -107,24 +103,26 @@ WHEN SUM ( CASE WHEN "{COL_NAME}" SIMILAR TO '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$' AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' - END as std_pattern_match, -05_else: NULL as distinct_std_value_ct, - NULL as zero_length_ct, - NULL as lead_space_ct, - NULL as quoted_value_ct, - NULL as includes_digit_ct, - NULL as filled_value_ct, - NULL as min_text, - NULL as max_text, - NULL as upper_case_ct, - NULL as lower_case_ct, - NULL as non_alpha_ct, - NULL as non_printing_ct, - NULL as numeric_ct, - NULL as date_ct, - NULL as std_pattern_match, - -06_A: ( SELECT LEFT(STRING_AGG(pattern, ' | ' ORDER BY ct DESC) , 1000) AS concat_pats + END AS std_pattern_match, +-- TG-ELSE + NULL AS distinct_std_value_ct, + NULL AS zero_length_ct, + NULL AS lead_space_ct, + NULL AS quoted_value_ct, + NULL AS includes_digit_ct, + NULL AS filled_value_ct, + NULL AS min_text, + NULL AS max_text, + NULL AS upper_case_ct, + NULL AS lower_case_ct, + NULL AS non_alpha_ct, + NULL AS non_printing_ct, + NULL AS numeric_ct, + NULL AS date_ct, + NULL AS std_pattern_match, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT LEFT(STRING_AGG(pattern, ' | ' ORDER BY ct DESC) , 1000) AS concat_pats FROM ( SELECT CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, COUNT(*) AS ct @@ -140,33 +138,39 @@ ORDER BY COUNT(*) DESC LIMIT 5 ) ps) AS top_patterns, -06_else: NULL as top_patterns, - -08_N: MIN("{COL_NAME}") AS min_value, - MIN(CASE WHEN "{COL_NAME}"::NUMERIC > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, - MAX("{COL_NAME}") AS max_value, - AVG(CAST("{COL_NAME}"::NUMERIC AS FLOAT)) AS avg_value, - STDDEV(CAST("{COL_NAME}"::NUMERIC AS FLOAT)) AS stdev_value, - MIN(pct_25) as percentile_25, - MIN(pct_50) as percentile_50, - MIN(pct_75) as percentile_75, -08_else: NULL as min_value, - NULL as min_value_over_0, - NULL as max_value, - NULL as avg_value, - NULL as stdev_value, - NULL as percentile_25, - NULL as percentile_50, - NULL as percentile_75, - -10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -10_else: NULL as fractional_sum, - -11_D: CASE +-- TG-ELSE + NULL AS top_patterns, +-- TG-ENDIF +-- TG-IF is_type_N + MIN("{COL_NAME}") AS min_value, + MIN(CASE WHEN "{COL_NAME}"::NUMERIC > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, + MAX("{COL_NAME}") AS max_value, + AVG(CAST("{COL_NAME}"::NUMERIC AS FLOAT)) AS avg_value, + STDDEV(CAST("{COL_NAME}"::NUMERIC AS FLOAT)) AS stdev_value, + MIN(pct_25) AS percentile_25, + MIN(pct_50) AS percentile_50, + MIN(pct_75) AS percentile_75, +-- TG-ELSE + NULL AS min_value, + NULL AS min_value_over_0, + NULL AS max_value, + NULL AS avg_value, + NULL AS stdev_value, + NULL AS percentile_25, + NULL AS percentile_50, + NULL AS percentile_75, +-- TG-ENDIF +-- TG-IF is_N_decimal + SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) AS fractional_sum, +-- TG-ELSE + NULL AS fractional_sum, +-- TG-ENDIF +-- TG-IF is_type_D + CASE WHEN MIN("{COL_NAME}") IS NULL THEN NULL ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') - END as min_date, - MAX("{COL_NAME}") as max_date, + END AS min_date, + MAX("{COL_NAME}") AS max_date, SUM(CASE WHEN <%DATEDIFF_MONTH;"{COL_NAME}";'{RUN_DATE}'%> > 12 THEN 1 ELSE 0 @@ -198,27 +202,31 @@ WHEN <%DATEDIFF_MONTH;'{RUN_DATE}';"{COL_NAME}"%> > 240 THEN 1 ELSE 0 END) AS distant_future_date_ct, - COUNT(DISTINCT <%DATEDIFF_DAY;"{COL_NAME}";'{RUN_DATE}'%>) as date_days_present, - COUNT(DISTINCT <%DATEDIFF_WEEK;"{COL_NAME}";'{RUN_DATE}'%>) as date_weeks_present, - COUNT(DISTINCT <%DATEDIFF_MONTH;"{COL_NAME}";'{RUN_DATE}'%>) as date_months_present, -11_else: NULL as min_date, - NULL as max_date, - NULL as before_1yr_date_ct, - NULL as before_5yr_date_ct, - NULL as before_20yr_date_ct, + COUNT(DISTINCT <%DATEDIFF_DAY;"{COL_NAME}";'{RUN_DATE}'%>) AS date_days_present, + COUNT(DISTINCT <%DATEDIFF_WEEK;"{COL_NAME}";'{RUN_DATE}'%>) AS date_weeks_present, + COUNT(DISTINCT <%DATEDIFF_MONTH;"{COL_NAME}";'{RUN_DATE}'%>) AS date_months_present, +-- TG-ELSE + NULL AS min_date, + NULL AS max_date, + NULL AS before_1yr_date_ct, + NULL AS before_5yr_date_ct, + NULL AS before_20yr_date_ct, NULL AS before_100yr_date_ct, - NULL as within_1yr_date_ct, - NULL as within_1mo_date_ct, - NULL as future_date_ct, - NULL as distant_future_date_ct, - NULL as date_days_present, - NULL as date_weeks_present, - NULL as date_months_present, - -12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, -12_else: NULL as boolean_true_ct, - -14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( + NULL AS within_1yr_date_ct, + NULL AS within_1mo_date_ct, + NULL AS future_date_ct, + NULL AS distant_future_date_ct, + NULL AS date_days_present, + NULL AS date_weeks_present, + NULL AS date_months_present, +-- TG-ENDIF +-- TG-IF is_type_B + SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +-- TG-ELSE + NULL AS boolean_true_ct, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( "{COL_NAME}", '[a-z]', 'a', 'g'), '[A-Z]', 'A', 'g'), '[0-9]', 'N', 'g') @@ -226,25 +234,25 @@ FROM target_table WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g')))::BIGINT) AS embedded_space_ct, - AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g'))::FLOAT) AS avg_embedded_spaces, -14_else: NULL as distinct_pattern_ct, - NULL as embedded_space_ct, - NULL as avg_embedded_spaces, - -16_all: " '{PROFILE_RUN_ID}' as profile_run_id" - -98_all: ' FROM target_table ' - -99_N: | + AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REGEXP_REPLACE(TRIM("{COL_NAME}"), ' ', '', 'g'))::FLOAT) AS avg_embedded_spaces, +-- TG-ELSE + NULL AS distinct_pattern_ct, + NULL AS embedded_space_ct, + NULL AS avg_embedded_spaces, +-- TG-ENDIF + '{PROFILE_RUN_ID}' AS profile_run_id + FROM target_table +-- TG-IF is_N_sampling , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75 - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile -99_N_sampling: | + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) LIMIT 1) pctile +-- TG-ENDIF +-- TG-IF is_N_no_sampling , (SELECT PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_25, PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_50, PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}"::NUMERIC) AS pct_75 - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI ({SAMPLE_PERCENT_CALC}) REPEATABLE (64) LIMIT 1) pctile -99_else: ' ' + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile +-- TG-ENDIF diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query.sql b/testgen/template/flavors/redshift/profiling/project_profiling_query.sql new file mode 100644 index 00000000..db97da0b --- /dev/null +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query.sql @@ -0,0 +1,205 @@ +WITH target_table AS ( +-- TG-IF do_sample + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} +-- TG-ELSE + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-ENDIF +) +SELECT + {CONNECTION_ID} AS connection_id, + '{PROJECT_CODE}' AS project_code, + '{TABLE_GROUPS_ID}' AS table_groups_id, + '{DATA_SCHEMA}' AS schema_name, + '{RUN_DATE}' AS run_date, + '{DATA_TABLE}' AS table_name, + {COL_POS} AS position, + '{COL_NAME_SANITIZED}' AS column_name, + '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, + '{COL_GEN_TYPE}' AS general_type, + COUNT(*) AS record_ct, + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, + SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, +-- TG-IF is_type_ADN + MIN(LEN("{COL_NAME}")) AS min_length, + MAX(LEN("{COL_NAME}")) AS max_length, + AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, +-- TG-ELSE + NULL AS min_length, + NULL AS max_length, + NULL AS avg_length, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_N + SUM(1 - ABS(SIGN("{COL_NAME}")))::BIGINT AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_not_A_not_N + NULL AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) AS distinct_std_value_ct, + COUNT(CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, + COUNT(CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 END) AS lead_space_ct, + COUNT(CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END) AS quoted_value_ct, + COUNT(CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 END) AS includes_digit_ct, + COUNT(CASE + WHEN LOWER("{COL_NAME}") SIMILAR TO '(.{1,}|-{1,}|\\?{1,}|\\s{1,}|0{2,}|9{2,}|x{2,}|z{2,})' THEN 1 + WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', + 'n/a','#na','none','null','unknown') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', + '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', + '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 + END) AS filled_value_ct, + LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, + LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, + COUNT(CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 END) AS upper_case_ct, + COUNT(CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 END) AS lower_case_ct, + COUNT(CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 END) AS non_alpha_ct, + COUNT(CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) AS non_printing_ct, + SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, + SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, + CASE + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'STREET_ADDR' + WHEN SUM(CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'STATE_USA' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'PHONE_USA' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL' + WHEN SUM(CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[\\w\\s\-]+\\.(txt|(c|t|p)sv|dat|doc|docx|json|pdf|xlsx|xml)$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'FILE_NAME' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]){3}[0-9]{4}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'CREDIT_CARD' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' + AND "{COL_NAME}" !~ '\\s(and|but|or|yet)\\s' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'DELIMITED_DATA' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$' + AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' + AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' + END AS std_pattern_match, +-- TG-ELSE + NULL AS distinct_std_value_ct, + NULL AS zero_length_ct, + NULL AS lead_space_ct, + NULL AS quoted_value_ct, + NULL AS includes_digit_ct, + NULL AS filled_value_ct, + NULL AS min_text, + NULL AS max_text, + NULL AS upper_case_ct, + NULL AS lower_case_ct, + NULL AS non_alpha_ct, + NULL AS non_printing_ct, + NULL AS numeric_ct, + NULL AS date_ct, + NULL AS std_pattern_match, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + FROM (SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, + COUNT(*) AS ct + FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( + "{COL_NAME}", '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') AS pattern + FROM target_table + WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + GROUP BY pattern + HAVING pattern > ' ' + ORDER BY COUNT(*) DESC) AS ps) AS top_patterns, +-- TG-ELSE + NULL AS top_patterns, +-- TG-ENDIF +-- TG-IF is_type_N + MIN("{COL_NAME}") AS min_value, + MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, + MAX("{COL_NAME}") AS max_value, + AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, + STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value, + MIN(pct_25) AS percentile_25, + MIN(pct_50) AS percentile_50, + MIN(pct_75) AS percentile_75, +-- TG-ELSE + NULL AS min_value, + NULL AS min_value_over_0, + NULL AS max_value, + NULL AS avg_value, + NULL AS stdev_value, + NULL AS percentile_25, + NULL AS percentile_50, + NULL AS percentile_75, +-- TG-ENDIF +-- TG-IF is_N_decimal + SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) AS fractional_sum, +-- TG-ELSE + NULL AS fractional_sum, +-- TG-ENDIF +-- TG-IF is_type_D + CASE + WHEN MIN("{COL_NAME}") IS NULL THEN NULL + ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') + END AS min_date, + MAX("{COL_NAME}") AS max_date, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct, + COUNT(CASE WHEN DATEDIFF('DAY', "{COL_NAME}"::DATE, '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct, + COUNT(CASE WHEN DATEDIFF('DAY', "{COL_NAME}"::DATE, '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, + COUNT(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}"::DATE) > 240 THEN 1 END) AS distant_future_date_ct, + COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_days_present, + COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_weeks_present, + COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_months_present, +-- TG-ELSE + NULL AS min_date, + NULL AS max_date, + NULL AS before_1yr_date_ct, + NULL AS before_5yr_date_ct, + NULL AS before_20yr_date_ct, + NULL AS before_100yr_date_ct, + NULL AS within_1yr_date_ct, + NULL AS within_1mo_date_ct, + NULL AS future_date_ct, + NULL AS distant_future_date_ct, + NULL AS date_days_present, + NULL AS date_weeks_present, + NULL AS date_months_present, +-- TG-ENDIF +-- TG-IF is_type_B + SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +-- TG-ELSE + NULL AS boolean_true_ct, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( + "{COL_NAME}", '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') + ) AS pattern_ct + FROM target_table + WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, + SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, + AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, +-- TG-ELSE + NULL AS distinct_pattern_ct, + NULL AS embedded_space_ct, + NULL AS avg_embedded_spaces, +-- TG-ENDIF + '{PROFILE_RUN_ID}' AS profile_run_id + FROM target_table +-- TG-IF is_type_N + , (SELECT + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile +-- TG-ENDIF diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml deleted file mode 100644 index 1055ecd1..00000000 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query.yaml +++ /dev/null @@ -1,204 +0,0 @@ ---- -01_sampling: | - WITH target_table AS ( - SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} - ) - SELECT -01_else: | - WITH target_table AS ( - SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - ) - SELECT -01_all: | - {CONNECTION_ID} as connection_id, - '{PROJECT_CODE}' as project_code, - '{TABLE_GROUPS_ID}' as table_groups_id, - '{DATA_SCHEMA}' AS schema_name, - '{RUN_DATE}' AS run_date, - '{DATA_TABLE}' AS table_name, - {COL_POS} AS position, - '{COL_NAME_SANITIZED}' AS column_name, - '{COL_TYPE}' AS column_type, - '{DB_DATA_TYPE}' AS db_data_type, - '{COL_GEN_TYPE}' AS general_type, - COUNT(*) AS record_ct, - -02_X: | - COUNT("{COL_NAME}") AS value_ct, - COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, - SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -02_else: | - COUNT("{COL_NAME}") AS value_ct, - COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, - SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, - -03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, - MAX(LEN("{COL_NAME}")) AS max_length, - AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, -03_else: NULL as min_length, - NULL as max_length, - NULL as avg_length, - -04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, -04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, -04_else: NULL as zero_value_ct, - -05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, - COUNT( CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, - COUNT( CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 END ) AS lead_space_ct, - COUNT( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct, - COUNT( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 END ) as includes_digit_ct, - COUNT( CASE - WHEN LENGTH("{COL_NAME}") > 0 AND "{COL_NAME}" IN ('.', '?', ' ') THEN 1 - WHEN LOWER("{COL_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 - WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', - 'n/a','#na','none','null','unknown') THEN 1 - WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', - '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 - WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', - '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 - END ) AS filled_value_ct, - LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, - LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, - COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 END) AS upper_case_ct, - COUNT( CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 END) AS lower_case_ct, - COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 END) AS non_alpha_ct, - COUNT( CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, - SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, - SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, - CASE - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'STREET_ADDR' - WHEN SUM( CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'STATE_USA' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'PHONE_USA' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL' - WHEN SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[\\w\\s\-]+\\.(txt|(c|t|p)sv|dat|doc|docx|json|pdf|xlsx|xml)$' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'FILE_NAME' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]){3}[0-9]{4}$' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'CREDIT_CARD' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' - AND "{COL_NAME}" !~ '\\s(and|but|or|yet)\\s' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'DELIMITED_DATA' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$' - AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' - AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' - END as std_pattern_match, -05_else: NULL as distinct_std_value_ct, - NULL as zero_length_ct, - NULL as lead_space_ct, - NULL as quoted_value_ct, - NULL as includes_digit_ct, - NULL as filled_value_ct, - NULL as min_text, - NULL as max_text, - NULL as upper_case_ct, - NULL as lower_case_ct, - NULL as non_alpha_ct, - NULL as non_printing_ct, - NULL as numeric_ct, - NULL as date_ct, - NULL as std_pattern_match, - -06_A: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats - FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, - COUNT(*) AS ct - FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( - "{COL_NAME}", '[a-z]', 'a'), - '[A-Z]', 'A'), - '[0-9]', 'N') AS pattern - FROM target_table - WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p - GROUP BY pattern - HAVING pattern > ' ' - ORDER BY COUNT(*) DESC) as ps) AS top_patterns, -06_else: NULL as top_patterns, - -08_N: MIN("{COL_NAME}") AS min_value, - MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, - MAX("{COL_NAME}") AS max_value, - AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, - STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value, - MIN(pct_25) as percentile_25, - MIN(pct_50) as percentile_50, - MIN(pct_75) as percentile_75, -08_else: NULL as min_value, - NULL as min_value_over_0, - NULL as max_value, - NULL as avg_value, - NULL as stdev_value, - NULL as percentile_25, - NULL as percentile_50, - NULL as percentile_75, - -10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -10_else: NULL as fractional_sum, - -11_D: CASE - WHEN MIN("{COL_NAME}") IS NULL THEN NULL - ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') - END as min_date, - MAX("{COL_NAME}") as max_date, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct, - COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}"::DATE, '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct, - COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}"::DATE, '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, - COUNT( CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}"::DATE) > 240 THEN 1 END) AS distant_future_date_ct, - COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_days_present, - COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_weeks_present, - COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) as date_months_present, -11_else: NULL as min_date, - NULL as max_date, - NULL as before_1yr_date_ct, - NULL as before_5yr_date_ct, - NULL as before_20yr_date_ct, - NULL AS before_100yr_date_ct, - NULL as within_1yr_date_ct, - NULL as within_1mo_date_ct, - NULL as future_date_ct, - NULL as distant_future_date_ct, - NULL as date_days_present, - NULL as date_weeks_present, - NULL as date_months_present, - -12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, -12_else: NULL as boolean_true_ct, - -14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( - "{COL_NAME}", '[a-z]', 'a'), - '[A-Z]', 'A'), - '[0-9]', 'N') - ) AS pattern_ct - FROM target_table - WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, - SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, - AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, -14_else: NULL as distinct_pattern_ct, - NULL as embedded_space_ct, - NULL as avg_embedded_spaces, - -16_all: " '{PROFILE_RUN_ID}' as profile_run_id" - -98_all: ' FROM target_table' - -99_N: | - , (SELECT - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, - PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, - PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile -99_N_sampling: | - , (SELECT - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, - PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, - PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile -99_else: ' ' diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql new file mode 100644 index 00000000..db97da0b --- /dev/null +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql @@ -0,0 +1,205 @@ +WITH target_table AS ( +-- TG-IF do_sample + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} +-- TG-ELSE + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-ENDIF +) +SELECT + {CONNECTION_ID} AS connection_id, + '{PROJECT_CODE}' AS project_code, + '{TABLE_GROUPS_ID}' AS table_groups_id, + '{DATA_SCHEMA}' AS schema_name, + '{RUN_DATE}' AS run_date, + '{DATA_TABLE}' AS table_name, + {COL_POS} AS position, + '{COL_NAME_SANITIZED}' AS column_name, + '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, + '{COL_GEN_TYPE}' AS general_type, + COUNT(*) AS record_ct, + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, + SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, +-- TG-IF is_type_ADN + MIN(LEN("{COL_NAME}")) AS min_length, + MAX(LEN("{COL_NAME}")) AS max_length, + AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, +-- TG-ELSE + NULL AS min_length, + NULL AS max_length, + NULL AS avg_length, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_N + SUM(1 - ABS(SIGN("{COL_NAME}")))::BIGINT AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_not_A_not_N + NULL AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) AS distinct_std_value_ct, + COUNT(CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, + COUNT(CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 END) AS lead_space_ct, + COUNT(CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END) AS quoted_value_ct, + COUNT(CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 END) AS includes_digit_ct, + COUNT(CASE + WHEN LOWER("{COL_NAME}") SIMILAR TO '(.{1,}|-{1,}|\\?{1,}|\\s{1,}|0{2,}|9{2,}|x{2,}|z{2,})' THEN 1 + WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', + 'n/a','#na','none','null','unknown') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', + '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', + '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 + END) AS filled_value_ct, + LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, + LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, + COUNT(CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 END) AS upper_case_ct, + COUNT(CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 END) AS lower_case_ct, + COUNT(CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 END) AS non_alpha_ct, + COUNT(CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) AS non_printing_ct, + SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, + SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, + CASE + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'STREET_ADDR' + WHEN SUM(CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'STATE_USA' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'PHONE_USA' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL' + WHEN SUM(CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[\\w\\s\-]+\\.(txt|(c|t|p)sv|dat|doc|docx|json|pdf|xlsx|xml)$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'FILE_NAME' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]){3}[0-9]{4}$' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'CREDIT_CARD' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' + AND "{COL_NAME}" !~ '\\s(and|but|or|yet)\\s' + THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'DELIMITED_DATA' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$' + AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' + AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' + END AS std_pattern_match, +-- TG-ELSE + NULL AS distinct_std_value_ct, + NULL AS zero_length_ct, + NULL AS lead_space_ct, + NULL AS quoted_value_ct, + NULL AS includes_digit_ct, + NULL AS filled_value_ct, + NULL AS min_text, + NULL AS max_text, + NULL AS upper_case_ct, + NULL AS lower_case_ct, + NULL AS non_alpha_ct, + NULL AS non_printing_ct, + NULL AS numeric_ct, + NULL AS date_ct, + NULL AS std_pattern_match, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + FROM (SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, + COUNT(*) AS ct + FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( + "{COL_NAME}", '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') AS pattern + FROM target_table + WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + GROUP BY pattern + HAVING pattern > ' ' + ORDER BY COUNT(*) DESC) AS ps) AS top_patterns, +-- TG-ELSE + NULL AS top_patterns, +-- TG-ENDIF +-- TG-IF is_type_N + MIN("{COL_NAME}") AS min_value, + MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, + MAX("{COL_NAME}") AS max_value, + AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, + STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value, + MIN(pct_25) AS percentile_25, + MIN(pct_50) AS percentile_50, + MIN(pct_75) AS percentile_75, +-- TG-ELSE + NULL AS min_value, + NULL AS min_value_over_0, + NULL AS max_value, + NULL AS avg_value, + NULL AS stdev_value, + NULL AS percentile_25, + NULL AS percentile_50, + NULL AS percentile_75, +-- TG-ENDIF +-- TG-IF is_N_decimal + SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) AS fractional_sum, +-- TG-ELSE + NULL AS fractional_sum, +-- TG-ENDIF +-- TG-IF is_type_D + CASE + WHEN MIN("{COL_NAME}") IS NULL THEN NULL + ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') + END AS min_date, + MAX("{COL_NAME}") AS max_date, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}"::DATE, '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct, + COUNT(CASE WHEN DATEDIFF('DAY', "{COL_NAME}"::DATE, '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct, + COUNT(CASE WHEN DATEDIFF('DAY', "{COL_NAME}"::DATE, '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, + COUNT(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}"::DATE) > 240 THEN 1 END) AS distant_future_date_ct, + COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_days_present, + COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_weeks_present, + COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_months_present, +-- TG-ELSE + NULL AS min_date, + NULL AS max_date, + NULL AS before_1yr_date_ct, + NULL AS before_5yr_date_ct, + NULL AS before_20yr_date_ct, + NULL AS before_100yr_date_ct, + NULL AS within_1yr_date_ct, + NULL AS within_1mo_date_ct, + NULL AS future_date_ct, + NULL AS distant_future_date_ct, + NULL AS date_days_present, + NULL AS date_weeks_present, + NULL AS date_months_present, +-- TG-ENDIF +-- TG-IF is_type_B + SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +-- TG-ELSE + NULL AS boolean_true_ct, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( + "{COL_NAME}", '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') + ) AS pattern_ct + FROM target_table + WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, + SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, + AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, +-- TG-ELSE + NULL AS distinct_pattern_ct, + NULL AS embedded_space_ct, + NULL AS avg_embedded_spaces, +-- TG-ENDIF + '{PROFILE_RUN_ID}' AS profile_run_id + FROM target_table +-- TG-IF is_type_N + , (SELECT + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile +-- TG-ENDIF diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml deleted file mode 100644 index 0e0b6401..00000000 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.yaml +++ /dev/null @@ -1,204 +0,0 @@ ---- -01_sampling: | - WITH target_table AS ( - SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE RAND() <= 1.0 / {PROFILE_SAMPLE_RATIO} - ) - SELECT -01_else: | - WITH target_table AS ( - SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - ) - SELECT -01_all: | - {CONNECTION_ID} as connection_id, - '{PROJECT_CODE}' as project_code, - '{TABLE_GROUPS_ID}' as table_groups_id, - '{DATA_SCHEMA}' AS schema_name, - '{RUN_DATE}' AS run_date, - '{DATA_TABLE}' AS table_name, - {COL_POS} AS position, - '{COL_NAME_SANITIZED}' AS column_name, - '{COL_TYPE}' AS column_type, - '{DB_DATA_TYPE}' AS db_data_type, - '{COL_GEN_TYPE}' AS general_type, - COUNT(*) AS record_ct, - -02_X: | - COUNT("{COL_NAME}") AS value_ct, - COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, - SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -02_else: | - COUNT("{COL_NAME}") AS value_ct, - COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, - SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, - -03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, - MAX(LEN("{COL_NAME}")) AS max_length, - AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, -03_else: NULL as min_length, - NULL as max_length, - NULL as avg_length, - -04_A: COUNT( CASE WHEN TRIM("{COL_NAME}") ~ '^0(\.0*)?$' THEN 1 END) AS zero_value_ct, -04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, -04_else: NULL as zero_value_ct, - -05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, - COUNT( CASE WHEN "{COL_NAME}" = '' THEN 1 END) AS zero_length_ct, - COUNT( CASE WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 END ) AS lead_space_ct, - COUNT( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct, - COUNT( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 END ) as includes_digit_ct, - COUNT( CASE - WHEN LENGTH("{COL_NAME}") > 0 AND "{COL_NAME}" IN ('.', '?', ' ') THEN 1 - WHEN LOWER("{COL_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1 - WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', - 'n/a','#na','none','null','unknown') THEN 1 - WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', - '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 - WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', - '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 - END ) AS filled_value_ct, - LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, - LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, - COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 END) AS upper_case_ct, - COUNT( CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 END) AS lower_case_ct, - COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 END) AS non_alpha_ct, - COUNT( CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, - SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, - SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, - CASE - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'STREET_ADDR' - WHEN SUM( CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'STATE_USA' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'PHONE_USA' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL' - WHEN SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[\\w\\s\-]+\\.(txt|(c|t|p)sv|dat|doc|docx|json|pdf|xlsx|xml)$' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'FILE_NAME' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]){3}[0-9]{4}$' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'CREDIT_CARD' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' - AND "{COL_NAME}" !~ '\\s(and|but|or|yet)\\s' - THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'DELIMITED_DATA' - WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$' - AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' - AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' - END as std_pattern_match, -05_else: NULL as distinct_std_value_ct, - NULL as zero_length_ct, - NULL as lead_space_ct, - NULL as quoted_value_ct, - NULL as includes_digit_ct, - NULL as filled_value_ct, - NULL as min_text, - NULL as max_text, - NULL as upper_case_ct, - NULL as lower_case_ct, - NULL as non_alpha_ct, - NULL as non_printing_ct, - NULL as numeric_ct, - NULL as date_ct, - NULL as std_pattern_match, - -06_A: (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats - FROM ( SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, - COUNT(*) AS ct - FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( - "{COL_NAME}", '[a-z]', 'a'), - '[A-Z]', 'A'), - '[0-9]', 'N') AS pattern - FROM target_table - WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p - GROUP BY pattern - HAVING pattern > ' ' - ORDER BY COUNT(*) DESC) as ps) AS top_patterns, -06_else: NULL as top_patterns, - -08_N: MIN("{COL_NAME}") AS min_value, - MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, - MAX("{COL_NAME}") AS max_value, - AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, - STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value, - MIN(pct_25) as percentile_25, - MIN(pct_50) as percentile_50, - MIN(pct_75) as percentile_75, -08_else: NULL as min_value, - NULL as min_value_over_0, - NULL as max_value, - NULL as avg_value, - NULL as stdev_value, - NULL as percentile_25, - NULL as percentile_50, - NULL as percentile_75, - -10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -10_else: NULL as fractional_sum, - -11_D: CASE - WHEN MIN("{COL_NAME}") IS NULL THEN NULL - ELSE GREATEST(MIN("{COL_NAME}"), '0001-01-01') - END as min_date, - MAX("{COL_NAME}") as max_date, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct, - COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct, - COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, - COUNT( CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 END) AS distant_future_date_ct, - COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, - COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, - COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, -11_else: NULL as min_date, - NULL as max_date, - NULL as before_1yr_date_ct, - NULL as before_5yr_date_ct, - NULL as before_20yr_date_ct, - NULL AS before_100yr_date_ct, - NULL as within_1yr_date_ct, - NULL as within_1mo_date_ct, - NULL as future_date_ct, - NULL as distant_future_date_ct, - NULL as date_days_present, - NULL as date_weeks_present, - NULL as date_months_present, - -12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, -12_else: NULL as boolean_true_ct, - -14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( - "{COL_NAME}", '[a-z]', 'a'), - '[A-Z]', 'A'), - '[0-9]', 'N') - ) AS pattern_ct - FROM target_table - WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, - SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"), ' '))::BIGINT) AS embedded_space_ct, - AVG(REGEXP_COUNT(TRIM("{COL_NAME}"), ' ')::FLOAT) AS avg_embedded_spaces, -14_else: NULL as distinct_pattern_ct, - NULL as embedded_space_ct, - NULL as avg_embedded_spaces, - -16_all: " '{PROFILE_RUN_ID}' as profile_run_id" - -98_all: ' FROM target_table' - -99_N: | - , (SELECT - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, - PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, - PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile -99_N_sampling: | - , (SELECT - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, - PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, - PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile -99_else: ' ' diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql b/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql new file mode 100644 index 00000000..56e4ae1f --- /dev/null +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql @@ -0,0 +1,221 @@ +WITH target_table AS ( +-- TG-IF do_sample + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) +-- TG-ELSE + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-ENDIF +) +SELECT + {CONNECTION_ID} AS connection_id, + '{PROJECT_CODE}' AS project_code, + '{TABLE_GROUPS_ID}' AS table_groups_id, + '{DATA_SCHEMA}' AS schema_name, + '{RUN_DATE}' AS run_date, + '{DATA_TABLE}' AS table_name, + {COL_POS} AS position, + '{COL_NAME_SANITIZED}' AS column_name, + '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, + '{COL_GEN_TYPE}' AS general_type, + COUNT(*) AS record_ct, + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, + SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, +-- TG-IF is_type_ADN + MIN(LEN("{COL_NAME}")) AS min_length, + MAX(LEN("{COL_NAME}")) AS max_length, + AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, +-- TG-ELSE + NULL AS min_length, + NULL AS max_length, + NULL AS avg_length, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(CASE + WHEN REGEXP_LIKE(TRIM("{COL_NAME}"::VARCHAR), '^0(\.0*)?$') THEN 1 + END) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_N + SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_not_A_not_N + NULL AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) AS distinct_std_value_ct, + COUNT(CASE + WHEN "{COL_NAME}" = '' THEN 1 + END) AS zero_length_ct, + COUNT( CASE + WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 + END) AS lead_space_ct, + COUNT(CASE WHEN "{COL_NAME}"::VARCHAR ILIKE '"%"' OR "{COL_NAME}"::VARCHAR ILIKE '''%''' THEN 1 END) AS quoted_value_ct, + COUNT(CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '.*[0-9].*') THEN 1 END) AS includes_digit_ct, + COUNT(CASE + WHEN LOWER("{COL_NAME}"::VARCHAR) REGEXP '\\.{1,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '-{1,}' + OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '\\?{1,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '\\s{1,}' + OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '9{2,}' + OR LOWER("{COL_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 + WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', + 'n/a','#na','none','null','unknown') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', + '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', + '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 + END) AS filled_value_ct, + LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, + LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, + COUNT(CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 END) AS upper_case_ct, + COUNT(CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 END) AS lower_case_ct, + COUNT(CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 END) AS non_alpha_ct, + COUNT(CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) AS non_printing_ct, + SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, + SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, + CASE + WHEN CAST(SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') + THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'STREET_ADDR' + WHEN CAST(SUM(CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') + THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'STATE_USA' + WHEN CAST(SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$') + THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'PHONE_USA' + WHEN CAST(SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') + THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'EMAIL' + WHEN CAST(SUM(CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') + THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'ZIP_USA' + WHEN CAST(SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[\\w\\s\-]+\\.(txt|csv|tsv|dat|doc|pdf|xlsx)$') + THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'FILE_NAME' + WHEN CAST(SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^([0-9]{4}[- ]?){3}[0-9]{4}$') + THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'CREDIT_CARD' + WHEN CAST(SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') + AND NOT REGEXP_LIKE("{COL_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*') + THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'DELIMITED_DATA' + WHEN SUM (CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$') + AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' + AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' + END AS std_pattern_match, +-- TG-ELSE + NULL AS distinct_std_value_ct, + NULL AS zero_length_ct, + NULL AS lead_space_ct, + NULL AS quoted_value_ct, + NULL AS includes_digit_ct, + NULL AS filled_value_ct, + NULL AS min_text, + NULL AS max_text, + NULL AS upper_case_ct, + NULL AS lower_case_ct, + NULL AS non_alpha_ct, + NULL AS non_printing_ct, + NULL AS numeric_ct, + NULL AS date_ct, + NULL AS std_pattern_match, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats + FROM ( + SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, + COUNT(*) AS ct + FROM (SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( + "{COL_NAME}"::VARCHAR, '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') AS pattern + FROM target_table + WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + GROUP BY pattern + HAVING pattern > ' ' + ORDER BY COUNT(*) DESC) AS ps) AS top_patterns, +-- TG-ELSE + NULL AS top_patterns, +-- TG-ENDIF +-- TG-IF is_type_N + MIN("{COL_NAME}") AS min_value, + MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, + MAX("{COL_NAME}") AS max_value, + AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, + STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value, + MIN(pct_25) AS percentile_25, + MIN(pct_50) AS percentile_50, + MIN(pct_75) AS percentile_75, +-- TG-ELSE + NULL AS min_value, + NULL AS min_value_over_0, + NULL AS max_value, + NULL AS avg_value, + NULL AS stdev_value, + NULL AS percentile_25, + NULL AS percentile_50, + NULL AS percentile_75, +-- TG-ENDIF +-- TG-IF is_N_decimal + SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) AS fractional_sum, +-- TG-ELSE + NULL AS fractional_sum, +-- TG-ENDIF +-- TG-IF is_type_D + GREATEST(MIN("{COL_NAME}"), '0001-01-01') AS min_date, + MAX("{COL_NAME}") AS max_date, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct, + COUNT(CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct, + COUNT(CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, + COUNT(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, + COUNT(CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 END) AS distant_future_date_ct, + COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}')) AS date_days_present, + COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}')) AS date_weeks_present, + COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}')) AS date_months_present, +-- TG-ELSE + NULL AS min_date, + NULL AS max_date, + NULL AS before_1yr_date_ct, + NULL AS before_5yr_date_ct, + NULL AS before_20yr_date_ct, + NULL AS before_100yr_date_ct, + NULL AS within_1yr_date_ct, + NULL AS within_1mo_date_ct, + NULL AS future_date_ct, + NULL AS distant_future_date_ct, + NULL AS date_days_present, + NULL AS date_weeks_present, + NULL AS date_months_present, +-- TG-ENDIF +-- TG-IF is_type_B + SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +-- TG-ELSE + NULL AS boolean_true_ct, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT COUNT(DISTINCT REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE( + "{COL_NAME}"::VARCHAR, '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') + ) AS pattern_ct + FROM target_table + WHERE "{COL_NAME}" > ' ') AS distinct_pattern_ct, + SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct, + AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces, +-- TG-ELSE + NULL AS distinct_pattern_ct, + NULL AS embedded_space_ct, + NULL AS avg_embedded_spaces, +-- TG-ENDIF + '{PROFILE_RUN_ID}' AS profile_run_id + FROM target_table +-- TG-IF is_N_sampling + , + (SELECT + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) LIMIT 1 ) pctile +-- TG-ENDIF +-- TG-IF is_N_no_sampling + , + (SELECT + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile +-- TG-ENDIF diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml deleted file mode 100644 index 5c04fce8..00000000 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query.yaml +++ /dev/null @@ -1,211 +0,0 @@ ---- -01_sampling: | - WITH target_table AS ( - SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) - ) - SELECT -01_else: | - WITH target_table AS ( - SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" - ) - SELECT -01_all: | - {CONNECTION_ID} as connection_id, - '{PROJECT_CODE}' as project_code, - '{TABLE_GROUPS_ID}' as table_groups_id, - '{DATA_SCHEMA}' AS schema_name, - '{RUN_DATE}' AS run_date, - '{DATA_TABLE}' AS table_name, - {COL_POS} AS position, - '{COL_NAME_SANITIZED}' AS column_name, - '{COL_TYPE}' AS column_type, - '{DB_DATA_TYPE}' AS db_data_type, - '{COL_GEN_TYPE}' AS general_type, - COUNT(*) AS record_ct, - -02_X: | - COUNT("{COL_NAME}") AS value_ct, - COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, - SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, -02_else: | - COUNT("{COL_NAME}") AS value_ct, - COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, - SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, - -03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, - MAX(LEN("{COL_NAME}")) AS max_length, - AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, -03_else: NULL as min_length, - NULL as max_length, - NULL as avg_length, - -04_A: COUNT(CASE - WHEN REGEXP_LIKE(TRIM("{COL_NAME}"::VARCHAR), '^0(\.0*)?$') THEN 1 - END) AS zero_value_ct, -04_N: SUM( 1 - ABS(SIGN("{COL_NAME}")) )::BIGINT AS zero_value_ct, -04_else: NULL as zero_value_ct, - -05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) as distinct_std_value_ct, - COUNT(CASE - WHEN "{COL_NAME}" = '' THEN 1 - END) AS zero_length_ct, - COUNT( CASE - WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 - END ) AS lead_space_ct, - COUNT( CASE WHEN "{COL_NAME}"::VARCHAR ILIKE '"%"' OR "{COL_NAME}"::VARCHAR ILIKE '''%''' THEN 1 END ) as quoted_value_ct, - COUNT( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '.*[0-9].*') THEN 1 END ) as includes_digit_ct, - COUNT( CASE - WHEN "{COL_NAME}" IN ('.', '?', ' ') THEN 1 - WHEN LOWER("{COL_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP '9{2,}' - OR LOWER("{COL_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COL_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 - WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', - 'n/a','#na','none','null','unknown') THEN 1 - WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', - '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 - WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', - '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 - END ) AS filled_value_ct, - LEFT(MIN(NULLIF("{COL_NAME}", '')), 100) AS min_text, - LEFT(MAX(NULLIF("{COL_NAME}", '')), 100) AS max_text, - COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 END) AS upper_case_ct, - COUNT( CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 END) AS lower_case_ct, - COUNT( CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 END) AS non_alpha_ct, - COUNT( CASE WHEN TRANSLATE("{COL_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) as non_printing_ct, - SUM(<%IS_NUM;LEFT("{COL_NAME}", 31)%>) AS numeric_ct, - SUM(<%IS_DATE;LEFT("{COL_NAME}", 26)%>) AS date_ct, - CASE - WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') - THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'STREET_ADDR' - WHEN CAST(SUM(CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') - THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'STATE_USA' - WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$') - THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'PHONE_USA' - WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') - THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'EMAIL' - WHEN CAST(SUM( CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') - THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'ZIP_USA' - WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[\\w\\s\-]+\\.(txt|csv|tsv|dat|doc|pdf|xlsx)$') - THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'FILE_NAME' - WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^([0-9]{4}[- ]?){3}[0-9]{4}$') - THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'CREDIT_CARD' - WHEN CAST(SUM( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') - AND NOT REGEXP_LIKE("{COL_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*') - THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'DELIMITED_DATA' - WHEN SUM ( CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$') - AND LEFT("{COL_NAME}", 3) NOT BETWEEN '734' AND '749' - AND LEFT("{COL_NAME}", 3) <> '666' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'SSN' - END as std_pattern_match, -05_else: NULL as distinct_std_value_ct, - NULL as zero_length_ct, - NULL as lead_space_ct, - NULL as quoted_value_ct, - NULL as includes_digit_ct, - NULL as filled_value_ct, - NULL as min_text, - NULL as max_text, - NULL as upper_case_ct, - NULL as lower_case_ct, - NULL as non_alpha_ct, - NULL as non_printing_ct, - NULL as numeric_ct, - NULL as date_ct, - NULL as std_pattern_match, - -06_A: ( SELECT LEFT(LISTAGG(pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1000) AS concat_pats - FROM ( - SELECT TOP 5 CAST(COUNT(*) AS VARCHAR(40)) || ' | ' || pattern AS pattern, - COUNT(*) AS ct - FROM ( SELECT REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( - "{COL_NAME}"::VARCHAR, '[a-z]', 'a'), - '[A-Z]', 'A'), - '[0-9]', 'N') AS pattern - FROM target_table - WHERE "{COL_NAME}" > ' ' AND (SELECT MAX(LEN("{COL_NAME}")) - FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p - GROUP BY pattern - HAVING pattern > ' ' - ORDER BY COUNT(*) DESC) as ps) AS top_patterns, -06_else: NULL as top_patterns, - -08_N: MIN("{COL_NAME}") AS min_value, - MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, - MAX("{COL_NAME}") AS max_value, - AVG(CAST("{COL_NAME}" AS FLOAT)) AS avg_value, - STDDEV(CAST("{COL_NAME}" AS FLOAT)) AS stdev_value, - MIN(pct_25) as percentile_25, - MIN(pct_50) as percentile_50, - MIN(pct_75) as percentile_75, -08_else: NULL as min_value, - NULL as min_value_over_0, - NULL as max_value, - NULL as avg_value, - NULL as stdev_value, - NULL as percentile_25, - NULL as percentile_50, - NULL as percentile_75, - -10_N_dec: SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) as fractional_sum, -10_else: NULL as fractional_sum, - -11_D: GREATEST(MIN("{COL_NAME}"), '0001-01-01') as min_date, - MAX("{COL_NAME}") as max_date, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 12 THEN 1 END) AS before_1yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 60 THEN 1 END) AS before_5yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 240 THEN 1 END) AS before_20yr_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', "{COL_NAME}", '{RUN_DATE}') > 1200 THEN 1 END) AS before_100yr_date_ct, - COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 365 THEN 1 END) AS within_1yr_date_ct, - COUNT( CASE WHEN DATEDIFF('DAY', "{COL_NAME}", '{RUN_DATE}') BETWEEN 0 AND 30 THEN 1 END) AS within_1mo_date_ct, - COUNT( CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, - COUNT( CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 END) AS distant_future_date_ct, - COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}' ) ) as date_days_present, - COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}' ) ) as date_weeks_present, - COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}' ) ) as date_months_present, -11_else: NULL as min_date, - NULL as max_date, - NULL as before_1yr_date_ct, - NULL as before_5yr_date_ct, - NULL as before_20yr_date_ct, - NULL AS before_100yr_date_ct, - NULL as within_1yr_date_ct, - NULL as within_1mo_date_ct, - NULL as future_date_ct, - NULL as distant_future_date_ct, - NULL as date_days_present, - NULL as date_weeks_present, - NULL as date_months_present, - -12_B: SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, -12_else: NULL as boolean_true_ct, - -14_A: ( SELECT COUNT(DISTINCT REGEXP_REPLACE( REGEXP_REPLACE( REGEXP_REPLACE( - "{COL_NAME}"::VARCHAR, '[a-z]', 'a'), - '[A-Z]', 'A'), - '[0-9]', 'N') - ) AS pattern_ct - FROM target_table - WHERE "{COL_NAME}" > ' ' ) AS distinct_pattern_ct, - SUM(SIGN(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' '))::BIGINT) AS embedded_space_ct, - AVG(REGEXP_COUNT(TRIM("{COL_NAME}"::VARCHAR), ' ')::FLOAT) AS avg_embedded_spaces, -14_else: NULL as distinct_pattern_ct, - NULL as embedded_space_ct, - NULL as avg_embedded_spaces, - -16_all: " '{PROFILE_RUN_ID}' as profile_run_id " - -98_all: ' FROM target_table ' - -99_N: | - , - (SELECT - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, - PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, - PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1) pctile -99_N_sampling: | - , - (SELECT - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_25, - PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_50, - PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") OVER () AS pct_75 - FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_SIZE} rows) LIMIT 1 ) pctile -99_else: ; From e13a7f5bce7882a38755bca3725b587c5c609e4f Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Sun, 8 Feb 2026 22:21:57 -0500 Subject: [PATCH 35/95] fix: discrepancies between flavors in hygiene issues and test types --- .../020_create_standard_functions_sprocs.sql | 2 +- ...ile_anomaly_types_Inconsistent_Casing.yaml | 4 ++-- .../test_types_Outlier_Pct_Below.yaml | 16 ++++++------- .../test_types_Weekly_Rec_Ct.yaml | 16 ++++++------- .../dbupgrade/0176_incremental_upgrade.sql | 24 +++++++++++++++++++ .../profiling/templated_functions.yaml | 3 +-- .../profiling/functional_datatype.sql | 3 +++ 7 files changed, 47 insertions(+), 21 deletions(-) create mode 100644 testgen/template/dbupgrade/0176_incremental_upgrade.sql diff --git a/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql b/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql index d2285833..013343f0 100644 --- a/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql +++ b/testgen/template/dbsetup/020_create_standard_functions_sprocs.sql @@ -10,7 +10,7 @@ $$ WHEN UPPER(difftype) IN ('DAY', 'DD') THEN DATE_PART('day', seconddate - firstdate) WHEN UPPER(difftype) IN ('WEEK','WK') - THEN TRUNC(DATE_PART('day', seconddate - firstdate)/7) + THEN (DATE_TRUNC('week', seconddate)::DATE - DATE_TRUNC('week', firstdate)::DATE) / 7 WHEN UPPER(difftype) IN ('MON', 'MM') THEN 12 * (DATE_PART('year', seconddate) - DATE_PART('year', firstdate)) + (DATE_PART('month', seconddate) - DATE_PART('month', firstdate)) diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml index 6443d845..2578c101 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml @@ -61,11 +61,11 @@ profile_anomaly_types: lookup_type: null lookup_query: |- SELECT TOP {LIMIT_2} 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" + WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" COLLATE Latin1_General_BIN GROUP BY "{COLUMN_NAME}" UNION ALL SELECT TOP {LIMIT_2} 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") + WHERE "{COLUMN_NAME}" COLLATE Latin1_General_BIN <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" COLLATE Latin1_General_BIN <> LOWER("{COLUMN_NAME}") GROUP BY "{COLUMN_NAME}" error_type: Profile Anomaly - id: '1259' diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml index 0fd3341a..7d192436 100644 --- a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml @@ -112,9 +112,9 @@ test_types: sql_flavor: bigquery lookup_type: null lookup_query: |- - SELECT ({BASELINE_AVG} + (2 * {BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count + SELECT ({BASELINE_AVG} - (2 * {BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` - WHERE CAST(`{COLUMN_NAME}` AS FLOAT64) < ({BASELINE_AVG} + (2 * {BASELINE_SD})) + WHERE CAST(`{COLUMN_NAME}` AS FLOAT64) < ({BASELINE_AVG} - (2 * {BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC; error_type: Test Results @@ -124,7 +124,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC; + SELECT ({BASELINE_AVG} - (2*{BASELINE_SD})) AS outlier_threshold, `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE `{COLUMN_NAME}` :: FLOAT < ({BASELINE_AVG} - (2*{BASELINE_SD})) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC; error_type: Test Results - id: '1159' test_id: '1025' @@ -132,7 +132,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS FLOAT) < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT ({BASELINE_AVG} - (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS FLOAT) < ({BASELINE_AVG} - (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results - id: '1102' test_id: '1025' @@ -140,7 +140,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT ({BASELINE_AVG} - (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} - (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results - id: '1020' test_id: '1025' @@ -148,7 +148,7 @@ test_types: sql_flavor: redshift lookup_type: null lookup_query: |- - SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT ({BASELINE_AVG} - (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} - (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results - id: '1420' test_id: '1025' @@ -156,7 +156,7 @@ test_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT ({BASELINE_AVG} - (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} - (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results - id: '1216' test_id: '1025' @@ -164,6 +164,6 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; + SELECT ({BASELINE_AVG} - (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} - (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml index 1aff7bb4..10c98cc6 100644 --- a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml @@ -49,7 +49,7 @@ test_types: test_type: Weekly_Rec_Ct sql_flavor: databricks measure: |- - CAST(<%DATEDIFF_WEEK;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%> + 1 - COUNT(DISTINCT DATE_TRUNC('week', {COLUMN_NAME})) AS INT) + MAX(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) - MIN(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>)+1 - COUNT(DISTINCT <%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -57,7 +57,7 @@ test_types: test_type: Weekly_Rec_Ct sql_flavor: mssql measure: |- - MAX(DATEDIFF(week, CAST('1800-01-01' AS DATE), {COLUMN_NAME})) - MIN(DATEDIFF(week, CAST('1800-01-01' AS DATE), {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, CAST('1800-01-01' AS DATE), {COLUMN_NAME})) + MAX(DATEDIFF(week, CAST('1800-01-01' AS DATE), DATEADD(day, -1, {COLUMN_NAME}))) - MIN(DATEDIFF(week, CAST('1800-01-01' AS DATE), DATEADD(day, -1, {COLUMN_NAME})))+1 - COUNT(DISTINCT DATEDIFF(week, CAST('1800-01-01' AS DATE), DATEADD(day, -1, {COLUMN_NAME}))) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -163,20 +163,20 @@ test_types: All_Nums as (select row_number() over(order by C) as Number from Pass4), tally as (SELECT Number FROM All_Nums WHERE Number <= 45000), - date_range as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MIN("{COLUMN_NAME}")), 0) AS DATE) AS min_period, - CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) AS max_period, + date_range as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, DATEADD(day, -1, MIN("{COLUMN_NAME}"))), 0) AS DATE) AS min_period, + CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, DATEADD(day, -1, MAX("{COLUMN_NAME}"))), 0) AS DATE) AS max_period, DATEDIFF(WEEK, - CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MIN("{COLUMN_NAME}")), 0) AS DATE), - CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, MAX("{COLUMN_NAME}")), 0) AS DATE) ) + 1 as period_ct + CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, DATEADD(day, -1, MIN("{COLUMN_NAME}"))), 0) AS DATE), + CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, DATEADD(day, -1, MAX("{COLUMN_NAME}"))), 0) AS DATE) ) + 1 as period_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" ), check_periods as ( SELECT d.min_period, d.max_period, t.number, DATEADD(WEEK, -(t.number - 1), d.max_period) AS check_period FROM date_range d INNER JOIN tally t ON (d.period_ct >= t.number) ), - data_by_period as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, "{COLUMN_NAME}"), 0) AS DATE) as data_period, COUNT(*) as record_ct + data_by_period as (SELECT CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, DATEADD(day, -1, "{COLUMN_NAME}")), 0) AS DATE) as data_period, COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" - GROUP BY CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, "{COLUMN_NAME}"), 0) AS DATE) ), + GROUP BY CAST(DATEADD(WEEK, DATEDIFF(WEEK, 0, DATEADD(day, -1, "{COLUMN_NAME}")), 0) AS DATE) ), data_by_prd_with_prior_next as (SELECT check_period, RANK() OVER (ORDER BY check_period DESC) as ranked, ISNULL(d.record_ct, 0) as record_ct, diff --git a/testgen/template/dbupgrade/0176_incremental_upgrade.sql b/testgen/template/dbupgrade/0176_incremental_upgrade.sql new file mode 100644 index 00000000..0c219e4b --- /dev/null +++ b/testgen/template/dbupgrade/0176_incremental_upgrade.sql @@ -0,0 +1,24 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +CREATE OR REPLACE FUNCTION {SCHEMA_NAME}.datediff(difftype character varying, firstdate timestamp without time zone, seconddate timestamp without time zone) returns bigint + language plpgsql +as +$$ + BEGIN + RETURN + CASE + WHEN UPPER(difftype) IN ('DAY', 'DD') + THEN DATE_PART('day', seconddate - firstdate) + WHEN UPPER(difftype) IN ('WEEK','WK') + THEN (DATE_TRUNC('week', seconddate)::DATE - DATE_TRUNC('week', firstdate)::DATE) / 7 + WHEN UPPER(difftype) IN ('MON', 'MM') + THEN 12 * (DATE_PART('year', seconddate) - DATE_PART('year', firstdate)) + + (DATE_PART('month', seconddate) - DATE_PART('month', firstdate)) + WHEN UPPER(difftype) IN ('QUARTER', 'QTR') + THEN 4 * (DATE_PART('year', seconddate) - DATE_PART('year', firstdate)) + + (DATE_PART('qtr', seconddate) - DATE_PART('month', firstdate)) + WHEN UPPER(difftype) IN ('YEAR', 'YY') + THEN DATE_PART('year', seconddate) - DATE_PART('year', firstdate) + END; + END; +$$; diff --git a/testgen/template/flavors/postgresql/profiling/templated_functions.yaml b/testgen/template/flavors/postgresql/profiling/templated_functions.yaml index cf9d8541..b447289d 100644 --- a/testgen/template/flavors/postgresql/profiling/templated_functions.yaml +++ b/testgen/template/flavors/postgresql/profiling/templated_functions.yaml @@ -1,6 +1,6 @@ DATEDIFF_DAY: DATE({$2}) - DATE({$1}) -DATEDIFF_WEEK: (DATE({$2}) - DATE({$1})) / 7 +DATEDIFF_WEEK: (DATE_TRUNC('week', DATE({$2}))::DATE - DATE_TRUNC('week', DATE({$1}))::DATE) / 7 DATEDIFF_MONTH: (DATE_PART('year', {$2}::TIMESTAMP) - DATE_PART('year', {$1}::TIMESTAMP)) * 12 + (DATE_PART('month', {$2}::TIMESTAMP) - DATE_PART('month', {$1}::TIMESTAMP)) @@ -106,4 +106,3 @@ IS_DATE: CASE END ELSE 0 END - diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index e3c66599..7b7832d2 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -491,6 +491,9 @@ SET functional_data_type = WHEN general_type='N' AND ( column_type ILIKE '%int%' OR + (SPLIT_PART(column_type, ',', 2) > '' + AND RTRIM(SPLIT_PART(column_type, ',', 2), ' )') = '0') + OR (RTRIM(SPLIT_PART(column_type, ',', 2), ')') > '0' AND fractional_sum = 0) -- 0 implies integer; null is float or non-numeric ) THEN 'Measurement Discrete' From 085e45dbc5dc3667d2a2892934de5db930890331 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Sun, 8 Feb 2026 22:22:18 -0500 Subject: [PATCH 36/95] feat: add support for Oracle 12c+ --- pyproject.toml | 1 + .../queries/refresh_data_chars_query.py | 22 +- .../common/database/flavor/flavor_service.py | 6 +- .../database/flavor/mssql_flavor_service.py | 4 +- .../database/flavor/oracle_flavor_service.py | 34 ++ testgen/common/models/connection.py | 2 +- ..._anomaly_types_Boolean_Value_Mismatch.yaml | 8 + ...anomaly_types_Char_Column_Date_Values.yaml | 8 + ...omaly_types_Char_Column_Number_Values.yaml | 8 + ...anomaly_types_Column_Pattern_Mismatch.yaml | 8 + ...anomaly_types_Delimited_Data_Embedded.yaml | 8 + ...ile_anomaly_types_Inconsistent_Casing.yaml | 8 + ...rofile_anomaly_types_Invalid_Zip3_USA.yaml | 8 + ...profile_anomaly_types_Invalid_Zip_USA.yaml | 8 + .../profile_anomaly_types_Leading_Spaces.yaml | 8 + ...le_anomaly_types_Multiple_Types_Major.yaml | 8 + ...le_anomaly_types_Multiple_Types_Minor.yaml | 8 + .../profile_anomaly_types_No_Values.yaml | 8 + ..._anomaly_types_Non_Alpha_Name_Address.yaml | 8 + ...anomaly_types_Non_Alpha_Prefixed_Name.yaml | 8 + ...file_anomaly_types_Non_Printing_Chars.yaml | 8 + ...ile_anomaly_types_Non_Standard_Blanks.yaml | 8 + ...le_anomaly_types_Potential_Duplicates.yaml | 8 + .../profile_anomaly_types_Potential_PII.yaml | 8 + .../profile_anomaly_types_Quoted_Values.yaml | 8 + ...rofile_anomaly_types_Recency_One_Year.yaml | 8 + ...file_anomaly_types_Recency_Six_Months.yaml | 8 + ...nomaly_types_Small_Divergent_Value_Ct.yaml | 8 + ..._anomaly_types_Small_Missing_Value_Ct.yaml | 8 + ..._anomaly_types_Small_Numeric_Value_Ct.yaml | 8 + ...maly_types_Standardized_Value_Matches.yaml | 8 + .../profile_anomaly_types_Suggested_Type.yaml | 8 + ..._anomaly_types_Table_Pattern_Mismatch.yaml | 8 + ...ofile_anomaly_types_Unexpected_Emails.yaml | 8 + ...le_anomaly_types_Unexpected_US_States.yaml | 8 + ...le_anomaly_types_Unlikely_Date_Values.yaml | 8 + ...le_anomaly_types_Variant_Coded_Values.yaml | 8 + .../test_types_Aggregate_Balance.yaml | 69 ++++ .../test_types_Aggregate_Balance_Percent.yaml | 71 ++++ .../test_types_Aggregate_Balance_Range.yaml | 71 ++++ .../test_types_Aggregate_Minimum.yaml | 69 ++++ .../test_types_Alpha_Trunc.yaml | 16 + .../test_types_Avg_Shift.yaml | 16 + .../dbsetup_test_types/test_types_CUSTOM.yaml | 34 ++ .../test_types_Combo_Match.yaml | 61 +++ .../test_types_Condition_Flag.yaml | 16 + .../test_types_Constant.yaml | 16 + .../test_types_Daily_Record_Ct.yaml | 16 + .../test_types_Dec_Trunc.yaml | 16 + .../test_types_Distinct_Date_Ct.yaml | 16 + .../test_types_Distinct_Value_Ct.yaml | 16 + .../test_types_Distribution_Shift.yaml | 78 ++++ .../test_types_Dupe_Rows.yaml | 47 +++ .../test_types_Email_Format.yaml | 16 + .../test_types_Freshness_Trend.yaml | 46 +++ .../test_types_Future_Date.yaml | 16 + .../test_types_Future_Date_1Y.yaml | 16 + .../test_types_Incr_Avg_Shift.yaml | 16 + .../test_types_LOV_All.yaml | 16 + .../test_types_LOV_Match.yaml | 16 + .../test_types_Metric_Trend.yaml | 19 + .../test_types_Min_Date.yaml | 16 + .../test_types_Min_Val.yaml | 16 + .../test_types_Missing_Pct.yaml | 16 + .../test_types_Monthly_Rec_Ct.yaml | 16 + .../test_types_Outlier_Pct_Above.yaml | 16 + .../test_types_Outlier_Pct_Below.yaml | 16 + .../test_types_Pattern_Match.yaml | 16 + .../test_types_Recency.yaml | 16 + .../test_types_Required.yaml | 16 + .../dbsetup_test_types/test_types_Row_Ct.yaml | 16 + .../test_types_Row_Ct_Pct.yaml | 16 + .../test_types_Schema_Drift.yaml | 153 +++++--- .../test_types_Street_Addr_Pattern.yaml | 16 + .../test_types_Table_Freshness.yaml | 60 ++- .../test_types_Timeframe_Combo_Gain.yaml | 61 +++ .../test_types_Timeframe_Combo_Match.yaml | 86 ++++ .../test_types_US_State.yaml | 16 + .../dbsetup_test_types/test_types_Unique.yaml | 16 + .../test_types_Unique_Pct.yaml | 16 + .../test_types_Valid_Characters.yaml | 16 + .../test_types_Valid_Month.yaml | 8 + .../test_types_Valid_US_Zip.yaml | 16 + .../test_types_Valid_US_Zip3.yaml | 16 + .../test_types_Variability_Decrease.yaml | 16 + .../test_types_Variability_Increase.yaml | 16 + .../test_types_Volume_Trend.yaml | 19 + .../test_types_Weekly_Rec_Ct.yaml | 16 + .../oracle/data_chars/get_schema_ddf.sql | 40 ++ .../oracle/gen_query_tests/gen_Dupe_Rows.sql | 55 +++ .../gen_query_tests/gen_Freshness_Trend.sql | 193 +++++++++ .../gen_query_tests/gen_Table_Freshness.sql | 181 +++++++++ .../profiling/project_profiling_query.sql | 367 ++++++++++++++++++ .../project_secondary_profiling_query.sql | 41 ++ .../oracle/profiling/templated_functions.yaml | 108 ++++++ .../validate_tests/get_target_identifiers.sql | 5 + testgen/ui/assets/flavors/oracle.svg | 58 +++ .../frontend/js/components/connection_form.js | 18 +- testgen/ui/services/database_service.py | 6 +- .../static/js/components/connection_form.js | 18 +- testgen/ui/views/connections.py | 10 +- .../ui/views/dialogs/data_preview_dialog.py | 7 +- 102 files changed, 2849 insertions(+), 87 deletions(-) create mode 100644 testgen/common/database/flavor/oracle_flavor_service.py create mode 100644 testgen/template/flavors/oracle/data_chars/get_schema_ddf.sql create mode 100644 testgen/template/flavors/oracle/gen_query_tests/gen_Dupe_Rows.sql create mode 100644 testgen/template/flavors/oracle/gen_query_tests/gen_Freshness_Trend.sql create mode 100644 testgen/template/flavors/oracle/gen_query_tests/gen_Table_Freshness.sql create mode 100644 testgen/template/flavors/oracle/profiling/project_profiling_query.sql create mode 100644 testgen/template/flavors/oracle/profiling/project_secondary_profiling_query.sql create mode 100644 testgen/template/flavors/oracle/profiling/templated_functions.yaml create mode 100644 testgen/template/flavors/oracle/validate_tests/get_target_identifiers.sql create mode 100644 testgen/ui/assets/flavors/oracle.svg diff --git a/pyproject.toml b/pyproject.toml index f11aba4d..39aba9f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "databricks-sdk>=0.20.0", "snowflake-sqlalchemy==1.6.1", "sqlalchemy-bigquery==1.14.1", + "oracledb==3.4.0", "pyodbc==5.0.0", "psycopg2-binary==2.9.9", "pycryptodome==3.21", diff --git a/testgen/commands/queries/refresh_data_chars_query.py b/testgen/commands/queries/refresh_data_chars_query.py index 1df6e994..9964a2d4 100644 --- a/testgen/commands/queries/refresh_data_chars_query.py +++ b/testgen/commands/queries/refresh_data_chars_query.py @@ -99,7 +99,7 @@ def _get_table_criteria(self) -> str: """ return table_criteria - + def get_schema_ddf(self) -> tuple[str, dict]: # Runs on Target database return self._get_query( @@ -107,7 +107,7 @@ def get_schema_ddf(self) -> tuple[str, dict]: f"flavors/{self.flavor}/data_chars", extra_params={"TABLE_CRITERIA": self._get_table_criteria()}, ) - + def get_row_counts(self, table_names: Iterable[str]) -> list[tuple[str, None]]: # Runs on Target database schema = self.table_group.table_group_schema @@ -118,18 +118,20 @@ def get_row_counts(self, table_names: Iterable[str]) -> list[tuple[str, None]]: ] chunked_queries = chunk_queries(count_queries, " UNION ALL ", self.connection.max_query_chars) return [ (query, None) for query in chunked_queries ] - + def verify_access(self, table_name: str) -> tuple[str, None]: # Runs on Target database schema = self.table_group.table_group_schema quote = self.flavor_service.quote_character - query = ( - f"SELECT 1 FROM {quote}{schema}{quote}.{quote}{table_name}{quote} LIMIT 1" - if not self.flavor_service.use_top - else f"SELECT TOP 1 * FROM {quote}{schema}{quote}.{quote}{table_name}{quote}" - ) + table_ref = f"{quote}{schema}{quote}.{quote}{table_name}{quote}" + if (row_limiting := self.flavor_service.row_limiting_clause) == "top": + query = f"SELECT TOP 1 * FROM {table_ref}" + elif row_limiting == "fetch": + query = f"SELECT 1 FROM {table_ref} FETCH FIRST 1 ROWS ONLY" + else: + query = f"SELECT 1 FROM {table_ref} LIMIT 1" return (query, None) - + def get_staging_data_chars(self, data_chars: list[ColumnChars], run_date: datetime) -> list[list[str | bool | int]]: return [ [ @@ -147,7 +149,7 @@ def get_staging_data_chars(self, data_chars: list[ColumnChars], run_date: dateti ] for column in data_chars ] - + def update_data_chars(self, run_date: datetime) -> list[tuple[str, dict]]: # Runs on App database params = {"RUN_DATE": to_sql_timestamp(run_date)} diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py index bb253595..af8e8d87 100644 --- a/testgen/common/database/flavor/flavor_service.py +++ b/testgen/common/database/flavor/flavor_service.py @@ -4,7 +4,8 @@ from testgen.common.encrypt import DecryptText -SQLFlavor = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "postgresql", "databricks"] +SQLFlavor = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "postgresql", "databricks", "bigquery", "oracle"] +RowLimitingClause = Literal["limit", "top", "fetch"] class ConnectionParams(TypedDict): @@ -34,8 +35,9 @@ class FlavorService: escape_clause = "" varchar_type = "VARCHAR(1000)" ddf_table_ref = "table_name" - use_top = False + row_limiting_clause: RowLimitingClause = "limit" default_uppercase = False + test_query = "SELECT 1" def init(self, connection_params: ConnectionParams): self.url = connection_params.get("url") or "" diff --git a/testgen/common/database/flavor/mssql_flavor_service.py b/testgen/common/database/flavor/mssql_flavor_service.py index 088c11e9..b066eac7 100644 --- a/testgen/common/database/flavor/mssql_flavor_service.py +++ b/testgen/common/database/flavor/mssql_flavor_service.py @@ -10,7 +10,7 @@ class MssqlFlavorService(FlavorService): concat_operator = "+" escaped_underscore = "[_]" - use_top = True + row_limiting_clause = "top" def get_connection_string_head(self): return f"mssql+pyodbc://{self.username}:{quote_plus(self.password)}@" @@ -44,7 +44,7 @@ def get_pre_connection_queries(self): ("SET ANSI_DEFAULTS ON;", None), ("SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;", None), ] - + def get_connect_args(self): connect_args = super().get_connect_args() if settings.SKIP_DATABASE_CERTIFICATE_VERIFICATION: diff --git a/testgen/common/database/flavor/oracle_flavor_service.py b/testgen/common/database/flavor/oracle_flavor_service.py new file mode 100644 index 00000000..9c3c7932 --- /dev/null +++ b/testgen/common/database/flavor/oracle_flavor_service.py @@ -0,0 +1,34 @@ +import sys +from urllib.parse import quote_plus + +import oracledb + +from testgen.common.database.flavor.flavor_service import FlavorService + +# https://stackoverflow.com/a/74105559 +oracledb.version = "8.3.0" +sys.modules["cx_Oracle"] = oracledb + + +class OracleFlavorService(FlavorService): + + escaped_underscore = "\\_" + escape_clause = "ESCAPE '\\'" + varchar_type = "VARCHAR2(1000)" + default_uppercase = True + row_limiting_clause = "fetch" + test_query = "SELECT 1 FROM DUAL" + + def get_connection_string_head(self): + return f"oracle://{self.username}:{quote_plus(self.password)}@" + + def get_connection_string_from_fields(self): + return f"oracle://{self.username}:{quote_plus(self.password)}@{self.host}:{self.port}?service_name={self.dbname}" + + def get_pre_connection_queries(self): + return [ + ("ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD HH24:MI:SS'", None), + ] + + def get_connect_args(self) -> dict: + return {} diff --git a/testgen/common/models/connection.py b/testgen/common/models/connection.py index 1b5a96f5..9436cd43 100644 --- a/testgen/common/models/connection.py +++ b/testgen/common/models/connection.py @@ -27,7 +27,7 @@ from testgen.common.models.table_group import TableGroup from testgen.utils import is_uuid4 -SQLFlavorCode = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "azure_mssql", "synapse_mssql", "postgresql", "databricks"] +SQLFlavorCode = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "azure_mssql", "synapse_mssql", "postgresql", "databricks", "bigquery", "oracle"] @dataclass diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml index fc3bd2e8..02ee6923 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml @@ -83,3 +83,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1514' + test_id: '1015' + test_type: Boolean_Value_Mismatch + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml index d7690240..cbbb4248 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml @@ -95,3 +95,11 @@ profile_anomaly_types: lookup_query: |- SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly + - id: '1511' + test_id: '1012' + test_type: Char_Column_Date_Values + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) B ORDER BY data_type, count DESC + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml index 9c600bac..fe3e71d2 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml @@ -95,3 +95,11 @@ profile_anomaly_types: lookup_query: |- SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly + - id: '1510' + test_id: '1011' + test_type: Char_Column_Number_Values + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) B ORDER BY data_type, count DESC + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml index 7bdd0df6..4650458c 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml @@ -124,3 +124,11 @@ profile_anomaly_types: lookup_query: |- SELECT A.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 4)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 6)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 8)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT trim(split_part('{DETAIL_EXPRESSION}', '|', 10)) AS top_pattern) b WHERE REGEXP_REPLACE(REGEXP_REPLACE( REGEXP_REPLACE( "{COLUMN_NAME}"::VARCHAR, '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC; error_type: Profile Anomaly + - id: '1506' + test_id: '1007' + test_type: Column_Pattern_Mismatch + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(REGEXP_SUBSTR('{DETAIL_EXPRESSION}', '[^|]+', 1, 4)) AS top_pattern FROM DUAL) b WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE("{COLUMN_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_4} ROWS ONLY) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(REGEXP_SUBSTR('{DETAIL_EXPRESSION}', '[^|]+', 1, 6)) AS top_pattern FROM DUAL) b WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE("{COLUMN_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_4} ROWS ONLY) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(REGEXP_SUBSTR('{DETAIL_EXPRESSION}', '[^|]+', 1, 8)) AS top_pattern FROM DUAL) b WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE("{COLUMN_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_4} ROWS ONLY) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(REGEXP_SUBSTR('{DETAIL_EXPRESSION}', '[^|]+', 1, 10)) AS top_pattern FROM DUAL) b WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE("{COLUMN_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_4} ROWS ONLY) D ORDER BY top_pattern DESC, count DESC + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml index f2a2adec..18acb363 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml @@ -78,3 +78,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '.*\\s(and|but|or|yet)\\s.*') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1524' + test_id: '1025' + test_type: Delimited_Data_Embedded + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}", '^([^,|' || CHR(9) || ']{1,20}[,|' || CHR(9) || ']){2,}[^,|' || CHR(9) || ']{0,20}([,|' || CHR(9) || ']{0,1}[^,|' || CHR(9) || ']{0,20})*$') AND NOT REGEXP_LIKE("{COLUMN_NAME}", '[[:space:]](and|but|or|yet)[[:space:]]') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml index 2578c101..0062feef 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml @@ -124,3 +124,11 @@ profile_anomaly_types: WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) error_type: Profile Anomaly + - id: '1526' + test_id: '1028' + test_type: Inconsistent_Casing + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT * FROM (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT_2} ROWS ONLY) UNION ALL SELECT * FROM (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT_2} ROWS ONLY) + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml index 876661df..1210ae17 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml @@ -81,3 +81,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1523' + test_id: '1024' + test_type: Invalid_Zip3_USA + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml index 400424a9..abdaa03b 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml @@ -77,3 +77,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1502' + test_id: '1003' + test_type: Invalid_Zip_USA + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml index 4231f420..8c95c91e 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml @@ -77,3 +77,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1508' + test_id: '1009' + test_type: Leading_Spaces + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml index 9f3e805e..fb5b4679 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml @@ -91,3 +91,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1504' + test_id: '1005' + test_type: Multiple_Types_Major + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, table_name, CASE WHEN data_type LIKE 'TIMESTAMP%%' THEN LOWER(data_type) WHEN data_type = 'DATE' THEN 'date' WHEN data_type = 'VARCHAR2' THEN 'varchar2(' || data_length || ')' WHEN data_type = 'CHAR' THEN 'char(' || data_length || ')' WHEN data_type = 'NUMBER' AND data_precision IS NULL THEN 'number' WHEN data_type = 'NUMBER' AND data_scale = 0 THEN 'number(' || data_precision || ')' WHEN data_type = 'NUMBER' THEN 'number(' || data_precision || ',' || data_scale || ')' ELSE data_type END AS data_type FROM all_tab_columns WHERE owner = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml index 1ddee506..ce6cc7e2 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml @@ -91,3 +91,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE 'timestamp%' THEN lower(data_type) WHEN data_type ILIKE 'date' THEN lower(data_type) WHEN data_type ILIKE 'boolean' THEN 'boolean' WHEN data_type = 'TEXT' THEN 'varchar(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type ILIKE 'char%' THEN 'char(' || CAST(character_maximum_length AS VARCHAR) || ')' WHEN data_type = 'NUMBER' AND numeric_precision = 38 AND numeric_scale = 0 THEN 'bigint' WHEN data_type ILIKE 'num%' THEN 'numeric(' || CAST(numeric_precision AS VARCHAR) || ',' || CAST(numeric_scale AS VARCHAR) || ')' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND tables.table_type = 'BASE TABLE' ORDER BY data_type, table_name LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1503' + test_id: '1004' + test_type: Multiple_Types_Minor + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, table_name, CASE WHEN data_type LIKE 'TIMESTAMP%%' THEN LOWER(data_type) WHEN data_type = 'DATE' THEN 'date' WHEN data_type = 'VARCHAR2' THEN 'varchar2(' || data_length || ')' WHEN data_type = 'CHAR' THEN 'char(' || data_length || ')' WHEN data_type = 'NUMBER' AND data_precision IS NULL THEN 'number' WHEN data_type = 'NUMBER' AND data_scale = 0 THEN 'number(' || data_precision || ')' WHEN data_type = 'NUMBER' THEN 'number(' || data_precision || ',' || data_scale || ')' ELSE data_type END AS data_type FROM all_tab_columns WHERE owner = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml index 87d80e61..bfcdb14a 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml @@ -79,3 +79,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1505' + test_id: '1006' + test_type: No_Values + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml index 3cfd99ef..b70fa38e 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml @@ -91,3 +91,11 @@ profile_anomaly_types: WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" > '' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly + - id: '1527' + test_id: '1029' + test_type: Non_Alpha_Name_Address + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml index dbaa2631..1fc026d5 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml @@ -92,3 +92,11 @@ profile_anomaly_types: WHERE "{COLUMN_NAME}" < 'A' AND LEFT("{COLUMN_NAME}", 1) NOT IN ('"', ' ') AND RIGHT("{COLUMN_NAME}", 1) <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly + - id: '1528' + test_id: '1030' + test_type: Non_Alpha_Prefixed_Name + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND SUBSTR("{COLUMN_NAME}", 1, 1) NOT IN ('"', ' ') AND SUBSTR("{COLUMN_NAME}", -1, 1) <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml index a6118bed..6ddaa705 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml @@ -144,3 +144,11 @@ profile_anomaly_types: WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly + - id: '1529' + test_id: '1031' + test_type: Non_Printing_Chars + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", UNISTR('\00A0'), '\x160'), UNISTR('\2009'), '\x8201'), UNISTR('\200B'), '\x8203'), UNISTR('\200C'), '\x8204'), UNISTR('\200D'), '\x8205'), UNISTR('\200E'), '\x8206'), UNISTR('\200F'), '\x8207'), UNISTR('\202F'), '\x8239'), UNISTR('\3000'), '\x12288'), UNISTR('\FEFF'), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", 'X' || UNISTR('\00A0') || UNISTR('\2009') || UNISTR('\200B') || UNISTR('\200C') || UNISTR('\200D') || UNISTR('\200E') || UNISTR('\200F') || UNISTR('\202F') || UNISTR('\3000') || UNISTR('\FEFF'), 'XXXXXXXXXXX') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml index 839c9fc8..c34fefd2 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml @@ -90,3 +90,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1501' + test_id: '1002' + test_type: Non_Standard_Blanks + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN REGEXP_LIKE(LOWER("{COLUMN_NAME}"), '(-{2,}|0{2,}|9{2,}|x{2,}|z{2,})') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml index 005957b5..28c551c4 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml @@ -79,3 +79,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1515' + test_id: '1016' + test_type: Potential_Duplicates + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml index 7efb6ed9..17b8a837 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml @@ -77,3 +77,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1530' + test_id: '1100' + test_type: Potential_PII + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml index 74a91f06..7e62bcce 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml @@ -78,3 +78,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" ILIKE '"%"' OR "{COLUMN_NAME}" ILIKE '''%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1509' + test_id: '1010' + test_type: Quoted_Values + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%%"' OR "{COLUMN_NAME}" LIKE '''%%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml index f6b3b36f..1509bc0a 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml @@ -72,3 +72,11 @@ profile_anomaly_types: lookup_query: |- created_in_ui error_type: Profile Anomaly + - id: '1518' + test_id: '1019' + test_type: Recency_One_Year + sql_flavor: oracle + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml index 7f13ef99..35dd0dac 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml @@ -72,3 +72,11 @@ profile_anomaly_types: lookup_query: |- created_in_ui error_type: Profile Anomaly + - id: '1519' + test_id: '1020' + test_type: Recency_Six_Months + sql_flavor: oracle + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml index bd121c7c..899661a6 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml @@ -70,3 +70,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1513' + test_id: '1014' + test_type: Small Divergent Value Ct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml index 381c26c1..2646aca6 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml @@ -73,3 +73,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '-{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '0{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP '9{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'x{2,}' OR LOWER("{COLUMN_NAME}"::VARCHAR) REGEXP 'z{2,}' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" = '' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1512' + test_id: '1013' + test_type: Small Missing Value Ct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN REGEXP_LIKE(LOWER("{COLUMN_NAME}"), '(-{2,}|0{2,}|9{2,}|x{2,}|z{2,})') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml index 3b7f394e..76a199b4 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml @@ -92,3 +92,11 @@ profile_anomaly_types: lookup_query: |- SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) AS B ORDER BY data_type, count DESC; error_type: Profile Anomaly + - id: '1522' + test_id: '1023' + test_type: Small_Numeric_Value_Ct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) B ORDER BY data_type, count DESC + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml index 4f7b457b..7abea8b3 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml @@ -87,3 +87,11 @@ profile_anomaly_types: lookup_query: |- WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1516' + test_id: '1017' + test_type: Standardized_Value_Matches + sql_flavor: oracle + lookup_type: null + lookup_query: |- + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", 'X '',.-', 'X')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") AS cnt FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", 'X '',.-', 'X')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", 'X '',.-', 'X')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml index 0016e44d..96aeb280 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml @@ -78,3 +78,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1500' + test_id: '1001' + test_type: Suggested_Type + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml index 8771cd40..60cdb242 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml @@ -88,3 +88,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = '{TARGET_SCHEMA}' AND columns.column_name = '{COLUMN_NAME}' AND UPPER(tables.table_type) = 'BASE TABLE' ORDER BY table_name LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1507' + test_id: '1008' + test_type: Table_Pattern_Mismatch + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT column_name, table_name FROM all_tab_columns WHERE owner = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY table_name FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml index 1c5bbf16..20bf64e9 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml @@ -77,3 +77,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1521' + test_id: '1022' + test_type: Unexpected Emails + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml index 68e6e2e1..b30b37d1 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml @@ -79,3 +79,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1520' + test_id: '1021' + test_type: Unexpected US States + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml index ea033f96..02abf507 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml @@ -81,3 +81,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", '{PROFILE_RUN_DATE}' :: DATE AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < '1900-01-01'::DATE) OR ("{COLUMN_NAME}" > '{PROFILE_RUN_DATE}' :: DATE + INTERVAL '30 year' ) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1517' + test_id: '1018' + test_type: Unlikely_Date_Values + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", TO_DATE('{PROFILE_RUN_DATE}', 'YYYY-MM-DD') AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < TO_DATE('1900-01-01', 'YYYY-MM-DD')) OR ("{COLUMN_NAME}" > ADD_MONTHS(TO_DATE('{PROFILE_RUN_DATE}', 'YYYY-MM-DD'), 360)) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml index 7ba71123..82c43fae 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml @@ -81,3 +81,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE lower("{COLUMN_NAME}") IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING('{DETAIL_EXPRESSION}', POSITION(':', '{DETAIL_EXPRESSION}') + 2), '|'))) ) GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Profile Anomaly + - id: '1525' + test_id: '1027' + test_type: Variant_Coded_Values + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") IN (SELECT TRIM(REGEXP_SUBSTR(SUBSTR('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '[^|]+', 1, LEVEL)) FROM DUAL CONNECT BY LEVEL <= REGEXP_COUNT(SUBSTR('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '[^|]+')) GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml index 3fe5b288..10bc3ffc 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml @@ -215,6 +215,31 @@ test_types: ORDER BY {GROUPBY_NAMES} LIMIT {LIMIT}; error_type: Test Results + - id: '8500' + test_id: '1500' + test_type: Aggregate_Balance + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES} + FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: - id: '2506' test_type: Aggregate_Balance @@ -559,3 +584,47 @@ test_types: WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL); + - id: '8006' + test_type: Aggregate_Balance + sql_flavor: oracle + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total + OR (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml index f5fc0618..80c9cd43 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml @@ -229,6 +229,33 @@ test_types: ORDER BY {GROUPBY_NAMES} LIMIT {LIMIT}; error_type: Test Results + - id: '8504' + test_id: '1504' + test_type: Aggregate_Balance_Percent + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES} + FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: - id: '2509' test_type: Aggregate_Balance_Percent @@ -573,3 +600,47 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)); + - id: '8009' + test_type: Aggregate_Balance_Percent + sql_flavor: oracle + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml index 9d594da4..141ddf0b 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml @@ -229,6 +229,33 @@ test_types: ORDER BY {GROUPBY_NAMES} LIMIT {LIMIT}; error_type: Test Results + - id: '8505' + test_id: '1505' + test_type: Aggregate_Balance_Range + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES} + FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: - id: '2510' test_type: Aggregate_Balance_Range @@ -573,3 +600,47 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}); + - id: '8010' + test_type: Aggregate_Balance_Range + sql_flavor: oracle + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml index 676052a2..92bac6db 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml @@ -215,6 +215,31 @@ test_types: ORDER BY {GROUPBY_NAMES} LIMIT {LIMIT}; error_type: Test Results + - id: '8501' + test_id: '1501' + test_type: Aggregate_Minimum + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES} + FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: - id: '2502' test_type: Aggregate_Minimum @@ -559,3 +584,47 @@ test_types: WHERE total < match_total -- OR (total IS NOT NULL AND match_total IS NULL) -- New categories OR (total IS NULL AND match_total IS NOT NULL); -- Dropped categories + - id: '8002' + test_type: Aggregate_Minimum + sql_flavor: oracle + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total + -- OR (total IS NOT NULL AND match_total IS NULL) -- New categories + OR (total IS NULL AND match_total IS NOT NULL) diff --git a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml index aa070119..898b9305 100644 --- a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml @@ -100,6 +100,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8001' + test_type: Alpha_Trunc + sql_flavor: oracle + measure: |- + MAX(LENGTH({COLUMN_NAME})) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1364' test_id: '1004' @@ -164,4 +172,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}" , LEN("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LEN("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LEN("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results + - id: '8001' + test_id: '1004' + test_type: Alpha_Trunc + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml index 367c833c..42f26e3f 100644 --- a/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>=' test_condition: |- {THRESHOLD_VALUE} + - id: '8002' + test_type: Avg_Shift + sql_flavor: oracle + measure: |- + ABS( (AVG(CAST({COLUMN_NAME} AS NUMBER)) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})-1)*POWER(STDDEV({COLUMN_NAME}),2) + ({BASELINE_VALUE_CT}-1) * POWER({BASELINE_SD},2)) /NULLIF(COUNT({COLUMN_NAME}) + {BASELINE_VALUE_CT}, 0) )) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1365' test_id: '1005' @@ -159,4 +167,12 @@ test_types: lookup_query: |- SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; error_type: Test Results + - id: '8002' + test_id: '1005' + test_type: Avg_Shift + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT AVG(CAST("{COLUMN_NAME}" AS NUMBER)) AS current_average FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml index fbfa7fa1..940a5f01 100644 --- a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml +++ b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml @@ -313,3 +313,37 @@ test_types: FROM ( {CUSTOM_QUERY} ) TEST; + - id: '8004' + test_type: CUSTOM + sql_flavor: oracle + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + CASE + WHEN '{COLUMN_NAME_NO_QUOTES}' IS NULL THEN NULL + ELSE '{COLUMN_NAME_NO_QUOTES}' + END as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + /* TODO: 'custom_query= {CUSTOM_QUERY_ESCAPED}' as input_parameters, */ + 'Skip_Errors={SKIP_ERRORS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( + {CUSTOM_QUERY} + ) TEST diff --git a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml index f9dffc4d..39dbdc70 100644 --- a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml @@ -192,6 +192,28 @@ test_types: ORDER BY {COLUMN_NAME_NO_QUOTES} LIMIT {LIMIT}; error_type: Test Results + - id: '8502' + test_id: '1502' + test_type: Combo_Match + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + MINUS + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test + ORDER BY {COLUMN_NAME_NO_QUOTES} + FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: - id: '2501' test_type: Combo_Match @@ -503,3 +525,42 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test; + - id: '8001' + test_type: Combo_Match + sql_flavor: oracle + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + MINUS + SELECT {MATCH_GROUPBY_NAMES} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test diff --git a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml index 11125999..ae1f4725 100644 --- a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8003' + test_type: Condition_Flag + sql_flavor: oracle + measure: |- + SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1366' test_id: '1006' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT {LIMIT}; error_type: Test Results + - id: '8006' + test_id: '1006' + test_type: Condition_Flag + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Constant.yaml b/testgen/template/dbsetup_test_types/test_types_Constant.yaml index 2bdd1a04..c3800a9e 100644 --- a/testgen/template/dbsetup_test_types/test_types_Constant.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Constant.yaml @@ -100,6 +100,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8004' + test_type: Constant + sql_flavor: oracle + measure: |- + SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1367' test_id: '1007' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results + - id: '8004' + test_id: '1007' + test_type: Constant + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml index 389bf0af..26833cd5 100644 --- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml @@ -104,6 +104,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8005' + test_type: Daily_Record_Ct + sql_flavor: oracle + measure: |- + <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT TRUNC({COLUMN_NAME})) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1368' test_id: '1009' @@ -232,4 +240,12 @@ test_types: lookup_query: |- WITH RECURSIVE daterange(all_dates) AS (SELECT MIN("{COLUMN_NAME}") :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(DAY, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT MAX("{COLUMN_NAME}") :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT "{COLUMN_NAME}" :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" :: DATE ) SELECT p.missing_period, p.prior_available_date, e.period_count as prior_available_date_count, p.next_available_date, f.period_count as next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results + - id: '8009' + test_id: '1009' + test_type: Daily_Record_Ct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + WITH daterange AS (SELECT (SELECT MIN(TRUNC("{COLUMN_NAME}")) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") + LEVEL - 1 AS all_dates FROM DUAL CONNECT BY LEVEL <= (SELECT MAX(TRUNC("{COLUMN_NAME}")) - MIN(TRUNC("{COLUMN_NAME}")) + 1 FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")), existing_periods AS (SELECT DISTINCT TRUNC("{COLUMN_NAME}") AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY TRUNC("{COLUMN_NAME}")) SELECT p.missing_period, p.prior_available_date, e.period_count AS prior_available_date_count, p.next_available_date, f.period_count AS next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml index 02fe0dda..9aedf7dc 100644 --- a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml @@ -101,6 +101,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8006' + test_type: Dec_Trunc + sql_flavor: oracle + measure: |- + SUM(ROUND(ABS(MOD({COLUMN_NAME}, 1)), 5))+1 + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1369' test_id: '1011' @@ -166,4 +174,12 @@ test_types: lookup_query: |- SELECT DISTINCT LENGTH(SPLIT_PART("{COLUMN_NAME}" :: TEXT, '.', 2)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY decimal_scale LIMIT {LIMIT}; error_type: Test Results + - id: '8006' + test_id: '1011' + test_type: Dec_Trunc + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT LENGTH(SUBSTR(TO_CHAR("{COLUMN_NAME}"), INSTR(TO_CHAR("{COLUMN_NAME}"), '.') + 1)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE INSTR(TO_CHAR("{COLUMN_NAME}"), '.') > 0 GROUP BY LENGTH(SUBSTR(TO_CHAR("{COLUMN_NAME}"), INSTR(TO_CHAR("{COLUMN_NAME}"), '.') + 1)) FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml index 54be295e..7ae9b335 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml @@ -101,6 +101,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8007' + test_type: Distinct_Date_Ct + sql_flavor: oracle + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1370' test_id: '1012' @@ -163,4 +171,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results + - id: '8012' + test_id: '1012' + test_type: Distinct_Date_Ct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml index 150289ab..501d9f17 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml @@ -100,6 +100,14 @@ test_types: test_operator: <> test_condition: |- {THRESHOLD_VALUE} + - id: '8008' + test_type: Distinct_Value_Ct + sql_flavor: oracle + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1371' test_id: '1013' @@ -162,4 +170,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT}; error_type: Test Results + - id: '8008' + test_id: '1013' + test_type: Distinct_Value_Ct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml index b44fcd2d..221563c4 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml @@ -217,6 +217,33 @@ test_types: ORDER BY COALESCE(l.category, o.category) LIMIT {LIMIT}; error_type: Test Results + - id: '8503' + test_id: '1503' + test_type: Distribution_Shift + sql_flavor: oracle + lookup_type: null + lookup_query: |- + WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + CAST(COUNT(*) AS NUMBER) / CAST(SUM(COUNT(*)) OVER () AS NUMBER) AS pct_of_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), + older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + CAST(COUNT(*) AS NUMBER) / CAST(SUM(COUNT(*)) OVER () AS NUMBER) AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ) + SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) + ORDER BY COALESCE(l.category, o.category) + FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: - id: '2503' test_type: Distribution_Shift @@ -590,3 +617,54 @@ test_types: SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2))) + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) as js_divergence FROM dataset ) rslt; + - id: '8003' + test_type: Distribution_Shift + sql_flavor: oracle + template: |- + -- Relative Entropy: measured by Jensen-Shannon Divergence + -- Smoothed and normalized version of KL divergence, + -- with scores between 0 (identical) and 1 (maximally different), + -- when using the base-2 logarithm. Formula is: + -- 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m) + -- Log base 2 of x = LN(x)/LN(2) + WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + CAST(COUNT(*) AS NUMBER) / CAST(SUM(COUNT(*)) OVER () AS NUMBER) AS pct_of_total + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), + older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + CAST(COUNT(*) AS NUMBER) / CAST(SUM(COUNT(*)) OVER () AS NUMBER) AS pct_of_total + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ), + dataset + AS ( SELECT COALESCE(l.category, o.category) AS category, + COALESCE(o.pct_of_total, 0.0000001) AS old_pct, + COALESCE(l.pct_of_total, 0.0000001) AS new_pct, + (COALESCE(o.pct_of_total, 0.0000001) + + COALESCE(l.pct_of_total, 0.0000001))/2.0 AS avg_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) ) + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + -- '{GROUPBY_NAMES}' as column_names, + '{THRESHOLD_VALUE}' as threshold_value, + NULL as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN js_divergence > {THRESHOLD_VALUE} THEN 0 ELSE 1 END as result_code, + 'Divergence Level: ' || CAST(js_divergence AS {VARCHAR_TYPE}) || ', Threshold: {THRESHOLD_VALUE}.' as result_message, + js_divergence as result_measure + FROM ( + SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2))) + + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) as js_divergence + FROM dataset ) rslt diff --git a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml index 480988a5..16065d9e 100644 --- a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml @@ -136,6 +136,20 @@ test_types: ORDER BY {GROUPBY_NAMES} LIMIT {LIMIT}; error_type: Test Results + - id: '8510' + test_id: '1510' + test_type: Dupe_Rows + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ORDER BY {GROUPBY_NAMES} + FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: - id: '2511' test_type: Dupe_Rows @@ -403,3 +417,36 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ) test; + - id: '8011' + test_type: Dupe_Rows + sql_flavor: oracle + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' duplicate row(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COALESCE(SUM(record_ct), 0) as result_measure + FROM ( SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ) test diff --git a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml index 1ec48c42..64e9611e 100644 --- a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml @@ -100,6 +100,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8009' + test_type: Email_Format + sql_flavor: oracle + measure: |- + SUM(CASE WHEN NOT REGEXP_LIKE({COLUMN_NAME}, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1372' test_id: '1014' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results + - id: '8009' + test_id: '1014' + test_type: Email_Format + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT REGEXP_LIKE("{COLUMN_NAME}", '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml index 0cfeecf7..fbf0307b 100644 --- a/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml @@ -389,3 +389,49 @@ test_types: ELSE COALESCE(interval_minutes::VARCHAR, 'Unknown') END AS result_signal FROM test_data; + - id: '2817' + test_type: Freshness_Trend + sql_flavor: oracle + template: |- + WITH test_data AS ( + SELECT + {CUSTOM_QUERY} AS fingerprint, + ROUND((CAST(TO_TIMESTAMP('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') AS DATE) - + CAST(TO_TIMESTAMP('{BASELINE_SUM}', 'YYYY-MM-DD HH24:MI:SS') AS DATE)) * 24 * 60) AS interval_minutes + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + ) + SELECT '{TEST_TYPE}' AS test_type, + '{TEST_DEFINITION_ID}' AS test_definition_id, + '{TEST_SUITE_ID}' AS test_suite_id, + '{TEST_RUN_ID}' AS test_run_id, + '{RUN_DATE}' AS test_time, + '{SCHEMA_NAME}' AS schema_name, + '{TABLE_NAME}' AS table_name, + '{COLUMN_NAME_NO_QUOTES}' AS column_names, + '{SKIP_ERRORS}' AS threshold_value, + {SKIP_ERRORS} AS skip_errors, + '{INPUT_PARAMETERS}' AS input_parameters, + fingerprint AS result_measure, + CASE + -- Training mode: tolerances not yet calculated + WHEN {LOWER_TOLERANCE} IS NULL OR {UPPER_TOLERANCE} IS NULL THEN -1 + -- No change to table, and we're beyond time range: LATE + WHEN fingerprint = '{BASELINE_VALUE}' AND interval_minutes > {UPPER_TOLERANCE} THEN 0 + -- Table changed outside time range: UNEXPECTED + WHEN fingerprint <> '{BASELINE_VALUE}' + AND NOT interval_minutes BETWEEN {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} THEN 0 + ELSE 1 + END AS result_code, + CASE + -- No change to table, and we're beyond time range: LATE + WHEN fingerprint = '{BASELINE_VALUE}' AND interval_minutes > {UPPER_TOLERANCE} + THEN 'Table unchanged beyond expected schedule' + -- Table changed outside time range: UNEXPECTED + WHEN fingerprint <> '{BASELINE_VALUE}' + AND NOT interval_minutes BETWEEN {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} + THEN 'Table changed outside of expected schedule' + ELSE 'Interval since last update: ' || COALESCE(TO_CHAR(interval_minutes), 'Unknown') + END AS result_message, + COALESCE(TO_CHAR(interval_minutes), 'Unknown') AS result_signal + FROM test_data diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml index 646cc9c0..87669e7a 100644 --- a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml @@ -99,6 +99,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8010' + test_type: Future_Date + sql_flavor: oracle + measure: |- + SUM(CASE WHEN TRUNC({COLUMN_NAME}) >= TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1373' test_id: '1015' @@ -160,4 +168,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - '{TEST_DATE}'::DATE)) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results + - id: '8010' + test_id: '1015' + test_type: Future_Date + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRUNC("{COLUMN_NAME}") >= TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml index 7f55192c..8361d66e 100644 --- a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml @@ -100,6 +100,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8011' + test_type: Future_Date_1Y + sql_flavor: oracle + measure: |- + SUM(CASE WHEN TRUNC({COLUMN_NAME}) >= TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') + 365 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1374' test_id: '1016' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE GREATEST(0, SIGN("{COLUMN_NAME}"::DATE - ('{TEST_DATE}'::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results + - id: '8016' + test_id: '1016' + test_type: Future_Date_1Y + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRUNC("{COLUMN_NAME}") >= TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') + 365 GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml index 94655ff8..4003c354 100644 --- a/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>=' test_condition: |- {THRESHOLD_VALUE} + - id: '8012' + test_type: Incr_Avg_Shift + sql_flavor: oracle + measure: |- + NVL(ABS(({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME}) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD}), 0) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1375' test_id: '1017' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT AVG("{COLUMN_NAME}" :: FLOAT) AS current_average, SUM("{COLUMN_NAME}" ::FLOAT) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}" )::FLOAT, 0) as current_value_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; error_type: Test Results + - id: '8012' + test_id: '1017' + test_type: Incr_Avg_Shift + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT AVG(CAST("{COLUMN_NAME}" AS NUMBER)) AS current_average, SUM(CAST("{COLUMN_NAME}" AS NUMBER)) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}"), 0) as current_value_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml index 85665563..03898e5d 100644 --- a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml +++ b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml @@ -98,6 +98,14 @@ test_types: test_operator: <> test_condition: |- {THRESHOLD_VALUE} + - id: '8013' + test_type: LOV_All + sql_flavor: oracle + measure: |- + LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1376' test_id: '1018' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results + - id: '8013' + test_id: '1018' + test_type: LOV_All + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") HAVING LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml index fed0b3ec..feb83f0e 100644 --- a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml @@ -204,6 +204,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8014' + test_type: LOV_Match + sql_flavor: oracle + measure: |- + SUM(CASE WHEN {COLUMN_NAME} IS NOT NULL AND {COLUMN_NAME} NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1377' test_id: '1019' @@ -265,4 +273,12 @@ test_types: lookup_query: |- SELECT DISTINCT NULLIF("{COLUMN_NAME}", '') AS "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results + - id: '8014' + test_id: '1019' + test_type: LOV_Match + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL AND "{COLUMN_NAME}" NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml index 545d25f6..86e3437f 100644 --- a/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml @@ -88,6 +88,14 @@ test_types: test_operator: NOT BETWEEN test_condition: |- {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} + - id: '8016' + test_type: Metric_Trend + sql_flavor: oracle + measure: |- + {CUSTOM_QUERY} + test_operator: NOT BETWEEN + test_condition: |- + {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} target_data_lookups: - id: '1484' test_id: '1514' @@ -166,4 +174,15 @@ test_types: {UPPER_TOLERANCE} AS upper_bound FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; error_type: Test Results + - id: '8514' + test_id: '1514' + test_type: Metric_Trend + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT {CUSTOM_QUERY} AS current_count, + {LOWER_TOLERANCE} AS lower_bound, + {UPPER_TOLERANCE} AS upper_bound + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml index 01dbf230..80950917 100644 --- a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml @@ -100,6 +100,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8015' + test_type: Min_Date + sql_flavor: oracle + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < TO_DATE('{BASELINE_VALUE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1378' test_id: '1020' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: DATE < '{BASELINE_VALUE}' :: DATE GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results + - id: '8015' + test_id: '1020' + test_type: Min_Date + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < TO_DATE('{BASELINE_VALUE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml index bfac4c70..96b6ff1d 100644 --- a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml @@ -100,6 +100,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8016' + test_type: Min_Val + sql_flavor: oracle + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1379' test_id: '1021' @@ -160,4 +168,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT {LIMIT}; error_type: Test Results + - id: '8016' + test_id: '1021' + test_type: Min_Val + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml index 3bc7069a..82e70601 100644 --- a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>=' test_condition: |- {THRESHOLD_VALUE} + - id: '8017' + test_type: Missing_Pct + sql_flavor: oracle + measure: |- + ABS(2.0 * ASIN(SQRT(CAST({BASELINE_VALUE_CT} AS NUMBER) / CAST({BASELINE_CT} AS NUMBER))) - 2 * ASIN(SQRT(CAST(COUNT({COLUMN_NAME}) AS NUMBER) / CAST(NULLIF(COUNT(*), 0) AS NUMBER)))) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1380' test_id: '1022' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL OR "{COLUMN_NAME}" :: VARCHAR(255) = '' LIMIT {LIMIT}; error_type: Test Results + - id: '8017' + test_id: '1022' + test_type: Missing_Pct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml index 4ce0fc6a..d126e843 100644 --- a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8018' + test_type: Monthly_Rec_Ct + sql_flavor: oracle + measure: |- + (MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1381' test_id: '1023' @@ -228,4 +236,12 @@ test_types: lookup_query: |- WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('month', MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT DATEADD(MONTH, 1, d.all_dates) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('month', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS (SELECT DISTINCT DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('month',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_month, e.period_count as prior_available_month_count, p.next_available_month, f.period_count as next_available_month_count FROM (SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results + - id: '8023' + test_id: '1023' + test_type: Monthly_Rec_Ct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + WITH daterange AS (SELECT ADD_MONTHS((SELECT TRUNC(MIN("{COLUMN_NAME}"), 'MM') FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), LEVEL - 1) AS all_dates FROM DUAL CONNECT BY LEVEL <= MONTHS_BETWEEN((SELECT TRUNC(MAX("{COLUMN_NAME}"), 'MM') FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), (SELECT TRUNC(MIN("{COLUMN_NAME}"), 'MM') FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + 1), existing_periods AS (SELECT DISTINCT TRUNC("{COLUMN_NAME}", 'MM') AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY TRUNC("{COLUMN_NAME}", 'MM')) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml index be6ad5eb..ff0ab45e 100644 --- a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml @@ -105,6 +105,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8019' + test_type: Outlier_Pct_Above + sql_flavor: oracle + measure: |- + CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS NUMBER) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS NUMBER) / NULLIF(COUNT({COLUMN_NAME}), 0) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1382' test_id: '1024' @@ -166,4 +174,12 @@ test_types: lookup_query: |- SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results + - id: '8019' + test_id: '1024' + test_type: Outlier_Pct_Above + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS NUMBER) > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml index 7d192436..469a36fc 100644 --- a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml @@ -105,6 +105,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8020' + test_type: Outlier_Pct_Below + sql_flavor: oracle + measure: |- + CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS NUMBER) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS NUMBER) / NULLIF(COUNT({COLUMN_NAME}), 0) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1383' test_id: '1025' @@ -166,4 +174,12 @@ test_types: lookup_query: |- SELECT ({BASELINE_AVG} - (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" :: FLOAT < ({BASELINE_AVG} - (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC; error_type: Test Results + - id: '8020' + test_id: '1025' + test_type: Outlier_Pct_Below + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} - (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS NUMBER) < ({BASELINE_AVG} - (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml index 6fd1f981..cb32a132 100644 --- a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml @@ -100,6 +100,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8021' + test_type: Pattern_Match + sql_flavor: oracle + measure: |- + COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(CASE WHEN REGEXP_LIKE(NULLIF(TO_CHAR({COLUMN_NAME}), ''), '{BASELINE_VALUE}') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1384' test_id: '1026' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE(NULLIF("{COLUMN_NAME}"::VARCHAR, ''),'{BASELINE_VALUE}') != 1 GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results + - id: '8021' + test_id: '1026' + test_type: Pattern_Match + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT REGEXP_LIKE(NULLIF(TO_CHAR("{COLUMN_NAME}"), ''), '{BASELINE_VALUE}') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Recency.yaml b/testgen/template/dbsetup_test_types/test_types_Recency.yaml index c69df2e2..eb1b0ba4 100644 --- a/testgen/template/dbsetup_test_types/test_types_Recency.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Recency.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8022' + test_type: Recency + sql_flavor: oracle + measure: |- + <%DATEDIFF_DAY;MAX({COLUMN_NAME});TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1385' test_id: '1028' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results + - id: '8022' + test_id: '1028' + test_type: Recency + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT col AS latest_date_available, TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE <%DATEDIFF_DAY;col;TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > {THRESHOLD_VALUE} FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Required.yaml b/testgen/template/dbsetup_test_types/test_types_Required.yaml index fcb3200b..bb2cedb6 100644 --- a/testgen/template/dbsetup_test_types/test_types_Required.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Required.yaml @@ -99,6 +99,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8023' + test_type: Required + sql_flavor: oracle + measure: |- + COUNT(*) - COUNT({COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1386' test_id: '1030' @@ -159,4 +167,12 @@ test_types: lookup_query: |- SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT}; error_type: Test Results + - id: '8023' + test_id: '1030' + test_type: Required + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml index 47c71112..7be82ea1 100644 --- a/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml @@ -98,6 +98,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8024' + test_type: Row_Ct + sql_flavor: oracle + measure: |- + COUNT(*) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1387' test_id: '1031' @@ -162,4 +170,12 @@ test_types: lookup_query: |- WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, ABS(ROUND(100 *(current_count - {THRESHOLD_VALUE}) :: FLOAT / {THRESHOLD_VALUE} :: FLOAT,2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE}; error_type: Test Results + - id: '8024' + test_id: '1031' + test_type: Row_Ct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) / {THRESHOLD_VALUE}, 2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml index 08209512..7eb8b421 100644 --- a/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml @@ -99,6 +99,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8025' + test_type: Row_Ct_Pct + sql_flavor: oracle + measure: |- + ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT}) / {BASELINE_CT}, 2)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1388' test_id: '1032' @@ -162,4 +170,12 @@ test_types: lookup_query: |- WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) :: FLOAT / {BASELINE_CT} :: FLOAT,2)) AS row_count_pct_difference FROM cte; error_type: Test Results + - id: '8025' + test_id: '1032' + test_type: Row_Ct_Pct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) / {BASELINE_CT}, 2)) AS row_count_pct_difference FROM cte + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Schema_Drift.yaml b/testgen/template/dbsetup_test_types/test_types_Schema_Drift.yaml index d1ea92cf..48cc710b 100644 --- a/testgen/template/dbsetup_test_types/test_types_Schema_Drift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Schema_Drift.yaml @@ -46,7 +46,7 @@ test_types: AND id <> '{TEST_RUN_ID}'::UUID ), table_changes AS ( - SELECT + SELECT dsl.table_name, MAX(prev_test.last_run_time) as window_start, MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'A' THEN dsl.change_date ELSE NULL END) as last_add_date, @@ -61,7 +61,7 @@ test_types: AND dsl.change_date > prev_test.last_run_time GROUP BY dsl.table_name ) - SELECT + SELECT '{TEST_TYPE}' AS test_type, '{TEST_DEFINITION_ID}' AS test_definition_id, '{TEST_SUITE_ID}' AS test_suite_id, @@ -70,14 +70,14 @@ test_types: '{SCHEMA_NAME}' AS schema_name, table_name, '{INPUT_PARAMETERS}' AS input_parameters, - (CASE + (CASE WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'A' WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'D' - ELSE 'M' + ELSE 'M' END) - || '|' || column_adds - || '|' || column_drops - || '|' || column_mods + || '|' || column_adds + || '|' || column_drops + || '|' || column_mods || '|' || window_start::TEXT AS result_signal, 0 AS result_code, @@ -101,7 +101,7 @@ test_types: AND id <> '{TEST_RUN_ID}'::UUID ), table_changes AS ( - SELECT + SELECT dsl.table_name, MAX(prev_test.last_run_time) as window_start, MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'A' THEN dsl.change_date ELSE NULL END) as last_add_date, @@ -116,7 +116,7 @@ test_types: AND dsl.change_date > prev_test.last_run_time GROUP BY dsl.table_name ) - SELECT + SELECT '{TEST_TYPE}' AS test_type, '{TEST_DEFINITION_ID}' AS test_definition_id, '{TEST_SUITE_ID}' AS test_suite_id, @@ -125,14 +125,14 @@ test_types: '{SCHEMA_NAME}' AS schema_name, table_name, '{INPUT_PARAMETERS}' AS input_parameters, - (CASE + (CASE WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'A' WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'D' - ELSE 'M' + ELSE 'M' END) - || '|' || column_adds - || '|' || column_drops - || '|' || column_mods + || '|' || column_adds + || '|' || column_drops + || '|' || column_mods || '|' || window_start::TEXT AS result_signal, 0 AS result_code, @@ -156,7 +156,7 @@ test_types: AND id <> '{TEST_RUN_ID}'::UUID ), table_changes AS ( - SELECT + SELECT dsl.table_name, MAX(prev_test.last_run_time) as window_start, MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'A' THEN dsl.change_date ELSE NULL END) as last_add_date, @@ -171,7 +171,7 @@ test_types: AND dsl.change_date > prev_test.last_run_time GROUP BY dsl.table_name ) - SELECT + SELECT '{TEST_TYPE}' AS test_type, '{TEST_DEFINITION_ID}' AS test_definition_id, '{TEST_SUITE_ID}' AS test_suite_id, @@ -180,14 +180,14 @@ test_types: '{SCHEMA_NAME}' AS schema_name, table_name, '{INPUT_PARAMETERS}' AS input_parameters, - (CASE + (CASE WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'A' WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'D' - ELSE 'M' + ELSE 'M' END) - || '|' || column_adds - || '|' || column_drops - || '|' || column_mods + || '|' || column_adds + || '|' || column_drops + || '|' || column_mods || '|' || window_start::TEXT AS result_signal, 0 AS result_code, @@ -211,7 +211,7 @@ test_types: AND id <> '{TEST_RUN_ID}'::UUID ), table_changes AS ( - SELECT + SELECT dsl.table_name, MAX(prev_test.last_run_time) as window_start, MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'A' THEN dsl.change_date ELSE NULL END) as last_add_date, @@ -226,7 +226,7 @@ test_types: AND dsl.change_date > prev_test.last_run_time GROUP BY dsl.table_name ) - SELECT + SELECT '{TEST_TYPE}' AS test_type, '{TEST_DEFINITION_ID}' AS test_definition_id, '{TEST_SUITE_ID}' AS test_suite_id, @@ -235,14 +235,14 @@ test_types: '{SCHEMA_NAME}' AS schema_name, table_name, '{INPUT_PARAMETERS}' AS input_parameters, - (CASE + (CASE WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'A' WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'D' - ELSE 'M' + ELSE 'M' END) - || '|' || column_adds - || '|' || column_drops - || '|' || column_mods + || '|' || column_adds + || '|' || column_drops + || '|' || column_mods || '|' || window_start::TEXT AS result_signal, 0 AS result_code, @@ -266,7 +266,7 @@ test_types: AND id <> '{TEST_RUN_ID}'::UUID ), table_changes AS ( - SELECT + SELECT dsl.table_name, MAX(prev_test.last_run_time) as window_start, MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'A' THEN dsl.change_date ELSE NULL END) as last_add_date, @@ -281,7 +281,7 @@ test_types: AND dsl.change_date > prev_test.last_run_time GROUP BY dsl.table_name ) - SELECT + SELECT '{TEST_TYPE}' AS test_type, '{TEST_DEFINITION_ID}' AS test_definition_id, '{TEST_SUITE_ID}' AS test_suite_id, @@ -290,14 +290,14 @@ test_types: '{SCHEMA_NAME}' AS schema_name, table_name, '{INPUT_PARAMETERS}' AS input_parameters, - (CASE + (CASE WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'A' WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'D' - ELSE 'M' + ELSE 'M' END) - || '|' || column_adds - || '|' || column_drops - || '|' || column_mods + || '|' || column_adds + || '|' || column_drops + || '|' || column_mods || '|' || window_start::TEXT AS result_signal, 0 AS result_code, @@ -321,7 +321,7 @@ test_types: AND id <> '{TEST_RUN_ID}'::UUID ), table_changes AS ( - SELECT + SELECT dsl.table_name, MAX(prev_test.last_run_time) as window_start, MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'A' THEN dsl.change_date ELSE NULL END) as last_add_date, @@ -336,7 +336,7 @@ test_types: AND dsl.change_date > prev_test.last_run_time GROUP BY dsl.table_name ) - SELECT + SELECT '{TEST_TYPE}' AS test_type, '{TEST_DEFINITION_ID}' AS test_definition_id, '{TEST_SUITE_ID}' AS test_suite_id, @@ -345,14 +345,14 @@ test_types: '{SCHEMA_NAME}' AS schema_name, table_name, '{INPUT_PARAMETERS}' AS input_parameters, - (CASE + (CASE WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'A' WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'D' - ELSE 'M' + ELSE 'M' END) - || '|' || column_adds - || '|' || column_drops - || '|' || column_mods + || '|' || column_adds + || '|' || column_drops + || '|' || column_mods || '|' || window_start::TEXT AS result_signal, 0 AS result_code, @@ -376,7 +376,7 @@ test_types: AND id <> '{TEST_RUN_ID}'::UUID ), table_changes AS ( - SELECT + SELECT dsl.table_name, MAX(prev_test.last_run_time) as window_start, MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'A' THEN dsl.change_date ELSE NULL END) as last_add_date, @@ -391,7 +391,7 @@ test_types: AND dsl.change_date > prev_test.last_run_time GROUP BY dsl.table_name ) - SELECT + SELECT '{TEST_TYPE}' AS test_type, '{TEST_DEFINITION_ID}' AS test_definition_id, '{TEST_SUITE_ID}' AS test_suite_id, @@ -400,14 +400,69 @@ test_types: '{SCHEMA_NAME}' AS schema_name, table_name, '{INPUT_PARAMETERS}' AS input_parameters, - (CASE + (CASE WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'A' WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'D' - ELSE 'M' + ELSE 'M' END) - || '|' || column_adds - || '|' || column_drops - || '|' || column_mods + || '|' || column_adds + || '|' || column_drops + || '|' || column_mods + || '|' || window_start::TEXT + AS result_signal, + 0 AS result_code, + CASE WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'Table added. ' ELSE '' END + || CASE WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'Table dropped. ' ELSE '' END + || CASE WHEN column_adds > 0 THEN column_adds || ' columns added. ' ELSE '' END + || CASE WHEN column_drops > 0 THEN column_drops || ' columns dropped. ' ELSE '' END + || CASE WHEN column_mods > 0 THEN column_mods || ' columns modified. ' ELSE '' END + AS result_message, + column_adds + column_drops + column_mods AS result_measure + FROM table_changes; + - id: '8014' + test_type: Schema_Drift + sql_flavor: oracle + template: |- + WITH prev_test AS ( + SELECT MAX(test_starttime) AS last_run_time + FROM {APP_SCHEMA_NAME}.test_runs + WHERE test_suite_id = '{TEST_SUITE_ID}'::UUID + -- Ignore current run + AND id <> '{TEST_RUN_ID}'::UUID + ), + table_changes AS ( + SELECT + dsl.table_name, + MAX(prev_test.last_run_time) as window_start, + MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'A' THEN dsl.change_date ELSE NULL END) as last_add_date, + MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'D' THEN dsl.change_date ELSE NULL END) as last_drop_date, + COUNT(*) FILTER (WHERE dsl.column_id IS NOT NULL AND dsl.change = 'A') AS column_adds, + COUNT(*) FILTER (WHERE dsl.column_id IS NOT NULL AND dsl.change = 'D') AS column_drops, + COUNT(*) FILTER (WHERE dsl.column_id IS NOT NULL AND dsl.change = 'M') AS column_mods + FROM {APP_SCHEMA_NAME}.data_structure_log dsl + CROSS JOIN prev_test + WHERE dsl.table_groups_id = '{TABLE_GROUPS_ID}'::UUID + -- if no previous tests, this comparision yelds null and nothing is counted + AND dsl.change_date > prev_test.last_run_time + GROUP BY dsl.table_name + ) + SELECT + '{TEST_TYPE}' AS test_type, + '{TEST_DEFINITION_ID}' AS test_definition_id, + '{TEST_SUITE_ID}' AS test_suite_id, + '{TEST_RUN_ID}' AS test_run_id, + '{RUN_DATE}' AS test_time, + '{SCHEMA_NAME}' AS schema_name, + table_name, + '{INPUT_PARAMETERS}' AS input_parameters, + (CASE + WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'A' + WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'D' + ELSE 'M' + END) + || '|' || column_adds + || '|' || column_drops + || '|' || column_mods || '|' || window_start::TEXT AS result_signal, 0 AS result_code, diff --git a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml index c5f9a5c6..65bf8123 100644 --- a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml @@ -101,6 +101,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8026' + test_type: Street_Addr_Pattern + sql_flavor: oracle + measure: |- + 100.0*SUM(CASE WHEN REGEXP_LIKE(TO_CHAR({COLUMN_NAME}), '^[0-9]{1,5}[a-zA-Z]?[[:space:]][[:alnum:]_]{1,5}\.?[[:space:]]?[[:alnum:]_]*[[:space:]]?[[:alnum:]_]*[[:space:]][a-zA-Z]{1,6}\.?[[:space:]]?[0-9]{0,5}[A-Z]{0,1}$') THEN 1 ELSE 0 END) / NULLIF(COUNT({COLUMN_NAME}), 0) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1389' test_id: '1033' @@ -163,4 +171,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$') != 1 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results + - id: '8033' + test_id: '1033' + test_type: Street_Addr_Pattern + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT REGEXP_LIKE(TO_CHAR("{COLUMN_NAME}"), '^[0-9]{1,5}[a-zA-Z]?[[:space:]][[:alnum:]_]{1,5}\.?[[:space:]]?[[:alnum:]_]*[[:space:]]?[[:alnum:]_]*[[:space:]][a-zA-Z]{1,6}\.?[[:space:]]?[0-9]{0,5}[A-Z]{0,1}$') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml index ed3e6340..74ccca3c 100644 --- a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml @@ -55,9 +55,9 @@ test_types: {SKIP_ERRORS} AS skip_errors, '{INPUT_PARAMETERS}' AS input_parameters, fingerprint AS result_signal, - CASE + CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 - ELSE 1 + ELSE 1 END AS result_code, CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 'No table change detected.' @@ -88,9 +88,9 @@ test_types: {SKIP_ERRORS} as skip_errors, '{INPUT_PARAMETERS}' as input_parameters, fingerprint as result_signal, - CASE + CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 - ELSE 1 + ELSE 1 END AS result_code, CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 'No table change detected.' @@ -120,9 +120,9 @@ test_types: {SKIP_ERRORS} as skip_errors, '{INPUT_PARAMETERS}' as input_parameters, fingerprint as result_signal, - CASE + CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 - ELSE 1 + ELSE 1 END AS result_code, CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 'No table change detected.' @@ -152,9 +152,9 @@ test_types: {SKIP_ERRORS} as skip_errors, '{INPUT_PARAMETERS}' as input_parameters, fingerprint as result_signal, - CASE + CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 - ELSE 1 + ELSE 1 END AS result_code, CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 'No table change detected.' @@ -184,9 +184,9 @@ test_types: {SKIP_ERRORS} as skip_errors, '{INPUT_PARAMETERS}' as input_parameters, fingerprint as result_signal, - CASE + CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 - ELSE 1 + ELSE 1 END AS result_code, CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 'No table change detected.' @@ -216,9 +216,9 @@ test_types: {SKIP_ERRORS} as skip_errors, '{INPUT_PARAMETERS}' as input_parameters, fingerprint as result_signal, - CASE + CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 - ELSE 1 + ELSE 1 END AS result_code, CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 'No table change detected.' @@ -248,9 +248,9 @@ test_types: {SKIP_ERRORS} as skip_errors, '{INPUT_PARAMETERS}' as input_parameters, fingerprint as result_signal, - CASE + CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 - ELSE 1 + ELSE 1 END AS result_code, CASE WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 'No table change detected.' @@ -264,3 +264,35 @@ test_types: FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) test; + - id: '8012' + test_type: Table_Freshness + sql_flavor: oracle + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + fingerprint as result_signal, + CASE + WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 + ELSE 1 + END AS result_code, + CASE + WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 'No table change detected.' + ELSE 'Table change detected.' + END AS result_message, + CASE + WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 + ELSE 1 + END AS result_measure + FROM ( SELECT {CUSTOM_QUERY} as fingerprint + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + ) test diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml index d4d1152b..da32668e 100644 --- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml @@ -158,6 +158,26 @@ test_types: GROUP BY {COLUMN_NAME_NO_QUOTES} LIMIT {LIMIT}; error_type: Test Results + - id: '8508' + test_id: '1508' + test_type: Timeframe_Combo_Gain + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + MINUS + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: - id: '2507' test_type: Timeframe_Combo_Gain @@ -479,3 +499,44 @@ test_types: AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) GROUP BY {COLUMN_NAME_NO_QUOTES} ) test; + - id: '8007' + test_type: Timeframe_Combo_Gain + sql_flavor: oracle + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS VARCHAR2(20)) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + MINUS + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + ) test diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml index 24b17cc4..6737e1f3 100644 --- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml @@ -273,6 +273,38 @@ test_types: LIMIT {LIMIT_2} ) error_type: Test Results + - id: '8509' + test_id: '1509' + test_type: Timeframe_Combo_Match + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT * FROM ( + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + MINUS + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) WHERE ROWNUM <= {LIMIT_2} + UNION ALL + SELECT * FROM ( + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + MINUS + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) WHERE ROWNUM <= {LIMIT_2} + error_type: Test Results test_templates: - id: '2508' test_type: Timeframe_Combo_Match @@ -706,3 +738,57 @@ test_types: AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE})) ) ) test; + - id: '8008' + test_type: Timeframe_Combo_Match + sql_flavor: oracle + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS VARCHAR2(20)) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( + ( + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} + MINUS + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) + UNION ALL + ( + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} + MINUS + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} + ) + ) test diff --git a/testgen/template/dbsetup_test_types/test_types_US_State.yaml b/testgen/template/dbsetup_test_types/test_types_US_State.yaml index d663db1f..22bd616e 100644 --- a/testgen/template/dbsetup_test_types/test_types_US_State.yaml +++ b/testgen/template/dbsetup_test_types/test_types_US_State.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8027' + test_type: US_State + sql_flavor: oracle + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1390' test_id: '1036' @@ -162,4 +170,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NULLIF("{COLUMN_NAME}", '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results + - id: '8036' + test_id: '1036' + test_type: US_State + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL AND "{COLUMN_NAME}" NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Unique.yaml b/testgen/template/dbsetup_test_types/test_types_Unique.yaml index a084f307..4e300c00 100644 --- a/testgen/template/dbsetup_test_types/test_types_Unique.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Unique.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8028' + test_type: Unique + sql_flavor: oracle + measure: |- + COUNT(*) - COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1391' test_id: '1034' @@ -163,4 +171,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results + - id: '8028' + test_id: '1034' + test_type: Unique + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml index 4f79e0dd..8229f185 100644 --- a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>=' test_condition: |- {THRESHOLD_VALUE} + - id: '8029' + test_type: Unique_Pct + sql_flavor: oracle + measure: |- + ABS(2.0 * ASIN(SQRT(CAST({BASELINE_UNIQUE_CT} AS NUMBER) / CAST({BASELINE_VALUE_CT} AS NUMBER))) - 2 * ASIN(SQRT(CAST(COUNT(DISTINCT {COLUMN_NAME}) AS NUMBER) / CAST(NULLIF(COUNT({COLUMN_NAME}), 0) AS NUMBER)))) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1392' test_id: '1035' @@ -162,4 +170,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT}; error_type: Test Results + - id: '8029' + test_id: '1035' + test_type: Unique_Pct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml index e2e2f9ce..e037726b 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8036' + test_type: Valid_Characters + sql_flavor: oracle + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME}, 'X' || UNISTR('\00A0') || UNISTR('\200B') || UNISTR('\FEFF') || UNISTR('\202F') || UNISTR('\2009') || UNISTR('\3000') || UNISTR('\200C'), 'XXXXXXXX') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1397' test_id: '1043' @@ -166,4 +174,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHAR(160) || CHAR(8203) || CHAR(65279) || CHAR(8239) || CHAR(8201) || CHAR(12288) || CHAR(8204), 'XXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results + - id: '8043' + test_id: '1043' + test_type: Valid_Characters + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", 'X' || UNISTR('\00A0') || UNISTR('\200B') || UNISTR('\FEFF') || UNISTR('\202F') || UNISTR('\2009') || UNISTR('\3000') || UNISTR('\200C'), 'XXXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml index 07dd037f..79328926 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml @@ -100,5 +100,13 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8033' + test_type: Valid_Month + sql_flavor: oracle + measure: |- + SUM(CASE WHEN {COLUMN_NAME} IS NOT NULL AND {COLUMN_NAME} NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: [] test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml index 29e12359..b00f3a68 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml @@ -99,6 +99,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8034' + test_type: Valid_US_Zip + sql_flavor: oracle + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1398' test_id: '1044' @@ -161,4 +169,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results + - id: '8044' + test_id: '1044' + test_type: Valid_US_Zip + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml index f2611807..d0a91272 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml @@ -100,6 +100,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8035' + test_type: Valid_US_Zip3 + sql_flavor: oracle + measure: |- + SUM(CASE WHEN TRANSLATE({COLUMN_NAME},'012345678','999999999') <> '999' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1399' test_id: '1045' @@ -162,4 +170,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT}; error_type: Test Results + - id: '8045' + test_id: '1045' + test_type: Valid_US_Zip3 + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml index 6cab00de..9970df89 100644 --- a/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml @@ -105,6 +105,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8032' + test_type: Variability_Decrease + sql_flavor: oracle + measure: |- + 100.0*STDDEV(CAST({COLUMN_NAME} AS NUMBER))/CAST({BASELINE_SD} AS NUMBER) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1395' test_id: '1041' @@ -163,4 +171,12 @@ test_types: lookup_query: |- SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; error_type: Test Results + - id: '8032' + test_id: '1041' + test_type: Variability_Decrease + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST("{COLUMN_NAME}" AS NUMBER)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml index e05a1234..5adc92b0 100644 --- a/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8031' + test_type: Variability_Increase + sql_flavor: oracle + measure: |- + 100.0*STDDEV(CAST({COLUMN_NAME} AS NUMBER))/CAST({BASELINE_SD} AS NUMBER) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1394' test_id: '1040' @@ -167,4 +175,12 @@ test_types: lookup_query: |- SELECT STDDEV(CAST("{COLUMN_NAME}" AS FLOAT)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; error_type: Test Results + - id: '8031' + test_id: '1040' + test_type: Variability_Increase + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST("{COLUMN_NAME}" AS NUMBER)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml index 3bc15367..a2b2d4aa 100644 --- a/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml @@ -89,6 +89,14 @@ test_types: test_operator: NOT BETWEEN test_condition: |- {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} + - id: '2815' + test_type: Volume_Trend + sql_flavor: oracle + measure: |- + {CUSTOM_QUERY} + test_operator: NOT BETWEEN + test_condition: |- + {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} target_data_lookups: - id: '1477' test_id: '1513' @@ -167,4 +175,15 @@ test_types: {UPPER_TOLERANCE} AS upper_bound FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"; error_type: Test Results + - id: '8015' + test_id: '1513' + test_type: Volume_Trend + sql_flavor: oracle + lookup_type: null + lookup_query: |- + SELECT {CUSTOM_QUERY} AS current_count, + {LOWER_TOLERANCE} AS lower_bound, + {UPPER_TOLERANCE} AS upper_bound + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml index 10c98cc6..62eb1975 100644 --- a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml @@ -101,6 +101,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8030' + test_type: Weekly_Rec_Ct + sql_flavor: oracle + measure: |- + MAX(<%DATEDIFF_WEEK;DATE '1800-01-01';{COLUMN_NAME}%>) - MIN(<%DATEDIFF_WEEK;DATE '1800-01-01';{COLUMN_NAME}%>)+1 - COUNT(DISTINCT <%DATEDIFF_WEEK;DATE '1800-01-01';{COLUMN_NAME}%>) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1393' test_id: '1037' @@ -228,4 +236,12 @@ test_types: lookup_query: |- WITH RECURSIVE daterange(all_dates) AS (SELECT DATE_TRUNC('week',MIN("{COLUMN_NAME}")) :: DATE AS all_dates FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" UNION ALL SELECT (d.all_dates + INTERVAL '1 week' ) :: DATE AS all_dates FROM daterange d WHERE d.all_dates < (SELECT DATE_TRUNC('week', MAX("{COLUMN_NAME}")) :: DATE FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") ), existing_periods AS ( SELECT DISTINCT DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE AS period, COUNT(1) as period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY DATE_TRUNC('week',"{COLUMN_NAME}") :: DATE ) SELECT p.missing_period, p.prior_available_week, e.period_count as prior_available_week_count, p.next_available_week, f.period_count as next_available_week_count FROM( SELECT d.all_dates as missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates ) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period LIMIT {LIMIT}; error_type: Test Results + - id: '8037' + test_id: '1037' + test_type: Weekly_Rec_Ct + sql_flavor: oracle + lookup_type: null + lookup_query: |- + WITH daterange AS (SELECT (SELECT TRUNC(MIN("{COLUMN_NAME}"), 'IW') FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") + (LEVEL - 1) * 7 AS all_dates FROM DUAL CONNECT BY LEVEL <= CEIL((TRUNC((SELECT MAX("{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) - TRUNC((SELECT MIN("{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"))) / 7) + 1), existing_periods AS (SELECT DISTINCT TRUNC("{COLUMN_NAME}", 'IW') AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY TRUNC("{COLUMN_NAME}", 'IW')) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period FETCH FIRST {LIMIT} ROWS ONLY + error_type: Test Results test_templates: [] diff --git a/testgen/template/flavors/oracle/data_chars/get_schema_ddf.sql b/testgen/template/flavors/oracle/data_chars/get_schema_ddf.sql new file mode 100644 index 00000000..d4f4f578 --- /dev/null +++ b/testgen/template/flavors/oracle/data_chars/get_schema_ddf.sql @@ -0,0 +1,40 @@ +SELECT + c.owner AS schema_name, + c.table_name, + c.column_name, + CASE + WHEN c.data_type IN ('VARCHAR2', 'NVARCHAR2', 'CHAR', 'NCHAR') THEN 'char(' || c.data_length || ')' + WHEN c.data_type = 'NUMBER' AND c.data_precision IS NOT NULL AND c.data_scale = 0 THEN 'bigint' + WHEN c.data_type = 'NUMBER' AND c.data_precision IS NOT NULL THEN 'numeric(' || c.data_precision || ',' || c.data_scale || ')' + WHEN c.data_type = 'NUMBER' THEN 'int' + WHEN c.data_type IN ('FLOAT', 'BINARY_FLOAT', 'BINARY_DOUBLE') THEN 'numeric' + WHEN c.data_type LIKE 'TIMESTAMP%' THEN 'timestamp' + ELSE LOWER(c.data_type) + END AS column_type, + CASE + WHEN c.data_type IN ('VARCHAR2', 'NVARCHAR2', 'CHAR', 'NCHAR') THEN c.data_type || '(' || c.data_length || ')' + WHEN c.data_type = 'NUMBER' AND c.data_precision IS NOT NULL THEN 'NUMBER(' || c.data_precision || ',' || c.data_scale || ')' + WHEN c.data_type = 'FLOAT' THEN 'FLOAT(' || c.data_precision || ')' + ELSE c.data_type + END AS db_data_type, + c.column_id AS ordinal_position, + CASE + WHEN c.data_type IN ('VARCHAR2', 'NVARCHAR2', 'CHAR', 'NCHAR') + THEN 'A' + WHEN c.data_type = 'BOOLEAN' + THEN 'B' + WHEN c.data_type = 'DATE' OR c.data_type LIKE 'TIMESTAMP%' + THEN 'D' + WHEN c.data_type IN ('NUMBER', 'FLOAT', 'BINARY_FLOAT', 'BINARY_DOUBLE') + THEN 'N' + ELSE 'X' + END AS general_type, + CASE + WHEN c.data_type = 'NUMBER' AND c.data_scale > 0 THEN 1 + ELSE 0 + END AS is_decimal, + t.num_rows AS approx_record_ct +FROM all_tab_columns c +LEFT JOIN all_tables t ON c.owner = t.owner AND c.table_name = t.table_name +WHERE c.owner = '{DATA_SCHEMA}' {TABLE_CRITERIA} +ORDER BY c.owner, c.table_name, c.column_id diff --git a/testgen/template/flavors/oracle/gen_query_tests/gen_Dupe_Rows.sql b/testgen/template/flavors/oracle/gen_query_tests/gen_Dupe_Rows.sql new file mode 100644 index 00000000..1aa96960 --- /dev/null +++ b/testgen/template/flavors/oracle/gen_query_tests/gen_Dupe_Rows.sql @@ -0,0 +1,55 @@ +WITH latest_run AS ( + -- Latest complete profiling run before as-of-date + SELECT MAX(run_date) AS last_run_date + FROM profile_results + WHERE table_groups_id = :TABLE_GROUPS_ID ::UUID + AND run_date::DATE <= :AS_OF_DATE ::DATE +), +selected_tables AS ( + SELECT profile_run_id, schema_name, table_name, + STRING_AGG(:QUOTE || column_name || :QUOTE, ', ' ORDER BY position) AS groupby_names + FROM profile_results p + INNER JOIN latest_run lr ON p.run_date = lr.last_run_date + WHERE table_groups_id = :TABLE_GROUPS_ID ::UUID + -- Skip X types - Oracle does not allow grouping by types like BLOB, RAW, BFILE, CLOB, NCLOB, LONG + AND general_type <> 'X' + GROUP BY profile_run_id, schema_name, table_name +) +INSERT INTO test_definitions ( + table_groups_id, test_suite_id, test_type, + schema_name, table_name, + test_active, last_auto_gen_date, profiling_as_of_date, profile_run_id, + groupby_names, skip_errors +) +SELECT + :TABLE_GROUPS_ID ::UUID AS table_groups_id, + :TEST_SUITE_ID ::UUID AS test_suite_id, + 'Dupe_Rows' AS test_type, + s.schema_name, + s.table_name, + 'Y' AS test_active, + :RUN_DATE ::TIMESTAMP AS last_auto_gen_date, + :AS_OF_DATE ::TIMESTAMP AS profiling_as_of_date, + s.profile_run_id, + s.groupby_names, + 0 AS skip_errors +FROM selected_tables s + -- Only insert if test type is active +WHERE EXISTS (SELECT 1 FROM test_types WHERE test_type = 'Dupe_Rows' AND active = 'Y') + -- Only insert if test type is included in generation set + AND EXISTS (SELECT 1 FROM generation_sets WHERE test_type = 'Dupe_Rows' AND generation_set = :GENERATION_SET) + +-- Match "uix_td_autogen_table" unique index exactly +ON CONFLICT (test_suite_id, test_type, schema_name, table_name) +WHERE last_auto_gen_date IS NOT NULL + AND table_name IS NOT NULL + AND column_name IS NULL + +-- Update tests if they already exist +DO UPDATE SET + test_active = EXCLUDED.test_active, + last_auto_gen_date = EXCLUDED.last_auto_gen_date, + groupby_names = EXCLUDED.groupby_names, + skip_errors = EXCLUDED.skip_errors +-- Ignore locked tests +WHERE test_definitions.lock_refresh = 'N'; diff --git a/testgen/template/flavors/oracle/gen_query_tests/gen_Freshness_Trend.sql b/testgen/template/flavors/oracle/gen_query_tests/gen_Freshness_Trend.sql new file mode 100644 index 00000000..d22e79d6 --- /dev/null +++ b/testgen/template/flavors/oracle/gen_query_tests/gen_Freshness_Trend.sql @@ -0,0 +1,193 @@ +WITH latest_run AS ( + -- Latest complete profiling run before as-of-date + SELECT MAX(run_date) AS last_run_date + FROM profile_results + WHERE table_groups_id = :TABLE_GROUPS_ID ::UUID + AND run_date::DATE <= :AS_OF_DATE ::DATE +), +latest_results AS ( + -- Column results for latest run + SELECT p.profile_run_id, p.schema_name, p.table_name, p.column_name, + p.functional_data_type, p.general_type, + p.distinct_value_ct, p.record_ct, p.null_value_ct, + p.max_value, p.min_value, p.avg_value, p.stdev_value + FROM profile_results p + INNER JOIN latest_run lr ON p.run_date = lr.last_run_date + INNER JOIN data_table_chars dtc ON ( + dtc.table_groups_id = p.table_groups_id + AND dtc.schema_name = p.schema_name + AND dtc.table_name = p.table_name + -- Ignore dropped tables + AND dtc.drop_date IS NULL + ) + WHERE p.table_groups_id = :TABLE_GROUPS_ID ::UUID +), +-- IDs - TOP 2 +id_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY + CASE + WHEN functional_data_type ILIKE 'ID-Unique%' THEN 1 + WHEN functional_data_type = 'ID-Secondary' THEN 2 + ELSE 3 + END, distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'ID%' +), +-- Process Date - TOP 1 +process_date_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY + CASE + WHEN column_name ILIKE '%mod%' THEN 1 + WHEN column_name ILIKE '%up%' THEN 1 + WHEN column_name ILIKE '%cr%' THEN 2 + WHEN column_name ILIKE '%in%' THEN 2 + END, distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'process%' +), +-- Transaction Date - TOP 1 +tran_date_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'transactional date%' + OR functional_data_type ILIKE 'period%' + OR functional_data_type = 'timestamp' +), +-- Numeric Measures +numeric_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, + -- Weighted score + ( + 0.25 * (distinct_value_ct * 1.0 / NULLIF(record_ct, 0)) + + 0.15 * ((max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + + 0.10 * (LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2))) + + 0.40 * (stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + + 0.10 * (1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1))) + ) AS change_detection_score + FROM latest_results + WHERE general_type = 'N' + AND ( + functional_data_type ILIKE 'Measure%' + OR functional_data_type IN ('Sequence', 'Constant') + ) +), +numeric_cols_ranked AS ( + SELECT *, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY change_detection_score DESC, column_name + ) AS rank + FROM numeric_cols + WHERE change_detection_score IS NOT NULL +), +combined AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order + FROM id_cols + WHERE rank <= 2 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order + FROM process_date_cols + WHERE rank = 1 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order + FROM tran_date_cols + WHERE rank = 1 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order + FROM numeric_cols_ranked + WHERE rank = 1 +), +selected_tables AS ( + SELECT profile_run_id, schema_name, table_name, + STRING_AGG(column_name, ',' ORDER BY element_type, fingerprint_order, column_name) AS column_names, + 'TO_CHAR(COUNT(*)) || ''|'' || ' || + STRING_AGG( + REPLACE( + CASE + WHEN general_type = 'D' THEN 'TO_CHAR(MIN(@@@)) || ''|'' || TO_CHAR(MAX(@@@)) || ''|'' || TO_CHAR(COUNT(DISTINCT @@@))' + WHEN general_type = 'A' THEN 'TO_CHAR(MIN(@@@)) || ''|'' || TO_CHAR(MAX(@@@)) || ''|'' || TO_CHAR(COUNT(DISTINCT @@@)) || ''|'' || TO_CHAR(SUM(LENGTH(@@@)))' + WHEN general_type = 'N' THEN 'TO_CHAR(COUNT(@@@)) || ''|'' || + TO_CHAR(COUNT(DISTINCT MOD(CAST(CAST(COALESCE(@@@,0) AS NUMBER(38,6)) * 1000000 AS NUMBER(38,0)), 1000003))) || ''|'' || + COALESCE(TO_CHAR(CAST(MIN(@@@) AS NUMBER(38,6))), '''') || ''|'' || + COALESCE(TO_CHAR(CAST(MAX(@@@) AS NUMBER(38,6))), '''') || ''|'' || + COALESCE(TO_CHAR(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) * 1000000 AS NUMBER), 1000000007)), 0), 1000000007)), '''') || ''|'' || + COALESCE(TO_CHAR(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) * 1000000 AS NUMBER), 1000000009)), 0), 1000000009)), '''')' + END, + '@@@', '"' || column_name || '"' + ), + ' || ''|'' || ' + ORDER BY element_type, fingerprint_order, column_name + ) AS fingerprint + FROM combined + GROUP BY profile_run_id, schema_name, table_name +) +-- Insert tests for selected tables +INSERT INTO test_definitions ( + table_groups_id, test_suite_id, test_type, + schema_name, table_name, groupby_names, + test_active, last_auto_gen_date, profiling_as_of_date, profile_run_id, + history_calculation, history_lookback, custom_query +) +SELECT + :TABLE_GROUPS_ID ::UUID AS table_groups_id, + :TEST_SUITE_ID ::UUID AS test_suite_id, + 'Freshness_Trend' AS test_type, + s.schema_name, + s.table_name, + s.column_names AS groupby_names, + 'Y' AS test_active, + :RUN_DATE ::TIMESTAMP AS last_auto_gen_date, + :AS_OF_DATE ::TIMESTAMP AS profiling_as_of_date, + s.profile_run_id, + 'PREDICT' AS history_calculation, + NULL AS history_lookback, + s.fingerprint AS custom_query +FROM selected_tables s + -- Only insert if test type is active +WHERE EXISTS (SELECT 1 FROM test_types WHERE test_type = 'Freshness_Trend' AND active = 'Y') + -- Only insert if test type is included in generation set + AND EXISTS (SELECT 1 FROM generation_sets WHERE test_type = 'Freshness_Trend' AND generation_set = :GENERATION_SET) + +-- Match "uix_td_autogen_table" unique index exactly +ON CONFLICT (test_suite_id, test_type, schema_name, table_name) +WHERE last_auto_gen_date IS NOT NULL + AND table_name IS NOT NULL + AND column_name IS NULL + +-- Update tests if they already exist +DO UPDATE SET + groupby_names = EXCLUDED.groupby_names, + test_active = EXCLUDED.test_active, + last_auto_gen_date = EXCLUDED.last_auto_gen_date, + profiling_as_of_date = EXCLUDED.profiling_as_of_date, + profile_run_id = EXCLUDED.profile_run_id, + history_calculation = EXCLUDED.history_calculation, + history_lookback = EXCLUDED.history_lookback, + custom_query = EXCLUDED.custom_query +-- Ignore locked tests +WHERE test_definitions.lock_refresh = 'N' + -- Don't update existing tests in "insert" mode + AND NOT COALESCE(:INSERT_ONLY, FALSE); diff --git a/testgen/template/flavors/oracle/gen_query_tests/gen_Table_Freshness.sql b/testgen/template/flavors/oracle/gen_query_tests/gen_Table_Freshness.sql new file mode 100644 index 00000000..29690379 --- /dev/null +++ b/testgen/template/flavors/oracle/gen_query_tests/gen_Table_Freshness.sql @@ -0,0 +1,181 @@ +WITH latest_run AS ( + -- Latest complete profiling run before as-of-date + SELECT MAX(run_date) AS last_run_date + FROM profile_results + WHERE table_groups_id = :TABLE_GROUPS_ID ::UUID + AND run_date::DATE <= :AS_OF_DATE ::DATE +), +latest_results AS ( + -- Column results for latest run + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, + distinct_value_ct, record_ct, null_value_ct, + max_value, min_value, avg_value, stdev_value + FROM profile_results p + INNER JOIN latest_run lr ON p.run_date = lr.last_run_date + WHERE table_groups_id = :TABLE_GROUPS_ID ::UUID +), +-- IDs - TOP 2 +id_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY + CASE + WHEN functional_data_type ILIKE 'ID-Unique%' THEN 1 + WHEN functional_data_type = 'ID-Secondary' THEN 2 + ELSE 3 + END, distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'ID%' +), +-- Process Date - TOP 1 +process_date_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY + CASE + WHEN column_name ILIKE '%mod%' THEN 1 + WHEN column_name ILIKE '%up%' THEN 1 + WHEN column_name ILIKE '%cr%' THEN 2 + WHEN column_name ILIKE '%in%' THEN 2 + END, distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'process%' +), +-- Transaction Date - TOP 1 +tran_date_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'transactional date%' + OR functional_data_type ILIKE 'period%' + OR functional_data_type = 'timestamp' +), +-- Numeric Measures +numeric_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, + -- Weighted score + ( + 0.25 * (distinct_value_ct * 1.0 / NULLIF(record_ct, 0)) + + 0.15 * ((max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + + 0.10 * (LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2))) + + 0.40 * (stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + + 0.10 * (1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1))) + ) AS change_detection_score + FROM latest_results + WHERE general_type = 'N' + AND ( + functional_data_type ILIKE 'Measure%' + OR functional_data_type IN ('Sequence', 'Constant') + ) +), +numeric_cols_ranked AS ( + SELECT *, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY change_detection_score DESC, column_name + ) AS rank + FROM numeric_cols + WHERE change_detection_score IS NOT NULL +), +combined AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order + FROM id_cols + WHERE rank <= 2 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order + FROM process_date_cols + WHERE rank = 1 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order + FROM tran_date_cols + WHERE rank = 1 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order + FROM numeric_cols_ranked + WHERE rank = 1 +), +selected_tables AS ( + SELECT profile_run_id, schema_name, table_name, + 'TO_CHAR(COUNT(*)) || ''|'' || ' || + STRING_AGG( + REPLACE( + CASE + WHEN general_type = 'D' THEN 'TO_CHAR(MIN(@@@)) || ''|'' || TO_CHAR(MAX(@@@)) || ''|'' || TO_CHAR(COUNT(DISTINCT @@@))' + WHEN general_type = 'A' THEN 'TO_CHAR(MIN(@@@)) || ''|'' || TO_CHAR(MAX(@@@)) || ''|'' || TO_CHAR(COUNT(DISTINCT @@@)) || ''|'' || TO_CHAR(SUM(LENGTH(@@@)))' + WHEN general_type = 'N' THEN 'TO_CHAR(COUNT(@@@)) || ''|'' || + TO_CHAR(COUNT(DISTINCT MOD(CAST(CAST(COALESCE(@@@,0) AS NUMBER(38,6)) * 1000000 AS NUMBER(38,0)), 1000003))) || ''|'' || + COALESCE(TO_CHAR(CAST(MIN(@@@) AS NUMBER(38,6))), '''') || ''|'' || + COALESCE(TO_CHAR(CAST(MAX(@@@) AS NUMBER(38,6))), '''') || ''|'' || + COALESCE(TO_CHAR(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) * 1000000 AS NUMBER), 1000000007)), 0), 1000000007)), '''') || ''|'' || + COALESCE(TO_CHAR(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) * 1000000 AS NUMBER), 1000000009)), 0), 1000000009)), '''')' + END, + '@@@', '"' || column_name || '"' + ), + ' || ''|'' || ' + ORDER BY element_type, fingerprint_order, column_name + ) AS fingerprint + FROM combined + GROUP BY profile_run_id, schema_name, table_name +) +-- Insert tests for selected tables +INSERT INTO test_definitions ( + table_groups_id, test_suite_id, test_type, + schema_name, table_name, + test_active, last_auto_gen_date, profiling_as_of_date, profile_run_id, + history_calculation, history_lookback, custom_query +) +SELECT + :TABLE_GROUPS_ID ::UUID AS table_groups_id, + :TEST_SUITE_ID ::UUID AS test_suite_id, + 'Table_Freshness' AS test_type, + s.schema_name, + s.table_name, + 'Y' AS test_active, + :RUN_DATE ::TIMESTAMP AS last_auto_gen_date, + :AS_OF_DATE ::TIMESTAMP AS profiling_as_of_date, + s.profile_run_id, + 'Value' AS history_calculation, + 1 AS history_lookback, + s.fingerprint AS custom_query +FROM selected_tables s + -- Only insert if test type is active +WHERE EXISTS (SELECT 1 FROM test_types WHERE test_type = 'Table_Freshness' AND active = 'Y') + -- Only insert if test type is included in generation set + AND EXISTS (SELECT 1 FROM generation_sets WHERE test_type = 'Table_Freshness' AND generation_set = :GENERATION_SET) + +-- Match "uix_td_autogen_table" unique index exactly +ON CONFLICT (test_suite_id, test_type, schema_name, table_name) +WHERE last_auto_gen_date IS NOT NULL + AND table_name IS NOT NULL + AND column_name IS NULL + +-- Update tests if they already exist +DO UPDATE SET + test_active = EXCLUDED.test_active, + last_auto_gen_date = EXCLUDED.last_auto_gen_date, + profiling_as_of_date = EXCLUDED.profiling_as_of_date, + profile_run_id = EXCLUDED.profile_run_id, + history_calculation = EXCLUDED.history_calculation, + history_lookback = EXCLUDED.history_lookback, + custom_query = EXCLUDED.custom_query +-- Ignore locked tests +WHERE test_definitions.lock_refresh = 'N'; diff --git a/testgen/template/flavors/oracle/profiling/project_profiling_query.sql b/testgen/template/flavors/oracle/profiling/project_profiling_query.sql new file mode 100644 index 00000000..a7240103 --- /dev/null +++ b/testgen/template/flavors/oracle/profiling/project_profiling_query.sql @@ -0,0 +1,367 @@ +SELECT + main.connection_id, + main.project_code, + main.table_groups_id, + main.schema_name, + main.run_date, + main.table_name, + main.position, + main.column_name, + main.column_type, + main.db_data_type, + main.general_type, + main.record_ct, + main.value_ct, + main.distinct_value_ct, + main.null_value_ct, + main.min_length, + main.max_length, + main.avg_length, + main.zero_value_ct, + main.distinct_std_value_ct, + main.zero_length_ct, + main.lead_space_ct, + main.quoted_value_ct, + main.includes_digit_ct, + main.filled_value_ct, + main.min_text, + main.max_text, + main.upper_case_ct, + main.lower_case_ct, + main.non_alpha_ct, + main.non_printing_ct, + main.numeric_ct, + main.date_ct, + main.std_pattern_match, +-- TG-IF is_type_A + patterns.top_patterns, +-- TG-ELSE + NULL AS top_patterns, +-- TG-ENDIF + main.min_value, + main.min_value_over_0, + main.max_value, + main.avg_value, + main.stdev_value, + main.percentile_25, + main.percentile_50, + main.percentile_75, + main.fractional_sum, + main.min_date, + main.max_date, + main.before_1yr_date_ct, + main.before_5yr_date_ct, + main.before_20yr_date_ct, + main.before_100yr_date_ct, + main.within_1yr_date_ct, + main.within_1mo_date_ct, + main.future_date_ct, + main.distant_future_date_ct, + main.date_days_present, + main.date_weeks_present, + main.date_months_present, + main.boolean_true_ct, +-- TG-IF is_type_A + patterns.distinct_pattern_ct, +-- TG-ELSE + NULL AS distinct_pattern_ct, +-- TG-ENDIF + main.embedded_space_ct, + main.avg_embedded_spaces, + main.profile_run_id +FROM ( + SELECT + {CONNECTION_ID} AS connection_id, + '{PROJECT_CODE}' AS project_code, + '{TABLE_GROUPS_ID}' AS table_groups_id, + '{DATA_SCHEMA}' AS schema_name, + '{RUN_DATE}' AS run_date, + '{DATA_TABLE}' AS table_name, + {COL_POS} AS position, + '{COL_NAME_SANITIZED}' AS column_name, + '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, + '{COL_GEN_TYPE}' AS general_type, + COUNT(*) AS record_ct, +-- TG-IF is_type_X + COUNT(CASE WHEN "{COL_NAME}" IS NOT NULL THEN 1 END) AS value_ct, + NULL AS distinct_value_ct, +-- TG-ELSE + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, +-- TG-ENDIF + SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, +-- TG-IF is_type_ADN + MIN(LENGTH(TO_CHAR("{COL_NAME}"))) AS min_length, + MAX(LENGTH(TO_CHAR("{COL_NAME}"))) AS max_length, + AVG(NULLIF(LENGTH(TO_CHAR("{COL_NAME}")), 0)) AS avg_length, +-- TG-ELSE + NULL AS min_length, + NULL AS max_length, + NULL AS avg_length, +-- TG-ENDIF +-- TG-IF is_type_A + SUM(CASE + WHEN REGEXP_LIKE(TRIM("{COL_NAME}"), '^0(\.0*)?$') THEN 1 ELSE 0 + END) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_N + SUM(1 - ABS(SIGN("{COL_NAME}"))) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_not_A_not_N + NULL AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", 'X '',.-', 'X'))) AS distinct_std_value_ct, + 0 AS zero_length_ct, + SUM(CASE + WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 + ELSE 0 + END) AS lead_space_ct, + SUM(CASE WHEN "{COL_NAME}" LIKE '"%"' OR "{COL_NAME}" LIKE '''%''' THEN 1 ELSE 0 END) AS quoted_value_ct, + SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '[0-9]') THEN 1 ELSE 0 END) AS includes_digit_ct, + SUM(CASE + WHEN REGEXP_LIKE(LOWER("{COL_NAME}"), '^(\.{1,}|-{1,}|\?{1,}|[[:space:]]{1,}|0{2,}|9{2,}|x{2,}|z{2,})$') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', + 'n/a','#na','none','null','unknown') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', + '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', + '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 + ELSE 0 + END) AS filled_value_ct, + SUBSTR(MIN(CASE WHEN "{COL_NAME}" IS NOT NULL THEN "{COL_NAME}" END), 1, 100) AS min_text, + SUBSTR(MAX(CASE WHEN "{COL_NAME}" IS NOT NULL THEN "{COL_NAME}" END), 1, 100) AS max_text, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'abcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS upper_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 0 + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', ' ') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS lower_case_ct, + SUM(CASE + WHEN TRANSLATE("{COL_NAME}", 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', ' ') = "{COL_NAME}" THEN 1 + ELSE 0 + END) AS non_alpha_ct, + COUNT(CASE WHEN TRANSLATE("{COL_NAME}", + 'X' || UNISTR('\00A0') || UNISTR('\2009') || UNISTR('\200B') || UNISTR('\200C') || UNISTR('\200D') || UNISTR('\200E') || UNISTR('\200F') || UNISTR('\202F') || UNISTR('\3000') || UNISTR('\FEFF'), + 'XXXXXXXXXXX') <> "{COL_NAME}" THEN 1 END) AS non_printing_ct, + SUM(<%IS_NUM;SUBSTR("{COL_NAME}", 1, 31)%>) AS numeric_ct, + SUM(<%IS_DATE;SUBSTR("{COL_NAME}", 1, 26)%>) AS date_ct, + CASE + WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^[0-9]{1,5}[a-zA-Z]?[[:space:]][[:alnum:]_]{1,5}\.?[[:space:]]?[[:alnum:]_]*[[:space:]]?[[:alnum:]_]*[[:space:]][a-zA-Z]{1,6}\.?[[:space:]]?[0-9]{0,5}[A-Z]?$') + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'STREET_ADDR' + WHEN SUM(CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'STATE_USA' + WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^(\+1|1)?[ .-]?(\([2-9][0-9]{2}\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$') + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'PHONE_USA' + WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'EMAIL' + WHEN SUM(CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'ZIP_USA' + WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^[[:alnum:]_[:space:]-]+\.(txt|csv|tsv|dat|doc|pdf|xlsx)$') + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'FILE_NAME' + WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^([0-9]{4}[- ]){3}[0-9]{4}$') + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'CREDIT_CARD' + WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^([^,|' || CHR(9) || ']{1,20}[,|' || CHR(9) || ']){2,}[^,|' || CHR(9) || ']{0,20}([,|' || CHR(9) || ']?[^,|' || CHR(9) || ']{0,20})*$') + AND NOT REGEXP_LIKE("{COL_NAME}", '[[:space:]](and|but|or|yet)[[:space:]]') + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'DELIMITED_DATA' + WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$') + AND SUBSTR("{COL_NAME}", 1, 3) NOT BETWEEN '734' AND '749' + AND SUBSTR("{COL_NAME}", 1, 3) <> '666' THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'SSN' + END AS std_pattern_match, +-- TG-ELSE + NULL AS distinct_std_value_ct, + NULL AS zero_length_ct, + NULL AS lead_space_ct, + NULL AS quoted_value_ct, + NULL AS includes_digit_ct, + NULL AS filled_value_ct, + NULL AS min_text, + NULL AS max_text, + NULL AS upper_case_ct, + NULL AS lower_case_ct, + NULL AS non_alpha_ct, + NULL AS non_printing_ct, + NULL AS numeric_ct, + NULL AS date_ct, + NULL AS std_pattern_match, +-- TG-ENDIF +-- TG-IF is_type_N + MIN("{COL_NAME}") AS min_value, + MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, + MAX("{COL_NAME}") AS max_value, + AVG(CAST("{COL_NAME}" AS NUMBER)) AS avg_value, + STDDEV(CAST("{COL_NAME}" AS NUMBER)) AS stdev_value, + MIN(pct_25) AS percentile_25, + MIN(pct_50) AS percentile_50, + MIN(pct_75) AS percentile_75, +-- TG-ELSE + NULL AS min_value, + NULL AS min_value_over_0, + NULL AS max_value, + NULL AS avg_value, + NULL AS stdev_value, + NULL AS percentile_25, + NULL AS percentile_50, + NULL AS percentile_75, +-- TG-ENDIF +-- TG-IF is_N_decimal + SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) AS fractional_sum, +-- TG-ELSE + NULL AS fractional_sum, +-- TG-ENDIF +-- TG-IF is_type_D + CASE + WHEN MIN("{COL_NAME}") IS NULL THEN NULL + ELSE GREATEST(MIN("{COL_NAME}"), TO_DATE('0001-01-01', 'YYYY-MM-DD')) + END AS min_date, + MAX("{COL_NAME}") AS max_date, + SUM(CASE + WHEN <%DATEDIFF_MONTH;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > 12 THEN 1 + ELSE 0 + END) AS before_1yr_date_ct, + SUM(CASE + WHEN <%DATEDIFF_MONTH;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > 60 THEN 1 + ELSE 0 + END) AS before_5yr_date_ct, + SUM(CASE + WHEN <%DATEDIFF_MONTH;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > 240 THEN 1 + ELSE 0 + END) AS before_20yr_date_ct, + SUM(CASE + WHEN <%DATEDIFF_MONTH;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > 1200 THEN 1 + ELSE 0 + END) AS before_100yr_date_ct, + SUM(CASE + WHEN <%DATEDIFF_DAY;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> BETWEEN 0 AND 365 THEN 1 + ELSE 0 + END) AS within_1yr_date_ct, + SUM(CASE + WHEN <%DATEDIFF_DAY;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> BETWEEN 0 AND 30 THEN 1 + ELSE 0 + END) AS within_1mo_date_ct, + SUM(CASE + WHEN "{COL_NAME}" > TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 + END) AS future_date_ct, + SUM(CASE + WHEN <%DATEDIFF_MONTH;TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS');"{COL_NAME}"%> > 240 THEN 1 + ELSE 0 + END) AS distant_future_date_ct, + COUNT(DISTINCT <%DATEDIFF_DAY;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) AS date_days_present, + COUNT(DISTINCT <%DATEDIFF_WEEK;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) AS date_weeks_present, + COUNT(DISTINCT <%DATEDIFF_MONTH;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) AS date_months_present, +-- TG-ELSE + NULL AS min_date, + NULL AS max_date, + NULL AS before_1yr_date_ct, + NULL AS before_5yr_date_ct, + NULL AS before_20yr_date_ct, + NULL AS before_100yr_date_ct, + NULL AS within_1yr_date_ct, + NULL AS within_1mo_date_ct, + NULL AS future_date_ct, + NULL AS distant_future_date_ct, + NULL AS date_days_present, + NULL AS date_weeks_present, + NULL AS date_months_present, +-- TG-ENDIF +-- TG-IF is_type_B + SUM(CAST("{COL_NAME}" AS NUMBER)) AS boolean_true_ct, +-- TG-ELSE + NULL AS boolean_true_ct, +-- TG-ENDIF +-- TG-IF is_A_sampling + SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REPLACE(TRIM("{COL_NAME}"), ' ', '')))) AS embedded_space_ct, + AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REPLACE(TRIM("{COL_NAME}"), ' ', ''))) AS avg_embedded_spaces, +-- TG-ENDIF +-- TG-IF is_A_no_sampling + SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REPLACE(TRIM("{COL_NAME}"), ' ', '')))) AS embedded_space_ct, + AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REPLACE(TRIM("{COL_NAME}"), ' ', ''))) AS avg_embedded_spaces, +-- TG-ENDIF +-- TG-IF is_not_A + NULL AS embedded_space_ct, + NULL AS avg_embedded_spaces, +-- TG-ENDIF + '{PROFILE_RUN_ID}' AS profile_run_id +-- TG-IF do_sample + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_PERCENT_CALC}) +-- TG-ELSE + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-ENDIF +-- TG-IF is_N_sampling + , (SELECT + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_PERCENT_CALC}) WHERE ROWNUM <= 1000000) pctile +-- TG-ENDIF +-- TG-IF is_N_no_sampling + , (SELECT + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" WHERE ROWNUM <= 1000000) pctile +-- TG-ENDIF +) main +-- TG-IF is_A_sampling +CROSS JOIN ( + SELECT + (SELECT SUBSTR(LISTAGG(formatted_pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1, 1000) + FROM ( + SELECT TO_CHAR(COUNT(*)) || ' | ' || pattern AS formatted_pattern, + COUNT(*) AS ct + FROM (SELECT REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE( + "{COL_NAME}", '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') AS pattern + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_PERCENT_CALC}) + WHERE "{COL_NAME}" IS NOT NULL AND "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_PERCENT_CALC})) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + GROUP BY pattern + HAVING pattern > ' ' + ORDER BY COUNT(*) DESC + FETCH FIRST 5 ROWS ONLY + ) ps) AS top_patterns, + (SELECT COUNT(DISTINCT REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE( + "{COL_NAME}", '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') + ) + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" SAMPLE ({SAMPLE_PERCENT_CALC}) + WHERE "{COL_NAME}" IS NOT NULL AND "{COL_NAME}" > ' ') AS distinct_pattern_ct + FROM DUAL +) patterns +-- TG-ENDIF +-- TG-IF is_A_no_sampling +CROSS JOIN ( + SELECT + (SELECT SUBSTR(LISTAGG(formatted_pattern, ' | ') WITHIN GROUP (ORDER BY ct DESC), 1, 1000) + FROM ( + SELECT TO_CHAR(COUNT(*)) || ' | ' || pattern AS formatted_pattern, + COUNT(*) AS ct + FROM (SELECT REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE( + "{COL_NAME}", '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') AS pattern + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + WHERE "{COL_NAME}" IS NOT NULL AND "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) + FROM "{DATA_SCHEMA}"."{DATA_TABLE}") BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + GROUP BY pattern + HAVING pattern > ' ' + ORDER BY COUNT(*) DESC + FETCH FIRST 5 ROWS ONLY + ) ps) AS top_patterns, + (SELECT COUNT(DISTINCT REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE( + "{COL_NAME}", '[a-z]', 'a'), + '[A-Z]', 'A'), + '[0-9]', 'N') + ) + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" + WHERE "{COL_NAME}" IS NOT NULL AND "{COL_NAME}" > ' ') AS distinct_pattern_ct + FROM DUAL +) patterns +-- TG-ENDIF diff --git a/testgen/template/flavors/oracle/profiling/project_secondary_profiling_query.sql b/testgen/template/flavors/oracle/profiling/project_secondary_profiling_query.sql new file mode 100644 index 00000000..4e67b07d --- /dev/null +++ b/testgen/template/flavors/oracle/profiling/project_secondary_profiling_query.sql @@ -0,0 +1,41 @@ +-- Get Freqs for selected columns +WITH ranked_vals AS ( + SELECT "{COL_NAME}", + COUNT(*) AS ct, + ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + SAMPLE ({SAMPLE_PERCENT_CALC}) +-- TG-ENDIF + WHERE "{COL_NAME}" IS NOT NULL AND "{COL_NAME}" > ' ' + GROUP BY "{COL_NAME}" +), +consol_vals AS ( + SELECT COALESCE(CASE WHEN rn <= 10 THEN '| ' || "{COL_NAME}" || ' | ' || TO_CHAR(ct) + ELSE NULL + END, '| Other Values (' || TO_CHAR(COUNT(DISTINCT "{COL_NAME}")) || ') | ' || TO_CHAR(SUM(ct))) AS val, + MIN(rn) as min_rn + FROM ranked_vals + GROUP BY CASE WHEN rn <= 10 THEN '| ' || "{COL_NAME}" || ' | ' || TO_CHAR(ct) + ELSE NULL + END +), +hash_val AS ( + SELECT RAWTOHEX(STANDARD_HASH(LISTAGG("{COL_NAME}", '|') WITHIN GROUP (ORDER BY "{COL_NAME}"), 'MD5')) as hash_result + FROM (SELECT DISTINCT "{COL_NAME}" + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + SAMPLE ({SAMPLE_PERCENT_CALC}) +-- TG-ENDIF + WHERE "{COL_NAME}" IS NOT NULL AND "{COL_NAME}" > ' ') +) +SELECT '{PROJECT_CODE}' as project_code, + '{DATA_SCHEMA}' as schema_name, + '{RUN_DATE}' as run_date, + '{DATA_TABLE}' as table_name, + '{COL_NAME}' as column_name, + REPLACE(LISTAGG(val, '^#^') WITHIN GROUP (ORDER BY min_rn), '^#^', CHR(10)) AS top_freq_values, + MAX(h.hash_result) as distinct_value_hash + FROM consol_vals + CROSS JOIN hash_val h + GROUP BY h.hash_result diff --git a/testgen/template/flavors/oracle/profiling/templated_functions.yaml b/testgen/template/flavors/oracle/profiling/templated_functions.yaml new file mode 100644 index 00000000..c2febea1 --- /dev/null +++ b/testgen/template/flavors/oracle/profiling/templated_functions.yaml @@ -0,0 +1,108 @@ +DATEDIFF_DAY: TRUNC({$2}) - TRUNC({$1}) + +DATEDIFF_WEEK: (TRUNC({$2}, 'IW') - TRUNC({$1}, 'IW')) / 7 + +DATEDIFF_MONTH: FLOOR(MONTHS_BETWEEN(TRUNC({$2}, 'MM'), TRUNC({$1}, 'MM'))) + +DATEDIFF_QUARTER: FLOOR(MONTHS_BETWEEN(TRUNC({$2}, 'MM'), TRUNC({$1}, 'MM')) / 3) + +DATEDIFF_YEAR: EXTRACT(YEAR FROM {$2}) - EXTRACT(YEAR FROM {$1}) + +IS_NUM: CASE + WHEN REGEXP_LIKE({$1}, '^[[:space:]]*[+-]?\$?[[:space:]]*[0-9]+(,[0-9]{3})*(\.[0-9]*)?[%]?[[:space:]]*$') THEN 1 + ELSE 0 + END + +IS_DATE: CASE + /* YYYY-MM-DD HH:MM:SS SSSSSS or YYYY-MM-DD HH:MM:SS */ + WHEN REGEXP_LIKE({$1}, '^([0-9]{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])[[:space:]](2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])([[:space:]][0-9]{6})?$') + THEN CASE + WHEN TO_NUMBER(SUBSTR({$1}, 1, 4)) BETWEEN 1800 AND 2200 + AND ( + ( SUBSTR({$1}, 6, 2) IN ('01', '03', '05', '07', '08', + '10', '12') + AND TO_NUMBER(SUBSTR({$1}, 9, 2)) BETWEEN 1 AND 31 ) + OR ( SUBSTR({$1}, 6, 2) IN ('04', '06', '09') + AND TO_NUMBER(SUBSTR({$1}, 9, 2)) BETWEEN 1 AND 30 ) + OR ( SUBSTR({$1}, 6, 2) = '02' + AND TO_NUMBER(SUBSTR({$1}, 9, 2)) BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* YYYYMMDDHHMMSSSSSS or YYYYMMDDHHMM */ + WHEN REGEXP_LIKE({$1}, '^([0-9]{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])([0-5][0-9])([0-5][0-9])([0-9]{6})$') + OR REGEXP_LIKE({$1}, '^([0-9]{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])$') + THEN CASE + WHEN TO_NUMBER(SUBSTR({$1}, 1, 4)) BETWEEN 1800 AND 2200 + AND ( + ( SUBSTR({$1}, 5, 2) IN ('01', '03', '05', '07', '08', + '10', '12') + AND TO_NUMBER(SUBSTR({$1}, 7, 2)) BETWEEN 1 AND 31 ) + OR ( SUBSTR({$1}, 5, 2) IN ('04', '06', '09') + AND TO_NUMBER(SUBSTR({$1}, 7, 2)) BETWEEN 1 AND 30 ) + OR ( SUBSTR({$1}, 5, 2) = '02' + AND TO_NUMBER(SUBSTR({$1}, 7, 2)) BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* Exclude anything else long */ + WHEN LENGTH({$1}) > 11 THEN 0 + /* YYYY-MMM/MM-DD */ + WHEN REGEXP_LIKE(REGEXP_REPLACE(UPPER({$1}), '(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', '12'), + '[12][09][0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9]') + THEN CASE + WHEN TO_NUMBER(REGEXP_SUBSTR({$1}, '^[^-]+')) BETWEEN 1800 AND 2200 + AND ( + ( UPPER(REGEXP_SUBSTR({$1}, '[^-]+', 1, 2)) IN ('01', '03', '05', '07', '08', + '1', '3', '5', '7', '8', '10', '12', + 'JAN', 'MAR', 'MAY', 'JUL', 'AUG', + 'OCT', 'DEC') + AND TO_NUMBER(REGEXP_SUBSTR({$1}, '[^-]+$')) BETWEEN 1 AND 31 ) + OR ( UPPER(REGEXP_SUBSTR({$1}, '[^-]+', 1, 2)) IN ('04', '06', '09', '4', '6', '9', '11', + 'APR', 'JUN', 'SEP', 'NOV') + AND TO_NUMBER(REGEXP_SUBSTR({$1}, '[^-]+$')) BETWEEN 1 AND 30 ) + OR ( UPPER(REGEXP_SUBSTR({$1}, '[^-]+', 1, 2)) IN ('02', '2', 'FEB') + AND TO_NUMBER(REGEXP_SUBSTR({$1}, '[^-]+$')) BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* MM/-DD/-YY/YYYY */ + WHEN REGEXP_LIKE(REPLACE({$1}, '-', '/'), '^[0-1]?[0-9]/[0-3]?[0-9]/[12][09][0-9][0-9]$') + OR REGEXP_LIKE(REPLACE({$1}, '-', '/'), '^[0-1]?[0-9]/[0-3]?[0-9]/[0-9][0-9]$') + THEN + CASE + WHEN TO_NUMBER(REGEXP_SUBSTR(REPLACE({$1}, '-', '/'), '[^/]+', 1, 1)) BETWEEN 1 AND 12 + AND ( + ( TO_NUMBER(REGEXP_SUBSTR(REPLACE({$1}, '-', '/'), '[^/]+', 1, 1)) IN (1, 3, 5, 7, 8, 10, 12) + AND TO_NUMBER(REGEXP_SUBSTR(REPLACE({$1}, '-', '/'), '[^/]+', 1, 2)) BETWEEN 1 AND 31 ) + OR ( TO_NUMBER(REGEXP_SUBSTR(REPLACE({$1}, '-', '/'), '[^/]+', 1, 1)) IN (4, 6, 9, 11) + AND TO_NUMBER(REGEXP_SUBSTR(REPLACE({$1}, '-', '/'), '[^/]+', 1, 2)) BETWEEN 1 AND 30 ) + OR ( TO_NUMBER(REGEXP_SUBSTR(REPLACE({$1}, '-', '/'), '[^/]+', 1, 1)) = 2 + AND TO_NUMBER(REGEXP_SUBSTR(REPLACE({$1}, '-', '/'), '[^/]+', 1, 2)) BETWEEN 1 AND 29) + ) + AND + TO_NUMBER('20' || SUBSTR(REGEXP_SUBSTR(REPLACE({$1}, '-', '/'), '[^/]+$'), -2)) BETWEEN 1800 AND 2200 + THEN 1 + ELSE 0 + END + /* DD-MMM-YYYY */ + WHEN REGEXP_LIKE(UPPER({$1}), '[0-3]?[0-9]-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-[12][09][0-9][0-9]') + THEN + CASE + WHEN TO_NUMBER(REGEXP_SUBSTR({$1}, '[^-]+$')) BETWEEN 1800 AND 2200 + AND ( + ( UPPER(REGEXP_SUBSTR({$1}, '[^-]+', 1, 2)) IN ('JAN', 'MAR', 'MAY', 'JUL', 'AUG', 'OCT', 'DEC') + AND TO_NUMBER(REGEXP_SUBSTR({$1}, '^[^-]+')) BETWEEN 1 AND 31 ) + OR ( UPPER(REGEXP_SUBSTR({$1}, '[^-]+', 1, 2)) IN ('APR', 'JUN', 'SEP', 'NOV') + AND TO_NUMBER(REGEXP_SUBSTR({$1}, '^[^-]+')) BETWEEN 1 AND 30 ) + OR ( UPPER(REGEXP_SUBSTR({$1}, '[^-]+', 1, 2)) = 'FEB' + AND TO_NUMBER(REGEXP_SUBSTR({$1}, '^[^-]+')) BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + ELSE 0 + END diff --git a/testgen/template/flavors/oracle/validate_tests/get_target_identifiers.sql b/testgen/template/flavors/oracle/validate_tests/get_target_identifiers.sql new file mode 100644 index 00000000..2685239c --- /dev/null +++ b/testgen/template/flavors/oracle/validate_tests/get_target_identifiers.sql @@ -0,0 +1,5 @@ +SELECT owner AS schema_name, + table_name, + column_name +FROM all_tab_columns +WHERE owner IN ({TEST_SCHEMAS}) diff --git a/testgen/ui/assets/flavors/oracle.svg b/testgen/ui/assets/flavors/oracle.svg new file mode 100644 index 00000000..eef50c67 --- /dev/null +++ b/testgen/ui/assets/flavors/oracle.svg @@ -0,0 +1,58 @@ + + + + + +Oracle + + +Oracle diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js index 0c0b3cfa..1ccf7867 100644 --- a/testgen/ui/components/frontend/js/components/connection_form.js +++ b/testgen/ui/components/frontend/js/components/connection_form.js @@ -83,6 +83,7 @@ const defaultPorts = { postgresql: '5432', snowflake: '443', databricks: '443', + oracle: '1521', }; /** @@ -234,6 +235,17 @@ const ConnectionForm = (props, saveButton) => { connection, dynamicConnectionUrl, ), + oracle: () => OracleForm( + updatedConnection, + getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), + (formValue, isValid) => { + updatedConnection.val = {...updatedConnection.val, ...formValue}; + setFieldValidity('oracle_form', isValid); + }, + connection, + dynamicConnectionUrl, + { dbNameLabel: 'Service Name' }, + ), bigquery: () => BigqueryForm( updatedConnection, getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), @@ -380,6 +392,7 @@ const ConnectionForm = (props, saveButton) => { * @param {(params: Partial, isValid: boolean) => void} onChange * @param {Connection?} originalConnection * @param {VanState} dynamicConnectionUrl + * @param {{dbNameLabel: string}?} options * @returns {HTMLElement} */ const RedshiftForm = ( @@ -388,6 +401,7 @@ const RedshiftForm = ( onChange, originalConnection, dynamicConnectionUrl, + options, ) => { const isValid = van.state(true); const connectByUrl = van.state(connection.rawVal.connect_by_url ?? false); @@ -479,7 +493,7 @@ const RedshiftForm = ( ), Input({ name: 'db_name', - label: 'Database', + label: options?.dbNameLabel || 'Database', value: connectionDatabase, disabled: connectByUrl, onChange: (value, state) => { @@ -552,6 +566,8 @@ const RedshiftSpectrumForm = RedshiftForm; const PostgresqlForm = RedshiftForm; +const OracleForm = RedshiftForm; + const AzureMSSQLForm = ( connection, flavor, diff --git a/testgen/ui/services/database_service.py b/testgen/ui/services/database_service.py index cf5c7280..d9e8b6ec 100644 --- a/testgen/ui/services/database_service.py +++ b/testgen/ui/services/database_service.py @@ -63,6 +63,8 @@ def fetch_from_target_db(connection: Connection, query: str, params: dict | None **flavor_service.get_engine_args(), ) - with engine.connect() as connection: - cursor: CursorResult = connection.execute(text(query), params) + with engine.connect() as conn: + for pre_query, pre_params in flavor_service.get_pre_connection_queries(): + conn.execute(text(pre_query), pre_params) + cursor: CursorResult = conn.execute(text(query), params) return cursor.fetchall() diff --git a/testgen/ui/static/js/components/connection_form.js b/testgen/ui/static/js/components/connection_form.js index 0c0b3cfa..1ccf7867 100644 --- a/testgen/ui/static/js/components/connection_form.js +++ b/testgen/ui/static/js/components/connection_form.js @@ -83,6 +83,7 @@ const defaultPorts = { postgresql: '5432', snowflake: '443', databricks: '443', + oracle: '1521', }; /** @@ -234,6 +235,17 @@ const ConnectionForm = (props, saveButton) => { connection, dynamicConnectionUrl, ), + oracle: () => OracleForm( + updatedConnection, + getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), + (formValue, isValid) => { + updatedConnection.val = {...updatedConnection.val, ...formValue}; + setFieldValidity('oracle_form', isValid); + }, + connection, + dynamicConnectionUrl, + { dbNameLabel: 'Service Name' }, + ), bigquery: () => BigqueryForm( updatedConnection, getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), @@ -380,6 +392,7 @@ const ConnectionForm = (props, saveButton) => { * @param {(params: Partial, isValid: boolean) => void} onChange * @param {Connection?} originalConnection * @param {VanState} dynamicConnectionUrl + * @param {{dbNameLabel: string}?} options * @returns {HTMLElement} */ const RedshiftForm = ( @@ -388,6 +401,7 @@ const RedshiftForm = ( onChange, originalConnection, dynamicConnectionUrl, + options, ) => { const isValid = van.state(true); const connectByUrl = van.state(connection.rawVal.connect_by_url ?? false); @@ -479,7 +493,7 @@ const RedshiftForm = ( ), Input({ name: 'db_name', - label: 'Database', + label: options?.dbNameLabel || 'Database', value: connectionDatabase, disabled: connectByUrl, onChange: (value, state) => { @@ -552,6 +566,8 @@ const RedshiftSpectrumForm = RedshiftForm; const PostgresqlForm = RedshiftForm; +const OracleForm = RedshiftForm; + const AzureMSSQLForm = ( connection, flavor, diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index 0c69e992..670cac38 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -238,8 +238,8 @@ def _format_connection(self, connection: Connection, should_test: bool = False) def test_connection(self, connection: Connection) -> "ConnectionStatus": empty_cache() try: - sql_query = "select 1;" - results = db.fetch_from_target_db(connection, sql_query) + flavor_service = get_flavor_service(connection.sql_flavor) + results = db.fetch_from_target_db(connection, flavor_service.test_query) connection_successful = len(results) == 1 and results[0][0] == 1 if not connection_successful: @@ -601,6 +601,12 @@ class ConnectionFlavor: flavor="mssql", icon=get_asset_data_url("flavors/mssql.svg"), ), + ConnectionFlavor( + label="Oracle", + value="oracle", + flavor="oracle", + icon=get_asset_data_url("flavors/oracle.svg"), + ), ConnectionFlavor( label="PostgreSQL", value="postgresql", diff --git a/testgen/ui/views/dialogs/data_preview_dialog.py b/testgen/ui/views/dialogs/data_preview_dialog.py index 8a65b006..c72b2223 100644 --- a/testgen/ui/views/dialogs/data_preview_dialog.py +++ b/testgen/ui/views/dialogs/data_preview_dialog.py @@ -47,14 +47,15 @@ def get_preview_data( if connection: flavor_service = get_flavor_service(connection.sql_flavor) - use_top = flavor_service.use_top + row_limiting = flavor_service.row_limiting_clause quote = flavor_service.quote_character query = f""" SELECT DISTINCT - {"TOP 100" if use_top else ""} + {"TOP 100" if row_limiting == "top" else ""} {f"{quote}{column_name}{quote}" if column_name else "*"} FROM {quote}{schema_name}{quote}.{quote}{table_name}{quote} - {"LIMIT 100" if not use_top else ""} + {"LIMIT 100" if row_limiting == "limit" else ""} + {"FETCH FIRST 100 ROWS ONLY" if row_limiting == "fetch" else ""} """ try: From 0e6be256f0c79ade4310d535ad1a4a1795843c4f Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Mon, 9 Feb 2026 18:35:17 -0500 Subject: [PATCH 37/95] fix(sql server): make Dupe Rows test case sensitive --- .../template/dbsetup_test_types/test_types_Dupe_Rows.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml index 16065d9e..0c3b50e4 100644 --- a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml @@ -76,7 +76,7 @@ test_types: SELECT TOP {LIMIT} {GROUPBY_NAMES}, COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} + GROUP BY {GROUPBY_NAMES}, BINARY_CHECKSUM({GROUPBY_NAMES}) HAVING COUNT(*) > 1 ORDER BY {GROUPBY_NAMES} error_type: Test Results @@ -262,7 +262,8 @@ test_types: FROM ( SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} + -- Checksum is added because string column grouping is case insensitive + GROUP BY {GROUPBY_NAMES}, BINARY_CHECKSUM({GROUPBY_NAMES}) HAVING COUNT(*) > 1 ) test; - id: '2311' From 911f81aab1e1e9d63dcac8039b9231a0b7bb43dd Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Mon, 9 Feb 2026 20:03:38 -0500 Subject: [PATCH 38/95] fix: discrepancies in Weekly Record Count test --- .../test_types_Weekly_Rec_Ct.yaml | 14 +++++++------- .../bigquery/profiling/project_profiling_query.sql | 2 +- .../databricks/profiling/templated_functions.yaml | 2 +- .../redshift/profiling/project_profiling_query.sql | 2 +- .../redshift/profiling/templated_functions.yaml | 1 + .../profiling/project_profiling_query.sql | 2 +- .../profiling/templated_functions.yaml | 1 + .../profiling/project_profiling_query.sql | 2 +- .../snowflake/profiling/templated_functions.yaml | 1 + 9 files changed, 15 insertions(+), 12 deletions(-) diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml index 62eb1975..063189b8 100644 --- a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml @@ -41,7 +41,7 @@ test_types: test_type: Weekly_Rec_Ct sql_flavor: bigquery measure: |- - DATETIME_DIFF(DATETIME_TRUNC(SAFE_CAST(MAX({COLUMN_NAME}) AS DATE), WEEK), DATETIME_TRUNC(SAFE_CAST(MIN({COLUMN_NAME}) AS DATE), WEEK), WEEK) + 1 - COUNT(DISTINCT DATETIME_TRUNC({COLUMN_NAME}, WEEK)) + DATETIME_DIFF(DATETIME_TRUNC(SAFE_CAST(MAX({COLUMN_NAME}) AS DATE), ISOWEEK), DATETIME_TRUNC(SAFE_CAST(MIN({COLUMN_NAME}) AS DATE), ISOWEEK), ISOWEEK) + 1 - COUNT(DISTINCT DATETIME_TRUNC({COLUMN_NAME}, ISOWEEK)) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -73,7 +73,7 @@ test_types: test_type: Weekly_Rec_Ct sql_flavor: redshift measure: |- - MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) + MAX(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) - MIN(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>)+1 - COUNT(DISTINCT <%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -81,7 +81,7 @@ test_types: test_type: Weekly_Rec_Ct sql_flavor: redshift_spectrum measure: |- - MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) + MAX(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) - MIN(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>)+1 - COUNT(DISTINCT <%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -89,7 +89,7 @@ test_types: test_type: Weekly_Rec_Ct sql_flavor: snowflake measure: |- - MAX(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) - MIN(DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME}))+1 - COUNT(DISTINCT DATEDIFF(week, '1800-01-01'::DATE, {COLUMN_NAME})) + MAX(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) - MIN(<%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>)+1 - COUNT(DISTINCT <%DATEDIFF_WEEK;'1800-01-01'::DATE;{COLUMN_NAME}%>) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -120,14 +120,14 @@ test_types: SELECT week_start AS all_dates FROM UNNEST( GENERATE_DATE_ARRAY( - DATE_TRUNC((SELECT MIN(CAST(`{COLUMN_NAME}` AS DATE)) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), WEEK), - DATE_TRUNC((SELECT MAX(CAST(`{COLUMN_NAME}` AS DATE)) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), WEEK), + DATE_TRUNC((SELECT MIN(CAST(`{COLUMN_NAME}` AS DATE)) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), ISOWEEK), + DATE_TRUNC((SELECT MAX(CAST(`{COLUMN_NAME}` AS DATE)) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), ISOWEEK), INTERVAL 7 DAY ) ) AS week_start ), existing_periods AS ( - SELECT DISTINCT DATE_TRUNC(CAST(`{COLUMN_NAME}` AS DATE), WEEK) AS period, COUNT(1) AS period_count + SELECT DISTINCT DATE_TRUNC(CAST(`{COLUMN_NAME}` AS DATE), ISOWEEK) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY period ), diff --git a/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql b/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql index ece95159..b77044a6 100644 --- a/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql @@ -199,7 +199,7 @@ SELECT COUNT(CASE WHEN SAFE_CAST(DATE(`{COL_NAME}`) AS DATE) > SAFE_CAST(DATE('{RUN_DATE}') AS DATE) THEN 1 END) AS future_date_ct, COUNT(CASE WHEN DATE_DIFF(SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), SAFE_CAST(DATE('{RUN_DATE}') AS DATE), MONTH) > 240 THEN 1 END) AS distant_future_date_ct, COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), DAY)) AS date_days_present, - COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), WEEK)) AS date_weeks_present, + COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), ISOWEEK)) AS date_weeks_present, COUNT(DISTINCT DATE_DIFF(SAFE_CAST(DATE('{RUN_DATE}') AS DATE), SAFE_CAST(DATE(`{COL_NAME}`) AS DATE), MONTH)) AS date_months_present, -- TG-ELSE NULL AS min_date, diff --git a/testgen/template/flavors/databricks/profiling/templated_functions.yaml b/testgen/template/flavors/databricks/profiling/templated_functions.yaml index a7706e26..24cd7fd5 100644 --- a/testgen/template/flavors/databricks/profiling/templated_functions.yaml +++ b/testgen/template/flavors/databricks/profiling/templated_functions.yaml @@ -18,6 +18,6 @@ IS_DATE: CASE DATEDIFF_MONTH: (YEAR({$2}) * 12 + MONTH({$2}) - YEAR({$1}) * 12 - MONTH({$1})) -DATEDIFF_WEEK: CAST(DATEDIFF(DATE_TRUNC('week', {$2} + INTERVAL 1 DAY), DATE_TRUNC('week', {$1} + INTERVAL 1 DAY)) / 7 AS INT) +DATEDIFF_WEEK: CAST(DATEDIFF(DATE_TRUNC('week', {$2}), DATE_TRUNC('week', {$1})) / 7 AS INT) DATEDIFF_DAY: EXTRACT(DAY FROM DATE({$2}) - DATE({$1})) diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query.sql b/testgen/template/flavors/redshift/profiling/project_profiling_query.sql index db97da0b..4c48dd5d 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query.sql @@ -157,7 +157,7 @@ SELECT COUNT(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, COUNT(CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}"::DATE) > 240 THEN 1 END) AS distant_future_date_ct, COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_days_present, - COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_weeks_present, + COUNT(DISTINCT <%DATEDIFF_WEEK;"{COL_NAME}";'{RUN_DATE}'%>) AS date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_months_present, -- TG-ELSE NULL AS min_date, diff --git a/testgen/template/flavors/redshift/profiling/templated_functions.yaml b/testgen/template/flavors/redshift/profiling/templated_functions.yaml index 4953e254..dffaa4f8 100644 --- a/testgen/template/flavors/redshift/profiling/templated_functions.yaml +++ b/testgen/template/flavors/redshift/profiling/templated_functions.yaml @@ -99,3 +99,4 @@ IS_DATE: CASE ELSE 0 END +DATEDIFF_WEEK: (DATE_TRUNC('week', {$2}::DATE)::DATE - DATE_TRUNC('week', {$1}::DATE)::DATE) / 7 diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql index db97da0b..4c48dd5d 100644 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql @@ -157,7 +157,7 @@ SELECT COUNT(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, COUNT(CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}"::DATE) > 240 THEN 1 END) AS distant_future_date_ct, COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_days_present, - COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_weeks_present, + COUNT(DISTINCT <%DATEDIFF_WEEK;"{COL_NAME}";'{RUN_DATE}'%>) AS date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}"::DATE, '{RUN_DATE}' ) ) AS date_months_present, -- TG-ELSE NULL AS min_date, diff --git a/testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml b/testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml index 4953e254..dffaa4f8 100644 --- a/testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml +++ b/testgen/template/flavors/redshift_spectrum/profiling/templated_functions.yaml @@ -99,3 +99,4 @@ IS_DATE: CASE ELSE 0 END +DATEDIFF_WEEK: (DATE_TRUNC('week', {$2}::DATE)::DATE - DATE_TRUNC('week', {$1}::DATE)::DATE) / 7 diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql b/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql index 56e4ae1f..6874d062 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql @@ -164,7 +164,7 @@ SELECT COUNT(CASE WHEN "{COL_NAME}" > '{RUN_DATE}' THEN 1 END) AS future_date_ct, COUNT(CASE WHEN DATEDIFF('MON', '{RUN_DATE}', "{COL_NAME}") > 240 THEN 1 END) AS distant_future_date_ct, COUNT(DISTINCT DATEDIFF(day, "{COL_NAME}", '{RUN_DATE}')) AS date_days_present, - COUNT(DISTINCT DATEDIFF(week, "{COL_NAME}", '{RUN_DATE}')) AS date_weeks_present, + COUNT(DISTINCT <%DATEDIFF_WEEK;"{COL_NAME}";'{RUN_DATE}'%>) AS date_weeks_present, COUNT(DISTINCT DATEDIFF(month, "{COL_NAME}", '{RUN_DATE}')) AS date_months_present, -- TG-ELSE NULL AS min_date, diff --git a/testgen/template/flavors/snowflake/profiling/templated_functions.yaml b/testgen/template/flavors/snowflake/profiling/templated_functions.yaml index 1afbdea3..c9316784 100644 --- a/testgen/template/flavors/snowflake/profiling/templated_functions.yaml +++ b/testgen/template/flavors/snowflake/profiling/templated_functions.yaml @@ -53,3 +53,4 @@ IS_DATE: CASE ELSE 0 END +DATEDIFF_WEEK: (DATEADD('day', 1 - DAYOFWEEKISO({$2}::DATE), {$2}::DATE)::DATE - DATEADD('day', 1 - DAYOFWEEKISO({$1}::DATE), {$1}::DATE)::DATE) / 7 From 4b993b5dd46d16581440860d6fc170f2d0e6de21 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 13 Feb 2026 18:15:14 -0500 Subject: [PATCH 39/95] feat: add support for SAP HANA --- pyproject.toml | 2 + .../common/database/flavor/flavor_service.py | 2 +- .../flavor/sap_hana_flavor_service.py | 22 ++ testgen/common/models/connection.py | 2 +- ..._anomaly_types_Boolean_Value_Mismatch.yaml | 8 + ...anomaly_types_Char_Column_Date_Values.yaml | 8 + ...omaly_types_Char_Column_Number_Values.yaml | 8 + ...anomaly_types_Column_Pattern_Mismatch.yaml | 8 + ...anomaly_types_Delimited_Data_Embedded.yaml | 8 + ...ile_anomaly_types_Inconsistent_Casing.yaml | 8 + ...rofile_anomaly_types_Invalid_Zip3_USA.yaml | 8 + ...profile_anomaly_types_Invalid_Zip_USA.yaml | 8 + .../profile_anomaly_types_Leading_Spaces.yaml | 8 + ...le_anomaly_types_Multiple_Types_Major.yaml | 8 + ...le_anomaly_types_Multiple_Types_Minor.yaml | 8 + .../profile_anomaly_types_No_Values.yaml | 8 + ..._anomaly_types_Non_Alpha_Name_Address.yaml | 8 + ...anomaly_types_Non_Alpha_Prefixed_Name.yaml | 8 + ...file_anomaly_types_Non_Printing_Chars.yaml | 8 + ...ile_anomaly_types_Non_Standard_Blanks.yaml | 8 + ...le_anomaly_types_Potential_Duplicates.yaml | 8 + .../profile_anomaly_types_Potential_PII.yaml | 8 + .../profile_anomaly_types_Quoted_Values.yaml | 8 + ...rofile_anomaly_types_Recency_One_Year.yaml | 8 + ...file_anomaly_types_Recency_Six_Months.yaml | 8 + ...nomaly_types_Small_Divergent_Value_Ct.yaml | 8 + ..._anomaly_types_Small_Missing_Value_Ct.yaml | 8 + ..._anomaly_types_Small_Numeric_Value_Ct.yaml | 8 + ...maly_types_Standardized_Value_Matches.yaml | 8 + .../profile_anomaly_types_Suggested_Type.yaml | 8 + ..._anomaly_types_Table_Pattern_Mismatch.yaml | 8 + ...ofile_anomaly_types_Unexpected_Emails.yaml | 8 + ...le_anomaly_types_Unexpected_US_States.yaml | 8 + ...le_anomaly_types_Unlikely_Date_Values.yaml | 8 + ...le_anomaly_types_Variant_Coded_Values.yaml | 8 + .../test_types_Aggregate_Balance.yaml | 69 +++++ .../test_types_Aggregate_Balance_Percent.yaml | 71 +++++ .../test_types_Aggregate_Balance_Range.yaml | 71 +++++ .../test_types_Aggregate_Minimum.yaml | 69 +++++ .../test_types_Alpha_Trunc.yaml | 16 ++ .../test_types_Avg_Shift.yaml | 16 ++ .../dbsetup_test_types/test_types_CUSTOM.yaml | 34 +++ .../test_types_Combo_Match.yaml | 61 +++++ .../test_types_Condition_Flag.yaml | 16 ++ .../test_types_Constant.yaml | 16 ++ .../test_types_Daily_Record_Ct.yaml | 16 ++ .../test_types_Dec_Trunc.yaml | 16 ++ .../test_types_Distinct_Date_Ct.yaml | 16 ++ .../test_types_Distinct_Value_Ct.yaml | 16 ++ .../test_types_Distribution_Shift.yaml | 78 ++++++ .../test_types_Dupe_Rows.yaml | 47 ++++ .../test_types_Email_Format.yaml | 16 ++ .../test_types_Freshness_Trend.yaml | 85 ++++-- .../test_types_Future_Date.yaml | 16 ++ .../test_types_Future_Date_1Y.yaml | 16 ++ .../test_types_Incr_Avg_Shift.yaml | 16 ++ .../test_types_LOV_All.yaml | 16 ++ .../test_types_LOV_Match.yaml | 16 ++ .../test_types_Metric_Trend.yaml | 19 ++ .../test_types_Min_Date.yaml | 16 ++ .../test_types_Min_Val.yaml | 16 ++ .../test_types_Missing_Pct.yaml | 16 ++ .../test_types_Monthly_Rec_Ct.yaml | 16 ++ .../test_types_Outlier_Pct_Above.yaml | 16 ++ .../test_types_Outlier_Pct_Below.yaml | 16 ++ .../test_types_Pattern_Match.yaml | 16 ++ .../test_types_Recency.yaml | 16 ++ .../test_types_Required.yaml | 16 ++ .../dbsetup_test_types/test_types_Row_Ct.yaml | 16 ++ .../test_types_Row_Ct_Pct.yaml | 16 ++ .../test_types_Schema_Drift.yaml | 55 ++++ .../test_types_Street_Addr_Pattern.yaml | 16 ++ .../test_types_Table_Freshness.yaml | 32 +++ .../test_types_Timeframe_Combo_Gain.yaml | 61 +++++ .../test_types_Timeframe_Combo_Match.yaml | 88 ++++++ .../test_types_US_State.yaml | 16 ++ .../dbsetup_test_types/test_types_Unique.yaml | 16 ++ .../test_types_Unique_Pct.yaml | 16 ++ .../test_types_Valid_Characters.yaml | 16 ++ .../test_types_Valid_Month.yaml | 8 + .../test_types_Valid_US_Zip.yaml | 16 ++ .../test_types_Valid_US_Zip3.yaml | 16 ++ .../test_types_Variability_Decrease.yaml | 16 ++ .../test_types_Variability_Increase.yaml | 16 ++ .../test_types_Volume_Trend.yaml | 19 ++ .../test_types_Weekly_Rec_Ct.yaml | 16 ++ .../sap_hana/data_chars/get_schema_ddf.sql | 41 +++ .../gen_query_tests/gen_Dupe_Rows.sql | 55 ++++ .../gen_query_tests/gen_Freshness_Trend.sql | 193 ++++++++++++++ .../gen_query_tests/gen_Table_Freshness.sql | 181 +++++++++++++ .../profiling/project_profiling_query.sql | 251 ++++++++++++++++++ .../project_secondary_profiling_query.sql | 36 +++ .../profiling/templated_functions.yaml | 108 ++++++++ .../validate_tests/get_target_identifiers.sql | 5 + testgen/ui/assets/flavors/sap_hana.svg | 66 +++++ .../frontend/js/components/connection_form.js | 11 + .../static/js/components/connection_form.js | 11 + testgen/ui/views/connections.py | 6 + 98 files changed, 2650 insertions(+), 17 deletions(-) create mode 100644 testgen/common/database/flavor/sap_hana_flavor_service.py create mode 100644 testgen/template/flavors/sap_hana/data_chars/get_schema_ddf.sql create mode 100644 testgen/template/flavors/sap_hana/gen_query_tests/gen_Dupe_Rows.sql create mode 100644 testgen/template/flavors/sap_hana/gen_query_tests/gen_Freshness_Trend.sql create mode 100644 testgen/template/flavors/sap_hana/gen_query_tests/gen_Table_Freshness.sql create mode 100644 testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql create mode 100644 testgen/template/flavors/sap_hana/profiling/project_secondary_profiling_query.sql create mode 100644 testgen/template/flavors/sap_hana/profiling/templated_functions.yaml create mode 100644 testgen/template/flavors/sap_hana/validate_tests/get_target_identifiers.sql create mode 100644 testgen/ui/assets/flavors/sap_hana.svg diff --git a/pyproject.toml b/pyproject.toml index 39aba9f9..6033df77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,8 @@ dependencies = [ "snowflake-sqlalchemy==1.6.1", "sqlalchemy-bigquery==1.14.1", "oracledb==3.4.0", + "hdbcli==2.23.26", + "sqlalchemy-hana==2.1.0", "pyodbc==5.0.0", "psycopg2-binary==2.9.9", "pycryptodome==3.21", diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py index af8e8d87..4b1625f1 100644 --- a/testgen/common/database/flavor/flavor_service.py +++ b/testgen/common/database/flavor/flavor_service.py @@ -4,7 +4,7 @@ from testgen.common.encrypt import DecryptText -SQLFlavor = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "postgresql", "databricks", "bigquery", "oracle"] +SQLFlavor = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "postgresql", "databricks", "bigquery", "oracle", "sap_hana"] RowLimitingClause = Literal["limit", "top", "fetch"] diff --git a/testgen/common/database/flavor/sap_hana_flavor_service.py b/testgen/common/database/flavor/sap_hana_flavor_service.py new file mode 100644 index 00000000..2a855e51 --- /dev/null +++ b/testgen/common/database/flavor/sap_hana_flavor_service.py @@ -0,0 +1,22 @@ +from urllib.parse import quote_plus + +from testgen.common.database.flavor.flavor_service import FlavorService + + +class SapHanaFlavorService(FlavorService): + + varchar_type = "NVARCHAR(1000)" + default_uppercase = True + test_query = "SELECT 1 FROM DUMMY" + + def get_connection_string_head(self): + return f"hana+hdbcli://{self.username}:{quote_plus(self.password)}@" + + def get_connection_string_from_fields(self): + url = f"hana+hdbcli://{self.username}:{quote_plus(self.password)}@{self.host}:{self.port}/" + if self.dbname: + url += f"?databaseName={self.dbname}" + return url + + def get_connect_args(self) -> dict: + return {} diff --git a/testgen/common/models/connection.py b/testgen/common/models/connection.py index 9436cd43..3d138396 100644 --- a/testgen/common/models/connection.py +++ b/testgen/common/models/connection.py @@ -27,7 +27,7 @@ from testgen.common.models.table_group import TableGroup from testgen.utils import is_uuid4 -SQLFlavorCode = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "azure_mssql", "synapse_mssql", "postgresql", "databricks", "bigquery", "oracle"] +SQLFlavorCode = Literal["redshift", "redshift_spectrum", "snowflake", "mssql", "azure_mssql", "synapse_mssql", "postgresql", "databricks", "bigquery", "oracle", "sap_hana"] @dataclass diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml index 02ee6923..23ccefa0 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml @@ -91,3 +91,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1514' + test_id: '1015' + test_type: Boolean_Value_Mismatch + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml index cbbb4248..7e25517d 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Date_Values.yaml @@ -103,3 +103,11 @@ profile_anomaly_types: lookup_query: |- SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) B ORDER BY data_type, count DESC error_type: Profile Anomaly + - id: '1511' + test_id: '1012' + test_type: Char_Column_Date_Values + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT 'Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Date' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_DATE;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) B ORDER BY data_type, count DESC + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml index fe3e71d2..d5d5ce14 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Char_Column_Number_Values.yaml @@ -103,3 +103,11 @@ profile_anomaly_types: lookup_query: |- SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) B ORDER BY data_type, count DESC error_type: Profile Anomaly + - id: '1510' + test_id: '1011' + test_type: Char_Column_Number_Values + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) B ORDER BY data_type, count DESC + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml index 4650458c..00e37271 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Column_Pattern_Mismatch.yaml @@ -132,3 +132,11 @@ profile_anomaly_types: lookup_query: |- SELECT A.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(REGEXP_SUBSTR('{DETAIL_EXPRESSION}', '[^|]+', 1, 4)) AS top_pattern FROM DUAL) b WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE("{COLUMN_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_4} ROWS ONLY) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(REGEXP_SUBSTR('{DETAIL_EXPRESSION}', '[^|]+', 1, 6)) AS top_pattern FROM DUAL) b WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE("{COLUMN_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_4} ROWS ONLY) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(REGEXP_SUBSTR('{DETAIL_EXPRESSION}', '[^|]+', 1, 8)) AS top_pattern FROM DUAL) b WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE("{COLUMN_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_4} ROWS ONLY) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(REGEXP_SUBSTR('{DETAIL_EXPRESSION}', '[^|]+', 1, 10)) AS top_pattern FROM DUAL) b WHERE REGEXP_REPLACE(REGEXP_REPLACE(REGEXP_REPLACE("{COLUMN_NAME}", '[a-z]', 'a'), '[A-Z]', 'A'), '[0-9]', 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_4} ROWS ONLY) D ORDER BY top_pattern DESC, count DESC error_type: Profile Anomaly + - id: '1506' + test_id: '1007' + test_type: Column_Pattern_Mismatch + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(SUBSTR_REGEXPR('[^|]+' IN '{DETAIL_EXPRESSION}' OCCURRENCE 4)) AS top_pattern FROM DUMMY) b WHERE REPLACE_REGEXPR('[0-9]' IN REPLACE_REGEXPR('[A-Z]' IN REPLACE_REGEXPR('[a-z]' IN "{COLUMN_NAME}" WITH 'a') WITH 'A') WITH 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) A UNION ALL SELECT B.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(SUBSTR_REGEXPR('[^|]+' IN '{DETAIL_EXPRESSION}' OCCURRENCE 6)) AS top_pattern FROM DUMMY) b WHERE REPLACE_REGEXPR('[0-9]' IN REPLACE_REGEXPR('[A-Z]' IN REPLACE_REGEXPR('[a-z]' IN "{COLUMN_NAME}" WITH 'a') WITH 'A') WITH 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) B UNION ALL SELECT C.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(SUBSTR_REGEXPR('[^|]+' IN '{DETAIL_EXPRESSION}' OCCURRENCE 8)) AS top_pattern FROM DUMMY) b WHERE REPLACE_REGEXPR('[0-9]' IN REPLACE_REGEXPR('[A-Z]' IN REPLACE_REGEXPR('[a-z]' IN "{COLUMN_NAME}" WITH 'a') WITH 'A') WITH 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) C UNION ALL SELECT D.* FROM (SELECT DISTINCT b.top_pattern, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT TRIM(SUBSTR_REGEXPR('[^|]+' IN '{DETAIL_EXPRESSION}' OCCURRENCE 10)) AS top_pattern FROM DUMMY) b WHERE REPLACE_REGEXPR('[0-9]' IN REPLACE_REGEXPR('[A-Z]' IN REPLACE_REGEXPR('[a-z]' IN "{COLUMN_NAME}" WITH 'a') WITH 'A') WITH 'N') = b.top_pattern GROUP BY b.top_pattern, "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_4}) D ORDER BY top_pattern DESC, count DESC + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml index 18acb363..7728798b 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml @@ -86,3 +86,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}", '^([^,|' || CHR(9) || ']{1,20}[,|' || CHR(9) || ']){2,}[^,|' || CHR(9) || ']{0,20}([,|' || CHR(9) || ']{0,1}[^,|' || CHR(9) || ']{0,20})*$') AND NOT REGEXP_LIKE("{COLUMN_NAME}", '[[:space:]](and|but|or|yet)[[:space:]]') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1524' + test_id: '1025' + test_type: Delimited_Data_Embedded + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" LIKE_REGEXPR '^([^,|' || NCHAR(9) || ']{1,20}[,|' || NCHAR(9) || ']){2,}[^,|' || NCHAR(9) || ']{0,20}([,|' || NCHAR(9) || ']{0,1}[^,|' || NCHAR(9) || ']{0,20})*$' AND NOT "{COLUMN_NAME}" LIKE_REGEXPR '[[:space:]](and|but|or|yet)[[:space:]]' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml index 0062feef..c6f5e139 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Inconsistent_Casing.yaml @@ -132,3 +132,11 @@ profile_anomaly_types: lookup_query: |- SELECT * FROM (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT_2} ROWS ONLY) UNION ALL SELECT * FROM (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT_2} ROWS ONLY) error_type: Profile Anomaly + - id: '1526' + test_id: '1028' + test_type: Inconsistent_Casing + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT * FROM (SELECT 'Upper Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") = "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) UNION ALL SELECT * FROM (SELECT 'Mixed Case' as casing, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" <> LOWER("{COLUMN_NAME}") GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT_2}) + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml index 1210ae17..8f8215c0 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip3_USA.yaml @@ -89,3 +89,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1523' + test_id: '1024' + test_type: Invalid_Zip3_USA + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REPLACE_REGEXPR('[0-9]' IN "{COLUMN_NAME}" WITH '9') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY count DESC, "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml index abdaa03b..a4aeaa62 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Invalid_Zip_USA.yaml @@ -85,3 +85,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1502' + test_id: '1003' + test_type: Invalid_Zip_USA + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REPLACE_REGEXPR('[0-9]' IN "{COLUMN_NAME}" WITH '9') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml index 8c95c91e..3f74cb98 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Leading_Spaces.yaml @@ -85,3 +85,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1508' + test_id: '1009' + test_type: Leading_Spaces + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" BETWEEN ' !' AND '!' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml index fb5b4679..ddb6b8d6 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Major.yaml @@ -99,3 +99,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT column_name, table_name, CASE WHEN data_type LIKE 'TIMESTAMP%%' THEN LOWER(data_type) WHEN data_type = 'DATE' THEN 'date' WHEN data_type = 'VARCHAR2' THEN 'varchar2(' || data_length || ')' WHEN data_type = 'CHAR' THEN 'char(' || data_length || ')' WHEN data_type = 'NUMBER' AND data_precision IS NULL THEN 'number' WHEN data_type = 'NUMBER' AND data_scale = 0 THEN 'number(' || data_precision || ')' WHEN data_type = 'NUMBER' THEN 'number(' || data_precision || ',' || data_scale || ')' ELSE data_type END AS data_type FROM all_tab_columns WHERE owner = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1504' + test_id: '1005' + test_type: Multiple_Types_Major + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT COLUMN_NAME, TABLE_NAME, CASE WHEN DATA_TYPE_NAME LIKE 'TIMESTAMP%%' THEN LOWER(DATA_TYPE_NAME) WHEN DATA_TYPE_NAME = 'DATE' THEN 'date' WHEN DATA_TYPE_NAME IN ('NVARCHAR', 'VARCHAR') THEN LOWER(DATA_TYPE_NAME) || '(' || LENGTH || ')' WHEN DATA_TYPE_NAME = 'CHAR' THEN 'char(' || LENGTH || ')' WHEN DATA_TYPE_NAME = 'DECIMAL' AND SCALE = 0 THEN 'decimal(' || LENGTH || ')' WHEN DATA_TYPE_NAME = 'DECIMAL' THEN 'decimal(' || LENGTH || ',' || SCALE || ')' WHEN DATA_TYPE_NAME IN ('INTEGER', 'BIGINT', 'SMALLINT', 'TINYINT') THEN LOWER(DATA_TYPE_NAME) ELSE LOWER(DATA_TYPE_NAME) END AS data_type FROM SYS.TABLE_COLUMNS WHERE SCHEMA_NAME = '{TARGET_SCHEMA}' AND COLUMN_NAME = '{COLUMN_NAME}' ORDER BY data_type, TABLE_NAME LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml index ce6cc7e2..17df28f3 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Multiple_Types_Minor.yaml @@ -99,3 +99,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT column_name, table_name, CASE WHEN data_type LIKE 'TIMESTAMP%%' THEN LOWER(data_type) WHEN data_type = 'DATE' THEN 'date' WHEN data_type = 'VARCHAR2' THEN 'varchar2(' || data_length || ')' WHEN data_type = 'CHAR' THEN 'char(' || data_length || ')' WHEN data_type = 'NUMBER' AND data_precision IS NULL THEN 'number' WHEN data_type = 'NUMBER' AND data_scale = 0 THEN 'number(' || data_precision || ')' WHEN data_type = 'NUMBER' THEN 'number(' || data_precision || ',' || data_scale || ')' ELSE data_type END AS data_type FROM all_tab_columns WHERE owner = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY data_type, table_name FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1503' + test_id: '1004' + test_type: Multiple_Types_Minor + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT COLUMN_NAME, TABLE_NAME, CASE WHEN DATA_TYPE_NAME LIKE 'TIMESTAMP%%' THEN LOWER(DATA_TYPE_NAME) WHEN DATA_TYPE_NAME = 'DATE' THEN 'date' WHEN DATA_TYPE_NAME IN ('NVARCHAR', 'VARCHAR') THEN LOWER(DATA_TYPE_NAME) || '(' || LENGTH || ')' WHEN DATA_TYPE_NAME = 'CHAR' THEN 'char(' || LENGTH || ')' WHEN DATA_TYPE_NAME = 'DECIMAL' AND SCALE = 0 THEN 'decimal(' || LENGTH || ')' WHEN DATA_TYPE_NAME = 'DECIMAL' THEN 'decimal(' || LENGTH || ',' || SCALE || ')' WHEN DATA_TYPE_NAME IN ('INTEGER', 'BIGINT', 'SMALLINT', 'TINYINT') THEN LOWER(DATA_TYPE_NAME) ELSE LOWER(DATA_TYPE_NAME) END AS data_type FROM SYS.TABLE_COLUMNS WHERE SCHEMA_NAME = '{TARGET_SCHEMA}' AND COLUMN_NAME = '{COLUMN_NAME}' ORDER BY data_type, TABLE_NAME LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml index bfcdb14a..0580df8c 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_No_Values.yaml @@ -87,3 +87,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1505' + test_id: '1006' + test_type: No_Values + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml index b70fa38e..47297f76 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Name_Address.yaml @@ -99,3 +99,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1527' + test_id: '1029' + test_type: Non_Alpha_Name_Address + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" = UPPER("{COLUMN_NAME}") AND "{COLUMN_NAME}" = LOWER("{COLUMN_NAME}") AND "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml index 1fc026d5..f556a4bd 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml @@ -100,3 +100,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND SUBSTR("{COLUMN_NAME}", 1, 1) NOT IN ('"', ' ') AND SUBSTR("{COLUMN_NAME}", -1, 1) <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1528' + test_id: '1030' + test_type: Non_Alpha_Prefixed_Name + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < 'A' AND SUBSTR("{COLUMN_NAME}", 1, 1) NOT IN ('"', ' ') AND SUBSTR("{COLUMN_NAME}", -1, 1) <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml index 6ddaa705..c922d9d8 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml @@ -152,3 +152,11 @@ profile_anomaly_types: lookup_query: |- SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", UNISTR('\00A0'), '\x160'), UNISTR('\2009'), '\x8201'), UNISTR('\200B'), '\x8203'), UNISTR('\200C'), '\x8204'), UNISTR('\200D'), '\x8205'), UNISTR('\200E'), '\x8206'), UNISTR('\200F'), '\x8207'), UNISTR('\202F'), '\x8239'), UNISTR('\3000'), '\x12288'), UNISTR('\FEFF'), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", 'X' || UNISTR('\00A0') || UNISTR('\2009') || UNISTR('\200B') || UNISTR('\200C') || UNISTR('\200D') || UNISTR('\200E') || UNISTR('\200F') || UNISTR('\202F') || UNISTR('\3000') || UNISTR('\FEFF'), 'XXXXXXXXXXX') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1529' + test_id: '1031' + test_type: Non_Printing_Chars + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", NCHAR(160), '\x160'), NCHAR(8201), '\x8201'), NCHAR(8203), '\x8203'), NCHAR(8204), '\x8204'), NCHAR(8205), '\x8205'), NCHAR(8206), '\x8206'), NCHAR(8207), '\x8207'), NCHAR(8239), '\x8239'), NCHAR(12288), '\x12288'), NCHAR(65279), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", NCHAR(160), ''), NCHAR(8201), ''), NCHAR(8203), ''), NCHAR(8204), ''), NCHAR(8205), ''), NCHAR(8206), ''), NCHAR(8207), ''), NCHAR(8239), ''), NCHAR(12288), ''), NCHAR(65279), '') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml index c34fefd2..b68be96d 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Standard_Blanks.yaml @@ -98,3 +98,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN REGEXP_LIKE(LOWER("{COLUMN_NAME}"), '(-{2,}|0{2,}|9{2,}|x{2,}|z{2,})') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1501' + test_id: '1002' + test_type: Non_Standard_Blanks + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") LIKE_REGEXPR '(-{2,}|0{2,}|9{2,}|x{2,}|z{2,})' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml index 28c551c4..b135f21a 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_Duplicates.yaml @@ -87,3 +87,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1515' + test_id: '1016' + test_type: Potential_Duplicates + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml index 17b8a837..e5742a68 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Potential_PII.yaml @@ -85,3 +85,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1530' + test_id: '1100' + test_type: Potential_PII + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml index 7e62bcce..7c91fc79 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Quoted_Values.yaml @@ -86,3 +86,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%%"' OR "{COLUMN_NAME}" LIKE '''%%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1509' + test_id: '1010' + test_type: Quoted_Values + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" LIKE '"%%"' OR "{COLUMN_NAME}" LIKE '''%%''' THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml index 1509bc0a..7415a7bd 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml @@ -80,3 +80,11 @@ profile_anomaly_types: lookup_query: |- created_in_ui error_type: Profile Anomaly + - id: '1518' + test_id: '1019' + test_type: Recency_One_Year + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml index 35dd0dac..4a31eb12 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml @@ -80,3 +80,11 @@ profile_anomaly_types: lookup_query: |- created_in_ui error_type: Profile Anomaly + - id: '1519' + test_id: '1020' + test_type: Recency_Six_Months + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + created_in_ui + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml index 899661a6..39841b8e 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Divergent_Value_Ct.yaml @@ -78,3 +78,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1513' + test_id: '1014' + test_type: Small Divergent Value Ct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml index 2646aca6..5a0d5ac8 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Missing_Value_Ct.yaml @@ -81,3 +81,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN REGEXP_LIKE(LOWER("{COLUMN_NAME}"), '(-{2,}|0{2,}|9{2,}|x{2,}|z{2,})') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1512' + test_id: '1013' + test_type: Small Missing Value Ct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE (CASE WHEN "{COLUMN_NAME}" IN ('.', '?', ' ') THEN 1 WHEN LOWER("{COLUMN_NAME}") LIKE_REGEXPR '(-{2,}|0{2,}|9{2,}|x{2,}|z{2,})' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('blank','error','missing','tbd', 'n/a','#na','none','null','unknown') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml index 76a199b4..b205e34d 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Small_Numeric_Value_Ct.yaml @@ -100,3 +100,11 @@ profile_anomaly_types: lookup_query: |- SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC FETCH FIRST {LIMIT_2} ROWS ONLY) B ORDER BY data_type, count DESC error_type: Profile Anomaly + - id: '1522' + test_id: '1023' + test_type: Small_Numeric_Value_Ct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT A.* FROM (SELECT DISTINCT 'Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> = 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) A UNION ALL SELECT B.* FROM (SELECT DISTINCT 'Non-Numeric' as data_type, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE <%IS_NUM;"{COLUMN_NAME}"%> != 1 GROUP BY "{COLUMN_NAME}" ORDER BY count DESC LIMIT {LIMIT_2}) B ORDER BY data_type, count DESC + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml index 7abea8b3..7210d2b8 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml @@ -95,3 +95,11 @@ profile_anomaly_types: lookup_query: |- WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", 'X '',.-', 'X')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") AS cnt FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", 'X '',.-', 'X')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", 'X '',.-', 'X')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1516' + test_id: '1017' + test_type: Standardized_Value_Matches + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + WITH CTE AS ( SELECT DISTINCT UPPER(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", ' ', ''), '''', ''), ',', ''), '.', ''), '-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") AS cnt FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", ' ', ''), '''', ''), ',', ''), '.', ''), '-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(a."{COLUMN_NAME}", ' ', ''), '''', ''), ',', ''), '.', ''), '-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml index 96aeb280..551391eb 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Suggested_Type.yaml @@ -86,3 +86,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1500' + test_id: '1001' + test_type: Suggested_Type + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml index 60cdb242..0a917305 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Table_Pattern_Mismatch.yaml @@ -96,3 +96,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT column_name, table_name FROM all_tab_columns WHERE owner = '{TARGET_SCHEMA}' AND column_name = '{COLUMN_NAME}' ORDER BY table_name FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1507' + test_id: '1008' + test_type: Table_Pattern_Mismatch + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT COLUMN_NAME, TABLE_NAME FROM SYS.TABLE_COLUMNS WHERE SCHEMA_NAME = '{TARGET_SCHEMA}' AND COLUMN_NAME = '{COLUMN_NAME}' ORDER BY TABLE_NAME LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml index 20bf64e9..7dde6180 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml @@ -85,3 +85,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1521' + test_id: '1022' + test_type: Unexpected Emails + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml index b30b37d1..1cdcf0bf 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml @@ -87,3 +87,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1520' + test_id: '1021' + test_type: Unexpected US States + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml index 02abf507..2b30f4e2 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml @@ -89,3 +89,11 @@ profile_anomaly_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", TO_DATE('{PROFILE_RUN_DATE}', 'YYYY-MM-DD') AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < TO_DATE('1900-01-01', 'YYYY-MM-DD')) OR ("{COLUMN_NAME}" > ADD_MONTHS(TO_DATE('{PROFILE_RUN_DATE}', 'YYYY-MM-DD'), 360)) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1517' + test_id: '1018' + test_type: Unlikely_Date_Values + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", TO_DATE('{PROFILE_RUN_DATE}', 'YYYY-MM-DD') AS profile_run_date, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a WHERE ("{COLUMN_NAME}" < TO_DATE('1900-01-01', 'YYYY-MM-DD')) OR ("{COLUMN_NAME}" > ADD_MONTHS(TO_DATE('{PROFILE_RUN_DATE}', 'YYYY-MM-DD'), 360)) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml index 82c43fae..a5b8519f 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Variant_Coded_Values.yaml @@ -89,3 +89,11 @@ profile_anomaly_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") IN (SELECT TRIM(REGEXP_SUBSTR(SUBSTR('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '[^|]+', 1, LEVEL)) FROM DUAL CONNECT BY LEVEL <= REGEXP_COUNT(SUBSTR('{DETAIL_EXPRESSION}', INSTR('{DETAIL_EXPRESSION}', ':') + 2), '[^|]+')) GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly + - id: '1525' + test_id: '1027' + test_type: Variant_Coded_Values + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + WITH val_list(token, remaining) AS ( SELECT CASE WHEN LOCATE(SUBSTR('{DETAIL_EXPRESSION}', LOCATE('{DETAIL_EXPRESSION}', ':') + 2), '|') > 0 THEN TRIM(SUBSTR(SUBSTR('{DETAIL_EXPRESSION}', LOCATE('{DETAIL_EXPRESSION}', ':') + 2), 1, LOCATE(SUBSTR('{DETAIL_EXPRESSION}', LOCATE('{DETAIL_EXPRESSION}', ':') + 2), '|') - 1)) ELSE TRIM(SUBSTR('{DETAIL_EXPRESSION}', LOCATE('{DETAIL_EXPRESSION}', ':') + 2)) END AS token, CASE WHEN LOCATE(SUBSTR('{DETAIL_EXPRESSION}', LOCATE('{DETAIL_EXPRESSION}', ':') + 2), '|') > 0 THEN SUBSTR(SUBSTR('{DETAIL_EXPRESSION}', LOCATE('{DETAIL_EXPRESSION}', ':') + 2), LOCATE(SUBSTR('{DETAIL_EXPRESSION}', LOCATE('{DETAIL_EXPRESSION}', ':') + 2), '|') + 1) ELSE '' END AS remaining FROM DUMMY UNION ALL SELECT CASE WHEN LOCATE(remaining, '|') > 0 THEN TRIM(SUBSTR(remaining, 1, LOCATE(remaining, '|') - 1)) ELSE TRIM(remaining) END AS token, CASE WHEN LOCATE(remaining, '|') > 0 THEN SUBSTR(remaining, LOCATE(remaining, '|') + 1) ELSE '' END AS remaining FROM val_list WHERE LENGTH(remaining) > 0 ) SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOWER("{COLUMN_NAME}") IN (SELECT token FROM val_list) GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml index 10bc3ffc..83e0ec45 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance.yaml @@ -240,6 +240,31 @@ test_types: ORDER BY {GROUPBY_NAMES} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8500' + test_id: '1500' + test_type: Aggregate_Balance + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT} + error_type: Test Results test_templates: - id: '2506' test_type: Aggregate_Balance @@ -628,3 +653,47 @@ test_types: WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) + - id: '8006' + test_type: Aggregate_Balance + sql_flavor: sap_hana + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total + OR (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml index 80c9cd43..59b127bb 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Percent.yaml @@ -256,6 +256,33 @@ test_types: ORDER BY {GROUPBY_NAMES} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8504' + test_id: '1504' + test_type: Aggregate_Balance_Percent + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT} + error_type: Test Results test_templates: - id: '2509' test_type: Aggregate_Balance_Percent @@ -644,3 +671,47 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) + - id: '8009' + test_type: Aggregate_Balance_Percent + sql_flavor: sap_hana + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total * (1 + {LOWER_TOLERANCE}/100.0) AND match_total * (1 + {UPPER_TOLERANCE}/100.0)) diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml index 141ddf0b..c868d3cd 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Balance_Range.yaml @@ -256,6 +256,33 @@ test_types: ORDER BY {GROUPBY_NAMES} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8505' + test_id: '1505' + test_type: Aggregate_Balance_Range + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT} + error_type: Test Results test_templates: - id: '2510' test_type: Aggregate_Balance_Range @@ -644,3 +671,47 @@ test_types: WHERE (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) + - id: '8010' + test_type: Aggregate_Balance_Range + sql_flavor: sap_hana + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL) + OR (total NOT BETWEEN match_total + {LOWER_TOLERANCE} AND match_total + {UPPER_TOLERANCE}) diff --git a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml index 92bac6db..49e1b39a 100644 --- a/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Aggregate_Minimum.yaml @@ -240,6 +240,31 @@ test_types: ORDER BY {GROUPBY_NAMES} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8501' + test_id: '1501' + test_type: Aggregate_Minimum + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT} + error_type: Test Results test_templates: - id: '2502' test_type: Aggregate_Minimum @@ -628,3 +653,47 @@ test_types: WHERE total < match_total -- OR (total IS NOT NULL AND match_total IS NULL) -- New categories OR (total IS NULL AND match_total IS NOT NULL) + - id: '8002' + test_type: Aggregate_Minimum + sql_flavor: sap_hana + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total + -- OR (total IS NOT NULL AND match_total IS NULL) -- New categories + OR (total IS NULL AND match_total IS NOT NULL) diff --git a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml index 898b9305..23d43989 100644 --- a/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Alpha_Trunc.yaml @@ -108,6 +108,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8001' + test_type: Alpha_Trunc + sql_flavor: sap_hana + measure: |- + MAX(LENGTH({COLUMN_NAME})) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1364' test_id: '1004' @@ -180,4 +188,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8001' + test_id: '1004' + test_type: Alpha_Trunc + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", LENGTH("{COLUMN_NAME}") as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}", (SELECT MAX(LENGTH("{COLUMN_NAME}")) as max_length FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE LENGTH("{COLUMN_NAME}") = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml index 42f26e3f..a224d3b6 100644 --- a/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Avg_Shift.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>=' test_condition: |- {THRESHOLD_VALUE} + - id: '8002' + test_type: Avg_Shift + sql_flavor: sap_hana + measure: |- + ABS( (AVG(CAST({COLUMN_NAME} AS DECIMAL)) - {BASELINE_AVG}) / SQRT(((COUNT({COLUMN_NAME})-1)*POWER(STDDEV({COLUMN_NAME}),2) + ({BASELINE_VALUE_CT}-1) * POWER({BASELINE_SD},2)) /NULLIF(COUNT({COLUMN_NAME}) + {BASELINE_VALUE_CT}, 0) )) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1365' test_id: '1005' @@ -175,4 +183,12 @@ test_types: lookup_query: |- SELECT AVG(CAST("{COLUMN_NAME}" AS NUMBER)) AS current_average FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" error_type: Test Results + - id: '8002' + test_id: '1005' + test_type: Avg_Shift + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT AVG(CAST("{COLUMN_NAME}" AS DECIMAL)) AS current_average FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml index 940a5f01..4122df48 100644 --- a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml +++ b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml @@ -347,3 +347,37 @@ test_types: FROM ( {CUSTOM_QUERY} ) TEST + - id: '8004' + test_type: CUSTOM + sql_flavor: sap_hana + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + CASE + WHEN '{COLUMN_NAME_NO_QUOTES}' IS NULL THEN NULL + ELSE '{COLUMN_NAME_NO_QUOTES}' + END as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + /* TODO: 'custom_query= {CUSTOM_QUERY_ESCAPED}' as input_parameters, */ + 'Skip_Errors={SKIP_ERRORS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( + {CUSTOM_QUERY} + ) TEST diff --git a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml index 39dbdc70..18bdde5d 100644 --- a/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Combo_Match.yaml @@ -214,6 +214,28 @@ test_types: ORDER BY {COLUMN_NAME_NO_QUOTES} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8502' + test_id: '1502' + test_type: Combo_Match + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test + ORDER BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT} + error_type: Test Results test_templates: - id: '2501' test_type: Combo_Match @@ -564,3 +586,42 @@ test_types: GROUP BY {MATCH_GROUPBY_NAMES} {MATCH_HAVING_CONDITION} ) test + - id: '8001' + test_type: Combo_Match + sql_flavor: sap_hana + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{MATCH_TABLE_NAME}{QUOTE} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test diff --git a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml index ae1f4725..110b2226 100644 --- a/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Condition_Flag.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8003' + test_type: Condition_Flag + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN {CUSTOM_QUERY} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1366' test_id: '1006' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8006' + test_id: '1006' + test_type: Condition_Flag + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE {CUSTOM_QUERY} LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Constant.yaml b/testgen/template/dbsetup_test_types/test_types_Constant.yaml index c3800a9e..7141bcfa 100644 --- a/testgen/template/dbsetup_test_types/test_types_Constant.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Constant.yaml @@ -108,6 +108,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8004' + test_type: Constant + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN {COLUMN_NAME} <> {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1367' test_id: '1007' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8004' + test_id: '1007' + test_type: Constant + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" <> {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml index 26833cd5..c5fe688b 100644 --- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml @@ -112,6 +112,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8005' + test_type: Daily_Record_Ct + sql_flavor: sap_hana + measure: |- + <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT CAST({COLUMN_NAME} AS DATE)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1368' test_id: '1009' @@ -248,4 +256,12 @@ test_types: lookup_query: |- WITH daterange AS (SELECT (SELECT MIN(TRUNC("{COLUMN_NAME}")) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") + LEVEL - 1 AS all_dates FROM DUAL CONNECT BY LEVEL <= (SELECT MAX(TRUNC("{COLUMN_NAME}")) - MIN(TRUNC("{COLUMN_NAME}")) + 1 FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")), existing_periods AS (SELECT DISTINCT TRUNC("{COLUMN_NAME}") AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY TRUNC("{COLUMN_NAME}")) SELECT p.missing_period, p.prior_available_date, e.period_count AS prior_available_date_count, p.next_available_date, f.period_count AS next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8009' + test_id: '1009' + test_type: Daily_Record_Ct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + WITH Pass0 AS (SELECT 1 C FROM DUMMY UNION ALL SELECT 1 FROM DUMMY), Pass1 AS (SELECT 1 C FROM Pass0 A, Pass0 B), Pass2 AS (SELECT 1 C FROM Pass1 A, Pass1 B), Pass3 AS (SELECT 1 C FROM Pass2 A, Pass2 B), Pass4 AS (SELECT 1 C FROM Pass3 A, Pass3 B), nums AS (SELECT ROW_NUMBER() OVER (ORDER BY C) - 1 AS rn FROM Pass4), bounds AS (SELECT MIN(CAST("{COLUMN_NAME}" AS DATE)) AS min_date, MAX(CAST("{COLUMN_NAME}" AS DATE)) AS max_date FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), daterange AS (SELECT ADD_DAYS(b.min_date, n.rn) AS all_dates FROM bounds b, nums n WHERE ADD_DAYS(b.min_date, n.rn) <= b.max_date), existing_periods AS (SELECT DISTINCT CAST("{COLUMN_NAME}" AS DATE) AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY CAST("{COLUMN_NAME}" AS DATE)) SELECT p.missing_period, p.prior_available_date, e.period_count AS prior_available_date_count, p.next_available_date, f.period_count AS next_available_date_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_date, MIN(c.period) AS next_available_date FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_date = e.period) LEFT JOIN existing_periods f ON (p.next_available_date = f.period) ORDER BY p.missing_period LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml index 9aedf7dc..7b40daa7 100644 --- a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml @@ -109,6 +109,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8006' + test_type: Dec_Trunc + sql_flavor: sap_hana + measure: |- + SUM(ROUND(ABS(MOD({COLUMN_NAME}, 1)), 5))+1 + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1369' test_id: '1011' @@ -182,4 +190,12 @@ test_types: lookup_query: |- SELECT DISTINCT LENGTH(SUBSTR(TO_CHAR("{COLUMN_NAME}"), INSTR(TO_CHAR("{COLUMN_NAME}"), '.') + 1)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE INSTR(TO_CHAR("{COLUMN_NAME}"), '.') > 0 GROUP BY LENGTH(SUBSTR(TO_CHAR("{COLUMN_NAME}"), INSTR(TO_CHAR("{COLUMN_NAME}"), '.') + 1)) FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8006' + test_id: '1011' + test_type: Dec_Trunc + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT LENGTH(SUBSTR(TO_VARCHAR("{COLUMN_NAME}"), LOCATE(TO_VARCHAR("{COLUMN_NAME}"), '.') + 1)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOCATE(TO_VARCHAR("{COLUMN_NAME}"), '.') > 0 GROUP BY LENGTH(SUBSTR(TO_VARCHAR("{COLUMN_NAME}"), LOCATE(TO_VARCHAR("{COLUMN_NAME}"), '.') + 1)) LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml index 7ae9b335..1a9d8c82 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Date_Ct.yaml @@ -109,6 +109,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8007' + test_type: Distinct_Date_Ct + sql_flavor: sap_hana + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1370' test_id: '1012' @@ -179,4 +187,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8012' + test_id: '1012' + test_type: Distinct_Date_Ct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml index 501d9f17..ea1195ec 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distinct_Value_Ct.yaml @@ -108,6 +108,14 @@ test_types: test_operator: <> test_condition: |- {THRESHOLD_VALUE} + - id: '8008' + test_type: Distinct_Value_Ct + sql_flavor: sap_hana + measure: |- + COUNT(DISTINCT {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1371' test_id: '1013' @@ -178,4 +186,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8008' + test_id: '1013' + test_type: Distinct_Value_Ct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml index 221563c4..d02569c6 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml @@ -244,6 +244,33 @@ test_types: ORDER BY COALESCE(l.category, o.category) FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8503' + test_id: '1503' + test_type: Distribution_Shift + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + CAST(COUNT(*) AS DECIMAL) / CAST(SUM(COUNT(*)) OVER () AS DECIMAL) AS pct_of_total + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), + older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + CAST(COUNT(*) AS DECIMAL) / CAST(SUM(COUNT(*)) OVER () AS DECIMAL) AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ) + SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) + ORDER BY COALESCE(l.category, o.category) + LIMIT {LIMIT} + error_type: Test Results test_templates: - id: '2503' test_type: Distribution_Shift @@ -668,3 +695,54 @@ test_types: SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2))) + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) as js_divergence FROM dataset ) rslt + - id: '8003' + test_type: Distribution_Shift + sql_flavor: sap_hana + template: |- + -- Relative Entropy: measured by Jensen-Shannon Divergence + -- Smoothed and normalized version of KL divergence, + -- with scores between 0 (identical) and 1 (maximally different), + -- when using the base-2 logarithm. Formula is: + -- 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m) + -- Log base 2 of x = LN(x)/LN(2) + WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + CAST(COUNT(*) AS DECIMAL) / CAST(SUM(COUNT(*)) OVER () AS DECIMAL) AS pct_of_total + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), + older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + CAST(COUNT(*) AS DECIMAL) / CAST(SUM(COUNT(*)) OVER () AS DECIMAL) AS pct_of_total + FROM {QUOTE}{MATCH_SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ), + dataset + AS ( SELECT COALESCE(l.category, o.category) AS category, + COALESCE(o.pct_of_total, 0.0000001) AS old_pct, + COALESCE(l.pct_of_total, 0.0000001) AS new_pct, + (COALESCE(o.pct_of_total, 0.0000001) + + COALESCE(l.pct_of_total, 0.0000001))/2.0 AS avg_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) ) + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + -- '{GROUPBY_NAMES}' as column_names, + '{THRESHOLD_VALUE}' as threshold_value, + NULL as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN js_divergence > {THRESHOLD_VALUE} THEN 0 ELSE 1 END as result_code, + 'Divergence Level: ' || CAST(js_divergence AS {VARCHAR_TYPE}) || ', Threshold: {THRESHOLD_VALUE}.' as result_message, + js_divergence as result_measure + FROM ( + SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2))) + + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) as js_divergence + FROM dataset ) rslt diff --git a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml index 0c3b50e4..138abb10 100644 --- a/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Dupe_Rows.yaml @@ -150,6 +150,20 @@ test_types: ORDER BY {GROUPBY_NAMES} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8510' + test_id: '1510' + test_type: Dupe_Rows + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ORDER BY {GROUPBY_NAMES} + LIMIT {LIMIT} + error_type: Test Results test_templates: - id: '2511' test_type: Dupe_Rows @@ -451,3 +465,36 @@ test_types: GROUP BY {GROUPBY_NAMES} HAVING COUNT(*) > 1 ) test + - id: '8011' + test_type: Dupe_Rows + sql_flavor: sap_hana + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS {VARCHAR_TYPE}) || ' duplicate row(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COALESCE(SUM(record_ct), 0) as result_measure + FROM ( SELECT {GROUPBY_NAMES}, COUNT(*) as record_ct + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + HAVING COUNT(*) > 1 + ) test diff --git a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml index 64e9611e..7b51af47 100644 --- a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml @@ -108,6 +108,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8009' + test_type: Email_Format + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN NOT {COLUMN_NAME} LIKE_REGEXPR '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1372' test_id: '1014' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT REGEXP_LIKE("{COLUMN_NAME}", '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8009' + test_id: '1014' + test_type: Email_Format + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT "{COLUMN_NAME}" LIKE_REGEXPR '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml index fbf0307b..3297b585 100644 --- a/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml @@ -415,23 +415,78 @@ test_types: fingerprint AS result_measure, CASE -- Training mode: tolerances not yet calculated - WHEN {LOWER_TOLERANCE} IS NULL OR {UPPER_TOLERANCE} IS NULL THEN -1 - -- No change to table, and we're beyond time range: LATE - WHEN fingerprint = '{BASELINE_VALUE}' AND interval_minutes > {UPPER_TOLERANCE} THEN 0 - -- Table changed outside time range: UNEXPECTED + WHEN {LOWER_TOLERANCE} IS NULL AND {UPPER_TOLERANCE} IS NULL THEN -1 + -- No change and excluded day: suppress + WHEN fingerprint = '{BASELINE_VALUE}' AND {IS_EXCLUDED_DAY} = 1 THEN 1 + -- No change, beyond time range (business time): LATE + WHEN fingerprint = '{BASELINE_VALUE}' + AND (interval_minutes - {EXCLUDED_MINUTES}) > {THRESHOLD_VALUE} THEN 0 + -- Table changed outside time range (business time): UNEXPECTED WHEN fingerprint <> '{BASELINE_VALUE}' - AND NOT interval_minutes BETWEEN {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} THEN 0 + AND NOT (interval_minutes - {EXCLUDED_MINUTES}) + BETWEEN {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} THEN 0 ELSE 1 END AS result_code, + 'Table update detected: ' || CASE WHEN fingerprint <> '{BASELINE_VALUE}' THEN 'Yes' ELSE 'No' END + || CASE + WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) BETWEEN {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} THEN '. On time.' + WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) < {LOWER_TOLERANCE} THEN '. Earlier than expected.' + WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) > {UPPER_TOLERANCE} THEN '. Later than expected.' + WHEN fingerprint = '{BASELINE_VALUE}' AND {IS_EXCLUDED_DAY} = 0 AND (interval_minutes - {EXCLUDED_MINUTES}) > {THRESHOLD_VALUE} THEN '. Late.' + ELSE '' + END AS result_message, + CASE + WHEN fingerprint <> '{BASELINE_VALUE}' THEN '0' + ELSE COALESCE(TO_CHAR(interval_minutes), 'Unknown') + END AS result_signal + FROM test_data; + - id: '2817' + test_type: Freshness_Trend + sql_flavor: sap_hana + template: |- + WITH test_data AS ( + SELECT + {CUSTOM_QUERY} AS fingerprint, + ROUND(SECONDS_BETWEEN(TO_TIMESTAMP('{BASELINE_SUM}', 'YYYY-MM-DD HH24:MI:SS'), TO_TIMESTAMP('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')) / 60.0) AS interval_minutes + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + ) + SELECT '{TEST_TYPE}' AS test_type, + '{TEST_DEFINITION_ID}' AS test_definition_id, + '{TEST_SUITE_ID}' AS test_suite_id, + '{TEST_RUN_ID}' AS test_run_id, + '{RUN_DATE}' AS test_time, + '{SCHEMA_NAME}' AS schema_name, + '{TABLE_NAME}' AS table_name, + '{COLUMN_NAME_NO_QUOTES}' AS column_names, + '{SKIP_ERRORS}' AS threshold_value, + {SKIP_ERRORS} AS skip_errors, + '{INPUT_PARAMETERS}' AS input_parameters, + fingerprint AS result_measure, CASE - -- No change to table, and we're beyond time range: LATE - WHEN fingerprint = '{BASELINE_VALUE}' AND interval_minutes > {UPPER_TOLERANCE} - THEN 'Table unchanged beyond expected schedule' - -- Table changed outside time range: UNEXPECTED + -- Training mode: tolerances not yet calculated + WHEN {LOWER_TOLERANCE} IS NULL AND {UPPER_TOLERANCE} IS NULL THEN -1 + -- No change and excluded day: suppress + WHEN fingerprint = '{BASELINE_VALUE}' AND {IS_EXCLUDED_DAY} = 1 THEN 1 + -- No change, beyond time range (business time): LATE + WHEN fingerprint = '{BASELINE_VALUE}' + AND (interval_minutes - {EXCLUDED_MINUTES}) > {THRESHOLD_VALUE} THEN 0 + -- Table changed outside time range (business time): UNEXPECTED WHEN fingerprint <> '{BASELINE_VALUE}' - AND NOT interval_minutes BETWEEN {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} - THEN 'Table changed outside of expected schedule' - ELSE 'Interval since last update: ' || COALESCE(TO_CHAR(interval_minutes), 'Unknown') - END AS result_message, - COALESCE(TO_CHAR(interval_minutes), 'Unknown') AS result_signal - FROM test_data + AND NOT (interval_minutes - {EXCLUDED_MINUTES}) + BETWEEN {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} THEN 0 + ELSE 1 + END AS result_code, + 'Table update detected: ' || CASE WHEN fingerprint <> '{BASELINE_VALUE}' THEN 'Yes' ELSE 'No' END + || CASE + WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) BETWEEN {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} THEN '. On time.' + WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) < {LOWER_TOLERANCE} THEN '. Earlier than expected.' + WHEN fingerprint <> '{BASELINE_VALUE}' AND (interval_minutes - {EXCLUDED_MINUTES}) > {UPPER_TOLERANCE} THEN '. Later than expected.' + WHEN fingerprint = '{BASELINE_VALUE}' AND {IS_EXCLUDED_DAY} = 0 AND (interval_minutes - {EXCLUDED_MINUTES}) > {THRESHOLD_VALUE} THEN '. Late.' + ELSE '' + END AS result_message, + CASE + WHEN fingerprint <> '{BASELINE_VALUE}' THEN '0' + ELSE COALESCE(TO_VARCHAR(interval_minutes), 'Unknown') + END AS result_signal + FROM test_data; diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml index 87669e7a..f164bcbe 100644 --- a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml @@ -107,6 +107,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8010' + test_type: Future_Date + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1373' test_id: '1015' @@ -176,4 +184,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRUNC("{COLUMN_NAME}") >= TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8010' + test_id: '1015' + test_type: Future_Date + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml index 8361d66e..f46bbe36 100644 --- a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml @@ -108,6 +108,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8011' + test_type: Future_Date_1Y + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= ADD_DAYS(TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS'), 365) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1374' test_id: '1016' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRUNC("{COLUMN_NAME}") >= TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') + 365 GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8016' + test_id: '1016' + test_type: Future_Date_1Y + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= ADD_DAYS(TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS'), 365) GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml index 4003c354..707d20a6 100644 --- a/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Incr_Avg_Shift.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>=' test_condition: |- {THRESHOLD_VALUE} + - id: '8012' + test_type: Incr_Avg_Shift + sql_flavor: sap_hana + measure: |- + COALESCE(ABS(({BASELINE_AVG} - (SUM({COLUMN_NAME}) - {BASELINE_SUM}) / NULLIF(COUNT({COLUMN_NAME}) - {BASELINE_VALUE_CT}, 0)) / {BASELINE_SD}), 0) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1375' test_id: '1017' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT AVG(CAST("{COLUMN_NAME}" AS NUMBER)) AS current_average, SUM(CAST("{COLUMN_NAME}" AS NUMBER)) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}"), 0) as current_value_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" error_type: Test Results + - id: '8012' + test_id: '1017' + test_type: Incr_Avg_Shift + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT AVG(CAST("{COLUMN_NAME}" AS DECIMAL)) AS current_average, SUM(CAST("{COLUMN_NAME}" AS DECIMAL)) AS current_sum, NULLIF(COUNT("{COLUMN_NAME}"), 0) as current_value_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml index 03898e5d..c24a0bfb 100644 --- a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml +++ b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml @@ -106,6 +106,14 @@ test_types: test_operator: <> test_condition: |- {THRESHOLD_VALUE} + - id: '8013' + test_type: LOV_All + sql_flavor: sap_hana + measure: |- + LISTAGG(DISTINCT {COLUMN_NAME}, '|') WITHIN GROUP (ORDER BY {COLUMN_NAME}) + test_operator: <> + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1376' test_id: '1018' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") HAVING LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8013' + test_id: '1018' + test_type: LOV_All + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") HAVING LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml index feb83f0e..ef37b028 100644 --- a/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_LOV_Match.yaml @@ -212,6 +212,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8014' + test_type: LOV_Match + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN {COLUMN_NAME} IS NOT NULL AND {COLUMN_NAME} NOT IN {BASELINE_VALUE} THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1377' test_id: '1019' @@ -281,4 +289,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL AND "{COLUMN_NAME}" NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8014' + test_id: '1019' + test_type: LOV_Match + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL AND "{COLUMN_NAME}" NOT IN {BASELINE_VALUE} GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml index 86e3437f..89e18871 100644 --- a/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml @@ -96,6 +96,14 @@ test_types: test_operator: NOT BETWEEN test_condition: |- {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} + - id: '8016' + test_type: Metric_Trend + sql_flavor: sap_hana + measure: |- + {CUSTOM_QUERY} + test_operator: NOT BETWEEN + test_condition: |- + {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} target_data_lookups: - id: '1484' test_id: '1514' @@ -185,4 +193,15 @@ test_types: {UPPER_TOLERANCE} AS upper_bound FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" error_type: Test Results + - id: '8514' + test_id: '1514' + test_type: Metric_Trend + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT {CUSTOM_QUERY} AS current_count, + {LOWER_TOLERANCE} AS lower_bound, + {UPPER_TOLERANCE} AS upper_bound + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml index 80950917..877bb855 100644 --- a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml @@ -108,6 +108,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8015' + test_type: Min_Date + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < TO_DATE('{BASELINE_VALUE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1378' test_id: '1020' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < TO_DATE('{BASELINE_VALUE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8015' + test_id: '1020' + test_type: Min_Date + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < TO_DATE('{BASELINE_VALUE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml index 96b6ff1d..56d505ff 100644 --- a/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Min_Val.yaml @@ -108,6 +108,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8016' + test_type: Min_Val + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} - 1e-6 THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1379' test_id: '1021' @@ -176,4 +184,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8016' + test_id: '1021' + test_type: Min_Val + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", (ABS("{COLUMN_NAME}") - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < {BASELINE_VALUE} LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml index 82e70601..6ddf86a0 100644 --- a/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Missing_Pct.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>=' test_condition: |- {THRESHOLD_VALUE} + - id: '8017' + test_type: Missing_Pct + sql_flavor: sap_hana + measure: |- + ABS(2.0 * ASIN(SQRT(CAST({BASELINE_VALUE_CT} AS DECIMAL) / CAST({BASELINE_CT} AS DECIMAL))) - 2 * ASIN(SQRT(CAST(COUNT({COLUMN_NAME}) AS DECIMAL) / CAST(NULLIF(COUNT(*), 0) AS DECIMAL)))) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1380' test_id: '1022' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8017' + test_id: '1022' + test_type: Missing_Pct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml index d126e843..83dbf5b5 100644 --- a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8018' + test_type: Monthly_Rec_Ct + sql_flavor: sap_hana + measure: |- + (MAX(<%DATEDIFF_MONTH;{COLUMN_NAME};TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) - MIN(<%DATEDIFF_MONTH;{COLUMN_NAME};TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) + 1) - COUNT(DISTINCT <%DATEDIFF_MONTH;{COLUMN_NAME};TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1381' test_id: '1023' @@ -244,4 +252,12 @@ test_types: lookup_query: |- WITH daterange AS (SELECT ADD_MONTHS((SELECT TRUNC(MIN("{COLUMN_NAME}"), 'MM') FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), LEVEL - 1) AS all_dates FROM DUAL CONNECT BY LEVEL <= MONTHS_BETWEEN((SELECT TRUNC(MAX("{COLUMN_NAME}"), 'MM') FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), (SELECT TRUNC(MIN("{COLUMN_NAME}"), 'MM') FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) + 1), existing_periods AS (SELECT DISTINCT TRUNC("{COLUMN_NAME}", 'MM') AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY TRUNC("{COLUMN_NAME}", 'MM')) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8023' + test_id: '1023' + test_type: Monthly_Rec_Ct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + WITH Pass0 AS (SELECT 1 C FROM DUMMY UNION ALL SELECT 1 FROM DUMMY), Pass1 AS (SELECT 1 C FROM Pass0 A, Pass0 B), Pass2 AS (SELECT 1 C FROM Pass1 A, Pass1 B), Pass3 AS (SELECT 1 C FROM Pass2 A, Pass2 B), nums AS (SELECT ROW_NUMBER() OVER (ORDER BY C) - 1 AS rn FROM Pass3), bounds AS (SELECT TO_DATE(YEAR(MIN("{COLUMN_NAME}")) || '-' || LPAD(MONTH(MIN("{COLUMN_NAME}")), 2, '0') || '-01', 'YYYY-MM-DD') AS min_month, TO_DATE(YEAR(MAX("{COLUMN_NAME}")) || '-' || LPAD(MONTH(MAX("{COLUMN_NAME}")), 2, '0') || '-01', 'YYYY-MM-DD') AS max_month FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), daterange AS (SELECT ADD_MONTHS(b.min_month, n.rn) AS all_dates FROM bounds b, nums n WHERE ADD_MONTHS(b.min_month, n.rn) <= b.max_month), existing_periods AS (SELECT DISTINCT TO_DATE(YEAR("{COLUMN_NAME}") || '-' || LPAD(MONTH("{COLUMN_NAME}"), 2, '0') || '-01', 'YYYY-MM-DD') AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY YEAR("{COLUMN_NAME}"), MONTH("{COLUMN_NAME}")) SELECT p.missing_period, p.prior_available_month, e.period_count AS prior_available_month_count, p.next_available_month, f.period_count AS next_available_month_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_month, MIN(c.period) AS next_available_month FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_month = e.period) LEFT JOIN existing_periods f ON (p.next_available_month = f.period) ORDER BY p.missing_period LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml index ff0ab45e..cb8ebf91 100644 --- a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Above.yaml @@ -113,6 +113,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8019' + test_type: Outlier_Pct_Above + sql_flavor: sap_hana + measure: |- + CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS DECIMAL) > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS DECIMAL) / NULLIF(COUNT({COLUMN_NAME}), 0) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1382' test_id: '1024' @@ -182,4 +190,12 @@ test_types: lookup_query: |- SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS NUMBER) > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC error_type: Test Results + - id: '8019' + test_id: '1024' + test_type: Outlier_Pct_Above + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} + (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DECIMAL) > ({BASELINE_AVG} + (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml index 469a36fc..b2b32d67 100644 --- a/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Outlier_Pct_Below.yaml @@ -113,6 +113,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8020' + test_type: Outlier_Pct_Below + sql_flavor: sap_hana + measure: |- + CAST(SUM(CASE WHEN CAST({COLUMN_NAME} AS DECIMAL) < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END) AS DECIMAL) / NULLIF(COUNT({COLUMN_NAME}), 0) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1383' test_id: '1025' @@ -182,4 +190,12 @@ test_types: lookup_query: |- SELECT ({BASELINE_AVG} - (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS NUMBER) < ({BASELINE_AVG} - (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC error_type: Test Results + - id: '8020' + test_id: '1025' + test_type: Outlier_Pct_Below + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT ({BASELINE_AVG} - (2*{BASELINE_SD})) AS outlier_threshold, "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DECIMAL) < ({BASELINE_AVG} - (2*{BASELINE_SD})) GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml index cb32a132..3cd3359d 100644 --- a/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Pattern_Match.yaml @@ -108,6 +108,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8021' + test_type: Pattern_Match + sql_flavor: sap_hana + measure: |- + COUNT(NULLIF({COLUMN_NAME}, '')) - SUM(CASE WHEN NULLIF(TO_VARCHAR({COLUMN_NAME}), '') LIKE_REGEXPR '{BASELINE_VALUE}' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1384' test_id: '1026' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT REGEXP_LIKE(NULLIF(TO_CHAR("{COLUMN_NAME}"), ''), '{BASELINE_VALUE}') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8021' + test_id: '1026' + test_type: Pattern_Match + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT NULLIF(TO_VARCHAR("{COLUMN_NAME}"), '') LIKE_REGEXPR '{BASELINE_VALUE}' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Recency.yaml b/testgen/template/dbsetup_test_types/test_types_Recency.yaml index eb1b0ba4..0f9e6b4f 100644 --- a/testgen/template/dbsetup_test_types/test_types_Recency.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Recency.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8022' + test_type: Recency + sql_flavor: sap_hana + measure: |- + <%DATEDIFF_DAY;MAX({COLUMN_NAME});TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1385' test_id: '1028' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT DISTINCT col AS latest_date_available, TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE <%DATEDIFF_DAY;col;TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > {THRESHOLD_VALUE} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8022' + test_id: '1028' + test_type: Recency + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT col AS latest_date_available, TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE <%DATEDIFF_DAY;col;TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > {THRESHOLD_VALUE} LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Required.yaml b/testgen/template/dbsetup_test_types/test_types_Required.yaml index bb2cedb6..f11ceb36 100644 --- a/testgen/template/dbsetup_test_types/test_types_Required.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Required.yaml @@ -107,6 +107,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8023' + test_type: Required + sql_flavor: sap_hana + measure: |- + COUNT(*) - COUNT({COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1386' test_id: '1030' @@ -175,4 +183,12 @@ test_types: lookup_query: |- SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8023' + test_id: '1030' + test_type: Required + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT * FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NULL LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml index 7be82ea1..4a373834 100644 --- a/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct.yaml @@ -106,6 +106,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8024' + test_type: Row_Ct + sql_flavor: sap_hana + measure: |- + COUNT(*) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1387' test_id: '1031' @@ -178,4 +186,12 @@ test_types: lookup_query: |- WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) / {THRESHOLD_VALUE}, 2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE} error_type: Test Results + - id: '8024' + test_id: '1031' + test_type: Row_Ct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, ABS(ROUND(100 * (current_count - {THRESHOLD_VALUE}) / {THRESHOLD_VALUE}, 2)) AS row_count_pct_decrease FROM cte WHERE current_count < {THRESHOLD_VALUE} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml index 7eb8b421..6b176c7a 100644 --- a/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Row_Ct_Pct.yaml @@ -107,6 +107,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8025' + test_type: Row_Ct_Pct + sql_flavor: sap_hana + measure: |- + ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT}) / {BASELINE_CT}, 2)) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1388' test_id: '1032' @@ -178,4 +186,12 @@ test_types: lookup_query: |- WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) / {BASELINE_CT}, 2)) AS row_count_pct_difference FROM cte error_type: Test Results + - id: '8025' + test_id: '1032' + test_type: Row_Ct_Pct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + WITH CTE AS (SELECT COUNT(*) AS current_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT current_count, {BASELINE_CT} AS baseline_count, ABS(ROUND(100 * (current_count - {BASELINE_CT}) / {BASELINE_CT}, 2)) AS row_count_pct_difference FROM cte + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Schema_Drift.yaml b/testgen/template/dbsetup_test_types/test_types_Schema_Drift.yaml index 48cc710b..e1e23dcd 100644 --- a/testgen/template/dbsetup_test_types/test_types_Schema_Drift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Schema_Drift.yaml @@ -474,3 +474,58 @@ test_types: AS result_message, column_adds + column_drops + column_mods AS result_measure FROM table_changes; + - id: '8014' + test_type: Schema_Drift + sql_flavor: sap_hana + template: |- + WITH prev_test AS ( + SELECT MAX(test_starttime) AS last_run_time + FROM {APP_SCHEMA_NAME}.test_runs + WHERE test_suite_id = '{TEST_SUITE_ID}'::UUID + -- Ignore current run + AND id <> '{TEST_RUN_ID}'::UUID + ), + table_changes AS ( + SELECT + dsl.table_name, + MAX(prev_test.last_run_time) as window_start, + MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'A' THEN dsl.change_date ELSE NULL END) as last_add_date, + MAX(CASE WHEN dsl.column_id IS NULL AND dsl.change = 'D' THEN dsl.change_date ELSE NULL END) as last_drop_date, + COUNT(*) FILTER (WHERE dsl.column_id IS NOT NULL AND dsl.change = 'A') AS column_adds, + COUNT(*) FILTER (WHERE dsl.column_id IS NOT NULL AND dsl.change = 'D') AS column_drops, + COUNT(*) FILTER (WHERE dsl.column_id IS NOT NULL AND dsl.change = 'M') AS column_mods + FROM {APP_SCHEMA_NAME}.data_structure_log dsl + CROSS JOIN prev_test + WHERE dsl.table_groups_id = '{TABLE_GROUPS_ID}'::UUID + -- if no previous tests, this comparision yelds null and nothing is counted + AND dsl.change_date > prev_test.last_run_time + GROUP BY dsl.table_name + ) + SELECT + '{TEST_TYPE}' AS test_type, + '{TEST_DEFINITION_ID}' AS test_definition_id, + '{TEST_SUITE_ID}' AS test_suite_id, + '{TEST_RUN_ID}' AS test_run_id, + '{RUN_DATE}' AS test_time, + '{SCHEMA_NAME}' AS schema_name, + table_name, + '{INPUT_PARAMETERS}' AS input_parameters, + (CASE + WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'A' + WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'D' + ELSE 'M' + END) + || '|' || column_adds + || '|' || column_drops + || '|' || column_mods + || '|' || window_start::TEXT + AS result_signal, + 0 AS result_code, + CASE WHEN last_add_date IS NOT NULL AND (last_drop_date IS NULL OR last_add_date > last_drop_date) THEN 'Table added. ' ELSE '' END + || CASE WHEN last_drop_date IS NOT NULL AND (last_add_date IS NULL OR last_drop_date > last_add_date) THEN 'Table dropped. ' ELSE '' END + || CASE WHEN column_adds > 0 THEN column_adds || ' columns added. ' ELSE '' END + || CASE WHEN column_drops > 0 THEN column_drops || ' columns dropped. ' ELSE '' END + || CASE WHEN column_mods > 0 THEN column_mods || ' columns modified. ' ELSE '' END + AS result_message, + column_adds + column_drops + column_mods AS result_measure + FROM table_changes; diff --git a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml index 65bf8123..d6fbfbf3 100644 --- a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml @@ -109,6 +109,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8026' + test_type: Street_Addr_Pattern + sql_flavor: sap_hana + measure: |- + 100.0*SUM(CASE WHEN TO_VARCHAR({COLUMN_NAME}) LIKE_REGEXPR '^[0-9]{1,5}[a-zA-Z]?[[:space:]][[:alnum:]_]{1,5}\.?[[:space:]]?[[:alnum:]_]*[[:space:]]?[[:alnum:]_]*[[:space:]][a-zA-Z]{1,6}\.?[[:space:]]?[0-9]{0,5}[A-Z]{0,1}$' THEN 1 ELSE 0 END) / NULLIF(COUNT({COLUMN_NAME}), 0) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1389' test_id: '1033' @@ -179,4 +187,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT REGEXP_LIKE(TO_CHAR("{COLUMN_NAME}"), '^[0-9]{1,5}[a-zA-Z]?[[:space:]][[:alnum:]_]{1,5}\.?[[:space:]]?[[:alnum:]_]*[[:space:]]?[[:alnum:]_]*[[:space:]][a-zA-Z]{1,6}\.?[[:space:]]?[0-9]{0,5}[A-Z]{0,1}$') GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8033' + test_id: '1033' + test_type: Street_Addr_Pattern + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT TO_VARCHAR("{COLUMN_NAME}") LIKE_REGEXPR '^[0-9]{1,5}[a-zA-Z]?[[:space:]][[:alnum:]_]{1,5}\.?[[:space:]]?[[:alnum:]_]*[[:space:]]?[[:alnum:]_]*[[:space:]][a-zA-Z]{1,6}\.?[[:space:]]?[0-9]{0,5}[A-Z]{0,1}$' GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml index 74ccca3c..85396aaa 100644 --- a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml @@ -296,3 +296,35 @@ test_types: FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) test + - id: '8012' + test_type: Table_Freshness + sql_flavor: sap_hana + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + fingerprint as result_signal, + CASE + WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 + ELSE 1 + END AS result_code, + CASE + WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 'No table change detected.' + ELSE 'Table change detected.' + END AS result_message, + CASE + WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 + ELSE 1 + END AS result_measure + FROM ( SELECT {CUSTOM_QUERY} as fingerprint + FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} + WHERE {SUBSET_CONDITION} + ) test diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml index da32668e..c03bfd5f 100644 --- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Gain.yaml @@ -178,6 +178,26 @@ test_types: GROUP BY {COLUMN_NAME_NO_QUOTES} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8508' + test_id: '1508' + test_type: Timeframe_Combo_Gain + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), -2 * {WINDOW_DAYS}) + AND {WINDOW_DATE_COLUMN} < ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + GROUP BY {COLUMN_NAME_NO_QUOTES} + LIMIT {LIMIT} + error_type: Test Results test_templates: - id: '2507' test_type: Timeframe_Combo_Gain @@ -540,3 +560,44 @@ test_types: AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} GROUP BY {COLUMN_NAME_NO_QUOTES} ) test + - id: '8007' + test_type: Timeframe_Combo_Gain + sql_flavor: sap_hana + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS VARCHAR(20)) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}"), -2 * {WINDOW_DAYS}) + AND {WINDOW_DATE_COLUMN} < ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + GROUP BY {COLUMN_NAME_NO_QUOTES} + ) test diff --git a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml index 6737e1f3..1c9851dc 100644 --- a/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Timeframe_Combo_Match.yaml @@ -305,6 +305,40 @@ test_types: AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") - {WINDOW_DAYS} ) WHERE ROWNUM <= {LIMIT_2} error_type: Test Results + - id: '8509' + test_id: '1509' + test_type: Timeframe_Combo_Match + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + ( + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + EXCEPT + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), -2 * {WINDOW_DAYS}) + AND {WINDOW_DATE_COLUMN} < ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + LIMIT {LIMIT_2} + ) + UNION ALL + ( + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), -2 * {WINDOW_DAYS}) + AND {WINDOW_DATE_COLUMN} < ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + EXCEPT + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + LIMIT {LIMIT_2} + ) + error_type: Test Results test_templates: - id: '2508' test_type: Timeframe_Combo_Match @@ -792,3 +826,57 @@ test_types: AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}") - {WINDOW_DAYS} ) ) test + - id: '8008' + test_type: Timeframe_Combo_Match + sql_flavor: sap_hana + template: |- + SELECT '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE_ID}' as test_suite_id, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + NULL as result_signal, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CAST(COUNT(*) AS VARCHAR(20)) || ' error(s) identified, ' || + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END || '{SKIP_ERRORS}.' + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure + FROM ( + ( + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + EXCEPT + SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}"), -2 * {WINDOW_DAYS}) + AND {WINDOW_DATE_COLUMN} < ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + ) + UNION ALL + ( + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}"), -2 * {WINDOW_DAYS}) + AND {WINDOW_DATE_COLUMN} < ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + EXCEPT + SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME_NO_QUOTES} + FROM "{SCHEMA_NAME}"."{TABLE_NAME}" + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= ADD_DAYS((SELECT MAX({WINDOW_DATE_COLUMN}) FROM "{SCHEMA_NAME}"."{TABLE_NAME}"), -{WINDOW_DAYS}) + ) + ) test diff --git a/testgen/template/dbsetup_test_types/test_types_US_State.yaml b/testgen/template/dbsetup_test_types/test_types_US_State.yaml index 22bd616e..21acdc38 100644 --- a/testgen/template/dbsetup_test_types/test_types_US_State.yaml +++ b/testgen/template/dbsetup_test_types/test_types_US_State.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8027' + test_type: US_State + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN NULLIF({COLUMN_NAME}, '') NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1390' test_id: '1036' @@ -178,4 +186,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL AND "{COLUMN_NAME}" NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8036' + test_id: '1036' + test_type: US_State + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" IS NOT NULL AND "{COLUMN_NAME}" NOT IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Unique.yaml b/testgen/template/dbsetup_test_types/test_types_Unique.yaml index 4e300c00..d02a9e38 100644 --- a/testgen/template/dbsetup_test_types/test_types_Unique.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Unique.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8028' + test_type: Unique + sql_flavor: sap_hana + measure: |- + COUNT(*) - COUNT(DISTINCT {COLUMN_NAME}) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1391' test_id: '1034' @@ -179,4 +187,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8028' + test_id: '1034' + test_type: Unique + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml index 8229f185..77f8aae5 100644 --- a/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Unique_Pct.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>=' test_condition: |- {THRESHOLD_VALUE} + - id: '8029' + test_type: Unique_Pct + sql_flavor: sap_hana + measure: |- + ABS(2.0 * ASIN(SQRT(CAST({BASELINE_UNIQUE_CT} AS DECIMAL) / CAST({BASELINE_VALUE_CT} AS DECIMAL))) - 2 * ASIN(SQRT(CAST(COUNT(DISTINCT {COLUMN_NAME}) AS DECIMAL) / CAST(NULLIF(COUNT({COLUMN_NAME}), 0) AS DECIMAL)))) + test_operator: '>=' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1392' test_id: '1035' @@ -178,4 +186,12 @@ test_types: lookup_query: |- SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8029' + test_id: '1035' + test_type: Unique_Pct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml index e037726b..09d90d0a 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_Characters.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8036' + test_type: Valid_Characters + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE({COLUMN_NAME}, NCHAR(160), ''), NCHAR(8203), ''), NCHAR(65279), ''), NCHAR(8239), ''), NCHAR(8201), ''), NCHAR(12288), ''), NCHAR(8204), '') <> {COLUMN_NAME} OR {COLUMN_NAME} LIKE ' %' OR {COLUMN_NAME} LIKE '''%''' OR {COLUMN_NAME} LIKE '"%"' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1397' test_id: '1043' @@ -182,4 +190,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", 'X' || UNISTR('\00A0') || UNISTR('\200B') || UNISTR('\FEFF') || UNISTR('\202F') || UNISTR('\2009') || UNISTR('\3000') || UNISTR('\200C'), 'XXXXXXXX') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8043' + test_id: '1043' + test_type: Valid_Characters + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", NCHAR(160), ''), NCHAR(8203), ''), NCHAR(65279), ''), NCHAR(8239), ''), NCHAR(8201), ''), NCHAR(12288), ''), NCHAR(8204), '') <> "{COLUMN_NAME}" OR "{COLUMN_NAME}" LIKE ' %' OR "{COLUMN_NAME}" LIKE '''%''' OR "{COLUMN_NAME}" LIKE '"%"' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml index 79328926..343587b7 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_Month.yaml @@ -108,5 +108,13 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8033' + test_type: Valid_Month + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN {COLUMN_NAME} IS NOT NULL AND {COLUMN_NAME} NOT IN ({BASELINE_VALUE}) THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: [] test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml index b00f3a68..a42d0aa2 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip.yaml @@ -107,6 +107,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8034' + test_type: Valid_US_Zip + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN REPLACE_REGEXPR('[0-9]' IN {COLUMN_NAME} WITH '9') NOT IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1398' test_id: '1044' @@ -177,4 +185,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8044' + test_id: '1044' + test_type: Valid_US_Zip + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REPLACE_REGEXPR('[0-9]' IN "{COLUMN_NAME}" WITH '9') NOT IN ('99999', '999999999', '99999-9999') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml index d0a91272..31a6d9ab 100644 --- a/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Valid_US_Zip3.yaml @@ -108,6 +108,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8035' + test_type: Valid_US_Zip3 + sql_flavor: sap_hana + measure: |- + SUM(CASE WHEN REPLACE_REGEXPR('[0-9]' IN {COLUMN_NAME} WITH '9') <> '999' THEN 1 ELSE 0 END) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1399' test_id: '1045' @@ -178,4 +186,12 @@ test_types: lookup_query: |- SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}",'012345678','999999999') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8045' + test_id: '1045' + test_type: Valid_US_Zip3 + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REPLACE_REGEXPR('[0-9]' IN "{COLUMN_NAME}" WITH '9') <> '999' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml index 9970df89..74b91f96 100644 --- a/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Variability_Decrease.yaml @@ -113,6 +113,14 @@ test_types: test_operator: < test_condition: |- {THRESHOLD_VALUE} + - id: '8032' + test_type: Variability_Decrease + sql_flavor: sap_hana + measure: |- + 100.0*STDDEV(CAST({COLUMN_NAME} AS DECIMAL))/CAST({BASELINE_SD} AS DECIMAL) + test_operator: < + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1395' test_id: '1041' @@ -179,4 +187,12 @@ test_types: lookup_query: |- SELECT STDDEV(CAST("{COLUMN_NAME}" AS NUMBER)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" error_type: Test Results + - id: '8032' + test_id: '1041' + test_type: Variability_Decrease + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST("{COLUMN_NAME}" AS DECIMAL)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml index 5adc92b0..1992ec41 100644 --- a/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Variability_Increase.yaml @@ -117,6 +117,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8031' + test_type: Variability_Increase + sql_flavor: sap_hana + measure: |- + 100.0*STDDEV(CAST({COLUMN_NAME} AS DECIMAL))/CAST({BASELINE_SD} AS DECIMAL) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1394' test_id: '1040' @@ -183,4 +191,12 @@ test_types: lookup_query: |- SELECT STDDEV(CAST("{COLUMN_NAME}" AS NUMBER)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" error_type: Test Results + - id: '8031' + test_id: '1040' + test_type: Variability_Increase + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT STDDEV(CAST("{COLUMN_NAME}" AS DECIMAL)) as current_standard_deviation FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml index a2b2d4aa..e14bf0c7 100644 --- a/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml @@ -97,6 +97,14 @@ test_types: test_operator: NOT BETWEEN test_condition: |- {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} + - id: '2815' + test_type: Volume_Trend + sql_flavor: sap_hana + measure: |- + {CUSTOM_QUERY} + test_operator: NOT BETWEEN + test_condition: |- + {LOWER_TOLERANCE} AND {UPPER_TOLERANCE} target_data_lookups: - id: '1477' test_id: '1513' @@ -186,4 +194,15 @@ test_types: {UPPER_TOLERANCE} AS upper_bound FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" error_type: Test Results + - id: '8015' + test_id: '1513' + test_type: Volume_Trend + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + SELECT {CUSTOM_QUERY} AS current_count, + {LOWER_TOLERANCE} AS lower_bound, + {UPPER_TOLERANCE} AS upper_bound + FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" + error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml index 063189b8..7fafc6b4 100644 --- a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml @@ -109,6 +109,14 @@ test_types: test_operator: '>' test_condition: |- {THRESHOLD_VALUE} + - id: '8030' + test_type: Weekly_Rec_Ct + sql_flavor: sap_hana + measure: |- + MAX(<%DATEDIFF_WEEK;DATE '1800-01-01';{COLUMN_NAME}%>) - MIN(<%DATEDIFF_WEEK;DATE '1800-01-01';{COLUMN_NAME}%>)+1 - COUNT(DISTINCT <%DATEDIFF_WEEK;DATE '1800-01-01';{COLUMN_NAME}%>) + test_operator: '>' + test_condition: |- + {THRESHOLD_VALUE} target_data_lookups: - id: '1393' test_id: '1037' @@ -244,4 +252,12 @@ test_types: lookup_query: |- WITH daterange AS (SELECT (SELECT TRUNC(MIN("{COLUMN_NAME}"), 'IW') FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") + (LEVEL - 1) * 7 AS all_dates FROM DUAL CONNECT BY LEVEL <= CEIL((TRUNC((SELECT MAX("{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}")) - TRUNC((SELECT MIN("{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"))) / 7) + 1), existing_periods AS (SELECT DISTINCT TRUNC("{COLUMN_NAME}", 'IW') AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY TRUNC("{COLUMN_NAME}", 'IW')) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results + - id: '8037' + test_id: '1037' + test_type: Weekly_Rec_Ct + sql_flavor: sap_hana + lookup_type: null + lookup_query: |- + WITH Pass0 AS (SELECT 1 C FROM DUMMY UNION ALL SELECT 1 FROM DUMMY), Pass1 AS (SELECT 1 C FROM Pass0 A, Pass0 B), Pass2 AS (SELECT 1 C FROM Pass1 A, Pass1 B), Pass3 AS (SELECT 1 C FROM Pass2 A, Pass2 B), nums AS (SELECT ROW_NUMBER() OVER (ORDER BY C) - 1 AS rn FROM Pass3), bounds AS (SELECT ADD_DAYS(CAST(MIN("{COLUMN_NAME}") AS DATE), -WEEKDAY(CAST(MIN("{COLUMN_NAME}") AS DATE))) AS min_week, ADD_DAYS(CAST(MAX("{COLUMN_NAME}") AS DATE), -WEEKDAY(CAST(MAX("{COLUMN_NAME}") AS DATE))) AS max_week FROM "{TARGET_SCHEMA}"."{TABLE_NAME}"), daterange AS (SELECT ADD_DAYS(b.min_week, n.rn * 7) AS all_dates FROM bounds b, nums n WHERE ADD_DAYS(b.min_week, n.rn * 7) <= b.max_week), existing_periods AS (SELECT DISTINCT ADD_DAYS(CAST("{COLUMN_NAME}" AS DATE), -WEEKDAY(CAST("{COLUMN_NAME}" AS DATE))) AS period, COUNT(1) AS period_count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY ADD_DAYS(CAST("{COLUMN_NAME}" AS DATE), -WEEKDAY(CAST("{COLUMN_NAME}" AS DATE)))) SELECT p.missing_period, p.prior_available_week, e.period_count AS prior_available_week_count, p.next_available_week, f.period_count AS next_available_week_count FROM (SELECT d.all_dates AS missing_period, MAX(b.period) AS prior_available_week, MIN(c.period) AS next_available_week FROM daterange d LEFT JOIN existing_periods a ON d.all_dates = a.period LEFT JOIN existing_periods b ON b.period < d.all_dates LEFT JOIN existing_periods c ON c.period > d.all_dates WHERE a.period IS NULL AND d.all_dates BETWEEN b.period AND c.period GROUP BY d.all_dates) p LEFT JOIN existing_periods e ON (p.prior_available_week = e.period) LEFT JOIN existing_periods f ON (p.next_available_week = f.period) ORDER BY p.missing_period LIMIT {LIMIT} + error_type: Test Results test_templates: [] diff --git a/testgen/template/flavors/sap_hana/data_chars/get_schema_ddf.sql b/testgen/template/flavors/sap_hana/data_chars/get_schema_ddf.sql new file mode 100644 index 00000000..8a0838fd --- /dev/null +++ b/testgen/template/flavors/sap_hana/data_chars/get_schema_ddf.sql @@ -0,0 +1,41 @@ +SELECT + c.SCHEMA_NAME AS schema_name, + c.TABLE_NAME AS table_name, + c.COLUMN_NAME AS column_name, + CASE + WHEN c.DATA_TYPE_NAME IN ('NVARCHAR', 'VARCHAR', 'NCHAR', 'CHAR', 'ALPHANUM', 'SHORTTEXT') THEN 'char(' || c.LENGTH || ')' + WHEN c.DATA_TYPE_NAME = 'DECIMAL' AND c.SCALE = 0 THEN 'bigint' + WHEN c.DATA_TYPE_NAME = 'DECIMAL' AND c.SCALE > 0 THEN 'numeric(' || c.LENGTH || ',' || c.SCALE || ')' + WHEN c.DATA_TYPE_NAME IN ('DOUBLE', 'REAL', 'SMALLDECIMAL') THEN 'numeric' + WHEN c.DATA_TYPE_NAME IN ('TIMESTAMP', 'SECONDDATE') THEN 'timestamp' + ELSE LOWER(c.DATA_TYPE_NAME) + END AS column_type, + CASE + WHEN c.DATA_TYPE_NAME IN ('NVARCHAR', 'VARCHAR', 'NCHAR', 'CHAR', 'ALPHANUM', 'SHORTTEXT') THEN c.DATA_TYPE_NAME || '(' || c.LENGTH || ')' + WHEN c.DATA_TYPE_NAME = 'DECIMAL' THEN 'DECIMAL(' || c.LENGTH || ',' || c.SCALE || ')' + ELSE c.DATA_TYPE_NAME + END AS db_data_type, + c.POSITION AS ordinal_position, + CASE + WHEN c.DATA_TYPE_NAME IN ('NVARCHAR', 'VARCHAR', 'NCHAR', 'CHAR', 'ALPHANUM', 'SHORTTEXT') + THEN 'A' + WHEN c.DATA_TYPE_NAME = 'BOOLEAN' + THEN 'B' + WHEN c.DATA_TYPE_NAME IN ('DATE', 'TIMESTAMP', 'SECONDDATE') + THEN 'D' + WHEN c.DATA_TYPE_NAME = 'TIME' + THEN 'T' + WHEN c.DATA_TYPE_NAME IN ('INTEGER', 'BIGINT', 'SMALLINT', 'TINYINT', 'DECIMAL', 'DOUBLE', 'REAL', 'SMALLDECIMAL') + THEN 'N' + ELSE 'X' + END AS general_type, + CASE + WHEN c.DATA_TYPE_NAME = 'DECIMAL' AND c.SCALE > 0 THEN 1 + WHEN c.DATA_TYPE_NAME IN ('DOUBLE', 'REAL', 'SMALLDECIMAL') THEN 1 + ELSE 0 + END AS is_decimal, + t.RECORD_COUNT AS approx_record_ct +FROM SYS.TABLE_COLUMNS c +LEFT JOIN SYS.M_TABLES t ON c.SCHEMA_NAME = t.SCHEMA_NAME AND c.TABLE_NAME = t.TABLE_NAME +WHERE c.SCHEMA_NAME = '{DATA_SCHEMA}' {TABLE_CRITERIA} +ORDER BY c.SCHEMA_NAME, c.TABLE_NAME, c.POSITION diff --git a/testgen/template/flavors/sap_hana/gen_query_tests/gen_Dupe_Rows.sql b/testgen/template/flavors/sap_hana/gen_query_tests/gen_Dupe_Rows.sql new file mode 100644 index 00000000..117ffe06 --- /dev/null +++ b/testgen/template/flavors/sap_hana/gen_query_tests/gen_Dupe_Rows.sql @@ -0,0 +1,55 @@ +WITH latest_run AS ( + -- Latest complete profiling run before as-of-date + SELECT MAX(run_date) AS last_run_date + FROM profile_results + WHERE table_groups_id = :TABLE_GROUPS_ID ::UUID + AND run_date::DATE <= :AS_OF_DATE ::DATE +), +selected_tables AS ( + SELECT profile_run_id, schema_name, table_name, + STRING_AGG(:QUOTE || column_name || :QUOTE, ', ' ORDER BY position) AS groupby_names + FROM profile_results p + INNER JOIN latest_run lr ON p.run_date = lr.last_run_date + WHERE table_groups_id = :TABLE_GROUPS_ID ::UUID + -- Skip X types - SAP HANA does not allow grouping by LOB types like BLOB, CLOB, NCLOB, TEXT, BINTEXT + AND general_type <> 'X' + GROUP BY profile_run_id, schema_name, table_name +) +INSERT INTO test_definitions ( + table_groups_id, test_suite_id, test_type, + schema_name, table_name, + test_active, last_auto_gen_date, profiling_as_of_date, profile_run_id, + groupby_names, skip_errors +) +SELECT + :TABLE_GROUPS_ID ::UUID AS table_groups_id, + :TEST_SUITE_ID ::UUID AS test_suite_id, + 'Dupe_Rows' AS test_type, + s.schema_name, + s.table_name, + 'Y' AS test_active, + :RUN_DATE ::TIMESTAMP AS last_auto_gen_date, + :AS_OF_DATE ::TIMESTAMP AS profiling_as_of_date, + s.profile_run_id, + s.groupby_names, + 0 AS skip_errors +FROM selected_tables s + -- Only insert if test type is active +WHERE EXISTS (SELECT 1 FROM test_types WHERE test_type = 'Dupe_Rows' AND active = 'Y') + -- Only insert if test type is included in generation set + AND EXISTS (SELECT 1 FROM generation_sets WHERE test_type = 'Dupe_Rows' AND generation_set = :GENERATION_SET) + +-- Match "uix_td_autogen_table" unique index exactly +ON CONFLICT (test_suite_id, test_type, schema_name, table_name) +WHERE last_auto_gen_date IS NOT NULL + AND table_name IS NOT NULL + AND column_name IS NULL + +-- Update tests if they already exist +DO UPDATE SET + test_active = EXCLUDED.test_active, + last_auto_gen_date = EXCLUDED.last_auto_gen_date, + groupby_names = EXCLUDED.groupby_names, + skip_errors = EXCLUDED.skip_errors +-- Ignore locked tests +WHERE test_definitions.lock_refresh = 'N'; diff --git a/testgen/template/flavors/sap_hana/gen_query_tests/gen_Freshness_Trend.sql b/testgen/template/flavors/sap_hana/gen_query_tests/gen_Freshness_Trend.sql new file mode 100644 index 00000000..ae947a22 --- /dev/null +++ b/testgen/template/flavors/sap_hana/gen_query_tests/gen_Freshness_Trend.sql @@ -0,0 +1,193 @@ +WITH latest_run AS ( + -- Latest complete profiling run before as-of-date + SELECT MAX(run_date) AS last_run_date + FROM profile_results + WHERE table_groups_id = :TABLE_GROUPS_ID ::UUID + AND run_date::DATE <= :AS_OF_DATE ::DATE +), +latest_results AS ( + -- Column results for latest run + SELECT p.profile_run_id, p.schema_name, p.table_name, p.column_name, + p.functional_data_type, p.general_type, + p.distinct_value_ct, p.record_ct, p.null_value_ct, + p.max_value, p.min_value, p.avg_value, p.stdev_value + FROM profile_results p + INNER JOIN latest_run lr ON p.run_date = lr.last_run_date + INNER JOIN data_table_chars dtc ON ( + dtc.table_groups_id = p.table_groups_id + AND dtc.schema_name = p.schema_name + AND dtc.table_name = p.table_name + -- Ignore dropped tables + AND dtc.drop_date IS NULL + ) + WHERE p.table_groups_id = :TABLE_GROUPS_ID ::UUID +), +-- IDs - TOP 2 +id_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY + CASE + WHEN functional_data_type ILIKE 'ID-Unique%' THEN 1 + WHEN functional_data_type = 'ID-Secondary' THEN 2 + ELSE 3 + END, distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'ID%' +), +-- Process Date - TOP 1 +process_date_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY + CASE + WHEN column_name ILIKE '%mod%' THEN 1 + WHEN column_name ILIKE '%up%' THEN 1 + WHEN column_name ILIKE '%cr%' THEN 2 + WHEN column_name ILIKE '%in%' THEN 2 + END, distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'process%' +), +-- Transaction Date - TOP 1 +tran_date_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'transactional date%' + OR functional_data_type ILIKE 'period%' + OR functional_data_type = 'timestamp' +), +-- Numeric Measures +numeric_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, + -- Weighted score + ( + 0.25 * (distinct_value_ct * 1.0 / NULLIF(record_ct, 0)) + + 0.15 * ((max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + + 0.10 * (LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2))) + + 0.40 * (stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + + 0.10 * (1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1))) + ) AS change_detection_score + FROM latest_results + WHERE general_type = 'N' + AND ( + functional_data_type ILIKE 'Measure%' + OR functional_data_type IN ('Sequence', 'Constant') + ) +), +numeric_cols_ranked AS ( + SELECT *, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY change_detection_score DESC, column_name + ) AS rank + FROM numeric_cols + WHERE change_detection_score IS NOT NULL +), +combined AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order + FROM id_cols + WHERE rank <= 2 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order + FROM process_date_cols + WHERE rank = 1 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order + FROM tran_date_cols + WHERE rank = 1 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order + FROM numeric_cols_ranked + WHERE rank = 1 +), +selected_tables AS ( + SELECT profile_run_id, schema_name, table_name, + STRING_AGG(column_name, ',' ORDER BY element_type, fingerprint_order, column_name) AS column_names, + 'TO_VARCHAR(COUNT(*)) || ''|'' || ' || + STRING_AGG( + REPLACE( + CASE + WHEN general_type = 'D' THEN 'TO_VARCHAR(MIN(@@@)) || ''|'' || TO_VARCHAR(MAX(@@@)) || ''|'' || TO_VARCHAR(COUNT(DISTINCT @@@))' + WHEN general_type = 'A' THEN 'TO_VARCHAR(MIN(@@@)) || ''|'' || TO_VARCHAR(MAX(@@@)) || ''|'' || TO_VARCHAR(COUNT(DISTINCT @@@)) || ''|'' || TO_VARCHAR(SUM(LENGTH(@@@)))' + WHEN general_type = 'N' THEN 'TO_VARCHAR(COUNT(@@@)) || ''|'' || + TO_VARCHAR(COUNT(DISTINCT MOD(CAST(CAST(COALESCE(@@@,0) AS DECIMAL(38,6)) * 1000000 AS DECIMAL(38,0)), 1000003))) || ''|'' || + COALESCE(TO_VARCHAR(CAST(MIN(@@@) AS DECIMAL(38,6))), '''') || ''|'' || + COALESCE(TO_VARCHAR(CAST(MAX(@@@) AS DECIMAL(38,6))), '''') || ''|'' || + COALESCE(TO_VARCHAR(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) * 1000000 AS DECIMAL), 1000000007)), 0), 1000000007)), '''') || ''|'' || + COALESCE(TO_VARCHAR(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) * 1000000 AS DECIMAL), 1000000009)), 0), 1000000009)), '''')' + END, + '@@@', '"' || column_name || '"' + ), + ' || ''|'' || ' + ORDER BY element_type, fingerprint_order, column_name + ) AS fingerprint + FROM combined + GROUP BY profile_run_id, schema_name, table_name +) +-- Insert tests for selected tables +INSERT INTO test_definitions ( + table_groups_id, test_suite_id, test_type, + schema_name, table_name, groupby_names, + test_active, last_auto_gen_date, profiling_as_of_date, profile_run_id, + history_calculation, history_lookback, custom_query +) +SELECT + :TABLE_GROUPS_ID ::UUID AS table_groups_id, + :TEST_SUITE_ID ::UUID AS test_suite_id, + 'Freshness_Trend' AS test_type, + s.schema_name, + s.table_name, + s.column_names AS groupby_names, + 'Y' AS test_active, + :RUN_DATE ::TIMESTAMP AS last_auto_gen_date, + :AS_OF_DATE ::TIMESTAMP AS profiling_as_of_date, + s.profile_run_id, + 'PREDICT' AS history_calculation, + NULL AS history_lookback, + s.fingerprint AS custom_query +FROM selected_tables s + -- Only insert if test type is active +WHERE EXISTS (SELECT 1 FROM test_types WHERE test_type = 'Freshness_Trend' AND active = 'Y') + -- Only insert if test type is included in generation set + AND EXISTS (SELECT 1 FROM generation_sets WHERE test_type = 'Freshness_Trend' AND generation_set = :GENERATION_SET) + +-- Match "uix_td_autogen_table" unique index exactly +ON CONFLICT (test_suite_id, test_type, schema_name, table_name) +WHERE last_auto_gen_date IS NOT NULL + AND table_name IS NOT NULL + AND column_name IS NULL + +-- Update tests if they already exist +DO UPDATE SET + groupby_names = EXCLUDED.groupby_names, + test_active = EXCLUDED.test_active, + last_auto_gen_date = EXCLUDED.last_auto_gen_date, + profiling_as_of_date = EXCLUDED.profiling_as_of_date, + profile_run_id = EXCLUDED.profile_run_id, + history_calculation = EXCLUDED.history_calculation, + history_lookback = EXCLUDED.history_lookback, + custom_query = EXCLUDED.custom_query +-- Ignore locked tests +WHERE test_definitions.lock_refresh = 'N' + -- Don't update existing tests in "insert" mode + AND NOT COALESCE(:INSERT_ONLY, FALSE); diff --git a/testgen/template/flavors/sap_hana/gen_query_tests/gen_Table_Freshness.sql b/testgen/template/flavors/sap_hana/gen_query_tests/gen_Table_Freshness.sql new file mode 100644 index 00000000..d3cc765d --- /dev/null +++ b/testgen/template/flavors/sap_hana/gen_query_tests/gen_Table_Freshness.sql @@ -0,0 +1,181 @@ +WITH latest_run AS ( + -- Latest complete profiling run before as-of-date + SELECT MAX(run_date) AS last_run_date + FROM profile_results + WHERE table_groups_id = :TABLE_GROUPS_ID ::UUID + AND run_date::DATE <= :AS_OF_DATE ::DATE +), +latest_results AS ( + -- Column results for latest run + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, + distinct_value_ct, record_ct, null_value_ct, + max_value, min_value, avg_value, stdev_value + FROM profile_results p + INNER JOIN latest_run lr ON p.run_date = lr.last_run_date + WHERE table_groups_id = :TABLE_GROUPS_ID ::UUID +), +-- IDs - TOP 2 +id_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY + CASE + WHEN functional_data_type ILIKE 'ID-Unique%' THEN 1 + WHEN functional_data_type = 'ID-Secondary' THEN 2 + ELSE 3 + END, distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'ID%' +), +-- Process Date - TOP 1 +process_date_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY + CASE + WHEN column_name ILIKE '%mod%' THEN 1 + WHEN column_name ILIKE '%up%' THEN 1 + WHEN column_name ILIKE '%cr%' THEN 2 + WHEN column_name ILIKE '%in%' THEN 2 + END, distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'process%' +), +-- Transaction Date - TOP 1 +tran_date_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, distinct_value_ct, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY distinct_value_ct DESC, column_name + ) AS rank + FROM latest_results + WHERE general_type IN ('A', 'D', 'N') + AND functional_data_type ILIKE 'transactional date%' + OR functional_data_type ILIKE 'period%' + OR functional_data_type = 'timestamp' +), +-- Numeric Measures +numeric_cols AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + functional_data_type, general_type, + -- Weighted score + ( + 0.25 * (distinct_value_ct * 1.0 / NULLIF(record_ct, 0)) + + 0.15 * ((max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + + 0.10 * (LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2))) + + 0.40 * (stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) + + 0.10 * (1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1))) + ) AS change_detection_score + FROM latest_results + WHERE general_type = 'N' + AND ( + functional_data_type ILIKE 'Measure%' + OR functional_data_type IN ('Sequence', 'Constant') + ) +), +numeric_cols_ranked AS ( + SELECT *, + ROW_NUMBER() OVER ( + PARTITION BY schema_name, table_name + ORDER BY change_detection_score DESC, column_name + ) AS rank + FROM numeric_cols + WHERE change_detection_score IS NOT NULL +), +combined AS ( + SELECT profile_run_id, schema_name, table_name, column_name, + 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order + FROM id_cols + WHERE rank <= 2 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order + FROM process_date_cols + WHERE rank = 1 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order + FROM tran_date_cols + WHERE rank = 1 + UNION ALL + SELECT profile_run_id, schema_name, table_name, column_name, + 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order + FROM numeric_cols_ranked + WHERE rank = 1 +), +selected_tables AS ( + SELECT profile_run_id, schema_name, table_name, + 'TO_VARCHAR(COUNT(*)) || ''|'' || ' || + STRING_AGG( + REPLACE( + CASE + WHEN general_type = 'D' THEN 'TO_VARCHAR(MIN(@@@)) || ''|'' || TO_VARCHAR(MAX(@@@)) || ''|'' || TO_VARCHAR(COUNT(DISTINCT @@@))' + WHEN general_type = 'A' THEN 'TO_VARCHAR(MIN(@@@)) || ''|'' || TO_VARCHAR(MAX(@@@)) || ''|'' || TO_VARCHAR(COUNT(DISTINCT @@@)) || ''|'' || TO_VARCHAR(SUM(LENGTH(@@@)))' + WHEN general_type = 'N' THEN 'TO_VARCHAR(COUNT(@@@)) || ''|'' || + TO_VARCHAR(COUNT(DISTINCT MOD(CAST(CAST(COALESCE(@@@,0) AS DECIMAL(38,6)) * 1000000 AS DECIMAL(38,0)), 1000003))) || ''|'' || + COALESCE(TO_VARCHAR(CAST(MIN(@@@) AS DECIMAL(38,6))), '''') || ''|'' || + COALESCE(TO_VARCHAR(CAST(MAX(@@@) AS DECIMAL(38,6))), '''') || ''|'' || + COALESCE(TO_VARCHAR(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) * 1000000 AS DECIMAL), 1000000007)), 0), 1000000007)), '''') || ''|'' || + COALESCE(TO_VARCHAR(MOD(COALESCE(SUM(MOD(CAST(ABS(COALESCE(@@@,0)) * 1000000 AS DECIMAL), 1000000009)), 0), 1000000009)), '''')' + END, + '@@@', '"' || column_name || '"' + ), + ' || ''|'' || ' + ORDER BY element_type, fingerprint_order, column_name + ) AS fingerprint + FROM combined + GROUP BY profile_run_id, schema_name, table_name +) +-- Insert tests for selected tables +INSERT INTO test_definitions ( + table_groups_id, test_suite_id, test_type, + schema_name, table_name, + test_active, last_auto_gen_date, profiling_as_of_date, profile_run_id, + history_calculation, history_lookback, custom_query +) +SELECT + :TABLE_GROUPS_ID ::UUID AS table_groups_id, + :TEST_SUITE_ID ::UUID AS test_suite_id, + 'Table_Freshness' AS test_type, + s.schema_name, + s.table_name, + 'Y' AS test_active, + :RUN_DATE ::TIMESTAMP AS last_auto_gen_date, + :AS_OF_DATE ::TIMESTAMP AS profiling_as_of_date, + s.profile_run_id, + 'Value' AS history_calculation, + 1 AS history_lookback, + s.fingerprint AS custom_query +FROM selected_tables s + -- Only insert if test type is active +WHERE EXISTS (SELECT 1 FROM test_types WHERE test_type = 'Table_Freshness' AND active = 'Y') + -- Only insert if test type is included in generation set + AND EXISTS (SELECT 1 FROM generation_sets WHERE test_type = 'Table_Freshness' AND generation_set = :GENERATION_SET) + +-- Match "uix_td_autogen_table" unique index exactly +ON CONFLICT (test_suite_id, test_type, schema_name, table_name) +WHERE last_auto_gen_date IS NOT NULL + AND table_name IS NOT NULL + AND column_name IS NULL + +-- Update tests if they already exist +DO UPDATE SET + test_active = EXCLUDED.test_active, + last_auto_gen_date = EXCLUDED.last_auto_gen_date, + profiling_as_of_date = EXCLUDED.profiling_as_of_date, + profile_run_id = EXCLUDED.profile_run_id, + history_calculation = EXCLUDED.history_calculation, + history_lookback = EXCLUDED.history_lookback, + custom_query = EXCLUDED.custom_query +-- Ignore locked tests +WHERE test_definitions.lock_refresh = 'N'; diff --git a/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql b/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql new file mode 100644 index 00000000..8bfac838 --- /dev/null +++ b/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql @@ -0,0 +1,251 @@ +-- TG-IF do_sample +WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI({SAMPLE_PERCENT_CALC}) +) +-- TG-ELSE +WITH target_table AS ( + SELECT * FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +) +-- TG-ENDIF +SELECT + {CONNECTION_ID} AS connection_id, + '{PROJECT_CODE}' AS project_code, + '{TABLE_GROUPS_ID}' AS table_groups_id, + '{DATA_SCHEMA}' AS schema_name, + '{RUN_DATE}' AS run_date, + '{DATA_TABLE}' AS table_name, + {COL_POS} AS position, + '{COL_NAME_SANITIZED}' AS column_name, + '{COL_TYPE}' AS column_type, + '{DB_DATA_TYPE}' AS db_data_type, + '{COL_GEN_TYPE}' AS general_type, + COUNT(*) AS record_ct, +-- TG-IF is_type_X + COUNT(CASE WHEN "{COL_NAME}" IS NOT NULL THEN 1 END) AS value_ct, + NULL AS distinct_value_ct, +-- TG-ELSE + COUNT("{COL_NAME}") AS value_ct, + COUNT(DISTINCT "{COL_NAME}") AS distinct_value_ct, +-- TG-ENDIF + SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, +-- TG-IF is_type_ADN + MIN(LENGTH(TO_VARCHAR("{COL_NAME}"))) AS min_length, + MAX(LENGTH(TO_VARCHAR("{COL_NAME}"))) AS max_length, + AVG(NULLIF(LENGTH(TO_VARCHAR("{COL_NAME}")), 0)) AS avg_length, +-- TG-ELSE + NULL AS min_length, + NULL AS max_length, + NULL AS avg_length, +-- TG-ENDIF +-- TG-IF is_type_A + SUM(CASE + WHEN TRIM("{COL_NAME}") LIKE_REGEXPR '^0(\.0*)?$' THEN 1 ELSE 0 + END) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_N + SUM(1 - ABS(SIGN("{COL_NAME}"))) AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_not_A_not_N + NULL AS zero_value_ct, +-- TG-ENDIF +-- TG-IF is_type_A + COUNT(DISTINCT UPPER(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COL_NAME}", ' ', ''), '''', ''), ',', ''), '.', ''), '-', ''))) AS distinct_std_value_ct, + 0 AS zero_length_ct, + SUM(CASE + WHEN "{COL_NAME}" BETWEEN ' !' AND '!' THEN 1 + ELSE 0 + END) AS lead_space_ct, + SUM(CASE WHEN "{COL_NAME}" LIKE '"%"' OR "{COL_NAME}" LIKE '''%''' THEN 1 ELSE 0 END) AS quoted_value_ct, + SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '[0-9]' THEN 1 ELSE 0 END) AS includes_digit_ct, + SUM(CASE + WHEN LOWER("{COL_NAME}") LIKE_REGEXPR '^(\.{1,}|-{1,}|\?{1,}|[[:space:]]{1,}|0{2,}|9{2,}|x{2,}|z{2,})$' THEN 1 + WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd', + 'n/a','#na','none','null','unknown') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('(blank)','(error)','(missing)','(tbd)', + '(n/a)','(#na)','(none)','(null)','(unknown)') THEN 1 + WHEN LOWER("{COL_NAME}") IN ('[blank]','[error]','[missing]','[tbd]', + '[n/a]','[#na]','[none]','[null]','[unknown]') THEN 1 + ELSE 0 + END) AS filled_value_ct, + SUBSTR(MIN(CASE WHEN "{COL_NAME}" IS NOT NULL THEN "{COL_NAME}" END), 1, 100) AS min_text, + SUBSTR(MAX(CASE WHEN "{COL_NAME}" IS NOT NULL THEN "{COL_NAME}" END), 1, 100) AS max_text, + SUM(CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" <> LOWER("{COL_NAME}") THEN 1 ELSE 0 END) AS upper_case_ct, + SUM(CASE WHEN "{COL_NAME}" = LOWER("{COL_NAME}") AND "{COL_NAME}" <> UPPER("{COL_NAME}") THEN 1 ELSE 0 END) AS lower_case_ct, + SUM(CASE WHEN "{COL_NAME}" = UPPER("{COL_NAME}") AND "{COL_NAME}" = LOWER("{COL_NAME}") THEN 1 ELSE 0 END) AS non_alpha_ct, + COUNT(CASE WHEN + REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COL_NAME}", + NCHAR(160), ''), NCHAR(8201), ''), NCHAR(8203), ''), NCHAR(8204), ''), NCHAR(8205), ''), + NCHAR(8206), ''), NCHAR(8207), ''), NCHAR(8239), ''), NCHAR(12288), ''), NCHAR(65279), '') + <> "{COL_NAME}" THEN 1 END) AS non_printing_ct, + SUM(<%IS_NUM;SUBSTR("{COL_NAME}", 1, 31)%>) AS numeric_ct, + SUM(<%IS_DATE;SUBSTR("{COL_NAME}", 1, 26)%>) AS date_ct, + CASE + WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^[0-9]{1,5}[a-zA-Z]?[[:space:]][[:alnum:]_]{1,5}\.?[[:space:]]?[[:alnum:]_]*[[:space:]]?[[:alnum:]_]*[[:space:]][a-zA-Z]{1,6}\.?[[:space:]]?[0-9]{0,5}[A-Z]?$' + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'STREET_ADDR' + WHEN SUM(CASE WHEN "{COL_NAME}" IN ('AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','AE','AP','AA') + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'STATE_USA' + WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^(\+1|1)?[ .-]?(\([2-9][0-9]{2}\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$' + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'PHONE_USA' + WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'EMAIL' + WHEN SUM(CASE WHEN REPLACE_REGEXPR('[0-9]' IN "{COL_NAME}" WITH '9') IN ('99999', '999999999', '99999-9999') + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'ZIP_USA' + WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^[[:alnum:]_[:space:]-]+\.(txt|csv|tsv|dat|doc|pdf|xlsx)$' + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'FILE_NAME' + WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^([0-9]{4}[- ]){3}[0-9]{4}$' + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'CREDIT_CARD' + WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^([^,|' || CHAR(9) || ']{1,20}[,|' || CHAR(9) || ']){2,}[^,|' || CHAR(9) || ']{0,20}([,|' || CHAR(9) || ']?[^,|' || CHAR(9) || ']{0,20})*$' + AND NOT "{COL_NAME}" LIKE_REGEXPR '[[:space:]](and|but|or|yet)[[:space:]]' + THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'DELIMITED_DATA' + WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^[0-8][0-9]{2}-[0-9]{2}-[0-9]{4}$' + AND SUBSTR("{COL_NAME}", 1, 3) NOT BETWEEN '734' AND '749' + AND SUBSTR("{COL_NAME}", 1, 3) <> '666' THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'SSN' + END AS std_pattern_match, +-- TG-ELSE + NULL AS distinct_std_value_ct, + NULL AS zero_length_ct, + NULL AS lead_space_ct, + NULL AS quoted_value_ct, + NULL AS includes_digit_ct, + NULL AS filled_value_ct, + NULL AS min_text, + NULL AS max_text, + NULL AS upper_case_ct, + NULL AS lower_case_ct, + NULL AS non_alpha_ct, + NULL AS non_printing_ct, + NULL AS numeric_ct, + NULL AS date_ct, + NULL AS std_pattern_match, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT SUBSTR(STRING_AGG(formatted_pattern, ' | ' ORDER BY ct DESC), 1, 1000) + FROM ( + SELECT TO_VARCHAR(COUNT(*)) || ' | ' || pattern AS formatted_pattern, + COUNT(*) AS ct + FROM (SELECT REPLACE_REGEXPR('[0-9]' IN REPLACE_REGEXPR('[A-Z]' IN REPLACE_REGEXPR('[a-z]' IN + "{COL_NAME}" WITH 'a') WITH 'A') WITH 'N') AS pattern + FROM target_table + WHERE "{COL_NAME}" IS NOT NULL AND "{COL_NAME}" > ' ' AND (SELECT MAX(LENGTH("{COL_NAME}")) + FROM target_table) BETWEEN 3 and {MAX_PATTERN_LENGTH}) p + GROUP BY pattern + HAVING pattern > ' ' + ORDER BY COUNT(*) DESC + LIMIT 5 + ) ps) AS top_patterns, +-- TG-ELSE + NULL AS top_patterns, +-- TG-ENDIF +-- TG-IF is_type_N + MIN("{COL_NAME}") AS min_value, + MIN(CASE WHEN "{COL_NAME}" > 0 THEN "{COL_NAME}" ELSE NULL END) AS min_value_over_0, + MAX("{COL_NAME}") AS max_value, + AVG(CAST("{COL_NAME}" AS DECIMAL)) AS avg_value, + STDDEV(CAST("{COL_NAME}" AS DECIMAL)) AS stdev_value, + MIN(pct_25) AS percentile_25, + MIN(pct_50) AS percentile_50, + MIN(pct_75) AS percentile_75, +-- TG-ELSE + NULL AS min_value, + NULL AS min_value_over_0, + NULL AS max_value, + NULL AS avg_value, + NULL AS stdev_value, + NULL AS percentile_25, + NULL AS percentile_50, + NULL AS percentile_75, +-- TG-ENDIF +-- TG-IF is_N_decimal + SUM(ROUND(ABS(MOD("{COL_NAME}", 1)), 5)) AS fractional_sum, +-- TG-ELSE + NULL AS fractional_sum, +-- TG-ENDIF +-- TG-IF is_type_D + CASE + WHEN MIN("{COL_NAME}") IS NULL THEN NULL + ELSE GREATEST(MIN("{COL_NAME}"), TO_DATE('0001-01-01', 'YYYY-MM-DD')) + END AS min_date, + MAX("{COL_NAME}") AS max_date, + SUM(CASE + WHEN <%DATEDIFF_MONTH;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > 12 THEN 1 + ELSE 0 + END) AS before_1yr_date_ct, + SUM(CASE + WHEN <%DATEDIFF_MONTH;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > 60 THEN 1 + ELSE 0 + END) AS before_5yr_date_ct, + SUM(CASE + WHEN <%DATEDIFF_MONTH;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > 240 THEN 1 + ELSE 0 + END) AS before_20yr_date_ct, + SUM(CASE + WHEN <%DATEDIFF_MONTH;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > 1200 THEN 1 + ELSE 0 + END) AS before_100yr_date_ct, + SUM(CASE + WHEN <%DATEDIFF_DAY;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> BETWEEN 0 AND 365 THEN 1 + ELSE 0 + END) AS within_1yr_date_ct, + SUM(CASE + WHEN <%DATEDIFF_DAY;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> BETWEEN 0 AND 30 THEN 1 + ELSE 0 + END) AS within_1mo_date_ct, + SUM(CASE + WHEN "{COL_NAME}" > TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 + END) AS future_date_ct, + SUM(CASE + WHEN <%DATEDIFF_MONTH;TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS');"{COL_NAME}"%> > 240 THEN 1 + ELSE 0 + END) AS distant_future_date_ct, + COUNT(DISTINCT <%DATEDIFF_DAY;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) AS date_days_present, + COUNT(DISTINCT <%DATEDIFF_WEEK;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) AS date_weeks_present, + COUNT(DISTINCT <%DATEDIFF_MONTH;"{COL_NAME}";TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')%>) AS date_months_present, +-- TG-ELSE + NULL AS min_date, + NULL AS max_date, + NULL AS before_1yr_date_ct, + NULL AS before_5yr_date_ct, + NULL AS before_20yr_date_ct, + NULL AS before_100yr_date_ct, + NULL AS within_1yr_date_ct, + NULL AS within_1mo_date_ct, + NULL AS future_date_ct, + NULL AS distant_future_date_ct, + NULL AS date_days_present, + NULL AS date_weeks_present, + NULL AS date_months_present, +-- TG-ENDIF +-- TG-IF is_type_B + SUM(CAST("{COL_NAME}" AS INTEGER)) AS boolean_true_ct, +-- TG-ELSE + NULL AS boolean_true_ct, +-- TG-ENDIF +-- TG-IF is_type_A + (SELECT COUNT(DISTINCT REPLACE_REGEXPR('[0-9]' IN REPLACE_REGEXPR('[A-Z]' IN REPLACE_REGEXPR('[a-z]' IN + "{COL_NAME}" WITH 'a') WITH 'A') WITH 'N') + ) + FROM target_table + WHERE "{COL_NAME}" IS NOT NULL AND "{COL_NAME}" > ' ') AS distinct_pattern_ct, + SUM(SIGN(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REPLACE(TRIM("{COL_NAME}"), ' ', '')))) AS embedded_space_ct, + AVG(LENGTH(TRIM("{COL_NAME}")) - LENGTH(REPLACE(TRIM("{COL_NAME}"), ' ', ''))) AS avg_embedded_spaces, +-- TG-ELSE + NULL AS distinct_pattern_ct, + NULL AS embedded_space_ct, + NULL AS avg_embedded_spaces, +-- TG-ENDIF + '{PROFILE_RUN_ID}' AS profile_run_id + FROM target_table +-- TG-IF is_N_sampling + , (SELECT + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" TABLESAMPLE BERNOULLI({SAMPLE_PERCENT_CALC}) LIMIT 1000000) pctile +-- TG-ENDIF +-- TG-IF is_N_no_sampling + , (SELECT + PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_25, + PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_50, + PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY "{COL_NAME}") AS pct_75 + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" LIMIT 1000000) pctile +-- TG-ENDIF diff --git a/testgen/template/flavors/sap_hana/profiling/project_secondary_profiling_query.sql b/testgen/template/flavors/sap_hana/profiling/project_secondary_profiling_query.sql new file mode 100644 index 00000000..c2593cb4 --- /dev/null +++ b/testgen/template/flavors/sap_hana/profiling/project_secondary_profiling_query.sql @@ -0,0 +1,36 @@ +-- Get Freqs for selected columns +WITH ranked_vals AS ( + SELECT "{COL_NAME}", + COUNT(*) AS ct, + ROW_NUMBER() OVER (ORDER BY COUNT(*) DESC, "{COL_NAME}") AS rn + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + TABLESAMPLE BERNOULLI({SAMPLE_PERCENT_CALC}) +-- TG-ENDIF + WHERE "{COL_NAME}" IS NOT NULL AND "{COL_NAME}" > ' ' + GROUP BY "{COL_NAME}" +), +consol_vals AS ( + SELECT COALESCE(CASE WHEN rn <= 10 THEN '| ' || "{COL_NAME}" || ' | ' || TO_VARCHAR(ct) + ELSE NULL + END, '| Other Values (' || TO_VARCHAR(COUNT(DISTINCT "{COL_NAME}")) || ') | ' || TO_VARCHAR(SUM(ct))) AS val, + MIN(rn) as min_rn + FROM ranked_vals + GROUP BY CASE WHEN rn <= 10 THEN '| ' || "{COL_NAME}" || ' | ' || TO_VARCHAR(ct) + ELSE NULL + END +) +SELECT '{PROJECT_CODE}' as project_code, + '{DATA_SCHEMA}' as schema_name, + '{RUN_DATE}' as run_date, + '{DATA_TABLE}' as table_name, + '{COL_NAME}' as column_name, + REPLACE(STRING_AGG(val, '^#^' ORDER BY min_rn), '^#^', CHAR(10)) AS top_freq_values, + (SELECT LOWER(BINTOHEX(HASH_MD5(TO_BINARY(STRING_AGG("{COL_NAME}", '|' ORDER BY "{COL_NAME}"))))) + FROM (SELECT DISTINCT "{COL_NAME}" + FROM "{DATA_SCHEMA}"."{DATA_TABLE}" +-- TG-IF do_sample_bool + TABLESAMPLE BERNOULLI({SAMPLE_PERCENT_CALC}) +-- TG-ENDIF + WHERE "{COL_NAME}" IS NOT NULL AND "{COL_NAME}" > ' ')) as distinct_value_hash + FROM consol_vals diff --git a/testgen/template/flavors/sap_hana/profiling/templated_functions.yaml b/testgen/template/flavors/sap_hana/profiling/templated_functions.yaml new file mode 100644 index 00000000..dbc2c73f --- /dev/null +++ b/testgen/template/flavors/sap_hana/profiling/templated_functions.yaml @@ -0,0 +1,108 @@ +DATEDIFF_DAY: DAYS_BETWEEN(CAST({$1} AS DATE), CAST({$2} AS DATE)) + +DATEDIFF_WEEK: (DAYS_BETWEEN(ADD_DAYS(CAST({$1} AS DATE), -WEEKDAY(CAST({$1} AS DATE))), ADD_DAYS(CAST({$2} AS DATE), -WEEKDAY(CAST({$2} AS DATE))))) / 7 + +DATEDIFF_MONTH: ((YEAR({$2}) * 12 + MONTH({$2})) - (YEAR({$1}) * 12 + MONTH({$1}))) + +DATEDIFF_QUARTER: FLOOR(((YEAR({$2}) * 12 + MONTH({$2})) - (YEAR({$1}) * 12 + MONTH({$1}))) / 3) + +DATEDIFF_YEAR: YEAR({$2}) - YEAR({$1}) + +IS_NUM: CASE + WHEN {$1} LIKE_REGEXPR '^[[:space:]]*[+-]?\$?[[:space:]]*[0-9]+(,[0-9]{3})*(\.[0-9]*)?[%]?[[:space:]]*$' THEN 1 + ELSE 0 + END + +IS_DATE: CASE + /* YYYY-MM-DD HH:MM:SS SSSSSS or YYYY-MM-DD HH:MM:SS */ + WHEN {$1} LIKE_REGEXPR '^([0-9]{4})-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])[[:space:]](2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])([[:space:]][0-9]{6})?$' + THEN CASE + WHEN TO_INTEGER(SUBSTR({$1}, 1, 4)) BETWEEN 1800 AND 2200 + AND ( + ( SUBSTR({$1}, 6, 2) IN ('01', '03', '05', '07', '08', + '10', '12') + AND TO_INTEGER(SUBSTR({$1}, 9, 2)) BETWEEN 1 AND 31 ) + OR ( SUBSTR({$1}, 6, 2) IN ('04', '06', '09') + AND TO_INTEGER(SUBSTR({$1}, 9, 2)) BETWEEN 1 AND 30 ) + OR ( SUBSTR({$1}, 6, 2) = '02' + AND TO_INTEGER(SUBSTR({$1}, 9, 2)) BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* YYYYMMDDHHMMSSSSSS or YYYYMMDDHHMM */ + WHEN {$1} LIKE_REGEXPR '^([0-9]{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])([0-5][0-9])([0-5][0-9])([0-9]{6})$' + OR {$1} LIKE_REGEXPR '^([0-9]{4})(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])(2[0-3]|[01][0-9])$' + THEN CASE + WHEN TO_INTEGER(SUBSTR({$1}, 1, 4)) BETWEEN 1800 AND 2200 + AND ( + ( SUBSTR({$1}, 5, 2) IN ('01', '03', '05', '07', '08', + '10', '12') + AND TO_INTEGER(SUBSTR({$1}, 7, 2)) BETWEEN 1 AND 31 ) + OR ( SUBSTR({$1}, 5, 2) IN ('04', '06', '09') + AND TO_INTEGER(SUBSTR({$1}, 7, 2)) BETWEEN 1 AND 30 ) + OR ( SUBSTR({$1}, 5, 2) = '02' + AND TO_INTEGER(SUBSTR({$1}, 7, 2)) BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* Exclude anything else long */ + WHEN LENGTH({$1}) > 11 THEN 0 + /* YYYY-MMM/MM-DD */ + WHEN REPLACE_REGEXPR('(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)' IN UPPER({$1}) WITH '12') LIKE_REGEXPR + '[12][09][0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9]' + THEN CASE + WHEN TO_INTEGER(SUBSTR_REGEXPR('^[^-]+' IN {$1})) BETWEEN 1800 AND 2200 + AND ( + ( UPPER(SUBSTR_REGEXPR('[^-]+' IN {$1} OCCURRENCE 2)) IN ('01', '03', '05', '07', '08', + '1', '3', '5', '7', '8', '10', '12', + 'JAN', 'MAR', 'MAY', 'JUL', 'AUG', + 'OCT', 'DEC') + AND TO_INTEGER(SUBSTR_REGEXPR('[^-]+$' IN {$1})) BETWEEN 1 AND 31 ) + OR ( UPPER(SUBSTR_REGEXPR('[^-]+' IN {$1} OCCURRENCE 2)) IN ('04', '06', '09', '4', '6', '9', '11', + 'APR', 'JUN', 'SEP', 'NOV') + AND TO_INTEGER(SUBSTR_REGEXPR('[^-]+$' IN {$1})) BETWEEN 1 AND 30 ) + OR ( UPPER(SUBSTR_REGEXPR('[^-]+' IN {$1} OCCURRENCE 2)) IN ('02', '2', 'FEB') + AND TO_INTEGER(SUBSTR_REGEXPR('[^-]+$' IN {$1})) BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + /* MM/-DD/-YY/YYYY */ + WHEN REPLACE({$1}, '-', '/') LIKE_REGEXPR '^[0-1]?[0-9]/[0-3]?[0-9]/[12][09][0-9][0-9]$' + OR REPLACE({$1}, '-', '/') LIKE_REGEXPR '^[0-1]?[0-9]/[0-3]?[0-9]/[0-9][0-9]$' + THEN + CASE + WHEN TO_INTEGER(SUBSTR_REGEXPR('[^/]+' IN REPLACE({$1}, '-', '/') OCCURRENCE 1)) BETWEEN 1 AND 12 + AND ( + ( TO_INTEGER(SUBSTR_REGEXPR('[^/]+' IN REPLACE({$1}, '-', '/') OCCURRENCE 1)) IN (1, 3, 5, 7, 8, 10, 12) + AND TO_INTEGER(SUBSTR_REGEXPR('[^/]+' IN REPLACE({$1}, '-', '/') OCCURRENCE 2)) BETWEEN 1 AND 31 ) + OR ( TO_INTEGER(SUBSTR_REGEXPR('[^/]+' IN REPLACE({$1}, '-', '/') OCCURRENCE 1)) IN (4, 6, 9, 11) + AND TO_INTEGER(SUBSTR_REGEXPR('[^/]+' IN REPLACE({$1}, '-', '/') OCCURRENCE 2)) BETWEEN 1 AND 30 ) + OR ( TO_INTEGER(SUBSTR_REGEXPR('[^/]+' IN REPLACE({$1}, '-', '/') OCCURRENCE 1)) = 2 + AND TO_INTEGER(SUBSTR_REGEXPR('[^/]+' IN REPLACE({$1}, '-', '/') OCCURRENCE 2)) BETWEEN 1 AND 29) + ) + AND + TO_INTEGER('20' || SUBSTR(SUBSTR_REGEXPR('[^/]+$' IN REPLACE({$1}, '-', '/')), -2)) BETWEEN 1800 AND 2200 + THEN 1 + ELSE 0 + END + /* DD-MMM-YYYY */ + WHEN UPPER({$1}) LIKE_REGEXPR '[0-3]?[0-9]-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-[12][09][0-9][0-9]' + THEN + CASE + WHEN TO_INTEGER(SUBSTR_REGEXPR('[^-]+$' IN {$1})) BETWEEN 1800 AND 2200 + AND ( + ( UPPER(SUBSTR_REGEXPR('[^-]+' IN {$1} OCCURRENCE 2)) IN ('JAN', 'MAR', 'MAY', 'JUL', 'AUG', 'OCT', 'DEC') + AND TO_INTEGER(SUBSTR_REGEXPR('^[^-]+' IN {$1})) BETWEEN 1 AND 31 ) + OR ( UPPER(SUBSTR_REGEXPR('[^-]+' IN {$1} OCCURRENCE 2)) IN ('APR', 'JUN', 'SEP', 'NOV') + AND TO_INTEGER(SUBSTR_REGEXPR('^[^-]+' IN {$1})) BETWEEN 1 AND 30 ) + OR ( UPPER(SUBSTR_REGEXPR('[^-]+' IN {$1} OCCURRENCE 2)) = 'FEB' + AND TO_INTEGER(SUBSTR_REGEXPR('^[^-]+' IN {$1})) BETWEEN 1 AND 29) + ) + THEN 1 + ELSE 0 + END + ELSE 0 + END diff --git a/testgen/template/flavors/sap_hana/validate_tests/get_target_identifiers.sql b/testgen/template/flavors/sap_hana/validate_tests/get_target_identifiers.sql new file mode 100644 index 00000000..e75b8fc9 --- /dev/null +++ b/testgen/template/flavors/sap_hana/validate_tests/get_target_identifiers.sql @@ -0,0 +1,5 @@ +SELECT SCHEMA_NAME AS schema_name, + TABLE_NAME AS table_name, + COLUMN_NAME AS column_name +FROM SYS.TABLE_COLUMNS +WHERE SCHEMA_NAME IN ({TEST_SCHEMAS}) diff --git a/testgen/ui/assets/flavors/sap_hana.svg b/testgen/ui/assets/flavors/sap_hana.svg new file mode 100644 index 00000000..446764d2 --- /dev/null +++ b/testgen/ui/assets/flavors/sap_hana.svg @@ -0,0 +1,66 @@ + + + + + + + + + + + + + + diff --git a/testgen/ui/components/frontend/js/components/connection_form.js b/testgen/ui/components/frontend/js/components/connection_form.js index 1ccf7867..53100d97 100644 --- a/testgen/ui/components/frontend/js/components/connection_form.js +++ b/testgen/ui/components/frontend/js/components/connection_form.js @@ -84,6 +84,7 @@ const defaultPorts = { snowflake: '443', databricks: '443', oracle: '1521', + sap_hana: '39015', }; /** @@ -246,6 +247,16 @@ const ConnectionForm = (props, saveButton) => { dynamicConnectionUrl, { dbNameLabel: 'Service Name' }, ), + sap_hana: () => OracleForm( + updatedConnection, + getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), + (formValue, isValid) => { + updatedConnection.val = {...updatedConnection.val, ...formValue}; + setFieldValidity('sap_hana_form', isValid); + }, + connection, + dynamicConnectionUrl, + ), bigquery: () => BigqueryForm( updatedConnection, getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), diff --git a/testgen/ui/static/js/components/connection_form.js b/testgen/ui/static/js/components/connection_form.js index 1ccf7867..53100d97 100644 --- a/testgen/ui/static/js/components/connection_form.js +++ b/testgen/ui/static/js/components/connection_form.js @@ -84,6 +84,7 @@ const defaultPorts = { snowflake: '443', databricks: '443', oracle: '1521', + sap_hana: '39015', }; /** @@ -246,6 +247,16 @@ const ConnectionForm = (props, saveButton) => { dynamicConnectionUrl, { dbNameLabel: 'Service Name' }, ), + sap_hana: () => OracleForm( + updatedConnection, + getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), + (formValue, isValid) => { + updatedConnection.val = {...updatedConnection.val, ...formValue}; + setFieldValidity('sap_hana_form', isValid); + }, + connection, + dynamicConnectionUrl, + ), bigquery: () => BigqueryForm( updatedConnection, getValue(props.flavors).find(f => f.value === connectionFlavor.rawVal), diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index 670cac38..47a3d068 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -613,6 +613,12 @@ class ConnectionFlavor: flavor="postgresql", icon=get_asset_data_url("flavors/postgresql.svg"), ), + ConnectionFlavor( + label="SAP HANA", + value="sap_hana", + flavor="sap_hana", + icon=get_asset_data_url("flavors/sap_hana.svg"), + ), ConnectionFlavor( label="Snowflake", value="snowflake", From 84e8eaf602fb5179c265f1f1338db6b17bee65a6 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 5 Mar 2026 00:50:15 -0500 Subject: [PATCH 40/95] misc: upgrade libraries --- pyproject.toml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6033df77..6d52c874 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,10 +33,10 @@ dependencies = [ "sqlalchemy==1.4.46", "databricks-sql-connector==2.9.3", "databricks-sdk>=0.20.0", - "snowflake-sqlalchemy==1.6.1", + "snowflake-sqlalchemy==1.9.0", "sqlalchemy-bigquery==1.14.1", "oracledb==3.4.0", - "hdbcli==2.23.26", + "hdbcli==2.25.31", "sqlalchemy-hana==2.1.0", "pyodbc==5.0.0", "psycopg2-binary==2.9.9", @@ -57,7 +57,7 @@ dependencies = [ "xlsxwriter==3.2.0", "psutil==5.9.8", "concurrent_log_handler==0.9.25", - "cryptography==44.0.1", + "cryptography==46.0.5", "validators==0.33.0", "reportlab==4.2.2", "cron-converter==1.2.1", @@ -72,12 +72,18 @@ dependencies = [ "matplotlib==3.9.2", "scipy==1.14.1", "jinja2==3.1.6", + "pillow==12.1.1", + "protobuf==6.33.5", # MCP server "mcp[cli]==1.26.0", "uvicorn==0.41.0", "PyJWT==2.11.0", "bcrypt==5.0.0", + + # API & OAuth server + "authlib~=1.6.6", + "fastapi==0.115.12", ] [project.optional-dependencies] From 24e6299e7834b540b87c9cbb2ee1608a3187db8c Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Mon, 9 Mar 2026 15:19:01 -0400 Subject: [PATCH 41/95] fix(edit-monitors): bugs in form - validate required fields --- testgen/common/models/test_definition.py | 2 + .../030_initialize_new_schema_structure.sql | 1 + .../test_types_Freshness_Trend.yaml | 1 + .../test_types_Metric_Trend.yaml | 1 + .../test_types_Volume_Trend.yaml | 1 + .../dbupgrade/0177_incremental_upgrade.sql | 3 ++ .../js/components/test_definition_form.js | 38 ++++++++++++--- .../frontend/js/components/textarea.js | 48 ++++++++++++++++--- .../frontend/js/pages/edit_table_monitors.js | 28 +++++++---- .../js/components/test_definition_form.js | 34 +++++++++++-- testgen/ui/static/js/components/textarea.js | 48 ++++++++++++++++--- 11 files changed, 173 insertions(+), 32 deletions(-) create mode 100644 testgen/template/dbupgrade/0177_incremental_upgrade.sql diff --git a/testgen/common/models/test_definition.py b/testgen/common/models/test_definition.py index ad6f0bcd..e3c71309 100644 --- a/testgen/common/models/test_definition.py +++ b/testgen/common/models/test_definition.py @@ -43,6 +43,7 @@ class TestTypeSummary(EntityMinimal): default_parm_columns: str default_parm_prompts: str default_parm_help: str + default_parm_required: str default_severity: str test_scope: TestScope usage_notes: str @@ -145,6 +146,7 @@ class TestType(Entity): default_parm_values: str = Column(Text) default_parm_prompts: str = Column(Text) default_parm_help: str = Column(Text) + default_parm_required: str = Column(Text) default_severity: str = Column(String) run_type: TestRunType = Column(String) test_scope: TestScope = Column(String) diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 13e557f9..b5b1eefe 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -497,6 +497,7 @@ CREATE TABLE test_types ( default_parm_values TEXT, default_parm_prompts TEXT, default_parm_help TEXT, + default_parm_required TEXT, default_severity VARCHAR(10), run_type VARCHAR(10), test_scope VARCHAR, diff --git a/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml index 3297b585..285037b0 100644 --- a/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml @@ -24,6 +24,7 @@ test_types: Record Subset Condition,Lower Bound,Upper Bound,History Lookback default_parm_help: |- Condition defining a subset of records in main table + default_parm_required: N,N,N,N default_severity: Fail run_type: QUERY test_scope: table diff --git a/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml index 89e18871..524d5135 100644 --- a/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Metric_Trend.yaml @@ -19,6 +19,7 @@ test_types: default_parm_values: null default_parm_prompts: Metric Name,Metric Expression,Lower Bound,Upper Bound,History Lookback default_parm_help: null + default_parm_required: Y,Y,N,N,N default_severity: Fail run_type: CAT test_scope: table diff --git a/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml index e14bf0c7..e748f130 100644 --- a/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Volume_Trend.yaml @@ -20,6 +20,7 @@ test_types: default_parm_values: null default_parm_prompts: Record Subset Condition,Lower Bound,Upper Bound,History Lookback default_parm_help: Condition defining a subset of records in main table + default_parm_required: N,N,N,N default_severity: Fail run_type: CAT test_scope: table diff --git a/testgen/template/dbupgrade/0177_incremental_upgrade.sql b/testgen/template/dbupgrade/0177_incremental_upgrade.sql new file mode 100644 index 00000000..16d6eeab --- /dev/null +++ b/testgen/template/dbupgrade/0177_incremental_upgrade.sql @@ -0,0 +1,3 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE test_types ADD COLUMN IF NOT EXISTS default_parm_required TEXT; diff --git a/testgen/ui/components/frontend/js/components/test_definition_form.js b/testgen/ui/components/frontend/js/components/test_definition_form.js index 31812f87..80962eee 100644 --- a/testgen/ui/components/frontend/js/components/test_definition_form.js +++ b/testgen/ui/components/frontend/js/components/test_definition_form.js @@ -51,6 +51,7 @@ * @property {string} default_parm_columns * @property {string} default_parm_prompts * @property {string} default_parm_help + * @property {string?} default_parm_required * @property {string} default_severity * @property {'column'|'referential'|'table'|'tablegroup'|'custom'} test_scope * @property {string?} prediction @@ -69,7 +70,7 @@ import { Select } from './select.js'; import { Textarea } from './textarea.js'; import { RadioGroup } from './radio_group.js'; import { Caption } from './caption.js'; -import { numberBetween } from '../form_validators.js'; +import { numberBetween, required } from '../form_validators.js'; const { div, span } = van.tags; @@ -97,6 +98,7 @@ const TestDefinitionForm = (/** @type Properties */ props) => { const paramColumns = (definition.default_parm_columns || '').split(',').map(v => v.trim()); const paramLabels = (definition.default_parm_prompts || '').split(',').map(v => v.trim()); const paramHelp = (definition.default_parm_help || '').split('|').map(v => v.trim()); + const paramRequired = (definition.default_parm_required || '').split(',').map(v => v.trim().toUpperCase() === 'Y'); const hasThresholds = paramColumns.includes('history_calculation'); const dynamicParamColumns = paramColumns @@ -105,6 +107,7 @@ const TestDefinitionForm = (/** @type Properties */ props) => { column, label: paramLabels[index] || column.replaceAll('_', ' '), help: paramHelp[index] || null, + validators: paramRequired[index] ? [required] : undefined, })) .filter(config => !hasThresholds || !thresholdColumns.includes(config.column)) @@ -171,6 +174,7 @@ const TestDefinitionForm = (/** @type Properties */ props) => { type: 'number', value: currentValue(), step: config.step, + validators: config.validators, onChange: (value, state) => { setFieldValues({ [column]: value || null }) setFieldValidity(column, state.valid); @@ -188,8 +192,10 @@ const TestDefinitionForm = (/** @type Properties */ props) => { help: config.help, value: currentValue(), height: 100, - onChange: (value) => { - setFieldValues({ [column]: value || null }) + validators: config.validators, + onChange: (value, state) => { + setFieldValues({ [column]: value || null }); + setFieldValidity(column, state.valid); }, }), ); @@ -202,6 +208,7 @@ const TestDefinitionForm = (/** @type Properties */ props) => { label: config.label, help: config.help, value: currentValue(), + validators: config.validators, onChange: (value, state) => { setFieldValues({ [column]: value || null }) setFieldValidity(column, state.valid); @@ -252,8 +259,8 @@ const historyCalcOptions = [ * @property {(updatedValues: object) => void} setFieldValues * @property {(field: string, valid: boolean) => void} setFieldValidity * - * @param {ThresholdFormOptions} options - * @param {TestDefinition} definition + * @param {ThresholdFormOptions} options + * @param {TestDefinition} definition */ const ThresholdForm = (options, definition) => { const { setFieldValues, setFieldValidity } = options; @@ -291,6 +298,21 @@ const ThresholdForm = (options, definition) => { 'lower_tolerance': newMode === 'static' ? lowerTolerance.val : newMode === 'prediction' ? definition.lower_tolerance : null, 'upper_tolerance': newMode === 'static' ? upperTolerance.val : newMode === 'prediction' ? definition.upper_tolerance : null, }); + if (newMode === 'static') { + if (!isFreshnessTrend) { + setFieldValidity('lower_tolerance', !!lowerTolerance.val); + } + setFieldValidity('upper_tolerance', !!upperTolerance.val); + setFieldValidity('history_lookback', true); + } else if (newMode === 'historical') { + setFieldValidity('lower_tolerance', true); + setFieldValidity('upper_tolerance', true); + setFieldValidity('history_lookback', !!historyLookback.val); + } else { + setFieldValidity('lower_tolerance', true); + setFieldValidity('upper_tolerance', true); + setFieldValidity('history_lookback', true); + } }, }), () => { @@ -376,8 +398,8 @@ const ThresholdForm = (options, definition) => { if (mode.val === 'static') { return div( - { class: 'flex-row fx-gap-3 fx-flex-wrap mt-2' }, - !isFreshnessTrend + { class: 'flex-row fx-gap-3 fx-flex-wrap fx-align-flex-start mt-2' }, + !isFreshnessTrend ? div( { class: 'td-form--field' }, Input({ @@ -385,6 +407,7 @@ const ThresholdForm = (options, definition) => { label: 'Lower Bound', type: 'number', value: lowerTolerance, + validators: [required], onChange: (value, state) => { lowerTolerance.val = value; setFieldValues({ lower_tolerance: value }); @@ -400,6 +423,7 @@ const ThresholdForm = (options, definition) => { label: isFreshnessTrend ? 'Maximum interval since last update (minutes)' : 'Upper Bound', type: 'number', value: upperTolerance, + validators: [required], onChange: (value, state) => { upperTolerance.val = value; setFieldValues({ upper_tolerance: value }); diff --git a/testgen/ui/components/frontend/js/components/textarea.js b/testgen/ui/components/frontend/js/components/textarea.js index 828d8c86..bdfc411a 100644 --- a/testgen/ui/components/frontend/js/components/textarea.js +++ b/testgen/ui/components/frontend/js/components/textarea.js @@ -1,4 +1,11 @@ /** + * @import { Validator } from '../form_validators.js'; + * + * @typedef InputState + * @type {object} + * @property {boolean} valid + * @property {string[]} errors + * * @typedef Properties * @type {object} * @property {string?} id @@ -16,13 +23,14 @@ * @property {number?} width * @property {number?} height * @property {string?} testId + * @property {Array?} validators */ import van from '../van.min.js'; -import { debounce, getValue, loadStylesheet, getRandomId } from '../utils.js'; +import { debounce, getValue, loadStylesheet, getRandomId, checkIsRequired } from '../utils.js'; import { Icon } from './icon.js'; import { withTooltip } from './tooltip.js'; -const { div, label, textarea } = van.tags; +const { div, label, textarea, small, span } = van.tags; const defaultHeight = 64; const Textarea = (/** @type Properties */ props) => { @@ -30,18 +38,31 @@ const Textarea = (/** @type Properties */ props) => { const domId = van.derive(() => getValue(props.id) ?? getRandomId()); const value = van.derive(() => getValue(props.value) ?? ''); + const errors = van.derive(() => { + const validators = getValue(props.validators) ?? []; + return validators.map(v => v(value.val)).filter(error => error); + }); + const firstError = van.derive(() => { + return errors.val[0] ?? ''; + }); + const isRequired = van.state(false); + const isDirty = van.state(false); const onChange = props.onChange?.val ?? props.onChange; if (onChange) { - onChange(value.val); + onChange(value.val, { errors: errors.val, valid: errors.val.length <= 0 }); } van.derive(() => { const onChange = props.onChange?.val ?? props.onChange; - if (onChange && value.val !== value.oldVal) { - onChange(value.val); + if (onChange && (value.val !== value.oldVal || errors.val.length !== errors.oldVal.length)) { + onChange(value.val, { errors: errors.val, valid: errors.val.length <= 0 }); } }); + van.derive(() => { + isRequired.val = checkIsRequired(getValue(props.validators) ?? []); + }); + return label( { id: domId, @@ -52,6 +73,9 @@ const Textarea = (/** @type Properties */ props) => { div( { class: 'flex-row fx-gap-1 text-caption' }, props.label, + () => isRequired.val + ? span({ class: 'text-error' }, '*') + : '', () => getValue(props.help) ? withTooltip( Icon({ size: 16, classes: 'text-disabled' }, 'help'), @@ -66,8 +90,15 @@ const Textarea = (/** @type Properties */ props) => { name: props.name ?? '', disabled: props.disabled, placeholder: () => getValue(props.placeholder) ?? '', - oninput: debounce((/** @type Event */ event) => value.val = event.target.value, 300), + oninput: debounce((/** @type Event */ event) => { + isDirty.val = true; + value.val = event.target.value; + }, 300), }), + () => + isDirty.val && firstError.val + ? small({ class: 'tg-textarea--error' }, firstError) + : '', ); }; @@ -96,6 +127,11 @@ stylesheet.replace(` outline: none; border-color: var(--primary-color); } + +.tg-textarea--error { + height: 12px; + color: var(--error-color); +} `); export { Textarea }; diff --git a/testgen/ui/components/frontend/js/pages/edit_table_monitors.js b/testgen/ui/components/frontend/js/pages/edit_table_monitors.js index 5fd564ae..c1bdb4c6 100644 --- a/testgen/ui/components/frontend/js/pages/edit_table_monitors.js +++ b/testgen/ui/components/frontend/js/pages/edit_table_monitors.js @@ -78,6 +78,11 @@ const EditTableMonitors = (/** @type Properties */ props) => { onclick: () => selectedItem.val = { type: key, id: null }, }, span(label), + () => { + const id = getValue(props.definitions).find(td => td.test_type === key)?.id; + const state = formStates.val[id]; + return state && !state.valid ? span({ class: 'text-error' }, ' *') : ''; + }, )), div({ class: 'edit-monitors--list-divider mt-3 mb-1' }), div( @@ -118,6 +123,10 @@ const EditTableMonitors = (/** @type Properties */ props) => { span( { style: `text-overflow: ellipsis; ${!metric.column_name ? 'font-style: italic;' : ''}` }, metric.column_name || '(Unnamed Metric)', + () => { + const state = formStates.val[id]; + return state && !state.valid ? span({ class: 'text-error' }, ' *') : ''; + }, ), Button({ type: 'icon', @@ -167,13 +176,14 @@ const EditTableMonitors = (/** @type Properties */ props) => { ...newMetrics.val, [id]: { ...newMetrics.val[id], ...changes }, }; - } else { + formStates.val = { ...formStates.val, [id]: state }; + } else if (state.dirty) { updatedDefinitions.val = { ...updatedDefinitions.val, - [id]: { ...changes, id }, + [id]: { ...updatedDefinitions.rawVal[id], ...changes, id }, }; + formStates.val = { ...formStates.val, [id]: state }; } - formStates.val = { ...formStates.val, [id]: state }; }, }); } @@ -190,11 +200,13 @@ const EditTableMonitors = (/** @type Properties */ props) => { definition: { ...selectedDef, ...updatedDefinitions.rawVal[selectedDef.id] }, class: 'edit-monitors--form', onChange: (changes, state) => { - updatedDefinitions.val = { - ...updatedDefinitions.val, - [selectedDef.id]: { ...changes, id: selectedDef.id }, - }; - formStates.val = { ...formStates.val, [selectedDef.id]: state }; + if (state.dirty) { + updatedDefinitions.val = { + ...updatedDefinitions.val, + [selectedDef.id]: { ...updatedDefinitions.rawVal[selectedDef.id], ...changes, id: selectedDef.id }, + }; + formStates.val = { ...formStates.val, [selectedDef.id]: state }; + } }, }); }, diff --git a/testgen/ui/static/js/components/test_definition_form.js b/testgen/ui/static/js/components/test_definition_form.js index 31812f87..18b173dc 100644 --- a/testgen/ui/static/js/components/test_definition_form.js +++ b/testgen/ui/static/js/components/test_definition_form.js @@ -51,6 +51,7 @@ * @property {string} default_parm_columns * @property {string} default_parm_prompts * @property {string} default_parm_help + * @property {string?} default_parm_required * @property {string} default_severity * @property {'column'|'referential'|'table'|'tablegroup'|'custom'} test_scope * @property {string?} prediction @@ -69,7 +70,7 @@ import { Select } from './select.js'; import { Textarea } from './textarea.js'; import { RadioGroup } from './radio_group.js'; import { Caption } from './caption.js'; -import { numberBetween } from '../form_validators.js'; +import { numberBetween, required } from '../form_validators.js'; const { div, span } = van.tags; @@ -97,6 +98,7 @@ const TestDefinitionForm = (/** @type Properties */ props) => { const paramColumns = (definition.default_parm_columns || '').split(',').map(v => v.trim()); const paramLabels = (definition.default_parm_prompts || '').split(',').map(v => v.trim()); const paramHelp = (definition.default_parm_help || '').split('|').map(v => v.trim()); + const paramRequired = (definition.default_parm_required || '').split(',').map(v => v.trim().toUpperCase() === 'Y'); const hasThresholds = paramColumns.includes('history_calculation'); const dynamicParamColumns = paramColumns @@ -105,6 +107,7 @@ const TestDefinitionForm = (/** @type Properties */ props) => { column, label: paramLabels[index] || column.replaceAll('_', ' '), help: paramHelp[index] || null, + validators: paramRequired[index] ? [required] : undefined, })) .filter(config => !hasThresholds || !thresholdColumns.includes(config.column)) @@ -171,6 +174,7 @@ const TestDefinitionForm = (/** @type Properties */ props) => { type: 'number', value: currentValue(), step: config.step, + validators: config.validators, onChange: (value, state) => { setFieldValues({ [column]: value || null }) setFieldValidity(column, state.valid); @@ -188,8 +192,10 @@ const TestDefinitionForm = (/** @type Properties */ props) => { help: config.help, value: currentValue(), height: 100, - onChange: (value) => { - setFieldValues({ [column]: value || null }) + validators: config.validators, + onChange: (value, state) => { + setFieldValues({ [column]: value || null }); + setFieldValidity(column, state.valid); }, }), ); @@ -202,6 +208,7 @@ const TestDefinitionForm = (/** @type Properties */ props) => { label: config.label, help: config.help, value: currentValue(), + validators: config.validators, onChange: (value, state) => { setFieldValues({ [column]: value || null }) setFieldValidity(column, state.valid); @@ -291,6 +298,21 @@ const ThresholdForm = (options, definition) => { 'lower_tolerance': newMode === 'static' ? lowerTolerance.val : newMode === 'prediction' ? definition.lower_tolerance : null, 'upper_tolerance': newMode === 'static' ? upperTolerance.val : newMode === 'prediction' ? definition.upper_tolerance : null, }); + if (newMode === 'static') { + if (!isFreshnessTrend) { + setFieldValidity('lower_tolerance', !!lowerTolerance.val); + } + setFieldValidity('upper_tolerance', !!upperTolerance.val); + setFieldValidity('history_lookback', true); + } else if (newMode === 'historical') { + setFieldValidity('lower_tolerance', true); + setFieldValidity('upper_tolerance', true); + setFieldValidity('history_lookback', !!historyLookback.val); + } else { + setFieldValidity('lower_tolerance', true); + setFieldValidity('upper_tolerance', true); + setFieldValidity('history_lookback', true); + } }, }), () => { @@ -376,8 +398,8 @@ const ThresholdForm = (options, definition) => { if (mode.val === 'static') { return div( - { class: 'flex-row fx-gap-3 fx-flex-wrap mt-2' }, - !isFreshnessTrend + { class: 'flex-row fx-gap-3 fx-flex-wrap fx-align-flex-start mt-2' }, + !isFreshnessTrend ? div( { class: 'td-form--field' }, Input({ @@ -385,6 +407,7 @@ const ThresholdForm = (options, definition) => { label: 'Lower Bound', type: 'number', value: lowerTolerance, + validators: [required], onChange: (value, state) => { lowerTolerance.val = value; setFieldValues({ lower_tolerance: value }); @@ -400,6 +423,7 @@ const ThresholdForm = (options, definition) => { label: isFreshnessTrend ? 'Maximum interval since last update (minutes)' : 'Upper Bound', type: 'number', value: upperTolerance, + validators: [required], onChange: (value, state) => { upperTolerance.val = value; setFieldValues({ upper_tolerance: value }); diff --git a/testgen/ui/static/js/components/textarea.js b/testgen/ui/static/js/components/textarea.js index 828d8c86..bdfc411a 100644 --- a/testgen/ui/static/js/components/textarea.js +++ b/testgen/ui/static/js/components/textarea.js @@ -1,4 +1,11 @@ /** + * @import { Validator } from '../form_validators.js'; + * + * @typedef InputState + * @type {object} + * @property {boolean} valid + * @property {string[]} errors + * * @typedef Properties * @type {object} * @property {string?} id @@ -16,13 +23,14 @@ * @property {number?} width * @property {number?} height * @property {string?} testId + * @property {Array?} validators */ import van from '../van.min.js'; -import { debounce, getValue, loadStylesheet, getRandomId } from '../utils.js'; +import { debounce, getValue, loadStylesheet, getRandomId, checkIsRequired } from '../utils.js'; import { Icon } from './icon.js'; import { withTooltip } from './tooltip.js'; -const { div, label, textarea } = van.tags; +const { div, label, textarea, small, span } = van.tags; const defaultHeight = 64; const Textarea = (/** @type Properties */ props) => { @@ -30,18 +38,31 @@ const Textarea = (/** @type Properties */ props) => { const domId = van.derive(() => getValue(props.id) ?? getRandomId()); const value = van.derive(() => getValue(props.value) ?? ''); + const errors = van.derive(() => { + const validators = getValue(props.validators) ?? []; + return validators.map(v => v(value.val)).filter(error => error); + }); + const firstError = van.derive(() => { + return errors.val[0] ?? ''; + }); + const isRequired = van.state(false); + const isDirty = van.state(false); const onChange = props.onChange?.val ?? props.onChange; if (onChange) { - onChange(value.val); + onChange(value.val, { errors: errors.val, valid: errors.val.length <= 0 }); } van.derive(() => { const onChange = props.onChange?.val ?? props.onChange; - if (onChange && value.val !== value.oldVal) { - onChange(value.val); + if (onChange && (value.val !== value.oldVal || errors.val.length !== errors.oldVal.length)) { + onChange(value.val, { errors: errors.val, valid: errors.val.length <= 0 }); } }); + van.derive(() => { + isRequired.val = checkIsRequired(getValue(props.validators) ?? []); + }); + return label( { id: domId, @@ -52,6 +73,9 @@ const Textarea = (/** @type Properties */ props) => { div( { class: 'flex-row fx-gap-1 text-caption' }, props.label, + () => isRequired.val + ? span({ class: 'text-error' }, '*') + : '', () => getValue(props.help) ? withTooltip( Icon({ size: 16, classes: 'text-disabled' }, 'help'), @@ -66,8 +90,15 @@ const Textarea = (/** @type Properties */ props) => { name: props.name ?? '', disabled: props.disabled, placeholder: () => getValue(props.placeholder) ?? '', - oninput: debounce((/** @type Event */ event) => value.val = event.target.value, 300), + oninput: debounce((/** @type Event */ event) => { + isDirty.val = true; + value.val = event.target.value; + }, 300), }), + () => + isDirty.val && firstError.val + ? small({ class: 'tg-textarea--error' }, firstError) + : '', ); }; @@ -96,6 +127,11 @@ stylesheet.replace(` outline: none; border-color: var(--primary-color); } + +.tg-textarea--error { + height: 12px; + color: var(--error-color); +} `); export { Textarea }; From 351d54d415a4a50ef41ffc483382a6cb71fd2ec6 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 4 Mar 2026 23:10:21 -0500 Subject: [PATCH 42/95] fix(data catalog): remove test suite links for catalog role --- .../frontend/js/pages/data_catalog.js | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/testgen/ui/components/frontend/js/pages/data_catalog.js b/testgen/ui/components/frontend/js/pages/data_catalog.js index 6d25e80f..e8825413 100644 --- a/testgen/ui/components/frontend/js/pages/data_catalog.js +++ b/testgen/ui/components/frontend/js/pages/data_catalog.js @@ -459,7 +459,7 @@ const SelectedDetails = (/** @type Properties */ props, /** @type Table | Column PotentialPIICard({ noLinks: !userCanNavigate }, item), HygieneIssuesCard({ noLinks: !userCanNavigate }, item), TestIssuesCard({ noLinks: !userCanNavigate }, item), - TestSuitesCard(item), + TestSuitesCard({ noLinks: !userCanNavigate }, item), item.type === 'table' ? TableCreateScriptCard({}, item) : null, @@ -586,44 +586,48 @@ const TagsCard = (/** @type TagProperties */ props, /** @type Table | Column */ }); }; -const TestSuitesCard = (/** @type Table | Column */ item) => { +const TestSuitesCard = (/** @type Properties */ props, /** @type Table | Column */ item) => { return Card({ title: 'Related Test Suites', content: div( { class: 'flex-column fx-gap-2' }, item.test_suites.map(({ id, name, test_count }) => div( { class: 'flex-row fx-gap-1' }, - Link({ - href: 'test-suites:definitions', - params: { - test_suite_id: id, - table_name: item.table_name, - column_name: item.column_name, - project_code: item.project_code, - }, - open_new: true, - label: name, - }), + props.noLinks + ? span(name) + : Link({ + href: 'test-suites:definitions', + params: { + test_suite_id: id, + table_name: item.table_name, + column_name: item.column_name, + project_code: item.project_code, + }, + open_new: true, + label: name, + }), span({ class: 'text-caption' }, `(${test_count} test definitions)`), )) ), actionContent: item.test_suites.length - ? null + ? null : item.drop_date ? span({ class: 'text-secondary' }, `No test definitions for ${item.type}`) : span( { class: 'text-secondary flex-row fx-gap-1 fx-justify-content-flex-end' }, `No test definitions yet for ${item.type}.`, - Link({ - href: 'test-suites', - params: { - project_code: item.project_code, - table_group_id: item.table_group_id, - }, - open_new: true, - label: 'Go to Test Suites', - right_icon: 'chevron_right', - }), + props.noLinks + ? null + : Link({ + href: 'test-suites', + params: { + project_code: item.project_code, + table_group_id: item.table_group_id, + }, + open_new: true, + label: 'Go to Test Suites', + right_icon: 'chevron_right', + }), ), }); }; From c1691b8dcf2079577a810af416487d5619323d9b Mon Sep 17 00:00:00 2001 From: testgen-ci-bot Date: Thu, 12 Mar 2026 05:34:01 +0000 Subject: [PATCH 43/95] ci: bump base image to v12 --- deploy/testgen.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index 856804fb..e8456b42 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -1,4 +1,4 @@ -ARG TESTGEN_BASE_LABEL=v11 +ARG TESTGEN_BASE_LABEL=v12 FROM datakitchen/dataops-testgen-base:${TESTGEN_BASE_LABEL} AS release-image From e103b0825da4a087d9526804b7e0e31f5e7b987e Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 11 Mar 2026 16:00:49 -0400 Subject: [PATCH 44/95] ci: handle hdbcli manylinux-only wheels on Alpine hdbcli only ships manylinux2014 wheels (no musl). pip 26+ correctly rejects these on Alpine. Work around by downloading the wheel with explicit --platform flag and extracting it directly into site-packages. Add gcompat for runtime glibc compatibility. Ref: TG-997 Co-Authored-By: Claude Opus 4.6 --- deploy/testgen-base.dockerfile | 15 +++++++++++++++ deploy/testgen.dockerfile | 5 +++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/deploy/testgen-base.dockerfile b/deploy/testgen-base.dockerfile index d758a03f..a49b1d06 100644 --- a/deploy/testgen-base.dockerfile +++ b/deploy/testgen-base.dockerfile @@ -20,6 +20,8 @@ RUN apk update && apk upgrade && apk add --no-cache \ gpg \ gpgv \ openssl \ + # glibc compatibility layer for packages that only ship manylinux wheels (e.g. hdbcli) + gcompat \ # Additional libraries needed and their dev counterparts. We add both so that we can remove # the *-dev later, keeping the libraries openblas=0.3.30-r2 \ @@ -41,6 +43,19 @@ RUN mkdir /dk # Upgrading pip for security RUN python3 -m pip install --upgrade pip==26.0 +# hdbcli only ships manylinux wheels (no musl). pip 26+ correctly rejects these on Alpine. +# We download the wheel for the correct arch, then extract it directly into site-packages +# (wheels are zip files). gcompat provides the glibc shim needed at runtime. +RUN ARCH=$(uname -m) && \ + pip download --platform manylinux2014_${ARCH} --python-version 3.12 --only-binary :all: \ + --no-deps -d /tmp/wheels hdbcli==2.25.31 && \ + python3 -m zipfile -e /tmp/wheels/hdbcli-*.whl /dk/lib/python3.12/site-packages/ && \ + rm -rf /tmp/wheels + +# Strip hdbcli from pyproject.toml before installing — it's already extracted above and +# pip 26+ would fail trying to resolve it from PyPI on musl +RUN sed -i '/hdbcli/d' /tmp/dk/pyproject.toml + RUN python3 -m pip install --prefix=/dk /tmp/dk RUN apk del \ diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index e8456b42..7c4d280f 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -12,9 +12,10 @@ ENV PATH=$PATH:/dk/bin RUN apk upgrade -# Now install everything +# Now install everything (hdbcli is pre-installed in the base image via manual wheel extraction) COPY . /tmp/dk/ -RUN python3 -m pip install --prefix=/dk /tmp/dk +RUN sed -i '/hdbcli/d' /tmp/dk/pyproject.toml /tmp/dk/testgen/pyproject.toml 2>/dev/null; \ + python3 -m pip install --prefix=/dk /tmp/dk # Generate third-party license notices from installed packages RUN pip install pip-licenses \ From cab56a0fdc034f67f8fc4921f1c1afe47597dc79 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 11 Mar 2026 23:24:50 -0400 Subject: [PATCH 45/95] ci: register hdbcli dist-info so pip resolves transitive dep sqlalchemy-hana depends on hdbcli~=2.10 transitively. The sed workaround only strips the direct dependency from pyproject.toml. Copy hdbcli's dist-info to system site-packages so pip sees it as already installed during resolution. Co-Authored-By: Claude Opus 4.6 --- deploy/testgen-base.dockerfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/deploy/testgen-base.dockerfile b/deploy/testgen-base.dockerfile index a49b1d06..a1d81370 100644 --- a/deploy/testgen-base.dockerfile +++ b/deploy/testgen-base.dockerfile @@ -50,6 +50,10 @@ RUN ARCH=$(uname -m) && \ pip download --platform manylinux2014_${ARCH} --python-version 3.12 --only-binary :all: \ --no-deps -d /tmp/wheels hdbcli==2.25.31 && \ python3 -m zipfile -e /tmp/wheels/hdbcli-*.whl /dk/lib/python3.12/site-packages/ && \ + # Copy dist-info to system site-packages so pip sees hdbcli as installed during + # dependency resolution (sqlalchemy-hana transitively depends on hdbcli~=2.10) + cp -r /dk/lib/python3.12/site-packages/hdbcli-*.dist-info \ + "$(python3 -c 'import sysconfig; print(sysconfig.get_path("purelib"))')"/ && \ rm -rf /tmp/wheels # Strip hdbcli from pyproject.toml before installing — it's already extracted above and From 9b2ef2c54c061c1c32c85c8a583bca7c879980d1 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 12 Mar 2026 02:10:14 -0400 Subject: [PATCH 46/95] misc: upgrade fastapi --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6d52c874..2740345d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,7 +83,7 @@ dependencies = [ # API & OAuth server "authlib~=1.6.6", - "fastapi==0.115.12", + "fastapi==0.135.1", ] [project.optional-dependencies] From 7925cc202bd6e98c2481a8081fde616d8fe190a9 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 12 Mar 2026 13:17:07 -0400 Subject: [PATCH 47/95] ci: clear pip cache in Docker images to fix Trivy false positive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trivy flagged a CRITICAL secret (GCP service-account) in the pip HTTP cache — it was example code from a cached package's documentation. Remove /root/.cache/pip in both base and QA Dockerfiles to prevent this class of false positive and reduce image size. Co-Authored-By: Claude Opus 4.6 --- deploy/testgen-base.dockerfile | 2 +- deploy/testgen.dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deploy/testgen-base.dockerfile b/deploy/testgen-base.dockerfile index a1d81370..f5b9a2bf 100644 --- a/deploy/testgen-base.dockerfile +++ b/deploy/testgen-base.dockerfile @@ -78,4 +78,4 @@ RUN apk del \ unixodbc-dev \ apache-arrow-dev -RUN rm /tmp/dk/install_linuxodbc.sh +RUN rm -rf /root/.cache/pip /tmp/dk/install_linuxodbc.sh diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index 7c4d280f..9f6a9c0e 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -23,7 +23,7 @@ RUN pip install pip-licenses \ && PYTHONPATH=/dk/lib/python3.12/site-packages python3 "$SCRIPT" --output /dk/THIRD-PARTY-NOTICES \ && pip uninstall -y pip-licenses -RUN rm -Rf /tmp/dk +RUN rm -Rf /tmp/dk /root/.cache/pip RUN tg-patch-streamlit From db9b626ceaaee60f673ff4fbc77334c073e8b04f Mon Sep 17 00:00:00 2001 From: testgen-ci-bot Date: Thu, 12 Mar 2026 19:06:23 +0000 Subject: [PATCH 48/95] ci: bump base image to v13 --- deploy/testgen.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index 9f6a9c0e..f40da127 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -1,4 +1,4 @@ -ARG TESTGEN_BASE_LABEL=v12 +ARG TESTGEN_BASE_LABEL=v13 FROM datakitchen/dataops-testgen-base:${TESTGEN_BASE_LABEL} AS release-image From 094a15b5c615dbc10d2758d9c97634da4977200f Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Mon, 16 Mar 2026 11:24:18 -0300 Subject: [PATCH 49/95] refactor(flavor): make FlavorService stateless with explicit params MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminate init() from FlavorService. All connection methods now take ResolvedConnectionParams as an explicit argument. Add resolve_connection_params() as a public standalone function, url_scheme class constant, default get_connection_string_head(), and create_engine() convenience method. Move URL parsing out of FlavorService into Connection.save() — the method was generic urlparse with no flavor-specific behavior. Inline the parsing directly since callers strip credentials before saving. Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/common/database/database_service.py | 22 +-- .../flavor/bigquery_flavor_service.py | 17 +- .../flavor/databricks_flavor_service.py | 47 ++--- .../common/database/flavor/flavor_service.py | 165 +++++++++--------- .../database/flavor/mssql_flavor_service.py | 30 ++-- .../database/flavor/oracle_flavor_service.py | 14 +- .../flavor/redshift_flavor_service.py | 15 +- .../flavor/sap_hana_flavor_service.py | 16 +- .../flavor/snowflake_flavor_service.py | 41 ++--- .../database/flavor/trino_flavor_service.py | 13 +- testgen/common/models/connection.py | 33 ++-- testgen/ui/services/database_service.py | 15 +- testgen/ui/views/connections.py | 5 +- 13 files changed, 211 insertions(+), 222 deletions(-) diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py index 4b340b18..0e338318 100644 --- a/testgen/common/database/database_service.py +++ b/testgen/common/database/database_service.py @@ -26,7 +26,12 @@ get_tg_username, ) from testgen.common.database import FilteredStringIO -from testgen.common.database.flavor.flavor_service import ConnectionParams, FlavorService, SQLFlavor +from testgen.common.database.flavor.flavor_service import ( + ConnectionParams, + FlavorService, + SQLFlavor, + resolve_connection_params, +) from testgen.common.read_file import get_template_files from testgen.utils import get_exception_message @@ -407,27 +412,22 @@ def _init_target_db_connection() -> Connection: raise ValueError("Target database connection parameters were not set") flavor_service = get_flavor_service(target_db_params["sql_flavor"]) - flavor_service.init(target_db_params) + params = resolve_connection_params(target_db_params) engine = engine_cache.target_db if not engine: try: - engine: Engine = create_engine( - flavor_service.get_connection_string(), - connect_args=flavor_service.get_connect_args(), - **flavor_service.get_engine_args(), - ) + engine: Engine = flavor_service.create_engine(target_db_params) except SQLAlchemyError as e: - raise ValueError(f"Failed to create engine for Target database '{flavor_service.dbname}' (User type = normal)") from e + raise ValueError(f"Failed to create engine for Target database '{params.dbname}' (User type = normal)") from e else: engine_cache.target_db = engine - connection: Connection = engine.connect() - for query, params in flavor_service.get_pre_connection_queries(): + for query, query_params in flavor_service.get_pre_connection_queries(params): try: - connection.execute(text(query), params) + connection.execute(text(query), query_params) except Exception: LOG.warning( f"Failed to execute preconnection query on Target database: {query}", diff --git a/testgen/common/database/flavor/bigquery_flavor_service.py b/testgen/common/database/flavor/bigquery_flavor_service.py index 8e80f146..47150a73 100644 --- a/testgen/common/database/flavor/bigquery_flavor_service.py +++ b/testgen/common/database/flavor/bigquery_flavor_service.py @@ -1,6 +1,6 @@ from typing import Any -from testgen.common.database.flavor.flavor_service import FlavorService +from testgen.common.database.flavor.flavor_service import FlavorService, ResolvedConnectionParams class BigqueryFlavorService(FlavorService): @@ -8,15 +8,16 @@ class BigqueryFlavorService(FlavorService): quote_character = "`" escaped_single_quote = "\\'" varchar_type = "STRING" + url_scheme = "bigquery" - def get_connection_string_head(self): - return "bigquery://" + def get_connection_string_head(self, params: ResolvedConnectionParams) -> str: # noqa: ARG002 + return f"{self.url_scheme}://" - def get_connection_string_from_fields(self): - return f"bigquery://{self.service_account_key["project_id"] if self.service_account_key else ""}" + def get_connection_string_from_fields(self, params: ResolvedConnectionParams) -> str: + return f"{self.url_scheme}://{params.service_account_key["project_id"] if params.service_account_key else ""}" - def get_connect_args(self) -> dict: + def get_connect_args(self, params: ResolvedConnectionParams) -> dict: # noqa: ARG002 return {} - def get_engine_args(self) -> dict[str,Any]: - return {"credentials_info": self.service_account_key} if self.service_account_key else {} + def get_engine_args(self, params: ResolvedConnectionParams) -> dict[str, Any]: + return {"credentials_info": params.service_account_key} if params.service_account_key else {} diff --git a/testgen/common/database/flavor/databricks_flavor_service.py b/testgen/common/database/flavor/databricks_flavor_service.py index 1595213c..8b143b92 100644 --- a/testgen/common/database/flavor/databricks_flavor_service.py +++ b/testgen/common/database/flavor/databricks_flavor_service.py @@ -1,6 +1,6 @@ from urllib.parse import quote_plus -from testgen.common.database.flavor.flavor_service import FlavorService +from testgen.common.database.flavor.flavor_service import FlavorService, ResolvedConnectionParams class DatabricksFlavorService(FlavorService): @@ -8,43 +8,44 @@ class DatabricksFlavorService(FlavorService): quote_character = "`" escaped_single_quote = "\\'" varchar_type = "STRING" + url_scheme = "databricks" - def get_pre_connection_queries(self) -> list[tuple[str, dict | None]]: - if self.dbname: - return [(f"USE CATALOG `{self.dbname}`", None)] + def get_pre_connection_queries(self, params: ResolvedConnectionParams) -> list[tuple[str, dict | None]]: + if params.dbname: + return [(f"USE CATALOG `{params.dbname}`", None)] return [] - def get_connect_args(self) -> dict: + def get_connect_args(self, params: ResolvedConnectionParams) -> dict: args = {} - if self.dbname: - args["catalog"] = self.dbname - if self.connect_by_key: - args["credentials_provider"] = self._get_oauth_credentials_provider() + if params.dbname: + args["catalog"] = params.dbname + if params.connect_by_key: + args["credentials_provider"] = self._get_oauth_credentials_provider(params) return args - def get_connection_string_head(self): - if self.connect_by_key: - return f"{self.flavor}://oauth:@" - return f"{self.flavor}://token:{quote_plus(self.password)}@" + def get_connection_string_head(self, params: ResolvedConnectionParams) -> str: + if params.connect_by_key: + return f"{self.url_scheme}://oauth:@" + return f"{self.url_scheme}://token:{quote_plus(params.password)}@" - def get_connection_string_from_fields(self): - if self.connect_by_key: + def get_connection_string_from_fields(self, params: ResolvedConnectionParams) -> str: + if params.connect_by_key: return ( - f"{self.flavor}://oauth:@{self.host}:{self.port}/{self.dbname}" - f"?http_path={self.http_path}&catalog={self.dbname}" + f"{self.url_scheme}://oauth:@{params.host}:{params.port}/{params.dbname}" + f"?http_path={params.http_path}&catalog={params.dbname}" ) return ( - f"{self.flavor}://token:{quote_plus(self.password)}@{self.host}:{self.port}/{self.dbname}" - f"?http_path={self.http_path}&catalog={self.dbname}" + f"{self.url_scheme}://token:{quote_plus(params.password)}@{params.host}:{params.port}/{params.dbname}" + f"?http_path={params.http_path}&catalog={params.dbname}" ) - def _get_oauth_credentials_provider(self): + def _get_oauth_credentials_provider(self, params: ResolvedConnectionParams): from databricks.sdk.core import Config, oauth_service_principal config = Config( - host=f"https://{self.host}", - client_id=self.username, - client_secret=self.password, + host=f"https://{params.host}", + client_id=params.username, + client_secret=params.password, ) # oauth_service_principal(config) returns an OAuthCredentialsProvider, # which is callable: provider() -> Dict[str, str] (auth headers). diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py index 4b1625f1..a56ac8ba 100644 --- a/testgen/common/database/flavor/flavor_service.py +++ b/testgen/common/database/flavor/flavor_service.py @@ -1,6 +1,10 @@ from abc import abstractmethod +from dataclasses import dataclass from typing import Any, Literal, TypedDict -from urllib.parse import parse_qs, urlparse +from urllib.parse import quote_plus + +from sqlalchemy import create_engine as sqlalchemy_create_engine +from sqlalchemy.engine.base import Engine from testgen.common.encrypt import DecryptText @@ -26,6 +30,59 @@ class ConnectionParams(TypedDict): connect_with_identity: bool sql_flavor_code: str + +@dataclass(frozen=True, slots=True) +class ResolvedConnectionParams: + url: str = "" + connect_by_url: bool = False + username: str = "" + password: str | None = None + host: str = "" + port: str = "" + dbname: str = "" + dbschema: str | None = None + sql_flavor: str = "" + sql_flavor_code: str = "" + connect_by_key: bool = False + private_key: str | None = None + private_key_passphrase: str | None = None + http_path: str = "" + catalog: str = "" + warehouse: str = "" + service_account_key: dict[str, Any] | None = None + connect_with_identity: bool = False + + +def _decrypt_if_needed(value: Any) -> str | None: + if isinstance(value, memoryview | bytes): + return DecryptText(value) + return value + + +def resolve_connection_params(connection_params: ConnectionParams) -> ResolvedConnectionParams: + sql_flavor = connection_params.get("sql_flavor") or "" + return ResolvedConnectionParams( + url=connection_params.get("url") or "", + connect_by_url=connection_params.get("connect_by_url", False), + username=connection_params.get("project_user") or "", + password=_decrypt_if_needed(connection_params.get("project_pw_encrypted")), + host=connection_params.get("project_host") or "", + port=connection_params.get("project_port") or "", + dbname=connection_params.get("project_db") or "", + dbschema=connection_params.get("table_group_schema"), + sql_flavor=sql_flavor, + sql_flavor_code=connection_params.get("sql_flavor_code") or sql_flavor, + connect_by_key=connection_params.get("connect_by_key", False), + private_key=_decrypt_if_needed(connection_params.get("private_key")), + private_key_passphrase=_decrypt_if_needed(connection_params.get("private_key_passphrase")), + http_path=connection_params.get("http_path") or "", + catalog=connection_params.get("catalog") or "", + warehouse=connection_params.get("warehouse") or "", + service_account_key=connection_params.get("service_account_key"), + connect_with_identity=connection_params.get("connect_with_identity") or False, + ) + + class FlavorService: concat_operator = "||" @@ -38,102 +95,36 @@ class FlavorService: row_limiting_clause: RowLimitingClause = "limit" default_uppercase = False test_query = "SELECT 1" + url_scheme = "postgresql" - def init(self, connection_params: ConnectionParams): - self.url = connection_params.get("url") or "" - self.connect_by_url = connection_params.get("connect_by_url", False) - self.username = connection_params.get("project_user") or "" - self.host = connection_params.get("project_host") or "" - self.port = connection_params.get("project_port") or "" - self.dbname = connection_params.get("project_db") or "" - self.flavor = connection_params.get("sql_flavor") - self.dbschema = connection_params.get("table_group_schema", None) - self.connect_by_key = connection_params.get("connect_by_key", False) - self.http_path = connection_params.get("http_path") or "" - self.catalog = connection_params.get("catalog") or "" - self.warehouse = connection_params.get("warehouse") or "" - self.service_account_key = connection_params.get("service_account_key", None) - self.connect_with_identity = connection_params.get("connect_with_identity") or False - self.sql_flavor_code = connection_params.get("sql_flavor_code") or self.flavor - - password = connection_params.get("project_pw_encrypted", None) - if isinstance(password, memoryview) or isinstance(password, bytes): - password = DecryptText(password) - self.password = password - - private_key = connection_params.get("private_key", None) - if isinstance(private_key, memoryview) or isinstance(private_key, bytes): - private_key = DecryptText(private_key) - self.private_key = private_key - - private_key_passphrase = connection_params.get("private_key_passphrase", None) - if isinstance(private_key_passphrase, memoryview) or isinstance(private_key_passphrase, bytes): - private_key_passphrase = DecryptText(private_key_passphrase) - self.private_key_passphrase = private_key_passphrase - - def get_pre_connection_queries(self) -> list[tuple[str, dict | None]]: + def get_pre_connection_queries(self, params: ResolvedConnectionParams) -> list[tuple[str, dict | None]]: # noqa: ARG002 return [] - def get_connect_args(self) -> dict: + def get_connect_args(self, params: ResolvedConnectionParams) -> dict: # noqa: ARG002 return {"connect_timeout": 3600} - def get_engine_args(self) -> dict[str,Any]: + def get_engine_args(self, params: ResolvedConnectionParams) -> dict[str, Any]: # noqa: ARG002 return {} - def get_connection_string(self) -> str: - if self.connect_by_url: - header = self.get_connection_string_head() - url = header + self.url - return url + def create_engine(self, connection_params: ConnectionParams) -> Engine: + params = resolve_connection_params(connection_params) + return sqlalchemy_create_engine( + self.get_connection_string(params), + connect_args=self.get_connect_args(params), + **self.get_engine_args(params), + ) + + def get_connection_string(self, params: ResolvedConnectionParams) -> str: + if params.connect_by_url: + header = self.get_connection_string_head(params) + return header + params.url else: - return self.get_connection_string_from_fields() + return self.get_connection_string_from_fields(params) @abstractmethod - def get_connection_string_from_fields(self) -> str: + def get_connection_string_from_fields(self, params: ResolvedConnectionParams) -> str: raise NotImplementedError("Subclasses must implement this method") - @abstractmethod - def get_connection_string_head(self) -> str: - raise NotImplementedError("Subclasses must implement this method") + def get_connection_string_head(self, params: ResolvedConnectionParams) -> str: + return f"{self.url_scheme}://{params.username}:{quote_plus(params.password)}@" - def get_parts_from_connection_string(self) -> dict[str, Any]: - if self.connect_by_url: - if not self.url: - return {} - - parsed_url = urlparse(self.get_connection_string()) - credentials, location = ( - parsed_url.netloc if "@" in parsed_url.netloc else f"@{parsed_url.netloc}" - ).split("@") - username, password = ( - credentials if ":" in credentials else f"{credentials}:" - ).split(":") - host, port = ( - location if ":" in location else f"{location}:" - ).split(":") - - database = (path_patrs[0] if (path_patrs := parsed_url.path.strip("/").split("/")) else "") - - extras = { - param_name: param_values[0] - for param_name, param_values in parse_qs(parsed_url.query or "").items() - } - - return { - "username": username, - "password": password, - "host": host, - "port": port, - "dbname": database, - **extras, - } - - return { - "username": self.username, - "password": self.password, - "host": self.host, - "port": self.port, - "dbname": self.dbname, - "http_path": self.http_path, - "catalog": self.catalog, - } diff --git a/testgen/common/database/flavor/mssql_flavor_service.py b/testgen/common/database/flavor/mssql_flavor_service.py index b066eac7..70ee3d11 100644 --- a/testgen/common/database/flavor/mssql_flavor_service.py +++ b/testgen/common/database/flavor/mssql_flavor_service.py @@ -3,7 +3,7 @@ from sqlalchemy.engine import URL from testgen import settings -from testgen.common.database.flavor.flavor_service import FlavorService +from testgen.common.database.flavor.flavor_service import FlavorService, ResolvedConnectionParams class MssqlFlavorService(FlavorService): @@ -11,42 +11,40 @@ class MssqlFlavorService(FlavorService): concat_operator = "+" escaped_underscore = "[_]" row_limiting_clause = "top" + url_scheme = "mssql+pyodbc" - def get_connection_string_head(self): - return f"mssql+pyodbc://{self.username}:{quote_plus(self.password)}@" - - def get_connection_string_from_fields(self): + def get_connection_string_from_fields(self, params: ResolvedConnectionParams) -> str: connection_url = URL.create( - "mssql+pyodbc", - username=self.username, - password=quote_plus(self.password or ""), - host=self.host, - port=int(self.port or 1443), - database=self.dbname, + self.url_scheme, + username=params.username, + password=quote_plus(params.password or ""), + host=params.host, + port=int(params.port or 1443), + database=params.dbname, query={ "driver": "ODBC Driver 18 for SQL Server", }, ) - if self.connect_with_identity: + if params.connect_with_identity: connection_url = connection_url._replace(username=None, password=None).update_query_dict({ "encrypt": "yes", "authentication": "ActiveDirectoryMsi", }) - if self.sql_flavor_code == "synapse_mssql": + if params.sql_flavor_code == "synapse_mssql": connection_url = connection_url.update_query_dict({"autocommit": "True"}) return connection_url.render_as_string(hide_password=False) - def get_pre_connection_queries(self): + def get_pre_connection_queries(self, params: ResolvedConnectionParams) -> list[tuple[str, dict | None]]: # noqa: ARG002 return [ ("SET ANSI_DEFAULTS ON;", None), ("SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;", None), ] - def get_connect_args(self): - connect_args = super().get_connect_args() + def get_connect_args(self, params: ResolvedConnectionParams) -> dict: + connect_args = super().get_connect_args(params) if settings.SKIP_DATABASE_CERTIFICATE_VERIFICATION: connect_args["TrustServerCertificate"] = "yes" return connect_args diff --git a/testgen/common/database/flavor/oracle_flavor_service.py b/testgen/common/database/flavor/oracle_flavor_service.py index 9c3c7932..3efd3854 100644 --- a/testgen/common/database/flavor/oracle_flavor_service.py +++ b/testgen/common/database/flavor/oracle_flavor_service.py @@ -3,7 +3,7 @@ import oracledb -from testgen.common.database.flavor.flavor_service import FlavorService +from testgen.common.database.flavor.flavor_service import FlavorService, ResolvedConnectionParams # https://stackoverflow.com/a/74105559 oracledb.version = "8.3.0" @@ -18,17 +18,15 @@ class OracleFlavorService(FlavorService): default_uppercase = True row_limiting_clause = "fetch" test_query = "SELECT 1 FROM DUAL" + url_scheme = "oracle" - def get_connection_string_head(self): - return f"oracle://{self.username}:{quote_plus(self.password)}@" + def get_connection_string_from_fields(self, params: ResolvedConnectionParams) -> str: + return f"{self.url_scheme}://{params.username}:{quote_plus(params.password)}@{params.host}:{params.port}?service_name={params.dbname}" - def get_connection_string_from_fields(self): - return f"oracle://{self.username}:{quote_plus(self.password)}@{self.host}:{self.port}?service_name={self.dbname}" - - def get_pre_connection_queries(self): + def get_pre_connection_queries(self, params: ResolvedConnectionParams) -> list[tuple[str, dict | None]]: # noqa: ARG002 return [ ("ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD HH24:MI:SS'", None), ] - def get_connect_args(self) -> dict: + def get_connect_args(self, params: ResolvedConnectionParams) -> dict: # noqa: ARG002 return {} diff --git a/testgen/common/database/flavor/redshift_flavor_service.py b/testgen/common/database/flavor/redshift_flavor_service.py index 36f89418..3b6c6e6a 100644 --- a/testgen/common/database/flavor/redshift_flavor_service.py +++ b/testgen/common/database/flavor/redshift_flavor_service.py @@ -1,20 +1,13 @@ from urllib.parse import quote_plus -from testgen.common.database.flavor.flavor_service import FlavorService +from testgen.common.database.flavor.flavor_service import FlavorService, ResolvedConnectionParams class RedshiftFlavorService(FlavorService): escaped_underscore = "\\\\_" + url_scheme = "postgresql" - def init(self, connection_params: dict): - super().init(connection_params) - # This is for connection purposes. sqlalchemy 1.4.46 uses postgresql to connect to redshift database - self.flavor = "postgresql" - - def get_connection_string_head(self): - return f"{self.flavor}://{self.username}:{quote_plus(self.password)}@" - - def get_connection_string_from_fields(self): + def get_connection_string_from_fields(self, params: ResolvedConnectionParams) -> str: # STANDARD FORMAT: strConnect = 'flavor://username:password@host:port/database' - return f"{self.flavor}://{self.username}:{quote_plus(self.password)}@{self.host}:{self.port}/{self.dbname}" + return f"{self.url_scheme}://{params.username}:{quote_plus(params.password)}@{params.host}:{params.port}/{params.dbname}" diff --git a/testgen/common/database/flavor/sap_hana_flavor_service.py b/testgen/common/database/flavor/sap_hana_flavor_service.py index 2a855e51..f6b6da17 100644 --- a/testgen/common/database/flavor/sap_hana_flavor_service.py +++ b/testgen/common/database/flavor/sap_hana_flavor_service.py @@ -1,6 +1,6 @@ from urllib.parse import quote_plus -from testgen.common.database.flavor.flavor_service import FlavorService +from testgen.common.database.flavor.flavor_service import FlavorService, ResolvedConnectionParams class SapHanaFlavorService(FlavorService): @@ -8,15 +8,13 @@ class SapHanaFlavorService(FlavorService): varchar_type = "NVARCHAR(1000)" default_uppercase = True test_query = "SELECT 1 FROM DUMMY" + url_scheme = "hana+hdbcli" - def get_connection_string_head(self): - return f"hana+hdbcli://{self.username}:{quote_plus(self.password)}@" - - def get_connection_string_from_fields(self): - url = f"hana+hdbcli://{self.username}:{quote_plus(self.password)}@{self.host}:{self.port}/" - if self.dbname: - url += f"?databaseName={self.dbname}" + def get_connection_string_from_fields(self, params: ResolvedConnectionParams) -> str: + url = f"{self.url_scheme}://{params.username}:{quote_plus(params.password)}@{params.host}:{params.port}/" + if params.dbname: + url += f"?databaseName={params.dbname}" return url - def get_connect_args(self) -> dict: + def get_connect_args(self, params: ResolvedConnectionParams) -> dict: # noqa: ARG002 return {} diff --git a/testgen/common/database/flavor/snowflake_flavor_service.py b/testgen/common/database/flavor/snowflake_flavor_service.py index a7bad3d8..5627b6ec 100644 --- a/testgen/common/database/flavor/snowflake_flavor_service.py +++ b/testgen/common/database/flavor/snowflake_flavor_service.py @@ -4,7 +4,7 @@ from cryptography.hazmat.primitives import serialization from snowflake.sqlalchemy import URL -from testgen.common.database.flavor.flavor_service import FlavorService +from testgen.common.database.flavor.flavor_service import FlavorService, ResolvedConnectionParams class SnowflakeFlavorService(FlavorService): @@ -12,13 +12,14 @@ class SnowflakeFlavorService(FlavorService): escaped_underscore = "\\\\_" escape_clause = "ESCAPE '\\\\'" default_uppercase = True + url_scheme = "snowflake" - def get_connect_args(self): - if self.connect_by_key: + def get_connect_args(self, params: ResolvedConnectionParams) -> dict: + if params.connect_by_key: # https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#key-pair-authentication-support - private_key_passphrase = self.private_key_passphrase.encode() if self.private_key_passphrase else None + private_key_passphrase = params.private_key_passphrase.encode() if params.private_key_passphrase else None private_key = serialization.load_pem_private_key( - self.private_key.encode(), + params.private_key.encode(), password=private_key_passphrase, backend=default_backend(), ) @@ -32,40 +33,40 @@ def get_connect_args(self): return {"private_key": private_key_bytes} return {} - def get_connection_string_head(self): - if self.connect_by_key: - return f"snowflake://{self.username}@" + def get_connection_string_head(self, params: ResolvedConnectionParams) -> str: + if params.connect_by_key: + return f"{self.url_scheme}://{params.username}@" else: - return f"snowflake://{self.username}:{quote_plus(self.password)}@" + return f"{self.url_scheme}://{params.username}:{quote_plus(params.password)}@" - def get_connection_string_from_fields(self): + def get_connection_string_from_fields(self, params: ResolvedConnectionParams) -> str: # SNOWFLAKE FORMAT: strConnect = 'flavor://username:password@host/database' # optionally + '/[schema]' + '?warehouse=xxx' # NOTE: Snowflake host should NOT include ".snowflakecomputing.com" - account, _ = self.host.split(".", maxsplit=1) if "." in self.host else ("", "") - host = self.host + account, _ = params.host.split(".", maxsplit=1) if "." in params.host else ("", "") + host = params.host if ".snowflakecomputing.com" not in host: host = f"{host}.snowflakecomputing.com" extra_params = {} - if self.warehouse: - extra_params["warehouse"] = self.warehouse + if params.warehouse: + extra_params["warehouse"] = params.warehouse connection_url = URL( host=host, - port=int(self.port if str(self.port).isdigit() else 443), + port=int(params.port if str(params.port).isdigit() else 443), account=account, - user=self.username, - password="" if self.connect_by_key else self.password, - database=self.dbname, - schema=self.dbschema or "", + user=params.username, + password="" if params.connect_by_key else params.password, + database=params.dbname, + schema=params.dbschema or "", **extra_params, ) return connection_url - def get_pre_connection_queries(self): + def get_pre_connection_queries(self, params: ResolvedConnectionParams) -> list[tuple[str, dict | None]]: # noqa: ARG002 return [ ("ALTER SESSION SET MULTI_STATEMENT_COUNT = 0;", None), ("ALTER SESSION SET WEEK_START = 7;", None), diff --git a/testgen/common/database/flavor/trino_flavor_service.py b/testgen/common/database/flavor/trino_flavor_service.py index ce1133cc..d7a78339 100644 --- a/testgen/common/database/flavor/trino_flavor_service.py +++ b/testgen/common/database/flavor/trino_flavor_service.py @@ -1,17 +1,16 @@ from urllib.parse import quote_plus -from testgen.common.database.flavor.flavor_service import FlavorService +from testgen.common.database.flavor.flavor_service import FlavorService, ResolvedConnectionParams class TrinoFlavorService(FlavorService): - def get_connection_string_head(self): - return f"{self.flavor}://{self.username}:{quote_plus(self.password)}@" + url_scheme = "trino" - def get_connection_string_from_fields(self): + def get_connection_string_from_fields(self, params: ResolvedConnectionParams) -> str: # STANDARD FORMAT: strConnect = 'flavor://username:password@host:port/catalog' - return f"{self.flavor}://{self.username}:{quote_plus(self.password)}@{self.host}:{self.port}/{self.catalog}" + return f"{self.url_scheme}://{params.username}:{quote_plus(params.password)}@{params.host}:{params.port}/{params.catalog}" - def get_pre_connection_queries(self): + def get_pre_connection_queries(self, params: ResolvedConnectionParams) -> list[tuple[str, dict | None]]: return [ - (f"USE {self.catalog}.{self.dbschema}", None), + (f"USE {params.catalog}.{params.dbschema}", None), ] diff --git a/testgen/common/models/connection.py b/testgen/common/models/connection.py index 3d138396..97a5b83b 100644 --- a/testgen/common/models/connection.py +++ b/testgen/common/models/connection.py @@ -1,6 +1,7 @@ from collections.abc import Iterable from dataclasses import dataclass from typing import Literal, Self +from urllib.parse import parse_qs, urlparse from uuid import UUID, uuid4 import streamlit as st @@ -19,7 +20,6 @@ from sqlalchemy.dialects import postgresql from sqlalchemy.orm import InstrumentedAttribute -from testgen.common.database.database_service import get_flavor_service from testgen.common.database.flavor.flavor_service import SQLFlavor from testgen.common.models import get_current_session from testgen.common.models.custom_types import JSON_TYPE, EncryptedBytea, EncryptedJson @@ -119,15 +119,26 @@ def clear_cache(cls) -> bool: def save(self) -> None: if self.connect_by_url and self.url: - flavor_service = get_flavor_service(self.sql_flavor) - flavor_service.init(self.to_dict()) - - connection_parts = flavor_service.get_parts_from_connection_string() - if connection_parts: - self.project_host = connection_parts["host"] - self.project_port = connection_parts["port"] - self.project_db = connection_parts["dbname"] - self.http_path = connection_parts.get("http_path") or None - self.warehouse = connection_parts.get("warehouse") or None + # When connect_by_url=True, the URL is the source of truth. + # Normalize it (strip scheme/credentials) and sync host/port/db fields from it. + url = self.url + if "://" in url: + url = url.split("://", 1)[1] + if "@" in url: + url = url.rsplit("@", 1)[1] + self.url = url + + parsed = urlparse(f"scheme://_@{url}") + location = parsed.netloc.split("@")[-1] + if ":" in location: + host, port = location.rsplit(":", 1) + else: + host, port = location, "" + self.project_host = host + self.project_port = port + self.project_db = parsed.path.strip("/").split("/")[0] if parsed.path.strip("/") else "" + extras = {k: v[0] for k, v in parse_qs(parsed.query).items()} + self.http_path = extras.get("http_path") or None + self.warehouse = extras.get("warehouse") or None super().save() diff --git a/testgen/ui/services/database_service.py b/testgen/ui/services/database_service.py index d9e8b6ec..8877a423 100644 --- a/testgen/ui/services/database_service.py +++ b/testgen/ui/services/database_service.py @@ -11,11 +11,12 @@ from typing import Any -from sqlalchemy import create_engine, text +from sqlalchemy import text from sqlalchemy.engine import Row, RowMapping from sqlalchemy.engine.cursor import CursorResult from testgen.common.database.database_service import get_flavor_service +from testgen.common.database.flavor.flavor_service import resolve_connection_params from testgen.common.models import get_current_session @@ -54,17 +55,13 @@ def fetch_one_from_db(query: str, params: dict | None = None) -> RowMapping | No def fetch_from_target_db(connection: Connection, query: str, params: dict | None = None) -> list[Row]: + connection_params = connection.to_dict() flavor_service = get_flavor_service(connection.sql_flavor) - flavor_service.init(connection.to_dict()) - - engine = create_engine( - flavor_service.get_connection_string(), - connect_args=flavor_service.get_connect_args(), - **flavor_service.get_engine_args(), - ) + resolved = resolve_connection_params(connection_params) + engine = flavor_service.create_engine(connection_params) with engine.connect() as conn: - for pre_query, pre_params in flavor_service.get_pre_connection_queries(): + for pre_query, pre_params in flavor_service.get_pre_connection_queries(resolved): conn.execute(text(pre_query), pre_params) cursor: CursorResult = conn.execute(text(query), params) return cursor.fetchall() diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index 47a3d068..15a0013a 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -18,6 +18,7 @@ import testgen.ui.services.database_service as db from testgen.commands.run_profiling import run_profiling_in_background from testgen.common.database.database_service import empty_cache, get_flavor_service +from testgen.common.database.flavor.flavor_service import resolve_connection_params from testgen.common.models import with_database_session from testgen.common.models.connection import Connection, ConnectionMinimal from testgen.common.models.scheduler import RUN_MONITORS_JOB_KEY, RUN_TESTS_JOB_KEY, JobSchedule @@ -170,8 +171,8 @@ def on_setup_table_group_clicked(*_args) -> None: connection_string: str | None = None flavor_service = get_flavor_service(connection.sql_flavor) - flavor_service.init({**connection.to_dict(), "project_pw_encrypted": ""}) - connection_string = flavor_service.get_connection_string().replace("%3E", ">").replace("%3C", "<") + params = resolve_connection_params({**connection.to_dict(), "project_pw_encrypted": ""}) + connection_string = flavor_service.get_connection_string(params).replace("%3E", ">").replace("%3C", "<") if should_save(): success = True From 8e68494c1126c491755d6daf5d92159ee23712f8 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 17 Mar 2026 23:32:36 -0400 Subject: [PATCH 50/95] fix: update doc links --- README.md | 18 +++++++++--------- pyproject.toml | 4 ++-- testgen/common/notifications/notifications.py | 2 +- .../frontend/js/components/help_menu.js | 6 +++--- .../frontend/js/pages/notification_settings.js | 2 +- testgen/ui/components/widgets/page.py | 2 +- testgen/ui/static/js/components/help_menu.js | 6 +++--- testgen/ui/views/connections.py | 2 +- testgen/ui/views/data_catalog.py | 2 +- testgen/ui/views/hygiene_issues.py | 2 +- testgen/ui/views/monitors_dashboard.py | 2 +- testgen/ui/views/profiling_results.py | 2 +- testgen/ui/views/profiling_runs.py | 2 +- testgen/ui/views/project_dashboard.py | 2 +- testgen/ui/views/project_settings.py | 2 +- testgen/ui/views/quality_dashboard.py | 2 +- testgen/ui/views/score_details.py | 2 +- testgen/ui/views/score_explorer.py | 2 +- testgen/ui/views/table_groups.py | 2 +- testgen/ui/views/test_definitions.py | 2 +- testgen/ui/views/test_results.py | 2 +- testgen/ui/views/test_runs.py | 2 +- testgen/ui/views/test_suites.py | 2 +- 23 files changed, 36 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 670c5ad9..ef49135b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # DataOps Data Quality TestGen -![apache 2.0 license Badge](https://img.shields.io/badge/License%20-%20Apache%202.0%20-%20blue) ![PRs Badge](https://img.shields.io/badge/PRs%20-%20Welcome%20-%20green) [![Latest Version](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fhub.docker.com%2Fv2%2Frepositories%2Fdatakitchen%2Fdataops-testgen%2Ftags%2F&query=results%5B0%5D.name&label=latest%20version&color=06A04A)](https://hub.docker.com/r/datakitchen/dataops-testgen) [![Docker Pulls](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fhub.docker.com%2Fv2%2Frepositories%2Fdatakitchen%2Fdataops-testgen%2F&query=pull_count&style=flat&label=docker%20pulls&color=06A04A)](https://hub.docker.com/r/datakitchen/dataops-testgen) [![Documentation](https://img.shields.io/badge/docs-On%20datakitchen.io-06A04A?style=flat)](https://docs.datakitchen.io/articles/dataops-testgen-help/dataops-testgen-help) [![Static Badge](https://img.shields.io/badge/Slack-Join%20Discussion-blue?style=flat&logo=slack)](https://data-observability-slack.datakitchen.io/join) +![apache 2.0 license Badge](https://img.shields.io/badge/License%20-%20Apache%202.0%20-%20blue) ![PRs Badge](https://img.shields.io/badge/PRs%20-%20Welcome%20-%20green) [![Latest Version](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fhub.docker.com%2Fv2%2Frepositories%2Fdatakitchen%2Fdataops-testgen%2Ftags%2F&query=results%5B0%5D.name&label=latest%20version&color=06A04A)](https://hub.docker.com/r/datakitchen/dataops-testgen) [![Docker Pulls](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fhub.docker.com%2Fv2%2Frepositories%2Fdatakitchen%2Fdataops-testgen%2F&query=pull_count&style=flat&label=docker%20pulls&color=06A04A)](https://hub.docker.com/r/datakitchen/dataops-testgen) [![Documentation](https://img.shields.io/badge/docs-On%20datakitchen.io-06A04A?style=flat)](https://docs.datakitchen.io/testgen/what-is-testgen/) [![Static Badge](https://img.shields.io/badge/Slack-Join%20Discussion-blue?style=flat&logo=slack)](https://data-observability-slack.datakitchen.io/join) *

DataOps Data Quality TestGen, or "TestGen" for short, can help you find data issues so you can alert your users and notify your suppliers. It does this by delivering simple, fast data quality test generation and execution by data profiling, new dataset screening and hygiene review, algorithmic generation of data quality validation tests, ongoing production testing of new data refreshes, and continuous anomaly monitoring of datasets. TestGen is part of DataKitchen's Open Source Data Observability.

* @@ -7,7 +7,7 @@ [DataOps TestGen Overview](https://datakitchen.io/dataops-testgen-product/) -[DataOps TestGen Documentation](https://docs.datakitchen.io/articles/dataops-testgen-help/dataops-testgen-help) +[DataOps TestGen Documentation](https://docs.datakitchen.io/testgen/what-is-testgen/) ## Features @@ -68,7 +68,7 @@ Once the installation completes, verify that you can login to the UI with the UR ### Optional: Run the TestGen demo setup -The [Data Observability quickstart](https://docs.datakitchen.io/articles/open-source-data-observability/data-observability-overview) walks you through DataOps Data Quality TestGen capabilities to demonstrate how it covers critical use cases for data and analytic teams. +The [Data Observability quickstart](https://docs.datakitchen.io/tutorials/quickstart-demo/) walks you through DataOps Data Quality TestGen capabilities to demonstrate how it covers critical use cases for data and analytic teams. ```shell python3 dk-installer.py tg run-demo @@ -110,7 +110,7 @@ Within the virtual environment, install the TestGen package with pip. pip install dataops-testgen ``` -Verify that the [_testgen_ command line](https://docs.datakitchen.io/articles/dataops-testgen-help/testgen-commands-and-details) works. +Verify that the [_testgen_ command line](https://docs.datakitchen.io/testgen/cli-reference/) works. ```shell testgen --help ``` @@ -165,7 +165,7 @@ Verify that you can login to the UI with the `TESTGEN_USERNAME` and `TESTGEN_PAS ### Optional: Run the TestGen demo setup -The [Data Observability quickstart](https://docs.datakitchen.io/articles/open-source-data-observability/data-observability-overview) walks you through DataOps Data Quality TestGen capabilities to demonstrate how it covers critical use cases for data and analytic teams. +The [Data Observability quickstart](https://docs.datakitchen.io/tutorials/quickstart-demo/) walks you through DataOps Data Quality TestGen capabilities to demonstrate how it covers critical use cases for data and analytic teams. ```shell testgen quick-start @@ -187,7 +187,7 @@ python3 dk-installer.py tg delete-demo ### Upgrade to latest version -New releases of TestGen are announced on the `#releases` channel on [Data Observability Slack](https://data-observability-slack.datakitchen.io/join), and release notes can be found on the [DataKitchen documentation portal](https://docs.datakitchen.io/articles/dataops-testgen-help/testgen-release-notes/a/h1_1691719522). Use the following command to upgrade to the latest released version. +New releases of TestGen are announced on the `#releases` channel on [Data Observability Slack](https://data-observability-slack.datakitchen.io/join), and release notes can be found on the [DataKitchen documentation portal](https://docs.datakitchen.io/testgen/release-notes/). Use the following command to upgrade to the latest released version. ```shell python3 dk-installer.py tg upgrade @@ -203,7 +203,7 @@ python3 dk-installer.py tg delete ### Access the _testgen_ CLI -The [_testgen_ command line](https://docs.datakitchen.io/articles/dataops-testgen-help/testgen-commands-and-details) can be accessed within the running container. +The [_testgen_ command line](https://docs.datakitchen.io/testgen/cli-reference/) can be accessed within the running container. ```shell docker compose exec engine bash @@ -226,13 +226,13 @@ docker compose up -d ## What Next? ### Getting started guide -We recommend you start by going through the [Data Observability Overview Demo](https://docs.datakitchen.io/articles/open-source-data-observability/data-observability-overview). +We recommend you start by going through the [Data Observability Overview Demo](https://docs.datakitchen.io/tutorials/quickstart-demo/). ### Support For support requests, [join the Data Observability Slack](https://data-observability-slack.datakitchen.io/join) 👋 and post on the `#support` channel. ### Connect to your database -Follow [these instructions](https://docs.datakitchen.io/articles/dataops-testgen-help/connect-your-database) to improve the quality of data in your database. +Follow [these instructions](https://docs.datakitchen.io/testgen/connect-your-database/) to improve the quality of data in your database. ### Community Talk and learn with other data practitioners who are building with DataKitchen. Share knowledge, get help, and contribute to our open-source project. diff --git a/pyproject.toml b/pyproject.toml index 2740345d..bd7a2982 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,8 +113,8 @@ tg-patch-streamlit = "testgen.ui.scripts.patch_streamlit:patch" [project.urls] "Source Code" = "https://github.com/DataKitchen/dataops-testgen" "Bug Tracker" = "https://github.com/DataKitchen/dataops-testgen/issues" -"Documentation" = "https://docs.datakitchen.io/articles/dataops-testgen-help/dataops-testgen-help" -"Release Notes" = "https://docs.datakitchen.io/articles/dataops-testgen-help/testgen-release-notes" +"Documentation" = "https://docs.datakitchen.io/testgen/what-is-testgen/" +"Release Notes" = "https://docs.datakitchen.io/testgen/release-notes/" "Slack" = "https://data-observability-slack.datakitchen.io/join" "Homepage" = "https://example.com" diff --git a/testgen/common/notifications/notifications.py b/testgen/common/notifications/notifications.py index b4343e2e..68e20732 100644 --- a/testgen/common/notifications/notifications.py +++ b/testgen/common/notifications/notifications.py @@ -393,7 +393,7 @@ def get_body_template(self) -> str: - TestGen Help diff --git a/testgen/ui/components/frontend/js/components/help_menu.js b/testgen/ui/components/frontend/js/components/help_menu.js index 3ea341db..45b2da24 100644 --- a/testgen/ui/components/frontend/js/components/help_menu.js +++ b/testgen/ui/components/frontend/js/components/help_menu.js @@ -23,9 +23,9 @@ import { Icon } from './icon.js'; const { a, div, span } = van.tags; -const baseHelpUrl = 'https://docs.datakitchen.io/articles/dataops-testgen-help/'; -const releaseNotesTopic = 'testgen-release-notes'; -const upgradeTopic = 'upgrade-testgen'; +const baseHelpUrl = 'https://docs.datakitchen.io/testgen/'; +const releaseNotesTopic = 'release-notes/'; +const upgradeTopic = 'administer/upgrade-testgen/'; const slackUrl = 'https://data-observability-slack.datakitchen.io/join'; const trainingUrl = 'https://info.datakitchen.io/data-quality-training-and-certifications'; diff --git a/testgen/ui/components/frontend/js/pages/notification_settings.js b/testgen/ui/components/frontend/js/pages/notification_settings.js index 570115de..55f45616 100644 --- a/testgen/ui/components/frontend/js/pages/notification_settings.js +++ b/testgen/ui/components/frontend/js/pages/notification_settings.js @@ -66,7 +66,7 @@ const NotificationSettings = (/** @type Properties */ props) => { class: 'notifications--empty', link: { label: 'View documentation', - href: 'https://docs.datakitchen.io/articles/dataops-testgen-help/configure-email-server', + href: 'https://docs.datakitchen.io/testgen/administer/configure-email-server/', open_new: true, }, }); diff --git a/testgen/ui/components/widgets/page.py b/testgen/ui/components/widgets/page.py index c6c68148..7f512b17 100644 --- a/testgen/ui/components/widgets/page.py +++ b/testgen/ui/components/widgets/page.py @@ -9,7 +9,7 @@ from testgen.ui.session import session from testgen.ui.views.dialogs.application_logs_dialog import application_logs_dialog -UPGRADE_URL = "https://docs.datakitchen.io/articles/dataops-testgen-help/upgrade-testgen" +UPGRADE_URL = "https://docs.datakitchen.io/testgen/administer/upgrade-testgen/" def page_header( diff --git a/testgen/ui/static/js/components/help_menu.js b/testgen/ui/static/js/components/help_menu.js index 3ea341db..45b2da24 100644 --- a/testgen/ui/static/js/components/help_menu.js +++ b/testgen/ui/static/js/components/help_menu.js @@ -23,9 +23,9 @@ import { Icon } from './icon.js'; const { a, div, span } = van.tags; -const baseHelpUrl = 'https://docs.datakitchen.io/articles/dataops-testgen-help/'; -const releaseNotesTopic = 'testgen-release-notes'; -const upgradeTopic = 'upgrade-testgen'; +const baseHelpUrl = 'https://docs.datakitchen.io/testgen/'; +const releaseNotesTopic = 'release-notes/'; +const upgradeTopic = 'administer/upgrade-testgen/'; const slackUrl = 'https://data-observability-slack.datakitchen.io/join'; const trainingUrl = 'https://info.datakitchen.io/data-quality-training-and-certifications'; diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index 15a0013a..b9f996d3 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -66,7 +66,7 @@ class ConnectionsPage(Page): def render(self, project_code: str, **_kwargs) -> None: testgen.page_header( PAGE_TITLE, - "manage-connections", + "connect-your-database/manage-connections/", ) connections = Connection.select_where(Connection.project_code == project_code) diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index 00881dd1..b48bd6fd 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -60,7 +60,7 @@ def render( ) -> None: testgen.page_header( PAGE_TITLE, - "data-catalog", + "data-catalog/", ) _, loading_column = st.columns([0.4, 0.6]) diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py index 4f4b52fe..b58cc615 100644 --- a/testgen/ui/views/hygiene_issues.py +++ b/testgen/ui/views/hygiene_issues.py @@ -71,7 +71,7 @@ def render( testgen.page_header( "Hygiene Issues", - "data-hygiene-issues", + "data-profiling/data-hygiene-issues/", breadcrumbs=[ { "label": "Profiling Runs", "path": "profiling-runs", "params": { "project_code": run.project_code } }, { "label": f"{run.table_groups_name} | {run_date}" }, diff --git a/testgen/ui/views/monitors_dashboard.py b/testgen/ui/views/monitors_dashboard.py index 491789be..1da90672 100644 --- a/testgen/ui/views/monitors_dashboard.py +++ b/testgen/ui/views/monitors_dashboard.py @@ -75,7 +75,7 @@ def render( ) -> None: testgen.page_header( PAGE_TITLE, - "monitor-tables", + "monitor-tables/", ) project_summary = Project.get_summary(project_code) diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index a71dbeb7..cabedd33 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -56,7 +56,7 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | testgen.page_header( "Data Profiling Results", - "investigate-profiling-results", + "data-profiling/investigate-profiling-results/", breadcrumbs=[ { "label": "Profiling Runs", "path": "profiling-runs", "params": { "project_code": run.project_code } }, { "label": f"{run.table_groups_name} | {run_date}" }, diff --git a/testgen/ui/views/profiling_runs.py b/testgen/ui/views/profiling_runs.py index 40ee4487..c13e1a35 100644 --- a/testgen/ui/views/profiling_runs.py +++ b/testgen/ui/views/profiling_runs.py @@ -49,7 +49,7 @@ class DataProfilingPage(Page): def render(self, project_code: str, table_group_id: str | None = None, **_kwargs) -> None: testgen.page_header( PAGE_TITLE, - "data-profiling", + "data-profiling/", ) with st.spinner("Loading data ..."): diff --git a/testgen/ui/views/project_dashboard.py b/testgen/ui/views/project_dashboard.py index 0fef708e..6425378f 100644 --- a/testgen/ui/views/project_dashboard.py +++ b/testgen/ui/views/project_dashboard.py @@ -30,7 +30,7 @@ class ProjectDashboardPage(Page): def render(self, project_code: str, **_kwargs): testgen.page_header( PAGE_TITLE, - "project-dashboard", + "project-dashboard/", ) with st.spinner("Loading data ..."): diff --git a/testgen/ui/views/project_settings.py b/testgen/ui/views/project_settings.py index aefc66a0..c28fc72c 100644 --- a/testgen/ui/views/project_settings.py +++ b/testgen/ui/views/project_settings.py @@ -37,7 +37,7 @@ def render(self, project_code: str | None = None, **_kwargs) -> None: testgen.page_header( PAGE_TITLE, - "manage-projects", + "manage-projects/", ) get_test_results, set_test_results = temp_value(f"project_settings:{project_code}", default=None) diff --git a/testgen/ui/views/quality_dashboard.py b/testgen/ui/views/quality_dashboard.py index 4391b6d7..d8460fbc 100644 --- a/testgen/ui/views/quality_dashboard.py +++ b/testgen/ui/views/quality_dashboard.py @@ -28,7 +28,7 @@ class QualityDashboardPage(Page): def render(self, *, project_code: str, **_kwargs) -> None: project_summary = Project.get_summary(project_code) - testgen.page_header(PAGE_TITLE, "quality-scores") + testgen.page_header(PAGE_TITLE, "quality-scores/") testgen.testgen_component( "quality_dashboard", props={ diff --git a/testgen/ui/views/score_details.py b/testgen/ui/views/score_details.py index c2458036..629bffe9 100644 --- a/testgen/ui/views/score_details.py +++ b/testgen/ui/views/score_details.py @@ -74,7 +74,7 @@ def render( testgen.page_header( "Score Details", - "view-score-details", + "quality-scores/view-score-details/", breadcrumbs=[ {"path": "quality-dashboard", "label": "Quality Dashboard", "params": {"project_code": score_definition.project_code}}, {"label": score_definition.name}, diff --git a/testgen/ui/views/score_explorer.py b/testgen/ui/views/score_explorer.py index 841b156a..391d0a2f 100644 --- a/testgen/ui/views/score_explorer.py +++ b/testgen/ui/views/score_explorer.py @@ -85,7 +85,7 @@ def render( page_title = "Edit Scorecard" last_breadcrumb = original_score_definition.name - testgen.page_header(page_title, "explore-and-create-scorecards", breadcrumbs=[ + testgen.page_header(page_title, "quality-scores/explore-and-create-scorecards/", breadcrumbs=[ {"path": "quality-dashboard", "label": "Quality Dashboard", "params": {"project_code": project_code}}, {"label": last_breadcrumb}, ]) diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index 9f81a2f5..54e6868f 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -49,7 +49,7 @@ def render( table_group_name: str | None = None, **_kwargs, ) -> None: - testgen.page_header(PAGE_TITLE, "manage-table-groups") + testgen.page_header(PAGE_TITLE, "connect-your-database/manage-table-groups/") user_can_edit = session.auth.user_has_permission("edit") project_summary = Project.get_summary(project_code) diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 8473a446..06b01c9c 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -79,7 +79,7 @@ def render( testgen.page_header( "Test Definitions", - "test-definitions", + "generate-tests/test-definitions/", breadcrumbs=[ { "label": "Test Suites", "path": "test-suites", "params": { "project_code": project_code } }, { "label": test_suite.test_suite }, diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 99b035c9..e9bfd6e0 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -87,7 +87,7 @@ def render( testgen.page_header( "Test Results", - "investigate-test-results", + "data-quality-testing/investigate-test-results/", breadcrumbs=[ { "label": "Test Runs", "path": "test-runs", "params": { "project_code": run.project_code } }, { "label": f"{run.test_suite} | {run_date}" }, diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index ea30c3fd..3a14c058 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -51,7 +51,7 @@ class TestRunsPage(Page): def render(self, project_code: str, table_group_id: str | None = None, test_suite_id: str | None = None, **_kwargs) -> None: testgen.page_header( PAGE_TITLE, - "data-quality-testing", + "data-quality-testing/", ) with st.spinner("Loading data ..."): diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index 033972b5..0be71483 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -40,7 +40,7 @@ class TestSuitesPage(Page): def render(self, project_code: str, table_group_id: str | None = None, **_kwargs) -> None: testgen.page_header( PAGE_TITLE, - "manage-test-suites", + "connect-your-database/manage-test-suites/", ) table_groups = TableGroup.select_minimal_where(TableGroup.project_code == project_code) From 8b39bbbe48f2a6042a2db92a5308a24b97276184 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 20 Mar 2026 00:41:22 -0400 Subject: [PATCH 51/95] fix(monitors): generate freshness monitors when profiling data already exists When a monitor suite is created after profiling has already run, Freshness_Trend monitors were never generated because all UI creation paths only generate Volume_Trend + Schema_Drift. Freshness generation only triggers during profiling, which may not run again. - Include Freshness_Trend at creation time if profiling data exists (settings dialog) - Auto-generate missing freshness monitors during monitor execution as resilience - Show specific guidance in Table Monitors dialog when freshness is not configured - Disable trends button for tables with no monitor results yet Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/commands/queries/execute_tests_query.py | 4 ++++ testgen/commands/run_test_execution.py | 7 +++++++ .../execution/get_missing_freshness_monitors.sql | 10 ++++++++++ .../frontend/js/pages/edit_table_monitors.js | 5 ++++- .../components/frontend/js/pages/monitors_dashboard.js | 1 + testgen/ui/views/monitors_dashboard.py | 5 ++++- 6 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 testgen/template/execution/get_missing_freshness_monitors.sql diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index 9501e04b..539a6dd0 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -374,6 +374,10 @@ def has_schema_changes(self) -> tuple[dict]: # Runs on App database return self._get_query("has_schema_changes.sql") + def get_missing_freshness_monitors(self) -> tuple[str, dict]: + # Runs on App database + return self._get_query("get_missing_freshness_monitors.sql") + def get_errored_autogen_monitors(self) -> tuple[str, dict]: # Runs on App database return self._get_query("get_errored_autogen_monitors.sql") diff --git a/testgen/commands/run_test_execution.py b/testgen/commands/run_test_execution.py index a809ad20..7adb15c4 100644 --- a/testgen/commands/run_test_execution.py +++ b/testgen/commands/run_test_execution.py @@ -196,6 +196,13 @@ def _sync_monitor_definitions(sql_generator: TestExecutionSQL) -> None: # Freshness monitors will be inserted after profiling run_monitor_generation(test_suite_id, ["Volume_Trend"], mode="insert") + # Autogenerate missing freshness monitors if profiling data exists + if sql_generator.table_group.last_complete_profile_run_id: + missing_monitors = fetch_dict_from_db(*sql_generator.get_missing_freshness_monitors()) + if missing_monitors: + table_names = [row["table_name"] for row in missing_monitors] + run_monitor_generation(test_suite_id, ["Freshness_Trend"], mode="insert", table_names=table_names) + # Regenerate monitors that errored in previous run errored_monitors = fetch_dict_from_db(*sql_generator.get_errored_autogen_monitors()) if errored_monitors: diff --git a/testgen/template/execution/get_missing_freshness_monitors.sql b/testgen/template/execution/get_missing_freshness_monitors.sql new file mode 100644 index 00000000..a81fab51 --- /dev/null +++ b/testgen/template/execution/get_missing_freshness_monitors.sql @@ -0,0 +1,10 @@ +SELECT DISTINCT dtc.table_name +FROM data_table_chars dtc +WHERE dtc.table_groups_id = :TABLE_GROUPS_ID ::UUID + AND dtc.drop_date IS NULL + AND dtc.table_name NOT IN ( + SELECT table_name + FROM test_definitions + WHERE test_suite_id = :TEST_SUITE_ID ::UUID + AND test_type = 'Freshness_Trend' + ); diff --git a/testgen/ui/components/frontend/js/pages/edit_table_monitors.js b/testgen/ui/components/frontend/js/pages/edit_table_monitors.js index c1bdb4c6..c1edc25a 100644 --- a/testgen/ui/components/frontend/js/pages/edit_table_monitors.js +++ b/testgen/ui/components/frontend/js/pages/edit_table_monitors.js @@ -190,9 +190,12 @@ const EditTableMonitors = (/** @type Properties */ props) => { const selectedDef = getValue(props.definitions).find(td => td.test_type === type); if (!selectedDef) { + const message = type === 'Freshness_Trend' + ? 'Freshness monitor not yet configured. Run profiling to auto-generate.' + : 'Monitor not configured for this table.'; return Card({ class: 'edit-monitors--empty flex-row fx-justify-center', - content: 'Monitor not configured for this table.', + content: message, }); } diff --git a/testgen/ui/components/frontend/js/pages/monitors_dashboard.js b/testgen/ui/components/frontend/js/pages/monitors_dashboard.js index b5294681..e8beabb9 100644 --- a/testgen/ui/components/frontend/js/pages/monitors_dashboard.js +++ b/testgen/ui/components/frontend/js/pages/monitors_dashboard.js @@ -254,6 +254,7 @@ const MonitorsDashboard = (/** @type Properties */ props) => { type: 'icon', tooltip: 'View table trends', tooltipPosition: 'top-left', + disabled: monitor.freshness_is_pending && monitor.volume_is_pending && monitor.schema_is_pending && monitor.metric_is_pending, style: 'color: var(--secondary-text-color);', onclick: () => openChartsDialog(monitor), }), diff --git a/testgen/ui/views/monitors_dashboard.py b/testgen/ui/views/monitors_dashboard.py index 1da90672..0eb0515a 100644 --- a/testgen/ui/views/monitors_dashboard.py +++ b/testgen/ui/views/monitors_dashboard.py @@ -557,7 +557,10 @@ def on_save_settings_clicked(payload: dict) -> None: updated_table_group = TableGroup.get(table_group.id) updated_table_group.monitor_test_suite_id = monitor_suite.id updated_table_group.save() - run_monitor_generation(monitor_suite.id, ["Volume_Trend", "Schema_Drift"]) + monitors: list[str] = ["Volume_Trend", "Schema_Drift"] + if updated_table_group.last_complete_profile_run_id: + monitors.append("Freshness_Trend") + run_monitor_generation(monitor_suite.id, monitors) st.rerun() From f82e34cc7ea764bd884217a1183a95f2ccbbed4c Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 20 Mar 2026 08:35:16 -0300 Subject: [PATCH 52/95] refactor: introduce database_session context manager MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the with_database_session decorator internals with a proper context manager that commits on clean exit and rolls back on Exception. The decorator now delegates to database_session(). Uses except Exception (not BaseException) so Streamlit's RerunException bypasses both rollback and auto-commit — safe_rerun() handles explicit commits before reruns. Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/common/models/__init__.py | 46 +++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/testgen/common/models/__init__.py b/testgen/common/models/__init__.py index 3734b11b..6e2b581c 100644 --- a/testgen/common/models/__init__.py +++ b/testgen/common/models/__init__.py @@ -1,3 +1,4 @@ +import contextlib import functools import platform import threading @@ -32,26 +33,41 @@ _current_session_wrapper.value = None -def with_database_session(func): - """ - Set up a thread-global SQLAlchemy session to be accessed - calling `get_current_session()` from any place. +@contextlib.contextmanager +def database_session(): + """Provide a thread-local SQLAlchemy session. - NOTE: Call once on the main entry point. - """ + Nested: yields existing session, no lifecycle management. + Owning: commits on clean exit, rolls back on Exception. - @functools.wraps(func) - def wrapper(*args, **kwargs): + Uses ``except Exception`` (not ``BaseException``) so that Streamlit's + ``RerunException`` (a ``BaseException`` subclass) bypasses both rollback + and auto-commit. If ``safe_rerun()`` was called, it already committed. + """ + existing = get_current_session() + if existing: + yield existing + return + with Session() as session: + _current_session_wrapper.value = session try: - session = get_current_session() - if session: - return func(*args, **kwargs) - - with Session() as session: - _current_session_wrapper.value = session - return func(*args, **kwargs) + yield session + except Exception: + session.rollback() + raise + else: + session.commit() finally: _current_session_wrapper.value = None + + +def with_database_session(func): + """Decorator form of :func:`database_session`.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + with database_session(): + return func(*args, **kwargs) return wrapper From a0be7152e6b04c1bd949e99bb8016bf0dae72355 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 20 Mar 2026 08:35:23 -0300 Subject: [PATCH 53/95] feat: add safe_rerun to prevent data loss on Streamlit rerun Create safe_rerun() in ui/services/rerun_service.py that commits the current database session before calling st.rerun(). This prevents flushed-but-uncommitted changes from being silently discarded when RerunException propagates through the session context manager. Also replace st.rerun() in form_service.reset_post_updates(). Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/ui/services/form_service.py | 3 ++- testgen/ui/services/rerun_service.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 testgen/ui/services/rerun_service.py diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index 8426f3c2..b55c9f04 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -11,6 +11,7 @@ from testgen.ui.components import widgets as testgen from testgen.ui.navigation.router import Router +from testgen.ui.services.rerun_service import safe_rerun """ Shared rendering of UI elements @@ -77,7 +78,7 @@ def reset_post_updates(str_message=None, as_toast=False, clear_cache=True, lst_c fcn.clear() else: st.cache_data.clear() - st.rerun() + safe_rerun() def render_html_list(dct_row, lst_columns, str_section_header=None, int_data_width=300, lst_labels=None): diff --git a/testgen/ui/services/rerun_service.py b/testgen/ui/services/rerun_service.py new file mode 100644 index 00000000..6c0b7ea1 --- /dev/null +++ b/testgen/ui/services/rerun_service.py @@ -0,0 +1,17 @@ +from typing import Literal, NoReturn + +import streamlit as st + +from testgen.common.models import get_current_session + + +def safe_rerun(*, scope: Literal["app", "fragment"] = "app") -> NoReturn: + """Commit any pending database changes, then trigger a Streamlit rerun. + + Prevents data loss when RerunException propagates through the + session context manager in app.py:render(). + """ + session = get_current_session() + if session: + session.commit() + st.rerun(scope=scope) From e43a3ab0ea424a6075316740dc09abc7d4043bf6 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 20 Mar 2026 08:35:36 -0300 Subject: [PATCH 54/95] refactor: replace st.rerun with safe_rerun in UI code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace all ~35 st.rerun() callsites across dialogs, views, and components with safe_rerun() to ensure the database session is committed before Streamlit's RerunException propagates. Also clean up inline with_database_session patterns in manage_schedules.py and manage_notifications.py: - with_database_session(fn)() → with database_session(): fn() - Direct Session() bypass → get_current_session() Co-Authored-By: Claude Opus 4.6 (1M context) --- .../ui/components/widgets/download_dialog.py | 3 +- testgen/ui/components/widgets/page.py | 3 +- testgen/ui/components/widgets/wizard.py | 3 +- testgen/ui/session.py | 3 +- testgen/ui/views/connections.py | 5 +- testgen/ui/views/data_catalog.py | 5 +- .../ui/views/dialogs/generate_tests_dialog.py | 3 +- .../views/dialogs/import_metadata_dialog.py | 3 +- .../ui/views/dialogs/manage_notifications.py | 8 +-- testgen/ui/views/dialogs/manage_schedules.py | 52 ++++++++++--------- .../ui/views/dialogs/run_profiling_dialog.py | 3 +- testgen/ui/views/dialogs/run_tests_dialog.py | 3 +- testgen/ui/views/monitors_dashboard.py | 11 ++-- testgen/ui/views/profiling_runs.py | 5 +- testgen/ui/views/score_explorer.py | 3 +- testgen/ui/views/table_groups.py | 5 +- testgen/ui/views/test_definitions.py | 11 ++-- testgen/ui/views/test_runs.py | 5 +- testgen/ui/views/test_suites.py | 5 +- 19 files changed, 80 insertions(+), 59 deletions(-) diff --git a/testgen/ui/components/widgets/download_dialog.py b/testgen/ui/components/widgets/download_dialog.py index 712eeaa0..7e403fd1 100644 --- a/testgen/ui/components/widgets/download_dialog.py +++ b/testgen/ui/components/widgets/download_dialog.py @@ -9,6 +9,7 @@ import streamlit as st from testgen.common import date_service +from testgen.ui.services.rerun_service import safe_rerun PROGRESS_UPDATE_TYPE = Callable[[float], None] @@ -152,7 +153,7 @@ def render_button(): mime=file_type, use_container_width=True, ): - st.rerun() + safe_rerun() with button_col: render_button() diff --git a/testgen/ui/components/widgets/page.py b/testgen/ui/components/widgets/page.py index 7f512b17..b85c8fdf 100644 --- a/testgen/ui/components/widgets/page.py +++ b/testgen/ui/components/widgets/page.py @@ -6,6 +6,7 @@ from testgen.ui.components.widgets.breadcrumbs import Breadcrumb from testgen.ui.components.widgets.breadcrumbs import breadcrumbs as tg_breadcrumbs from testgen.ui.components.widgets.testgen_component import testgen_component +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session from testgen.ui.views.dialogs.application_logs_dialog import application_logs_dialog @@ -47,7 +48,7 @@ def close_help(rerun: bool = False) -> None: flex_row_end() st.markdown("Help :material/keyboard_arrow_down:") if rerun: - st.rerun() + safe_rerun() def open_app_logs(): close_help() diff --git a/testgen/ui/components/widgets/wizard.py b/testgen/ui/components/widgets/wizard.py index 1b87da1e..31baeaa3 100644 --- a/testgen/ui/components/widgets/wizard.py +++ b/testgen/ui/components/widgets/wizard.py @@ -8,6 +8,7 @@ from testgen.ui.components import widgets as testgen from testgen.ui.navigation.router import Router +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import temp_value ResultsType = typing.TypeVar("ResultsType", bound=typing.Any | None) @@ -175,7 +176,7 @@ def complete(self, container: DeltaGenerator) -> None: do_rerun = self._on_complete(**kwargs) self._reset() if do_rerun: - st.rerun() + safe_rerun() def _reset(self) -> None: del st.session_state[self._key] diff --git a/testgen/ui/session.py b/testgen/ui/session.py index e1525d37..9f50ed33 100644 --- a/testgen/ui/session.py +++ b/testgen/ui/session.py @@ -11,6 +11,7 @@ import streamlit as st from streamlit.runtime.state import SessionStateProxy +from testgen.ui.services.rerun_service import safe_rerun from testgen.utils.singleton import Singleton T = TypeVar("T") @@ -59,7 +60,7 @@ def __delattr__(self, key: str) -> None: def set_sidebar_project(self, project_code: str) -> None: if project_code != self.sidebar_project: self.sidebar_project = project_code - st.rerun() + safe_rerun() def temp_value(session_key: str, *, default: T | None = None) -> tuple[TempValueGetter[T | None], TempValueSetter[T]]: diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index b9f996d3..b7f10094 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -28,6 +28,7 @@ from testgen.ui.components import widgets as testgen from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value from testgen.ui.utils import get_cron_sample_handler @@ -305,7 +306,7 @@ def on_close_clicked(_params: dict) -> None: get_close_dialog, set_close_dialog = temp_value(f"connections:{connection_id}:close", default=False) if (get_close_dialog()): - st.rerun() + safe_rerun() get_new_table_group, set_new_table_group = temp_value( f"connections:{connection_id}:table_group", @@ -467,7 +468,7 @@ def on_close_clicked(_params: dict) -> None: LOG.exception(message) else: LOG.info("Table group %s created", table_group.id) - st.rerun() + safe_rerun() except Exception as error: message = "Error creating table group" success = False diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index b48bd6fd..74da93ea 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -34,6 +34,7 @@ get_tables_by_table_group, ) from testgen.ui.services.database_service import execute_db_query, fetch_all_from_db +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value from testgen.ui.views.dialogs.column_history_dialog import column_history_dialog from testgen.ui.views.dialogs.data_preview_dialog import data_preview_dialog @@ -356,7 +357,7 @@ def remove_table_dialog(item: dict) -> None: for func in [get_table_group_columns, get_tag_values]: func.clear() st.session_state["data_catalog:last_saved_timestamp"] = datetime.now().timestamp() - st.rerun() + safe_rerun() def on_tags_changed(spinner_container: DeltaGenerator, payload: dict) -> FILE_DATA_TYPE: @@ -408,7 +409,7 @@ def on_tags_changed(spinner_container: DeltaGenerator, payload: dict) -> FILE_DA for func in [get_table_group_columns, get_table_by_id, get_column_by_id, get_tag_values]: func.clear() st.session_state["data_catalog:last_saved_timestamp"] = datetime.now().timestamp() - st.rerun() + safe_rerun() def export_metadata_csv(table_group: TableGroupMinimal) -> None: diff --git a/testgen/ui/views/dialogs/generate_tests_dialog.py b/testgen/ui/views/dialogs/generate_tests_dialog.py index 0da5e623..094921cf 100644 --- a/testgen/ui/views/dialogs/generate_tests_dialog.py +++ b/testgen/ui/views/dialogs/generate_tests_dialog.py @@ -7,6 +7,7 @@ from testgen.common.models.test_suite import TestSuiteMinimal from testgen.ui.components import widgets as testgen from testgen.ui.services.database_service import execute_db_query, fetch_all_from_db, fetch_one_from_db +from testgen.ui.services.rerun_service import safe_rerun @st.dialog(title="Generate Tests") @@ -77,7 +78,7 @@ def generate_tests_dialog(test_suite: TestSuiteMinimal) -> None: status_container.success(f"Test generation completed for test suite **{test_suite_name}**.") time.sleep(1) st.cache_data.clear() - st.rerun() + safe_rerun() def get_test_suite_refresh_warning(test_suite_id: str) -> tuple[int, int, int]: diff --git a/testgen/ui/views/dialogs/import_metadata_dialog.py b/testgen/ui/views/dialogs/import_metadata_dialog.py index 92d9a2dc..aa639745 100644 --- a/testgen/ui/views/dialogs/import_metadata_dialog.py +++ b/testgen/ui/views/dialogs/import_metadata_dialog.py @@ -11,6 +11,7 @@ from testgen.ui.components.widgets.testgen_component import testgen_component from testgen.ui.queries.profiling_queries import TAG_FIELDS from testgen.ui.services.database_service import execute_db_query, fetch_all_from_db +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import temp_value LOG = logging.getLogger("testgen") @@ -387,7 +388,7 @@ def on_file_cleared(_payload: dict) -> None: if result and result["success"]: time.sleep(2) - st.rerun() + safe_rerun() def _build_preview_props(preview: dict) -> dict: diff --git a/testgen/ui/views/dialogs/manage_notifications.py b/testgen/ui/views/dialogs/manage_notifications.py index c1037d4a..4a14637c 100644 --- a/testgen/ui/views/dialogs/manage_notifications.py +++ b/testgen/ui/views/dialogs/manage_notifications.py @@ -6,10 +6,11 @@ import streamlit as st -from testgen.common.models import with_database_session +from testgen.common.models import database_session, with_database_session from testgen.common.models.notification_settings import NotificationSettings, NotificationSettingsValidationError from testgen.common.models.settings import PersistedSetting from testgen.ui.components import widgets +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value LOG = logging.getLogger("testgen") @@ -41,7 +42,8 @@ def decorator(method): @wraps(method) def wrapper(self, *args, **kwargs): try: - with_database_session(method)(self, *args, **kwargs) + with database_session(): + method(self, *args, **kwargs) except NotificationSettingsValidationError as e: success = False message = str(e) @@ -55,7 +57,7 @@ def wrapper(self, *args, **kwargs): # The ever-changing "idx" is useful to force refreshing the component self.set_result({"success": success, "message": message, "idx": next(self._result_idx)}) - st.rerun(scope="fragment") + safe_rerun(scope="fragment") return wrapper return decorator diff --git a/testgen/ui/views/dialogs/manage_schedules.py b/testgen/ui/views/dialogs/manage_schedules.py index 82ff0551..346ed819 100644 --- a/testgen/ui/views/dialogs/manage_schedules.py +++ b/testgen/ui/views/dialogs/manage_schedules.py @@ -6,9 +6,10 @@ import streamlit as st from sqlalchemy.exc import IntegrityError -from testgen.common.models import Session, with_database_session +from testgen.common.models import database_session, get_current_session, with_database_session from testgen.common.models.scheduler import JobSchedule from testgen.ui.components import widgets as testgen +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value from testgen.ui.utils import get_cron_sample_handler @@ -44,17 +45,17 @@ def render(self) -> None: @with_database_session def on_delete_sched(item): JobSchedule.delete(item["id"]) - st.rerun(scope="fragment") + safe_rerun(scope="fragment") @with_database_session def on_pause_sched(item): JobSchedule.update_active(item["id"], False) - st.rerun(scope="fragment") + safe_rerun(scope="fragment") @with_database_session def on_resume_sched(item): JobSchedule.update_active(item["id"], True) - st.rerun(scope="fragment") + safe_rerun(scope="fragment") def on_add_schedule(payload: dict[str, str]): set_arg_value(payload["arg_value"]) @@ -98,7 +99,8 @@ def on_add_schedule(payload: dict[str, str]): args=args, kwargs=kwargs, ) - with_database_session(sched_model.save)() + with database_session(): + sched_model.save() else: success = False message = "Complete all the fields before adding the schedule" @@ -113,26 +115,26 @@ def on_add_schedule(payload: dict[str, str]): message = "Error validating the Cron expression" results = {"success": success, "message": message} - with Session() as db_session: - scheduled_jobs = ( - db_session.query(JobSchedule) - .where(JobSchedule.project_code == self.project_code, JobSchedule.key == self.job_key) - ) - scheduled_jobs_json = [] - for job in scheduled_jobs: - job_json = { - "id": str(job.id), - "argValue": self.get_arg_value(job), - "cronExpr": job.cron_expr, - "readableExpr": cron_descriptor.get_description(job.cron_expr), - "cronTz": job.cron_tz_str, - "sample": [ - sample.strftime("%a %b %-d, %-I:%M %p") - for sample in job.get_sample_triggering_timestamps(CRON_SAMPLE_COUNT + 1) - ], - "active": job.active, - } - scheduled_jobs_json.append(job_json) + db_session = get_current_session() + scheduled_jobs = ( + db_session.query(JobSchedule) + .where(JobSchedule.project_code == self.project_code, JobSchedule.key == self.job_key) + ) + scheduled_jobs_json = [] + for job in scheduled_jobs: + job_json = { + "id": str(job.id), + "argValue": self.get_arg_value(job), + "cronExpr": job.cron_expr, + "readableExpr": cron_descriptor.get_description(job.cron_expr), + "cronTz": job.cron_tz_str, + "sample": [ + sample.strftime("%a %b %-d, %-I:%M %p") + for sample in job.get_sample_triggering_timestamps(CRON_SAMPLE_COUNT + 1) + ], + "active": job.active, + } + scheduled_jobs_json.append(job_json) testgen.css_class("l-dialog") testgen.testgen_component( diff --git a/testgen/ui/views/dialogs/run_profiling_dialog.py b/testgen/ui/views/dialogs/run_profiling_dialog.py index 74d6dc02..88c77cc7 100644 --- a/testgen/ui/views/dialogs/run_profiling_dialog.py +++ b/testgen/ui/views/dialogs/run_profiling_dialog.py @@ -8,6 +8,7 @@ from testgen.common.models.table_group import TableGroup from testgen.ui.components import widgets as testgen from testgen.ui.navigation.router import Router +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value LINK_HREF = "profiling-runs" @@ -69,4 +70,4 @@ def on_run_profiling_confirmed(table_group: dict) -> None: if result and result["success"] and not result["show_link"]: time.sleep(2) ProfilingRun.select_summary.clear() - st.rerun() + safe_rerun() diff --git a/testgen/ui/views/dialogs/run_tests_dialog.py b/testgen/ui/views/dialogs/run_tests_dialog.py index 1350a230..3a6cd589 100644 --- a/testgen/ui/views/dialogs/run_tests_dialog.py +++ b/testgen/ui/views/dialogs/run_tests_dialog.py @@ -6,6 +6,7 @@ from testgen.common.models import with_database_session from testgen.common.models.test_suite import TestSuite, TestSuiteMinimal from testgen.ui.components import widgets as testgen +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session from testgen.utils import to_dataframe @@ -88,4 +89,4 @@ def run_tests_dialog(project_code: str, test_suite: TestSuiteMinimal | None = No else: time.sleep(2) st.cache_data.clear() - st.rerun() + safe_rerun() diff --git a/testgen/ui/views/monitors_dashboard.py b/testgen/ui/views/monitors_dashboard.py index 0eb0515a..1507c8e4 100644 --- a/testgen/ui/views/monitors_dashboard.py +++ b/testgen/ui/views/monitors_dashboard.py @@ -25,6 +25,7 @@ from testgen.ui.navigation.router import Router from testgen.ui.queries.profiling_queries import get_tables_by_table_group from testgen.ui.services.database_service import execute_db_query, fetch_all_from_db, fetch_one_from_db +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value from testgen.ui.utils import dict_from_kv, get_cron_sample, get_cron_sample_handler from testgen.ui.views.dialogs.manage_notifications import NotificationSettingsDialogBase @@ -562,7 +563,7 @@ def on_save_settings_clicked(payload: dict) -> None: monitors.append("Freshness_Trend") run_monitor_generation(monitor_suite.id, monitors) - st.rerun() + safe_rerun() testgen.edit_monitor_settings( key="edit_monitor_settings", @@ -618,14 +619,14 @@ def on_delete_confirmed(*_args) -> None: monitor_suite = TestSuite.get(table_group.monitor_test_suite_id) TestSuite.cascade_delete([monitor_suite.id]) st.cache_data.clear() - st.rerun() + safe_rerun() except Exception: LOG.exception("Failed to delete monitor suite") set_result({ "success": False, "message": "Unable to delete monitors for the table group, try again.", }) - st.rerun(scope="fragment") + safe_rerun(scope="fragment") def open_schema_changes(table_group: TableGroupMinimal, payload: dict): @@ -1033,10 +1034,10 @@ def on_save_test_definition(payload: dict) -> None: ) if should_close(): - st.rerun() + safe_rerun() set_result({"success": True, "timestamp": datetime.now(UTC).isoformat()}) - st.rerun(scope="fragment") + safe_rerun(scope="fragment") metric_test_types = TestType.select_summary_where(TestType.test_type == "Metric_Trend") metric_test_type = metric_test_types[0] if metric_test_types else None diff --git a/testgen/ui/views/profiling_runs.py b/testgen/ui/views/profiling_runs.py index c13e1a35..475cae67 100644 --- a/testgen/ui/views/profiling_runs.py +++ b/testgen/ui/views/profiling_runs.py @@ -22,6 +22,7 @@ from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page from testgen.ui.navigation.router import Router +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value from testgen.ui.views.dialogs.manage_notifications import NotificationSettingsDialogBase from testgen.ui.views.dialogs.manage_schedules import ScheduleDialog @@ -223,11 +224,11 @@ def on_delete_confirmed(*_args) -> None: ProfilingRun.cancel_run(profiling_run.id) send_profiling_run_notifications(ProfilingRun.get(profiling_run.id)) ProfilingRun.cascade_delete(profiling_run_ids) - st.rerun() + safe_rerun() except Exception: LOG.exception("Failed to delete profiling runs") set_result({ "success": False, "message": "Unable to delete the selected profiling runs, try again.", }) - st.rerun(scope="fragment") + safe_rerun(scope="fragment") diff --git a/testgen/ui/views/score_explorer.py b/testgen/ui/views/score_explorer.py index 391d0a2f..64e004b0 100644 --- a/testgen/ui/views/score_explorer.py +++ b/testgen/ui/views/score_explorer.py @@ -34,6 +34,7 @@ get_score_card_issue_reports, get_score_category_values, ) +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value from testgen.utils import format_score_card, format_score_card_breakdown, format_score_card_issues, try_json @@ -261,7 +262,7 @@ def column_selector_dialog(project_code: str, score_definition_dict: dict, _) -> def dialog_content() -> None: if not is_column_selector_opened(): - st.rerun() + safe_rerun() selected_filters = set() if score_definition_dict.get("filter_by_columns"): diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index 54e6868f..3acf740e 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -19,6 +19,7 @@ from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page from testgen.ui.queries import table_group_queries +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value from testgen.ui.utils import get_cron_sample_handler from testgen.ui.views.connections import FLAVOR_OPTIONS, format_connection @@ -165,7 +166,7 @@ def on_close_clicked(_params: dict) -> None: get_close_dialog, set_close_dialog = temp_value("table_groups:close:new", default=False) if (get_close_dialog()): - st.rerun() + safe_rerun() should_preview, mark_for_preview = temp_value("table_groups:preview:new", default=False) should_verify_access, mark_for_access_preview = temp_value("table_groups:preview_access:new", default=False) @@ -418,7 +419,7 @@ def on_delete_confirmed(*_args): if not TableGroup.has_running_process([table_group_id]): TableGroup.cascade_delete([table_group_id]) message = f"Table Group {table_group.table_groups_name} has been deleted. " - st.rerun() + safe_rerun() else: message = "This Table Group is in use by a running process and cannot be deleted." result = {"success": success, "message": message} diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 06b01c9c..33648518 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -30,6 +30,7 @@ from testgen.ui.components.widgets.page import css_class, flex_row_end from testgen.ui.navigation.page import Page from testgen.ui.services.database_service import fetch_all_from_db, fetch_df_from_db, fetch_from_target_db +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.services.string_service import empty_if_null, snake_case_to_title_case from testgen.ui.session import session, temp_value from testgen.ui.views.dialogs.profiling_results_dialog import view_profiling_button @@ -395,7 +396,7 @@ def delete_test_dialog(test_definitions: list[dict]): TestDefinition.delete_where(TestDefinition.id.in_([ item["id"] for item in test_definitions ])) st.success("Test definitions have been deleted.") time.sleep(1) - st.rerun() + safe_rerun() def show_test_form_by_id(test_definition_id): @@ -918,7 +919,7 @@ def render_dynamic_attribute(attribute: str, container: DeltaGenerator): test_definition["id"] = selected_test_def["id"] TestDefinition(**test_definition).save() get_test_suite_columns.clear() - st.rerun() + safe_rerun() @st.dialog(title="Add Test") @@ -1031,14 +1032,14 @@ def copy_move_test_dialog( st.success(success_message) get_test_suite_columns.clear() time.sleep(1) - st.rerun() + safe_rerun() elif copy: TestDefinition.copy(test_definition_ids, target_table_group_id, target_test_suite_id, target_table_name, target_column_name) success_message = "Test Definitions have been copied." st.success(success_message) get_test_suite_columns.clear() time.sleep(1) - st.rerun() + safe_rerun() def validate_form(test_scope, test_definition, column_name_label): if test_scope in ["column", "referential", "custom"] and not test_definition["column_name"]: @@ -1097,7 +1098,7 @@ def confirm_unlocking_test_definition(test_definitions: list[dict]): if unlock_confirmed(): update_test_definition(test_definitions, "lock_refresh", False, "Test definitions have been unlocked.") time.sleep(1) - st.rerun() + safe_rerun() _, button_column = st.columns([.85, .15]) with button_column: diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index 3a14c058..9918b96f 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -24,6 +24,7 @@ from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page from testgen.ui.navigation.router import Router +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value from testgen.ui.views.dialogs.manage_notifications import NotificationSettingsDialogBase from testgen.ui.views.dialogs.manage_schedules import ScheduleDialog @@ -242,8 +243,8 @@ def on_delete_confirmed(*_args) -> None: TestRun.cancel_run(test_run.test_run_id) send_test_run_notifications(TestRun.get(test_run.test_run_id)) TestRun.cascade_delete(test_run_ids) - st.rerun() + safe_rerun() except Exception: LOG.exception("Failed to delete test run") result = {"success": False, "message": "Unable to delete the test run, try again."} - st.rerun(scope="fragment") + safe_rerun(scope="fragment") diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index 0be71483..f0dc5abd 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -14,6 +14,7 @@ from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page from testgen.ui.navigation.router import Router +from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.services.string_service import empty_if_null from testgen.ui.session import session from testgen.ui.views.dialogs.generate_tests_dialog import generate_tests_dialog @@ -206,7 +207,7 @@ def show_test_suite(mode, project_code, table_groups: Iterable[TableGroupMinimal ) st.success(success_message) time.sleep(1) - st.rerun() + safe_rerun() @st.dialog(title="Delete Test Suite") @@ -245,7 +246,7 @@ def delete_test_suite_dialog(test_suite_id: str) -> None: success_message = f"Test Suite {test_suite_name} has been deleted. " st.success(success_message) time.sleep(1) - st.rerun() + safe_rerun() @st.dialog(title="Export to Observability") From de20ee0071953a3c6c7006f94a49a89db14e4729 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 20 Mar 2026 08:46:12 -0300 Subject: [PATCH 55/95] fix: use database_session() context manager for schedule query The render() method has no active session (st.dialog breaks the call chain from open()). The original code used Session() directly for this reason. Use database_session() instead of bare get_current_session() so a session is created when none exists. Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/ui/views/dialogs/manage_schedules.py | 42 ++++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/testgen/ui/views/dialogs/manage_schedules.py b/testgen/ui/views/dialogs/manage_schedules.py index 346ed819..cb743dd7 100644 --- a/testgen/ui/views/dialogs/manage_schedules.py +++ b/testgen/ui/views/dialogs/manage_schedules.py @@ -6,7 +6,7 @@ import streamlit as st from sqlalchemy.exc import IntegrityError -from testgen.common.models import database_session, get_current_session, with_database_session +from testgen.common.models import database_session, with_database_session from testgen.common.models.scheduler import JobSchedule from testgen.ui.components import widgets as testgen from testgen.ui.services.rerun_service import safe_rerun @@ -115,26 +115,26 @@ def on_add_schedule(payload: dict[str, str]): message = "Error validating the Cron expression" results = {"success": success, "message": message} - db_session = get_current_session() - scheduled_jobs = ( - db_session.query(JobSchedule) - .where(JobSchedule.project_code == self.project_code, JobSchedule.key == self.job_key) - ) - scheduled_jobs_json = [] - for job in scheduled_jobs: - job_json = { - "id": str(job.id), - "argValue": self.get_arg_value(job), - "cronExpr": job.cron_expr, - "readableExpr": cron_descriptor.get_description(job.cron_expr), - "cronTz": job.cron_tz_str, - "sample": [ - sample.strftime("%a %b %-d, %-I:%M %p") - for sample in job.get_sample_triggering_timestamps(CRON_SAMPLE_COUNT + 1) - ], - "active": job.active, - } - scheduled_jobs_json.append(job_json) + with database_session() as db_session: + scheduled_jobs = ( + db_session.query(JobSchedule) + .where(JobSchedule.project_code == self.project_code, JobSchedule.key == self.job_key) + ) + scheduled_jobs_json = [] + for job in scheduled_jobs: + job_json = { + "id": str(job.id), + "argValue": self.get_arg_value(job), + "cronExpr": job.cron_expr, + "readableExpr": cron_descriptor.get_description(job.cron_expr), + "cronTz": job.cron_tz_str, + "sample": [ + sample.strftime("%a %b %-d, %-I:%M %p") + for sample in job.get_sample_triggering_timestamps(CRON_SAMPLE_COUNT + 1) + ], + "active": job.active, + } + scheduled_jobs_json.append(job_json) testgen.css_class("l-dialog") testgen.testgen_component( From 292f1ab32309e99815504b54cea3cdc2ef31dc64 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 20 Mar 2026 09:15:56 -0300 Subject: [PATCH 56/95] refactor: track writes via after_flush, clear cache in safe_rerun Register a SQLAlchemy after_flush event listener that sets a flag when the session writes to the database. safe_rerun() checks this flag and clears st.cache_data only when writes actually happened, avoiding unnecessary cache invalidation on no-DB rerun paths (dialog close, help menu, etc.). Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/common/models/__init__.py | 19 ++++++++++++++++++- testgen/ui/services/rerun_service.py | 7 +++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/testgen/common/models/__init__.py b/testgen/common/models/__init__.py index 6e2b581c..10038b29 100644 --- a/testgen/common/models/__init__.py +++ b/testgen/common/models/__init__.py @@ -4,7 +4,7 @@ import threading import urllib.parse -from sqlalchemy import create_engine +from sqlalchemy import create_engine, event from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Session as SQLAlchemySession from sqlalchemy.orm import sessionmaker @@ -50,6 +50,7 @@ def database_session(): return with Session() as session: _current_session_wrapper.value = session + _current_session_wrapper.session_flushed = False try: yield session except Exception: @@ -73,3 +74,19 @@ def wrapper(*args, **kwargs): def get_current_session() -> SQLAlchemySession: return getattr(_current_session_wrapper, "value", None) + + +def session_had_writes() -> bool: + """Check and reset the write-tracking flag. + + Returns True if the session flushed any writes since the flag was + last reset (i.e. since the owning ``database_session()`` opened). + """ + had_writes = getattr(_current_session_wrapper, "session_flushed", False) + _current_session_wrapper.session_flushed = False + return had_writes + + +@event.listens_for(Session, "after_flush") +def _track_writes(_session, _flush_context): + _current_session_wrapper.session_flushed = True diff --git a/testgen/ui/services/rerun_service.py b/testgen/ui/services/rerun_service.py index 6c0b7ea1..b23dbad9 100644 --- a/testgen/ui/services/rerun_service.py +++ b/testgen/ui/services/rerun_service.py @@ -2,16 +2,19 @@ import streamlit as st -from testgen.common.models import get_current_session +from testgen.common.models import get_current_session, session_had_writes def safe_rerun(*, scope: Literal["app", "fragment"] = "app") -> NoReturn: """Commit any pending database changes, then trigger a Streamlit rerun. Prevents data loss when RerunException propagates through the - session context manager in app.py:render(). + session context manager in app.py:render(). Clears the Streamlit + data cache when the session flushed writes during this render cycle. """ session = get_current_session() if session: session.commit() + if session_had_writes(): + st.cache_data.clear() st.rerun(scope=scope) From ff7917fafb2bd148fcb36d18ebce5bb4a6fe615d Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 20 Mar 2026 09:16:04 -0300 Subject: [PATCH 57/95] refactor: remove cache-clearing side effects from model mutations Entity.save(), .delete(), .delete_where() and all subclass mutation methods no longer call st.cache_data.clear() or cls.clear_cache(). Cache invalidation is now handled by safe_rerun() via the after_flush write-tracking flag. This removes the behavioral coupling between model mutations and Streamlit's caching layer. The @st.cache_data decorators on query methods and clear_cache() definitions are retained for targeted invalidation in non-rerun UI paths. Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/common/models/entity.py | 9 --------- testgen/common/models/profiling_run.py | 2 -- testgen/common/models/scheduler.py | 3 --- testgen/common/models/table_group.py | 1 - testgen/common/models/test_definition.py | 5 ----- testgen/common/models/test_run.py | 2 -- testgen/common/models/user.py | 1 - 7 files changed, 23 deletions(-) diff --git a/testgen/common/models/entity.py b/testgen/common/models/entity.py index 6d0b0950..3647e684 100644 --- a/testgen/common/models/entity.py +++ b/testgen/common/models/entity.py @@ -119,8 +119,6 @@ def delete_where(cls, *clauses) -> None: db_session = get_current_session() db_session.execute(query) db_session.commit() - # We clear all because cached data like Project.select_summary will be affected - st.cache_data.clear() @classmethod def is_in_use(cls, ids: list[str]) -> bool: @@ -144,24 +142,17 @@ def refresh(self) -> None: db_session.refresh(self) def save(self) -> None: - is_new = self.id is None db_session = get_current_session() db_session.add(self) db_session.flush([self]) db_session.commit() db_session.refresh(self, ["id"]) - if is_new: - # We clear all because cached data like Project.select_summary will be affected - st.cache_data.clear() - else: - self.__class__.clear_cache() def delete(self) -> None: db_session = get_current_session() db_session.add(self) db_session.delete(self) db_session.commit() - self.__class__.clear_cache() def to_dict(self, json_safe: bool = False): result = {col.name: getattr(self, col.name) for col in self.__table__.columns} diff --git a/testgen/common/models/profiling_run.py b/testgen/common/models/profiling_run.py index b7059a6d..7365f8a4 100644 --- a/testgen/common/models/profiling_run.py +++ b/testgen/common/models/profiling_run.py @@ -247,7 +247,6 @@ def cancel_all_running(cls) -> list[UUID]: db_session = get_current_session() rows = db_session.execute(query) db_session.commit() - cls.clear_cache() return [r.id for r in rows] @classmethod @@ -256,7 +255,6 @@ def cancel_run(cls, run_id: str | UUID) -> None: db_session = get_current_session() db_session.execute(query) db_session.commit() - cls.clear_cache() @classmethod def cascade_delete(cls, ids: list[str]) -> None: diff --git a/testgen/common/models/scheduler.py b/testgen/common/models/scheduler.py index fa070e03..86b120d6 100644 --- a/testgen/common/models/scheduler.py +++ b/testgen/common/models/scheduler.py @@ -70,7 +70,6 @@ def delete(cls, job_id: str | UUID) -> None: db_session.rollback() else: db_session.commit() - cls.clear_cache() @classmethod def update_active(cls, job_id: str | UUID, active: bool) -> None: @@ -82,7 +81,6 @@ def update_active(cls, job_id: str | UUID, active: bool) -> None: db_session.rollback() else: db_session.commit() - cls.clear_cache() @classmethod def count(cls): @@ -104,4 +102,3 @@ def save(self) -> None: db_session = get_current_session() db_session.add(self) db_session.commit() - self.__class__.clear_cache() diff --git a/testgen/common/models/table_group.py b/testgen/common/models/table_group.py index 938a851b..80b9a6d2 100644 --- a/testgen/common/models/table_group.py +++ b/testgen/common/models/table_group.py @@ -445,4 +445,3 @@ def save(self, add_scorecard_definition: bool = False) -> None: super().save() if add_scorecard_definition: ScoreDefinition.from_table_group(self).save() - TableGroup.clear_cache() diff --git a/testgen/common/models/test_definition.py b/testgen/common/models/test_definition.py index e3c71309..7eb03a66 100644 --- a/testgen/common/models/test_definition.py +++ b/testgen/common/models/test_definition.py @@ -314,7 +314,6 @@ def set_status_attribute( db_session = get_current_session() db_session.execute(text(query), params) db_session.commit() - cls.clear_cache() @classmethod def move( @@ -350,7 +349,6 @@ def move( db_session = get_current_session() db_session.execute(text(query), params) db_session.commit() - cls.clear_cache() @classmethod def copy( @@ -392,7 +390,6 @@ def copy( db_session = get_current_session() db_session.execute(query) db_session.commit() - cls.clear_cache() @classmethod def clear_cache(cls) -> bool: @@ -413,8 +410,6 @@ def save(self) -> None: else: super().save() - TestDefinition.clear_cache() - class TestDefinitionNote(Base): __tablename__ = "test_definition_notes" diff --git a/testgen/common/models/test_run.py b/testgen/common/models/test_run.py index 4b3b2a15..55c7dd56 100644 --- a/testgen/common/models/test_run.py +++ b/testgen/common/models/test_run.py @@ -349,7 +349,6 @@ def cancel_all_running(cls) -> list[UUID]: db_session = get_current_session() rows = db_session.execute(query) db_session.commit() - cls.clear_cache() return [r.id for r in rows] @classmethod @@ -358,7 +357,6 @@ def cancel_run(cls, run_id: str | UUID) -> None: db_session = get_current_session() db_session.execute(query) db_session.commit() - cls.clear_cache() @classmethod def cascade_delete(cls, ids: list[str]) -> None: diff --git a/testgen/common/models/user.py b/testgen/common/models/user.py index e65bc02b..f0fbd6e3 100644 --- a/testgen/common/models/user.py +++ b/testgen/common/models/user.py @@ -37,7 +37,6 @@ def save(self, update_latest_login: bool = False) -> None: db_session = get_current_session() db_session.execute(query) db_session.commit() - User.clear_cache() else: if update_latest_login: self.latest_login = datetime.now(UTC) From df621030cdc44213676889f060b05689a3baf182 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 20 Mar 2026 09:16:11 -0300 Subject: [PATCH 58/95] refactor: remove redundant cache clears from view callsites MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove st.cache_data.clear(), targeted .clear(), and the clear_cache / lst_cached_functions parameters from reset_post_updates() — all now handled by safe_rerun() via the write-tracking flag. Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/ui/services/form_service.py | 10 ++-------- testgen/ui/views/data_catalog.py | 4 ---- testgen/ui/views/dialogs/generate_tests_dialog.py | 1 - testgen/ui/views/dialogs/run_profiling_dialog.py | 2 -- testgen/ui/views/dialogs/run_tests_dialog.py | 1 - testgen/ui/views/hygiene_issues.py | 4 ---- testgen/ui/views/monitors_dashboard.py | 1 - testgen/ui/views/test_definitions.py | 5 ----- testgen/ui/views/test_results.py | 6 ------ 9 files changed, 2 insertions(+), 32 deletions(-) diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index b55c9f04..948d65a1 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -25,7 +25,7 @@ def render_refresh_button(button_container): with button_container: do_refresh = st.button(":material/refresh:", help="Refresh page data", use_container_width=False) if do_refresh: - reset_post_updates("Refreshing page", True, True) + reset_post_updates("Refreshing page", as_toast=True) def show_prompt(str_prompt=None): @@ -62,7 +62,7 @@ def ut_prettify_header(str_header, expand=False): return str_new -def reset_post_updates(str_message=None, as_toast=False, clear_cache=True, lst_cached_functions=None, style="success"): +def reset_post_updates(str_message=None, as_toast=False, style="success"): if str_message: if as_toast: st.toast(str_message) @@ -72,12 +72,6 @@ def reset_post_updates(str_message=None, as_toast=False, clear_cache=True, lst_c st.success(str_message) sleep(1.5) - if clear_cache: - if lst_cached_functions: - for fcn in lst_cached_functions: - fcn.clear() - else: - st.cache_data.clear() safe_rerun() diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index 74da93ea..407a4447 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -354,8 +354,6 @@ def remove_table_dialog(item: dict) -> None: st.success("Table has been removed.") time.sleep(1) - for func in [get_table_group_columns, get_tag_values]: - func.clear() st.session_state["data_catalog:last_saved_timestamp"] = datetime.now().timestamp() safe_rerun() @@ -406,8 +404,6 @@ def on_tags_changed(spinner_container: DeltaGenerator, payload: dict) -> FILE_DA params, ) - for func in [get_table_group_columns, get_table_by_id, get_column_by_id, get_tag_values]: - func.clear() st.session_state["data_catalog:last_saved_timestamp"] = datetime.now().timestamp() safe_rerun() diff --git a/testgen/ui/views/dialogs/generate_tests_dialog.py b/testgen/ui/views/dialogs/generate_tests_dialog.py index 094921cf..ad67ed3b 100644 --- a/testgen/ui/views/dialogs/generate_tests_dialog.py +++ b/testgen/ui/views/dialogs/generate_tests_dialog.py @@ -77,7 +77,6 @@ def generate_tests_dialog(test_suite: TestSuiteMinimal) -> None: status_container.success(f"Test generation completed for test suite **{test_suite_name}**.") time.sleep(1) - st.cache_data.clear() safe_rerun() diff --git a/testgen/ui/views/dialogs/run_profiling_dialog.py b/testgen/ui/views/dialogs/run_profiling_dialog.py index 88c77cc7..de77622f 100644 --- a/testgen/ui/views/dialogs/run_profiling_dialog.py +++ b/testgen/ui/views/dialogs/run_profiling_dialog.py @@ -4,7 +4,6 @@ import streamlit as st from testgen.commands.run_profiling import run_profiling_in_background -from testgen.common.models.profiling_run import ProfilingRun from testgen.common.models.table_group import TableGroup from testgen.ui.components import widgets as testgen from testgen.ui.navigation.router import Router @@ -69,5 +68,4 @@ def on_run_profiling_confirmed(table_group: dict) -> None: if result and result["success"] and not result["show_link"]: time.sleep(2) - ProfilingRun.select_summary.clear() safe_rerun() diff --git a/testgen/ui/views/dialogs/run_tests_dialog.py b/testgen/ui/views/dialogs/run_tests_dialog.py index 3a6cd589..7908f90c 100644 --- a/testgen/ui/views/dialogs/run_tests_dialog.py +++ b/testgen/ui/views/dialogs/run_tests_dialog.py @@ -88,5 +88,4 @@ def run_tests_dialog(project_code: str, test_suite: TestSuiteMinimal | None = No ) else: time.sleep(2) - st.cache_data.clear() safe_rerun() diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py index b58cc615..8ac10dda 100644 --- a/testgen/ui/views/hygiene_issues.py +++ b/testgen/ui/views/hygiene_issues.py @@ -309,8 +309,6 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: int_data_width=700, ) - cached_functions = [get_anomaly_disposition, get_profiling_anomaly_summary, get_profiling_anomalies] - disposition_actions = [ { "icon": "✓", "help": "Confirm this issue as relevant for this run", "status": "Confirmed" }, { "icon": "✘", "help": "Dismiss this issue as not relevant for this run", "status": "Dismissed" }, @@ -334,8 +332,6 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: fm.reset_post_updates( do_disposition_update(selected, d_action["status"]), as_toast=True, - clear_cache=True, - lst_cached_functions=cached_functions, ) # Needs to be after all data loading/updating diff --git a/testgen/ui/views/monitors_dashboard.py b/testgen/ui/views/monitors_dashboard.py index 1507c8e4..4086f294 100644 --- a/testgen/ui/views/monitors_dashboard.py +++ b/testgen/ui/views/monitors_dashboard.py @@ -618,7 +618,6 @@ def on_delete_confirmed(*_args) -> None: with st.spinner("Deleting monitors ..."): monitor_suite = TestSuite.get(table_group.monitor_test_suite_id) TestSuite.cascade_delete([monitor_suite.id]) - st.cache_data.clear() safe_rerun() except Exception: LOG.exception("Failed to delete monitor suite") diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 33648518..1d843fae 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -234,8 +234,6 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: fm.reset_post_updates( update_test_definition(selected, action["attribute"], action["value"], action["message"]), as_toast=True, - clear_cache=True, - lst_cached_functions=[], ) if actions_column.button( @@ -918,7 +916,6 @@ def render_dynamic_attribute(attribute: str, container: DeltaGenerator): if mode == "edit": test_definition["id"] = selected_test_def["id"] TestDefinition(**test_definition).save() - get_test_suite_columns.clear() safe_rerun() @@ -1030,14 +1027,12 @@ def copy_move_test_dialog( TestDefinition.move(test_definition_ids, target_table_group_id, target_test_suite_id, target_table_name, target_column_name) success_message = "Test Definitions have been moved." st.success(success_message) - get_test_suite_columns.clear() time.sleep(1) safe_rerun() elif copy: TestDefinition.copy(test_definition_ids, target_table_group_id, target_test_suite_id, target_table_name, target_column_name) success_message = "Test Definitions have been copied." st.success(success_message) - get_test_suite_columns.clear() time.sleep(1) safe_rerun() diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index e9bfd6e0..bafe46a6 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -306,8 +306,6 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: ) # Need to render toolbar buttons after grid, so selection status is maintained - affected_cached_functions = [get_test_disposition, test_result_queries.get_test_results] - # === Action buttons (left side, near the grid) === if actions_column.button( @@ -417,8 +415,6 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: fm.reset_post_updates( do_disposition_update(selected, action["status"]), as_toast=True, - clear_cache=True, - lst_cached_functions=affected_cached_functions, ) if session.auth.user_has_permission("disposition"): @@ -437,8 +433,6 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: fm.reset_post_updates( None, as_toast=True, - clear_cache=True, - lst_cached_functions=affected_cached_functions, ) # Needs to be after all data loading/updating From 671e70253869fba4a2066ea6f62046f8401b44fd Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 20 Mar 2026 09:31:54 -0300 Subject: [PATCH 59/95] fix: always clear cache in safe_rerun after_flush only fires for ORM-tracked changes (add/delete), not for raw SQL or bulk update/delete constructs that most model mutations use. This caused stale grids after disposition updates, etc. Replace the write-tracking approach with unconditional cache clearing. The perf cost on no-DB rerun paths is negligible. Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/common/models/__init__.py | 19 +------------------ testgen/ui/services/rerun_service.py | 10 +++++----- 2 files changed, 6 insertions(+), 23 deletions(-) diff --git a/testgen/common/models/__init__.py b/testgen/common/models/__init__.py index 10038b29..6e2b581c 100644 --- a/testgen/common/models/__init__.py +++ b/testgen/common/models/__init__.py @@ -4,7 +4,7 @@ import threading import urllib.parse -from sqlalchemy import create_engine, event +from sqlalchemy import create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Session as SQLAlchemySession from sqlalchemy.orm import sessionmaker @@ -50,7 +50,6 @@ def database_session(): return with Session() as session: _current_session_wrapper.value = session - _current_session_wrapper.session_flushed = False try: yield session except Exception: @@ -74,19 +73,3 @@ def wrapper(*args, **kwargs): def get_current_session() -> SQLAlchemySession: return getattr(_current_session_wrapper, "value", None) - - -def session_had_writes() -> bool: - """Check and reset the write-tracking flag. - - Returns True if the session flushed any writes since the flag was - last reset (i.e. since the owning ``database_session()`` opened). - """ - had_writes = getattr(_current_session_wrapper, "session_flushed", False) - _current_session_wrapper.session_flushed = False - return had_writes - - -@event.listens_for(Session, "after_flush") -def _track_writes(_session, _flush_context): - _current_session_wrapper.session_flushed = True diff --git a/testgen/ui/services/rerun_service.py b/testgen/ui/services/rerun_service.py index b23dbad9..34afce09 100644 --- a/testgen/ui/services/rerun_service.py +++ b/testgen/ui/services/rerun_service.py @@ -2,19 +2,19 @@ import streamlit as st -from testgen.common.models import get_current_session, session_had_writes +from testgen.common.models import get_current_session def safe_rerun(*, scope: Literal["app", "fragment"] = "app") -> NoReturn: """Commit any pending database changes, then trigger a Streamlit rerun. Prevents data loss when RerunException propagates through the - session context manager in app.py:render(). Clears the Streamlit - data cache when the session flushed writes during this render cycle. + session context manager in app.py:render(). Always clears the + Streamlit data cache so stale query results are not served after + database mutations. """ session = get_current_session() if session: session.commit() - if session_had_writes(): - st.cache_data.clear() + st.cache_data.clear() st.rerun(scope=scope) From adc78839395b1ca03f4dea61a16d817a9b9a71c1 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 20 Mar 2026 23:53:20 -0300 Subject: [PATCH 60/95] fix(TG-1005): correct Daily_Record_Ct operator and Email_Format lookup regexes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Daily_Record_Ct: test_operator was `<` instead of `>` for snowflake, bigquery, databricks — the measure counts missing days, so `>` is correct (fail when missing > threshold). Email_Format: lookup query regexes were inconsistent with measure regexes for postgresql, snowflake, databricks, bigquery, oracle, sap-hana — missing apostrophe in character class and double-escaped dots. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../test_types_Daily_Record_Ct.yaml | 6 +++--- .../dbsetup_test_types/test_types_Email_Format.yaml | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml index c5fe688b..b84368e5 100644 --- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml @@ -45,7 +45,7 @@ test_types: sql_flavor: bigquery measure: |- DATETIME_DIFF(DATETIME_TRUNC(SAFE_CAST(MAX({COLUMN_NAME}) AS DATE), DAY), DATETIME_TRUNC(SAFE_CAST(MIN({COLUMN_NAME}) AS DATE), DAY), DAY) + 1 - COUNT(DISTINCT DATETIME_TRUNC({COLUMN_NAME}, DAY)) - test_operator: < + test_operator: '>' test_condition: |- {THRESHOLD_VALUE} - id: '6005' @@ -53,7 +53,7 @@ test_types: sql_flavor: databricks measure: |- <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME}) - test_operator: < + test_operator: '>' test_condition: |- {THRESHOLD_VALUE} - id: '3005' @@ -93,7 +93,7 @@ test_types: sql_flavor: snowflake measure: |- DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) - test_operator: < + test_operator: '>' test_condition: |- {THRESHOLD_VALUE} - id: '5005' diff --git a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml index 7b51af47..1d49d881 100644 --- a/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Email_Format.yaml @@ -125,7 +125,7 @@ test_types: lookup_query: |- SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` - WHERE NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$') + WHERE NOT REGEXP_CONTAINS(CAST(`{COLUMN_NAME}` AS STRING), r'^[A-Za-z0-9._%+-]+@(?:[A-Za-z0-9-]+[.])+[A-Za-z]{2,}$') GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results @@ -135,7 +135,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; + SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') GROUP BY `{COLUMN_NAME}` LIMIT {LIMIT}; error_type: Test Results - id: '1148' test_id: '1014' @@ -151,7 +151,7 @@ test_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" !~ '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '1009' test_id: '1014' @@ -175,7 +175,7 @@ test_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') != 1 GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT REGEXP_LIKE("{COLUMN_NAME}"::VARCHAR, '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT}; error_type: Test Results - id: '8009' test_id: '1014' @@ -183,7 +183,7 @@ test_types: sql_flavor: oracle lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT REGEXP_LIKE("{COLUMN_NAME}", '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT REGEXP_LIKE("{COLUMN_NAME}", '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results - id: '8009' test_id: '1014' @@ -191,6 +191,6 @@ test_types: sql_flavor: sap_hana lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT "{COLUMN_NAME}" LIKE_REGEXPR '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE NOT "{COLUMN_NAME}" LIKE_REGEXPR '^[A-Za-z0-9._''%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Test Results test_templates: [] From 5b2d69d643f64b250ff892336d4bc10395508a79 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Mon, 23 Mar 2026 11:23:09 -0300 Subject: [PATCH 61/95] fix(TG-1005): make MSSQL calendar gap lookups consistent with other flavors MSSQL Daily/Weekly/Monthly_Record_Ct lookup queries had two issues: - Column named `check_period` instead of `missing_period` (all other flavors use `missing_period`) - WHERE clause included adjacent present periods (record_ct = 0 OR last_record_ct = 0 OR next_record_ct = 0), inflating row counts vs other flavors that return only missing periods Fix: alias to `missing_period`, filter to `WHERE record_ct = 0` only. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../dbsetup_test_types/test_types_Daily_Record_Ct.yaml | 4 +--- .../dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml | 4 +--- .../template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml index b84368e5..aabd9933 100644 --- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml @@ -205,15 +205,13 @@ test_types: FROM check_periods c LEFT JOIN data_by_period d ON (c.check_period = d.data_period) ) - SELECT TOP {LIMIT} check_period, record_ct, + SELECT TOP {LIMIT} check_period AS missing_period, record_ct, CASE WHEN record_ct = 0 THEN 'MISSING' ELSE 'Present' END as status FROM data_by_prd_with_prior_next WHERE record_ct = 0 - OR last_record_ct = 0 - OR next_record_ct = 0 ORDER BY check_period DESC; error_type: Test Results - id: '1087' diff --git a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml index 83dbf5b5..ec0fffa4 100644 --- a/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Monthly_Rec_Ct.yaml @@ -201,15 +201,13 @@ test_types: FROM check_periods c LEFT JOIN data_by_period d ON (c.check_period = d.data_period) ) - SELECT TOP {LIMIT} check_period, record_ct, + SELECT TOP {LIMIT} check_period AS missing_period, record_ct, CASE WHEN record_ct = 0 THEN 'MISSING' ELSE 'Present' END as status FROM data_by_prd_with_prior_next WHERE record_ct = 0 - OR last_record_ct = 0 - OR next_record_ct = 0 ORDER BY check_period DESC; error_type: Test Results - id: '1100' diff --git a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml index 7fafc6b4..3c288eaf 100644 --- a/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Weekly_Rec_Ct.yaml @@ -201,15 +201,13 @@ test_types: FROM check_periods c LEFT JOIN data_by_period d ON (c.check_period = d.data_period) ) - SELECT TOP {LIMIT} check_period, record_ct, + SELECT TOP {LIMIT} check_period AS missing_period, record_ct, CASE WHEN record_ct = 0 THEN 'MISSING' ELSE 'Present' END as status FROM data_by_prd_with_prior_next WHERE record_ct = 0 - OR last_record_ct = 0 - OR next_record_ct = 0 ORDER BY check_period DESC; error_type: Test Results - id: '1112' From b1100e0aacb5b8afa85508939e4c0c5ef7528364 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Mon, 23 Mar 2026 18:37:09 -0300 Subject: [PATCH 62/95] fix(TG-1005): fix cross-flavor test type bugs found by validation suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Daily_Record_Ct: MSSQL test_operator was < instead of > (same bug previously fixed for snowflake/bigquery/databricks). Future_Date, Future_Date_1Y: MSSQL, trino, oracle, SAP HANA used >= instead of > — counting the run date itself as "future". Consistent flavors (postgresql, redshift, snowflake, databricks, bigquery) all use strict > via SIGN() or explicit comparison. Min_Date: SAP HANA TO_DATE format used 'YYYY-MM-DD HH24:MI:SS' but BASELINE_VALUE is date-only — SAP HANA is strict about format length. Street_Addr_Pattern: MSSQL lookup used AND for negated conditions instead of OR (De Morgan's law). Returned fewer rows than expected. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../test_types_Daily_Record_Ct.yaml | 2 +- .../dbsetup_test_types/test_types_Dec_Trunc.yaml | 4 ++-- .../dbsetup_test_types/test_types_Future_Date.yaml | 14 +++++++------- .../test_types_Future_Date_1Y.yaml | 14 +++++++------- .../dbsetup_test_types/test_types_Min_Date.yaml | 4 ++-- .../test_types_Street_Addr_Pattern.yaml | 2 +- testgen/ui/queries/source_data_queries.py | 4 ++-- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml index aabd9933..fea86ca0 100644 --- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml @@ -61,7 +61,7 @@ test_types: sql_flavor: mssql measure: |- DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) - test_operator: < + test_operator: '>' test_condition: |- {THRESHOLD_VALUE} - id: '4005' diff --git a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml index 7b40daa7..ac988b64 100644 --- a/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Dec_Trunc.yaml @@ -188,7 +188,7 @@ test_types: sql_flavor: oracle lookup_type: null lookup_query: |- - SELECT DISTINCT LENGTH(SUBSTR(TO_CHAR("{COLUMN_NAME}"), INSTR(TO_CHAR("{COLUMN_NAME}"), '.') + 1)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE INSTR(TO_CHAR("{COLUMN_NAME}"), '.') > 0 GROUP BY LENGTH(SUBSTR(TO_CHAR("{COLUMN_NAME}"), INSTR(TO_CHAR("{COLUMN_NAME}"), '.') + 1)) FETCH FIRST {LIMIT} ROWS ONLY + SELECT DISTINCT CASE WHEN INSTR(TO_CHAR("{COLUMN_NAME}"), '.') > 0 THEN LENGTH(SUBSTR(TO_CHAR("{COLUMN_NAME}"), INSTR(TO_CHAR("{COLUMN_NAME}"), '.') + 1)) ELSE 0 END AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY CASE WHEN INSTR(TO_CHAR("{COLUMN_NAME}"), '.') > 0 THEN LENGTH(SUBSTR(TO_CHAR("{COLUMN_NAME}"), INSTR(TO_CHAR("{COLUMN_NAME}"), '.') + 1)) ELSE 0 END FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results - id: '8006' test_id: '1011' @@ -196,6 +196,6 @@ test_types: sql_flavor: sap_hana lookup_type: null lookup_query: |- - SELECT DISTINCT LENGTH(SUBSTR(TO_VARCHAR("{COLUMN_NAME}"), LOCATE(TO_VARCHAR("{COLUMN_NAME}"), '.') + 1)) AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE LOCATE(TO_VARCHAR("{COLUMN_NAME}"), '.') > 0 GROUP BY LENGTH(SUBSTR(TO_VARCHAR("{COLUMN_NAME}"), LOCATE(TO_VARCHAR("{COLUMN_NAME}"), '.') + 1)) LIMIT {LIMIT} + SELECT DISTINCT CASE WHEN LOCATE(TO_VARCHAR("{COLUMN_NAME}"), '.') > 0 THEN LENGTH(SUBSTR(TO_VARCHAR("{COLUMN_NAME}"), LOCATE(TO_VARCHAR("{COLUMN_NAME}"), '.') + 1)) ELSE 0 END AS decimal_scale, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY CASE WHEN LOCATE(TO_VARCHAR("{COLUMN_NAME}"), '.') > 0 THEN LENGTH(SUBSTR(TO_VARCHAR("{COLUMN_NAME}"), LOCATE(TO_VARCHAR("{COLUMN_NAME}"), '.') + 1)) ELSE 0 END LIMIT {LIMIT} error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml index f164bcbe..af804c97 100644 --- a/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date.yaml @@ -55,7 +55,7 @@ test_types: test_type: Future_Date sql_flavor: mssql measure: |- - SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CONVERT(DATE, '{RUN_DATE}') THEN 1 ELSE 0 END) + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) > CONVERT(DATE, '{RUN_DATE}') THEN 1 ELSE 0 END) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -95,7 +95,7 @@ test_types: test_type: Future_Date sql_flavor: trino measure: |- - SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= CAST('{RUN_DATE}' AS DATE) THEN 1 ELSE 0 END) + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) > CAST('{RUN_DATE}' AS DATE) THEN 1 ELSE 0 END) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -103,7 +103,7 @@ test_types: test_type: Future_Date sql_flavor: oracle measure: |- - SUM(CASE WHEN TRUNC({COLUMN_NAME}) >= TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 END) + SUM(CASE WHEN TRUNC({COLUMN_NAME}) > TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 END) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -111,7 +111,7 @@ test_types: test_type: Future_Date sql_flavor: sap_hana measure: |- - SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 END) + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) > TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 END) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -142,7 +142,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= CONVERT(DATE, '{TEST_DATE}') GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) > CONVERT(DATE, '{TEST_DATE}') GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1092' test_id: '1015' @@ -182,7 +182,7 @@ test_types: sql_flavor: oracle lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRUNC("{COLUMN_NAME}") >= TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRUNC("{COLUMN_NAME}") > TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results - id: '8010' test_id: '1015' @@ -190,6 +190,6 @@ test_types: sql_flavor: sap_hana lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) > TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml index f46bbe36..ae400acb 100644 --- a/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Future_Date_1Y.yaml @@ -56,7 +56,7 @@ test_types: test_type: Future_Date_1Y sql_flavor: mssql measure: |- - SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{RUN_DATE}')) THEN 1 ELSE 0 END) + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) > DATEADD(DAY, 365, CONVERT(DATE, '{RUN_DATE}')) THEN 1 ELSE 0 END) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -96,7 +96,7 @@ test_types: test_type: Future_Date_1Y sql_flavor: trino measure: |- - SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= (FROM_ISO8601_DATE('{RUN_DATE}') + interval '365' day ) THEN 1 ELSE 0 END) + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) > (FROM_ISO8601_DATE('{RUN_DATE}') + interval '365' day ) THEN 1 ELSE 0 END) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -104,7 +104,7 @@ test_types: test_type: Future_Date_1Y sql_flavor: oracle measure: |- - SUM(CASE WHEN TRUNC({COLUMN_NAME}) >= TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') + 365 THEN 1 ELSE 0 END) + SUM(CASE WHEN TRUNC({COLUMN_NAME}) > TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') + 365 THEN 1 ELSE 0 END) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -112,7 +112,7 @@ test_types: test_type: Future_Date_1Y sql_flavor: sap_hana measure: |- - SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) >= ADD_DAYS(TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS'), 365) THEN 1 ELSE 0 END) + SUM(CASE WHEN CAST({COLUMN_NAME} AS DATE) > ADD_DAYS(TO_DATE('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS'), 365) THEN 1 ELSE 0 END) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -143,7 +143,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= DATEADD(DAY, 365, CONVERT(DATE, '{TEST_DATE}')) GROUP BY "{COLUMN_NAME}"; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) > DATEADD(DAY, 365, CONVERT(DATE, '{TEST_DATE}')) GROUP BY "{COLUMN_NAME}"; error_type: Test Results - id: '1093' test_id: '1016' @@ -183,7 +183,7 @@ test_types: sql_flavor: oracle lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRUNC("{COLUMN_NAME}") >= TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') + 365 GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRUNC("{COLUMN_NAME}") > TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') + 365 GROUP BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results - id: '8016' test_id: '1016' @@ -191,6 +191,6 @@ test_types: sql_flavor: sap_hana lookup_type: null lookup_query: |- - SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) >= ADD_DAYS(TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS'), 365) GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE CAST("{COLUMN_NAME}" AS DATE) > ADD_DAYS(TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS'), 365) GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml index 877bb855..2a64f34a 100644 --- a/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Min_Date.yaml @@ -112,7 +112,7 @@ test_types: test_type: Min_Date sql_flavor: sap_hana measure: |- - SUM(CASE WHEN {COLUMN_NAME} < TO_DATE('{BASELINE_VALUE}', 'YYYY-MM-DD HH24:MI:SS') THEN 1 ELSE 0 END) + SUM(CASE WHEN {COLUMN_NAME} < CAST('{BASELINE_VALUE}' AS {COLUMN_TYPE}) THEN 1 ELSE 0 END) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -191,6 +191,6 @@ test_types: sql_flavor: sap_hana lookup_type: null lookup_query: |- - SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < TO_DATE('{BASELINE_VALUE}', 'YYYY-MM-DD HH24:MI:SS') GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} + SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE "{COLUMN_NAME}" < CAST('{BASELINE_VALUE}' AS {COLUMN_TYPE}) GROUP BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml index d6fbfbf3..31004340 100644 --- a/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Street_Addr_Pattern.yaml @@ -145,7 +145,7 @@ test_types: sql_flavor: mssql lookup_type: null lookup_query: |- - SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") NOT LIKE '[1-9]% [A-Z]% %' AND CHARINDEX(' ', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; + SELECT DISTINCT TOP {LIMIT} "{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE UPPER("{COLUMN_NAME}") NOT LIKE '[1-9]% [A-Z]% %' OR CHARINDEX(' ', "{COLUMN_NAME}") NOT BETWEEN 2 AND 6 GROUP BY "{COLUMN_NAME}" ORDER BY COUNT(*) DESC; error_type: Test Results - id: '1108' test_id: '1033' diff --git a/testgen/ui/queries/source_data_queries.py b/testgen/ui/queries/source_data_queries.py index d1537023..9abfc101 100644 --- a/testgen/ui/queries/source_data_queries.py +++ b/testgen/ui/queries/source_data_queries.py @@ -12,7 +12,7 @@ from testgen.common.read_file import replace_templated_functions from testgen.ui.services.database_service import fetch_from_target_db, fetch_one_from_db from testgen.ui.utils import parse_fuzzy_date -from testgen.utils import to_dataframe +from testgen.utils import to_dataframe, to_sql_timestamp LOG = logging.getLogger("testgen") DEFAULT_LIMIT = 500 @@ -119,7 +119,7 @@ def get_test_issue_source_query(issue_data: dict, limit: int = DEFAULT_LIMIT) -> "TABLE_NAME": issue_data["table_name"], "COLUMN_NAME": issue_data["column_names"], # Don't quote this - queries already have quotes "COLUMN_TYPE": issue_data["column_type"], - "TEST_DATE": str(parsed_test_date) if (parsed_test_date := parse_fuzzy_date(issue_data["test_date"])) + "TEST_DATE": to_sql_timestamp(parsed_test_date) if (parsed_test_date := parse_fuzzy_date(issue_data["test_date"])) else None, "CUSTOM_QUERY": test_definition.custom_query, "BASELINE_VALUE": test_definition.baseline_value, From 5732304002e653398fe3e680fd177c5121bbd99b Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Fri, 20 Mar 2026 09:45:14 -0300 Subject: [PATCH 63/95] refactor: remove explicit commits from model mutations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model methods no longer call session.commit() — commits are handled by database_session() on clean exit (CLI) or safe_rerun() before Streamlit reruns (UI). This makes multi-step operations atomic (e.g. cascade deletes) instead of committing per-step. flush() is retained where immediate visibility within the same session is needed (Entity.save for generated IDs, cancel_all_running for RETURNING clauses). Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/commands/run_profiling.py | 12 ++- testgen/commands/run_test_execution.py | 17 ++- testgen/common/models/entity.py | 3 - testgen/common/models/profiling_run.py | 4 +- testgen/common/models/scheduler.py | 17 +-- testgen/common/models/scores.py | 2 - testgen/common/models/settings.py | 2 +- testgen/common/models/table_group.py | 2 - testgen/common/models/test_definition.py | 7 -- testgen/common/models/test_run.py | 4 +- testgen/common/models/test_suite.py | 1 - testgen/common/models/user.py | 1 - testgen/ui/app.py | 131 ++++++++++++----------- testgen/ui/services/rerun_service.py | 7 +- 14 files changed, 104 insertions(+), 106 deletions(-) diff --git a/testgen/commands/run_profiling.py b/testgen/commands/run_profiling.py index 9acc4a1e..1c8e58fc 100644 --- a/testgen/commands/run_profiling.py +++ b/testgen/commands/run_profiling.py @@ -26,7 +26,7 @@ ) from testgen.common.database.database_service import ThreadedProgress, empty_cache from testgen.common.mixpanel_service import MixpanelService -from testgen.common.models import with_database_session +from testgen.common.models import get_current_session, with_database_session from testgen.common.models.connection import Connection from testgen.common.models.profiling_run import ProfilingRun from testgen.common.models.table_group import TableGroup @@ -78,6 +78,9 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d profiling_run.init_progress() profiling_run.set_progress("data_chars", "Running") profiling_run.save() + # This runs in a subprocess — commit after every save so progress is visible + # to the UI (separate session) and to execute_db_queries (independent connection). + get_current_session().commit() LOG.info(f"Profiling run: {profiling_run.id}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}") try: @@ -109,6 +112,7 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d profiling_run.profiling_endtime = datetime.now(UTC) + time_delta profiling_run.status = "Error" profiling_run.save() + get_current_session().commit() send_profiling_run_notifications(profiling_run) else: @@ -116,6 +120,7 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d profiling_run.profiling_endtime = datetime.now(UTC) + time_delta profiling_run.status = "Complete" profiling_run.save() + get_current_session().commit() send_profiling_run_notifications(profiling_run) _rollup_profiling_scores(profiling_run, table_group) @@ -143,6 +148,7 @@ def _run_column_profiling(sql_generator: ProfilingSQL, data_chars: list[ColumnCh profiling_run = sql_generator.profiling_run profiling_run.set_progress("col_profiling", "Running") profiling_run.save() + get_current_session().commit() LOG.info(f"Running column profiling queries: {len(data_chars)}") table_group = sql_generator.table_group @@ -169,6 +175,7 @@ def update_column_progress(progress: ThreadedProgress) -> None: else None, ) profiling_run.save() + get_current_session().commit() profiling_results, result_columns, error_data = fetch_from_db_threaded( [sql_generator.run_column_profiling(column, sampling_params.get(column.table_name)) for column in data_chars], @@ -216,6 +223,7 @@ def _run_frequency_analysis(sql_generator: ProfilingSQL) -> None: profiling_run = sql_generator.profiling_run profiling_run.set_progress("freq_analysis", "Running") profiling_run.save() + get_current_session().commit() error_data = None try: @@ -230,6 +238,7 @@ def update_frequency_progress(progress: ThreadedProgress) -> None: "freq_analysis", "Running", detail=f"{progress['processed']} of {progress['total']}" ) profiling_run.save() + get_current_session().commit() frequency_results, result_columns, error_data = fetch_from_db_threaded( [sql_generator.run_frequency_analysis(ColumnChars(**column)) for column in frequency_columns], @@ -262,6 +271,7 @@ def _run_hygiene_issue_detection(sql_generator: ProfilingSQL) -> None: profiling_run = sql_generator.profiling_run profiling_run.set_progress("hygiene_issues", "Running") profiling_run.save() + get_current_session().commit() try: LOG.info("Detecting functional data types and critical data elements") diff --git a/testgen/commands/run_test_execution.py b/testgen/commands/run_test_execution.py index 7adb15c4..06aae744 100644 --- a/testgen/commands/run_test_execution.py +++ b/testgen/commands/run_test_execution.py @@ -23,7 +23,7 @@ ) from testgen.common.database.database_service import ThreadedProgress, empty_cache from testgen.common.mixpanel_service import MixpanelService -from testgen.common.models import with_database_session +from testgen.common.models import get_current_session, with_database_session from testgen.common.models.connection import Connection from testgen.common.models.table_group import TableGroup from testgen.common.models.test_run import TestRun @@ -78,6 +78,10 @@ def run_test_execution(test_suite_id: str | UUID, username: str | None = None, r test_run.init_progress() test_run.set_progress("data_chars", "Running") test_run.save() + # This runs in a subprocess — commit after every save so progress is visible + # to the UI (separate session) and to execute_db_queries (independent connection). + session = get_current_session() + session.commit() try: LOG.info(f"Test run: {test_run.id}, Test suite: {test_suite.test_suite}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}") @@ -101,6 +105,7 @@ def run_test_execution(test_suite_id: str | UUID, username: str | None = None, r LOG.info(f"Active test definitions: {len(test_defs)}") test_run.set_progress("validation", "Running") test_run.save() + session.commit() valid_test_defs = run_test_validation(sql_generator, test_defs) invalid_count = len(test_defs) - len(valid_test_defs) @@ -134,6 +139,7 @@ def run_test_execution(test_suite_id: str | UUID, username: str | None = None, r LOG.info("Updating test results and test run") test_run.save() + session.commit() execute_db_queries(sql_generator.update_test_results()) # Refresh needed because previous query updates the test run too test_run.refresh() @@ -145,6 +151,7 @@ def run_test_execution(test_suite_id: str | UUID, username: str | None = None, r test_run.test_endtime = datetime.now(UTC) + time_delta test_run.status = "Error" test_run.save() + session.commit() send_test_run_notifications(test_run) else: @@ -152,10 +159,12 @@ def run_test_execution(test_suite_id: str | UUID, username: str | None = None, r test_run.test_endtime = datetime.now(UTC) + time_delta test_run.status = "Complete" test_run.save() + session.commit() LOG.info("Updating latest run for test suite") test_suite.last_complete_test_run_id = test_run.id test_suite.save() + session.commit() if not test_suite.is_monitor: send_test_run_notifications(test_run) @@ -222,6 +231,7 @@ def _run_tests( test_run = sql_generator.test_run test_run.set_progress(run_type, "Running") test_run.save() + get_current_session().commit() def update_test_progress(progress: ThreadedProgress) -> None: test_run.set_progress( @@ -233,6 +243,7 @@ def update_test_progress(progress: ThreadedProgress) -> None: else None, ) test_run.save() + get_current_session().commit() LOG.info(f"Running {run_type} tests: {len(test_defs)}") test_results, result_columns, error_data = fetch_from_db_threaded( @@ -272,6 +283,7 @@ def _run_cat_tests( test_run = sql_generator.test_run test_run.set_progress("CAT", "Running") test_run.save() + get_current_session().commit() total_count = len(test_defs) LOG.info(f"Aggregating CAT tests: {total_count}") @@ -288,6 +300,7 @@ def update_aggegate_progress(progress: ThreadedProgress) -> None: else None, ) test_run.save() + get_current_session().commit() LOG.info(f"Running aggregated CAT test queries: {len(aggregate_queries)}") aggregate_results, _, aggregate_errors = fetch_from_db_threaded( @@ -317,6 +330,7 @@ def update_aggegate_progress(progress: ThreadedProgress) -> None: error="Rerunning errored tests singly", ) test_run.save() + get_current_session().commit() def update_single_progress(progress: ThreadedProgress) -> None: test_run.set_progress( @@ -328,6 +342,7 @@ def update_single_progress(progress: ThreadedProgress) -> None: ), ) test_run.save() + get_current_session().commit() LOG.info(f"Rerunning errored CAT tests singly: {len(single_test_defs)}") single_results, _, single_errors = fetch_from_db_threaded( diff --git a/testgen/common/models/entity.py b/testgen/common/models/entity.py index 3647e684..3d7560de 100644 --- a/testgen/common/models/entity.py +++ b/testgen/common/models/entity.py @@ -118,7 +118,6 @@ def delete_where(cls, *clauses) -> None: query = delete(cls).where(*clauses) db_session = get_current_session() db_session.execute(query) - db_session.commit() @classmethod def is_in_use(cls, ids: list[str]) -> bool: @@ -145,14 +144,12 @@ def save(self) -> None: db_session = get_current_session() db_session.add(self) db_session.flush([self]) - db_session.commit() db_session.refresh(self, ["id"]) def delete(self) -> None: db_session = get_current_session() db_session.add(self) db_session.delete(self) - db_session.commit() def to_dict(self, json_safe: bool = False): result = {col.name: getattr(self, col.name) for col in self.__table__.columns} diff --git a/testgen/common/models/profiling_run.py b/testgen/common/models/profiling_run.py index 7365f8a4..41ae7e16 100644 --- a/testgen/common/models/profiling_run.py +++ b/testgen/common/models/profiling_run.py @@ -246,7 +246,7 @@ def cancel_all_running(cls) -> list[UUID]: ) db_session = get_current_session() rows = db_session.execute(query) - db_session.commit() + db_session.flush() return [r.id for r in rows] @classmethod @@ -254,7 +254,6 @@ def cancel_run(cls, run_id: str | UUID) -> None: query = update(cls).where(cls.id == run_id).values(status="Cancelled", profiling_endtime=datetime.now(UTC)) db_session = get_current_session() db_session.execute(query) - db_session.commit() @classmethod def cascade_delete(cls, ids: list[str]) -> None: @@ -270,7 +269,6 @@ def cascade_delete(cls, ids: list[str]) -> None: """ db_session = get_current_session() db_session.execute(text(query), {"profiling_run_ids": tuple(ids)}) - db_session.commit() cls.delete_where(cls.id.in_(ids)) @classmethod diff --git a/testgen/common/models/scheduler.py b/testgen/common/models/scheduler.py index 86b120d6..7408501d 100644 --- a/testgen/common/models/scheduler.py +++ b/testgen/common/models/scheduler.py @@ -63,24 +63,12 @@ def select_where(cls, *clauses, order_by: str | InstrumentedAttribute | None = N @classmethod def delete(cls, job_id: str | UUID) -> None: query = delete(cls).where(JobSchedule.id == job_id) - db_session = get_current_session() - try: - db_session.execute(query) - except ValueError: - db_session.rollback() - else: - db_session.commit() + get_current_session().execute(query) @classmethod def update_active(cls, job_id: str | UUID, active: bool) -> None: query = update(cls).where(JobSchedule.id == job_id).values(active=active) - db_session = get_current_session() - try: - db_session.execute(query) - except ValueError: - db_session.rollback() - else: - db_session.commit() + get_current_session().execute(query) @classmethod def count(cls): @@ -101,4 +89,3 @@ def cron_tz_str(self) -> str: def save(self) -> None: db_session = get_current_session() db_session.add(self) - db_session.commit() diff --git a/testgen/common/models/scores.py b/testgen/common/models/scores.py index 61c3ceb4..788ee00b 100644 --- a/testgen/common/models/scores.py +++ b/testgen/common/models/scores.py @@ -186,14 +186,12 @@ def save(self) -> None: db_session = get_current_session() db_session.add(self) db_session.flush([self]) - db_session.commit() db_session.refresh(self, ["id"]) def delete(self) -> None: db_session = get_current_session() db_session.add(self) db_session.delete(self) - db_session.commit() def clear_results(self) -> None: db_session = get_current_session() diff --git a/testgen/common/models/settings.py b/testgen/common/models/settings.py index 4d9d67c9..f98b1565 100644 --- a/testgen/common/models/settings.py +++ b/testgen/common/models/settings.py @@ -37,7 +37,7 @@ def set(cls, key: str, value: Any): ps.value = value else: session.add(cls(key=key, value=value)) - session.commit() + session.flush() def __repr__(self): return f"{self.__class__.__name__}(key={self.key!r} value={self.value!r})" diff --git a/testgen/common/models/table_group.py b/testgen/common/models/table_group.py index 80b9a6d2..7ca2dd90 100644 --- a/testgen/common/models/table_group.py +++ b/testgen/common/models/table_group.py @@ -420,7 +420,6 @@ def cascade_delete(cls, ids: list[str]) -> None: params = {"table_group_ids": tuple(ids)} db_session = get_current_session() db_session.execute(text(query), params) - db_session.commit() cls.delete_where(cls.id.in_(ids)) @classmethod @@ -440,7 +439,6 @@ def save(self, add_scorecard_definition: bool = False) -> None: query = update(TableGroup).where(TableGroup.id == self.id).values(**values) db_session = get_current_session() db_session.execute(query) - db_session.commit() else: super().save() if add_scorecard_definition: diff --git a/testgen/common/models/test_definition.py b/testgen/common/models/test_definition.py index 7eb03a66..e9e2651c 100644 --- a/testgen/common/models/test_definition.py +++ b/testgen/common/models/test_definition.py @@ -313,7 +313,6 @@ def set_status_attribute( db_session = get_current_session() db_session.execute(text(query), params) - db_session.commit() @classmethod def move( @@ -348,7 +347,6 @@ def move( db_session = get_current_session() db_session.execute(text(query), params) - db_session.commit() @classmethod def copy( @@ -389,7 +387,6 @@ def copy( ) db_session = get_current_session() db_session.execute(query) - db_session.commit() @classmethod def clear_cache(cls) -> bool: @@ -406,7 +403,6 @@ def save(self) -> None: query = update(TestDefinition).where(TestDefinition.id == self.id).values(**values) db_session = get_current_session() db_session.execute(query) - db_session.commit() else: super().save() @@ -429,7 +425,6 @@ def add_note(cls, test_definition_id: str | UUID, detail: str, username: str) -> db_session.execute( insert(cls).values(test_definition_id=test_definition_id, detail=detail, created_by=username) ) - db_session.commit() @classmethod def update_note(cls, note_id: str | UUID, detail: str) -> None: @@ -437,13 +432,11 @@ def update_note(cls, note_id: str | UUID, detail: str) -> None: db_session.execute( update(cls).where(cls.id == note_id).values(detail=detail, updated_at=func.now()) ) - db_session.commit() @classmethod def delete_note(cls, note_id: str | UUID) -> None: db_session = get_current_session() db_session.execute(delete(cls).where(cls.id == note_id)) - db_session.commit() @classmethod def get_notes_count_by_ids(cls, test_definition_ids: list[str]) -> dict[str, int]: diff --git a/testgen/common/models/test_run.py b/testgen/common/models/test_run.py index 55c7dd56..1517bb4e 100644 --- a/testgen/common/models/test_run.py +++ b/testgen/common/models/test_run.py @@ -348,7 +348,7 @@ def cancel_all_running(cls) -> list[UUID]: ) db_session = get_current_session() rows = db_session.execute(query) - db_session.commit() + db_session.flush() return [r.id for r in rows] @classmethod @@ -356,7 +356,6 @@ def cancel_run(cls, run_id: str | UUID) -> None: query = update(cls).where(cls.id == run_id).values(status="Cancelled", test_endtime=datetime.now(UTC)) db_session = get_current_session() db_session.execute(query) - db_session.commit() @classmethod def cascade_delete(cls, ids: list[str]) -> None: @@ -366,7 +365,6 @@ def cascade_delete(cls, ids: list[str]) -> None: """ db_session = get_current_session() db_session.execute(text(query), {"test_run_ids": tuple(ids)}) - db_session.commit() cls.delete_where(cls.id.in_(ids)) @classmethod diff --git a/testgen/common/models/test_suite.py b/testgen/common/models/test_suite.py index a8c35b8d..18e29cd3 100644 --- a/testgen/common/models/test_suite.py +++ b/testgen/common/models/test_suite.py @@ -246,7 +246,6 @@ def cascade_delete(cls, ids: list[str]) -> None: """ db_session = get_current_session() db_session.execute(text(query), {"test_suite_ids": tuple(ids)}) - db_session.commit() cls.delete_where(cls.id.in_(ids)) @classmethod diff --git a/testgen/common/models/user.py b/testgen/common/models/user.py index f0fbd6e3..b4e1d575 100644 --- a/testgen/common/models/user.py +++ b/testgen/common/models/user.py @@ -36,7 +36,6 @@ def save(self, update_latest_login: bool = False) -> None: query = update(User).where(User.id == self.id).values(**values) db_session = get_current_session() db_session.execute(query) - db_session.commit() else: if update_latest_login: self.latest_login = datetime.now(UTC) diff --git a/testgen/ui/app.py b/testgen/ui/app.py index b5b4fbcd..5ed2bc72 100644 --- a/testgen/ui/app.py +++ b/testgen/ui/app.py @@ -6,7 +6,7 @@ from testgen import settings from testgen.common import version_service from testgen.common.docker_service import check_basic_configuration -from testgen.common.models import with_database_session +from testgen.common.models import get_current_session, with_database_session from testgen.common.models.project import Project from testgen.ui import bootstrap from testgen.ui.assets import get_asset_path @@ -17,67 +17,74 @@ @with_database_session def render(log_level: int = logging.INFO): - st.set_page_config( - page_title="TestGen", - page_icon=get_asset_path("favicon.ico"), - layout="wide", - # Collapse when logging out or on the no-project page (no sidebar content on either) - initial_sidebar_state="collapsed" - if (session.auth and session.auth.logging_out) or session.current_page == "no-project" - else "auto", - ) - - application = get_application(log_level=log_level) - application.logger.debug("Starting Streamlit re-run") - if not session.auth: - session.auth = application.auth_class() - - status_ok, message = check_basic_configuration() - if not status_ok: - st.markdown(f":red[{message}]") - return - - set_locale() - - if session.auth.logging_out: - session.sidebar_project = None - else: - session.sidebar_project = ( - session.page_args_pending_router and session.page_args_pending_router.get("project_code") - ) or st.query_params.get("project_code", session.sidebar_project) - - if not session.auth.is_logged_in and not session.auth.logging_out: - session.auth.load_user_session() - - if session.auth.is_logged_in and not session.auth.logging_out: - session.auth.load_user_role() - - application.logo.render() - - if session.auth.is_logged_in and not session.auth.logging_in and not session.auth.logging_out: - current_page = session.current_page - if not current_page: - try: - current_page = urlparse(st.context.url).path.lstrip("/") - except Exception: - current_page = "" - is_global_context = current_page in application.global_admin_paths - if current_page != "no-project": - with st.sidebar: - testgen.sidebar( - projects=[] if is_global_context else [ - p for p in Project.select_where() if session.auth.user_has_project_access(p.project_code) - ], - current_project=None if is_global_context else session.sidebar_project, - menu=application.menu, - current_page=session.current_page, - version=version_service.get_version(), - support_email=settings.SUPPORT_EMAIL, - global_context=is_global_context, - is_global_admin=session.auth.user_has_permission("global_admin") and bool(application.global_admin_paths), - ) - - application.router.run() + try: + st.set_page_config( + page_title="TestGen", + page_icon=get_asset_path("favicon.ico"), + layout="wide", + # Collapse when logging out or on the no-project page (no sidebar content on either) + initial_sidebar_state="collapsed" + if (session.auth and session.auth.logging_out) or session.current_page == "no-project" + else "auto", + ) + + application = get_application(log_level=log_level) + application.logger.debug("Starting Streamlit re-run") + if not session.auth: + session.auth = application.auth_class() + + status_ok, message = check_basic_configuration() + if not status_ok: + st.markdown(f":red[{message}]") + return + + set_locale() + + if session.auth.logging_out: + session.sidebar_project = None + else: + session.sidebar_project = ( + session.page_args_pending_router and session.page_args_pending_router.get("project_code") + ) or st.query_params.get("project_code", session.sidebar_project) + + if not session.auth.is_logged_in and not session.auth.logging_out: + session.auth.load_user_session() + + if session.auth.is_logged_in and not session.auth.logging_out: + session.auth.load_user_role() + + application.logo.render() + + if session.auth.is_logged_in and not session.auth.logging_in and not session.auth.logging_out: + current_page = session.current_page + if not current_page: + try: + current_page = urlparse(st.context.url).path.lstrip("/") + except Exception: + current_page = "" + is_global_context = current_page in application.global_admin_paths + if current_page != "no-project": + with st.sidebar: + testgen.sidebar( + projects=[] if is_global_context else [ + p for p in Project.select_where() if session.auth.user_has_project_access(p.project_code) + ], + current_project=None if is_global_context else session.sidebar_project, + menu=application.menu, + current_page=session.current_page, + version=version_service.get_version(), + support_email=settings.SUPPORT_EMAIL, + global_context=is_global_context, + is_global_admin=session.auth.user_has_permission("global_admin") and bool(application.global_admin_paths), + ) + + application.router.run() + finally: + # Safety net: commit any flushed-but-uncommitted work (e.g., PersistedSetting writes) + # before RerunException propagates and bypasses database_session()'s normal commit. + db_session = get_current_session() + if db_session: + db_session.commit() @st.cache_resource(validate=lambda _: not settings.IS_DEBUG, show_spinner=False) diff --git a/testgen/ui/services/rerun_service.py b/testgen/ui/services/rerun_service.py index 34afce09..1f812a46 100644 --- a/testgen/ui/services/rerun_service.py +++ b/testgen/ui/services/rerun_service.py @@ -9,12 +9,11 @@ def safe_rerun(*, scope: Literal["app", "fragment"] = "app") -> NoReturn: """Commit any pending database changes, then trigger a Streamlit rerun. Prevents data loss when RerunException propagates through the - session context manager in app.py:render(). Always clears the - Streamlit data cache so stale query results are not served after - database mutations. + session context manager in app.py:render(). Clears the Streamlit + data cache when a database session is active (writes may have occurred). """ session = get_current_session() if session: session.commit() - st.cache_data.clear() + st.cache_data.clear() st.rerun(scope=scope) From 3b18a69278e72cd90b0831e66694aa312b117f30 Mon Sep 17 00:00:00 2001 From: Luis Date: Tue, 24 Mar 2026 12:43:33 -0400 Subject: [PATCH 64/95] fix(ui): render portals/tooltips on top of Streamlit dialogs --- testgen/ui/static/js/components/portal.js | 16 +++++++++++++++- testgen/ui/static/js/components/tooltip.js | 20 ++++++++++++++++++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/testgen/ui/static/js/components/portal.js b/testgen/ui/static/js/components/portal.js index b1683dd3..de0278af 100644 --- a/testgen/ui/static/js/components/portal.js +++ b/testgen/ui/static/js/components/portal.js @@ -18,6 +18,9 @@ import van from '../van.min.js'; import { getValue } from '../utils.js'; +const STREAMLIT_DIALOG_ZINDEX = 1000060; +const STREAMLIT_DIALOG_CLASS = 'stDialog'; + const Portal = (/** @type Options */ options, ...args) => { const { target, align = 'left', position = 'bottom' } = getValue(options); const id = `${target}-portal`; @@ -52,6 +55,8 @@ const Portal = (/** @type Options */ options, ...args) => { if (!anchor) return; const fixed = hasFixedAncestor(anchor); + const fromDialog = hasStreamlitDialogAncestor(anchor); + const zIndex = fromDialog ? (STREAMLIT_DIALOG_ZINDEX + 1) : 1001; const coords = position === 'bottom' ? calculateBottomPosition(anchor, align, fixed) : calculateTopPosition(anchor, align, fixed); @@ -72,7 +77,7 @@ const Portal = (/** @type Options */ options, ...args) => { portalEl.id = id; portalEl.className = getValue(options.class) ?? ''; - portalEl.style.cssText = `position: ${fixed ? 'fixed' : 'absolute'}; z-index: 1001; ${coords} ${getValue(options.style) ?? ''}`; + portalEl.style.cssText = `position: ${fixed ? 'fixed' : 'absolute'}; z-index: ${zIndex}; ${coords} ${getValue(options.style) ?? ''}`; }); return ''; @@ -87,6 +92,15 @@ function hasFixedAncestor(el) { return false; } +function hasStreamlitDialogAncestor(el) { + let node = el.parentElement; + while (node && node !== document.body) { + if (node.classList.contains(STREAMLIT_DIALOG_CLASS)) return true; + node = node.parentElement; + } + return false; +} + function calculateBottomPosition(anchor, align, fixed = false) { const r = anchor.getBoundingClientRect(); const top = fixed ? r.bottom : r.bottom + window.scrollY; diff --git a/testgen/ui/static/js/components/tooltip.js b/testgen/ui/static/js/components/tooltip.js index 77af4b9b..6da8c523 100644 --- a/testgen/ui/static/js/components/tooltip.js +++ b/testgen/ui/static/js/components/tooltip.js @@ -18,6 +18,8 @@ import { getValue, loadStylesheet } from '../utils.js'; const { div, span } = van.tags; const defaultPosition = 'top'; +const STREAMLIT_DIALOG_ZINDEX = 1000060; +const STREAMLIT_DIALOG_CLASS = 'stDialog'; const Tooltip = (/** @type Properties */ props) => { loadStylesheet('tooltip', stylesheet); @@ -57,11 +59,12 @@ const withTooltip = (/** @type HTMLElement */ component, /** @type Properties */ const showTooltip = van.state(false); const positionStyle = van.state(''); + const zIndex = van.state(9999); const tooltipEl = span( { class: () => `tg-tooltip portal ${getValue(tooltipProps.position) || defaultPosition} ${showTooltip.val ? '' : 'hidden'}`, - style: () => `opacity: ${showTooltip.val ? 1 : 0}; pointer-events: none; max-width: ${getValue(tooltipProps.width) || '400'}px; ${positionStyle.val}${getValue(tooltipProps.style) ?? ''}`, + style: () => `opacity: ${showTooltip.val ? 1 : 0}; pointer-events: none; z-index: ${zIndex.val ?? 9999}; max-width: ${getValue(tooltipProps.width) || '400'}px; ${positionStyle.val}${getValue(tooltipProps.style) ?? ''}`, }, tooltipProps.text, div({ class: 'tg-tooltip--triangle' }), @@ -71,6 +74,11 @@ const withTooltip = (/** @type HTMLElement */ component, /** @type Properties */ requestAnimationFrame(() => { if (!component.isConnected) return; + + if (hasStreamlitDialogAncestor(component)) { + zIndex.val = STREAMLIT_DIALOG_ZINDEX + 1; + } + const observer = new MutationObserver(() => { if (!component.isConnected) { tooltipEl.remove(); @@ -91,6 +99,15 @@ const withTooltip = (/** @type HTMLElement */ component, /** @type Properties */ return component; }; +function hasStreamlitDialogAncestor(el) { + let node = el.parentElement; + while (node && node !== document.body) { + if (node.classList.contains(STREAMLIT_DIALOG_CLASS)) return true; + node = node.parentElement; + } + return false; +} + const stylesheet = new CSSStyleSheet(); stylesheet.replace(` .tg-tooltip { @@ -110,7 +127,6 @@ stylesheet.replace(` .tg-tooltip.portal { position: fixed; - z-index: 9999; top: unset; bottom: unset; left: unset; From 9e7ce46478928ca9c4facf44498814c64c672440 Mon Sep 17 00:00:00 2001 From: Luis Date: Tue, 24 Mar 2026 13:40:11 -0400 Subject: [PATCH 65/95] fix(ui): add support for caption in select options --- testgen/ui/static/js/components/select.js | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/testgen/ui/static/js/components/select.js b/testgen/ui/static/js/components/select.js index efc055c4..cea87c88 100644 --- a/testgen/ui/static/js/components/select.js +++ b/testgen/ui/static/js/components/select.js @@ -4,6 +4,7 @@ * @property {string} label * @property {string} value * @property {string?} icon + * @property {string?} caption * * @typedef Properties * @type {object} @@ -194,17 +195,19 @@ const Select = (/** @type {Properties} */ props) => { getValue(filteredOptions).map(option => div( { - class: () => `tg-select--option ${getValue(value) === option.value ? 'selected' : ''}`, + class: () => `tg-select--option flex-column fx-justify-center ${getValue(value) === option.value ? 'selected' : ''} ${option.caption ? 'has-caption' : ''}`, onclick: (/** @type Event */ event) => { changeSelection(option); event.stopPropagation(); }, 'data-testid': 'select-options-item', }, - option.icon - ? Icon({ classes: 'mr-2' }, option.icon) - : undefined, - span(option.label), + div( + {class: 'flex-row fx-gap-2'}, + option.icon ? Icon({}, option.icon) : '', + span(option.label), + ), + option.caption ? span({class: 'text-small text-secondary'}, option.caption) : '', ) ), ), @@ -408,8 +411,6 @@ stylesheet.replace(` } .tg-select--option { - display: flex; - align-items: center; height: 40px; padding: 0px 16px; cursor: pointer; @@ -420,6 +421,11 @@ stylesheet.replace(` background: var(--select-hover-background); } +.tg-select--option.has-caption { + height: auto; + padding: 2px 16px; +} + .tg-select--option.selected { background: var(--select-hover-background); color: var(--primary-color); From 8e5e7b239ab66a7fa5481c67f41bdcc1a88d8361 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Wed, 25 Mar 2026 09:15:26 -0300 Subject: [PATCH 66/95] feat(mcp): sanitize errors at tool/resource/prompt boundary Add MCPUserError exception hierarchy and mcp_error_handler decorator. User-facing errors return their message; unexpected exceptions are logged and a neutral message is returned to the client. Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/mcp/exceptions.py | 44 ++++++++++++++++ testgen/mcp/permissions.py | 17 +++---- testgen/mcp/server.py | 40 +++++++++------ testgen/mcp/tools/discovery.py | 3 +- testgen/mcp/tools/test_results.py | 7 +-- tests/unit/mcp/test_error_boundary.py | 61 +++++++++++++++++++++++ tests/unit/mcp/test_permissions.py | 21 ++++---- tests/unit/mcp/test_tools_discovery.py | 20 ++++---- tests/unit/mcp/test_tools_test_results.py | 9 ++-- tests/unit/mcp/test_tools_test_runs.py | 18 +++---- 10 files changed, 176 insertions(+), 64 deletions(-) create mode 100644 testgen/mcp/exceptions.py create mode 100644 tests/unit/mcp/test_error_boundary.py diff --git a/testgen/mcp/exceptions.py b/testgen/mcp/exceptions.py new file mode 100644 index 00000000..dc8d1444 --- /dev/null +++ b/testgen/mcp/exceptions.py @@ -0,0 +1,44 @@ +"""MCP exception hierarchy and error boundary. + +``MCPUserError`` (and its subclasses) carry safe, user-facing messages. +``mcp_error_boundary`` is a decorator that catches them and converts to +text, while neutralising unexpected exceptions. +""" + +import functools +import logging + +LOG = logging.getLogger("testgen") + + +class MCPUserError(Exception): + """Safe, user-facing error for MCP tools, prompts, and resources. + + The error boundary converts ``str(e)`` into the response text. + All other exceptions are treated as unexpected: their traceback is + logged and a neutral message is returned to the client. + """ + + +class MCPPermissionDenied(MCPUserError): + """Raised when access is denied due to insufficient project permissions.""" + + +def mcp_error_handler(fn): + """Wrap an MCP handler (tool, resource, or prompt) with safe error handling. + + - ``MCPUserError`` (including ``MCPPermissionDenied``) → ``str(e)`` as the response. + - Any other exception → traceback logged, neutral message returned. + """ + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + try: + return fn(*args, **kwargs) + except MCPUserError as e: + return str(e) + except Exception: + LOG.exception("Unhandled error in MCP handler '%s'", fn.__name__) + return "An unexpected error occurred." + + return wrapper diff --git a/testgen/mcp/permissions.py b/testgen/mcp/permissions.py index ed45a645..47ac21da 100644 --- a/testgen/mcp/permissions.py +++ b/testgen/mcp/permissions.py @@ -7,6 +7,7 @@ from testgen.common.models.project_membership import ProjectMembership from testgen.common.models.user import User +from testgen.mcp.exceptions import MCPPermissionDenied from testgen.utils.plugins import PluginHook _NOT_SET = object() @@ -17,10 +18,6 @@ ) -class MCPPermissionDenied(Exception): - """Raised by ProjectPermissions when access is denied. Caught by the decorator.""" - - @dataclass(frozen=True, slots=True) class ProjectPermissions: memberships: dict[str, str] # {project_code: role} @@ -105,9 +102,9 @@ def mcp_permission(permission: str) -> Callable: permission, and stores it in a ContextVar. The tool retrieves the value via ``get_project_permissions()``. - If the user has no projects with the required permission, returns an - early denial message. Catches MCPPermissionDenied raised by tool code - and returns str(e) as the tool response. + Raises ``MCPPermissionDenied`` if the user has no projects with the required + permission. Other ``MCPPermissionDenied`` exceptions from tool code propagate + through — the ``safe_tool`` error boundary handles conversion to text. """ def decorator(fn: Callable) -> Callable: @@ -116,12 +113,12 @@ def wrapper(*args, **kwargs): user = get_current_mcp_user() perms = _compute_project_permissions(user, permission) if not perms.allowed_codes: - return "Your role does not include the necessary permission for this operation on any project." + raise MCPPermissionDenied( + "Your role does not include the necessary permission for this operation on any project." + ) tok = _mcp_project_permissions.set(perms) try: return fn(*args, **kwargs) - except MCPPermissionDenied as e: - return str(e) finally: _mcp_project_permissions.reset(tok) diff --git a/testgen/mcp/server.py b/testgen/mcp/server.py index 56f7d81c..34a2174b 100644 --- a/testgen/mcp/server.py +++ b/testgen/mcp/server.py @@ -80,6 +80,7 @@ def _configure_mcp_logging() -> None: def run_mcp() -> None: """Start the MCP server with streamable HTTP transport.""" from testgen.mcp import get_server_url + from testgen.mcp.exceptions import mcp_error_handler from testgen.mcp.prompts.workflows import compare_runs, health_check, investigate_failures, table_health from testgen.mcp.tools.discovery import get_data_inventory, list_projects, list_tables, list_test_suites from testgen.mcp.tools.reference import get_test_type, glossary_resource, test_types_resource @@ -105,26 +106,35 @@ def run_mcp() -> None: ) _configure_mcp_logging() + def safe_tool(fn): + mcp.tool()(mcp_error_handler(fn)) + + def safe_resource(uri, fn): + mcp.resource(uri)(mcp_error_handler(fn)) + + def safe_prompt(fn): + mcp.prompt()(mcp_error_handler(fn)) + # Tools (9) - mcp.tool()(get_data_inventory) - mcp.tool()(list_projects) - mcp.tool()(list_tables) - mcp.tool()(list_test_suites) - mcp.tool()(get_recent_test_runs) - mcp.tool()(get_test_results) - mcp.tool()(get_test_result_history) - mcp.tool()(get_failure_summary) - mcp.tool()(get_test_type) + safe_tool(get_data_inventory) + safe_tool(list_projects) + safe_tool(list_tables) + safe_tool(list_test_suites) + safe_tool(get_recent_test_runs) + safe_tool(get_test_results) + safe_tool(get_test_result_history) + safe_tool(get_failure_summary) + safe_tool(get_test_type) # Resources (2) - mcp.resource("testgen://test-types")(test_types_resource) - mcp.resource("testgen://glossary")(glossary_resource) + safe_resource("testgen://test-types", test_types_resource) + safe_resource("testgen://glossary", glossary_resource) # Prompts (4) - mcp.prompt()(health_check) - mcp.prompt()(investigate_failures) - mcp.prompt()(table_health) - mcp.prompt()(compare_runs) + safe_prompt(health_check) + safe_prompt(investigate_failures) + safe_prompt(table_health) + safe_prompt(compare_runs) LOG.info("Starting MCP server on %s:%s (auth issuer: %s)", settings.MCP_HOST, settings.MCP_PORT, server_url) diff --git a/testgen/mcp/tools/discovery.py b/testgen/mcp/tools/discovery.py index 360c2ac4..358f03fb 100644 --- a/testgen/mcp/tools/discovery.py +++ b/testgen/mcp/tools/discovery.py @@ -4,6 +4,7 @@ from testgen.common.models.data_table import DataTable from testgen.common.models.project import Project from testgen.common.models.test_suite import TestSuite +from testgen.mcp.exceptions import MCPUserError from testgen.mcp.permissions import get_project_permissions, mcp_permission @@ -104,7 +105,7 @@ def list_tables(table_group_id: str, limit: int = 200, page: int = 1) -> str: try: group_uuid = UUID(table_group_id) except (ValueError, AttributeError) as err: - raise ValueError(f"Invalid table_group_id: `{table_group_id}` is not a valid UUID.") from err + raise MCPUserError(f"Invalid table_group_id: `{table_group_id}` is not a valid UUID.") from err perms = get_project_permissions() project_codes = perms.allowed_codes diff --git a/testgen/mcp/tools/test_results.py b/testgen/mcp/tools/test_results.py index 9eff3e22..c76f2e5d 100644 --- a/testgen/mcp/tools/test_results.py +++ b/testgen/mcp/tools/test_results.py @@ -3,6 +3,7 @@ from testgen.common.models import with_database_session from testgen.common.models.test_definition import TestType from testgen.common.models.test_result import TestResult, TestResultStatus +from testgen.mcp.exceptions import MCPUserError from testgen.mcp.permissions import get_project_permissions, mcp_permission @@ -10,7 +11,7 @@ def _parse_uuid(value: str, label: str = "ID") -> UUID: try: return UUID(value) except (ValueError, AttributeError) as err: - raise ValueError(f"Invalid {label}: `{value}` is not a valid UUID.") from err + raise MCPUserError(f"Invalid {label}: `{value}` is not a valid UUID.") from err def _parse_status(value: str) -> TestResultStatus: @@ -18,14 +19,14 @@ def _parse_status(value: str) -> TestResultStatus: return TestResultStatus(value) except ValueError as err: valid = ", ".join(s.value for s in TestResultStatus) - raise ValueError(f"Invalid status `{value}`. Valid values: {valid}") from err + raise MCPUserError(f"Invalid status `{value}`. Valid values: {valid}") from err def _resolve_test_type(short_name: str) -> str: """Resolve a test type short name to its internal code.""" matches = TestType.select_where(TestType.test_name_short == short_name) if not matches: - raise ValueError(f"Unknown test type: `{short_name}`. Use the testgen://test-types resource to see available types.") + raise MCPUserError(f"Unknown test type: `{short_name}`. Use the testgen://test-types resource to see available types.") return matches[0].test_type diff --git a/tests/unit/mcp/test_error_boundary.py b/tests/unit/mcp/test_error_boundary.py new file mode 100644 index 00000000..a49ca36a --- /dev/null +++ b/tests/unit/mcp/test_error_boundary.py @@ -0,0 +1,61 @@ +"""Tests for the mcp_error_boundary decorator.""" + +import logging + +from testgen.mcp.exceptions import MCPPermissionDenied, MCPUserError, mcp_error_handler + + +def test_returns_normal_result(): + @mcp_error_handler + def my_tool(x: int) -> str: + return f"result: {x}" + + assert my_tool(42) == "result: 42" + + +def test_converts_mcp_user_error_to_string(): + @mcp_error_handler + def failing_tool(): + raise MCPUserError("Invalid table_group_id: `abc` is not a valid UUID.") + + assert failing_tool() == "Invalid table_group_id: `abc` is not a valid UUID." + + +def test_converts_permission_denied_to_string(): + @mcp_error_handler + def restricted_tool(): + raise MCPPermissionDenied("Your role does not include the necessary permission.") + + assert restricted_tool() == "Your role does not include the necessary permission." + + +def test_catches_unexpected_error_and_returns_neutral_message(): + @mcp_error_handler + def broken_tool(): + raise RuntimeError("DB connection pool exhausted") + + result = broken_tool() + assert result == "An unexpected error occurred." + assert "DB connection pool" not in result + + +def test_logs_unexpected_error_traceback(caplog): + @mcp_error_handler + def broken_tool(): + raise RuntimeError("secret internal detail") + + with caplog.at_level(logging.ERROR, logger="testgen"): + broken_tool() + + assert "secret internal detail" in caplog.text + assert "broken_tool" in caplog.text + + +def test_preserves_function_metadata(): + @mcp_error_handler + def my_tool(x: int, y: str = "default") -> str: + """Tool docstring.""" + return f"{x}-{y}" + + assert my_tool.__name__ == "my_tool" + assert my_tool.__doc__ == "Tool docstring." diff --git a/tests/unit/mcp/test_permissions.py b/tests/unit/mcp/test_permissions.py index da980825..6f7b1512 100644 --- a/tests/unit/mcp/test_permissions.py +++ b/tests/unit/mcp/test_permissions.py @@ -3,9 +3,9 @@ import pytest +from testgen.mcp.exceptions import MCPPermissionDenied from testgen.mcp.permissions import ( _NOT_SET, - MCPPermissionDenied, ProjectPermissions, _compute_project_permissions, _mcp_project_permissions, @@ -210,8 +210,8 @@ def tool_fn(): @patch("testgen.mcp.permissions.ProjectMembership") -def test_mcp_permission_early_return_when_no_allowed_codes(mock_membership): - """Decorator returns early if user has no projects with the required permission.""" +def test_mcp_permission_raises_when_no_allowed_codes(mock_membership): + """Decorator raises MCPPermissionDenied if user has no projects with the required permission.""" set_mcp_username("test") m1 = MagicMock() @@ -223,23 +223,20 @@ def test_mcp_permission_early_return_when_no_allowed_codes(mock_membership): def tool_fn(): raise AssertionError("Should not be called") - result = tool_fn() - - assert "permission" in result - assert "role" in result.lower() + with pytest.raises(MCPPermissionDenied, match="permission"): + tool_fn() -def test_mcp_permission_catches_mcp_permission_denied(): - """Decorator catches MCPPermissionDenied and returns str(e).""" +def test_mcp_permission_propagates_mcp_permission_denied(): + """Decorator lets MCPPermissionDenied propagate — safe_tool handles conversion.""" set_mcp_username("test") @mcp_permission("view") def tool_fn(): raise MCPPermissionDenied("Access denied for testing") - result = tool_fn() - - assert result == "Access denied for testing" + with pytest.raises(MCPPermissionDenied, match="Access denied for testing"): + tool_fn() def test_mcp_permission_resets_contextvar_after_call(): diff --git a/tests/unit/mcp/test_tools_discovery.py b/tests/unit/mcp/test_tools_discovery.py index 48438cb6..a04379d9 100644 --- a/tests/unit/mcp/test_tools_discovery.py +++ b/tests/unit/mcp/test_tools_discovery.py @@ -1,6 +1,9 @@ from unittest.mock import MagicMock, patch from uuid import uuid4 +import pytest + +from testgen.mcp.exceptions import MCPPermissionDenied from testgen.mcp.permissions import ProjectPermissions @@ -144,7 +147,7 @@ def test_list_test_suites_empty(mock_suite, db_session_mock): from testgen.mcp.tools.discovery import list_test_suites - result = list_test_suites("nonexistent") + result = list_test_suites("demo") assert "No test suites found" in result @@ -159,7 +162,7 @@ def test_list_test_suites_empty_project_code(db_session_mock): @patch("testgen.mcp.permissions._compute_project_permissions") -def test_list_test_suites_returns_not_found_for_inaccessible_project( +def test_list_test_suites_raises_not_found_for_inaccessible_project( mock_compute, db_session_mock, ): mock_compute.return_value = ProjectPermissions( @@ -169,13 +172,12 @@ def test_list_test_suites_returns_not_found_for_inaccessible_project( from testgen.mcp.tools.discovery import list_test_suites - result = list_test_suites("secret_project") - - assert "No test suites found for project `secret_project`" in result + with pytest.raises(MCPPermissionDenied, match="No test suites found for project `secret_project`"): + list_test_suites("secret_project") @patch("testgen.mcp.permissions._compute_project_permissions") -def test_list_test_suites_returns_denial_for_insufficient_permission( +def test_list_test_suites_raises_denial_for_insufficient_permission( mock_compute, db_session_mock, ): mock_compute.return_value = ProjectPermissions( @@ -185,10 +187,8 @@ def test_list_test_suites_returns_denial_for_insufficient_permission( from testgen.mcp.tools.discovery import list_test_suites - result = list_test_suites("secret_project") - - assert "necessary permission" in result - assert "role" in result.lower() + with pytest.raises(MCPPermissionDenied, match="necessary permission"): + list_test_suites("secret_project") @patch("testgen.mcp.tools.discovery.DataTable") diff --git a/tests/unit/mcp/test_tools_test_results.py b/tests/unit/mcp/test_tools_test_results.py index b1b6cea2..1fd2812e 100644 --- a/tests/unit/mcp/test_tools_test_results.py +++ b/tests/unit/mcp/test_tools_test_results.py @@ -4,6 +4,7 @@ import pytest from testgen.common.models.test_result import TestResultStatus +from testgen.mcp.exceptions import MCPUserError from testgen.mcp.permissions import ProjectPermissions @@ -98,14 +99,14 @@ def test_get_test_results_with_filters(mock_result, mock_tt_cls, db_session_mock def test_get_test_results_invalid_uuid(db_session_mock): from testgen.mcp.tools.test_results import get_test_results - with pytest.raises(ValueError, match="not a valid UUID"): + with pytest.raises(MCPUserError, match="not a valid UUID"): get_test_results("not-a-uuid") def test_get_test_results_invalid_status(db_session_mock): from testgen.mcp.tools.test_results import get_test_results - with pytest.raises(ValueError, match="Invalid status"): + with pytest.raises(MCPUserError, match="Invalid status"): get_test_results(str(uuid4()), status="BadStatus") @@ -196,7 +197,7 @@ def test_get_failure_summary_by_column(mock_result, db_session_mock): def test_get_failure_summary_invalid_uuid(db_session_mock): from testgen.mcp.tools.test_results import get_failure_summary - with pytest.raises(ValueError, match="not a valid UUID"): + with pytest.raises(MCPUserError, match="not a valid UUID"): get_failure_summary("bad-uuid") @@ -273,7 +274,7 @@ def test_get_test_result_history_empty(mock_result, db_session_mock): def test_get_test_result_history_invalid_uuid(db_session_mock): from testgen.mcp.tools.test_results import get_test_result_history - with pytest.raises(ValueError, match="not a valid UUID"): + with pytest.raises(MCPUserError, match="not a valid UUID"): get_test_result_history("bad-uuid") diff --git a/tests/unit/mcp/test_tools_test_runs.py b/tests/unit/mcp/test_tools_test_runs.py index 1cbb7b99..5b9a7d28 100644 --- a/tests/unit/mcp/test_tools_test_runs.py +++ b/tests/unit/mcp/test_tools_test_runs.py @@ -1,6 +1,9 @@ from unittest.mock import MagicMock, patch from uuid import uuid4 +import pytest + +from testgen.mcp.exceptions import MCPPermissionDenied from testgen.mcp.permissions import ProjectPermissions @@ -135,7 +138,7 @@ def test_get_recent_test_runs_empty_project_code(db_session_mock): @patch("testgen.mcp.permissions._compute_project_permissions") -def test_get_recent_test_runs_returns_not_found_for_inaccessible_project( +def test_get_recent_test_runs_raises_not_found_for_inaccessible_project( mock_compute, db_session_mock, ): mock_compute.return_value = ProjectPermissions( @@ -145,13 +148,12 @@ def test_get_recent_test_runs_returns_not_found_for_inaccessible_project( from testgen.mcp.tools.test_runs import get_recent_test_runs - result = get_recent_test_runs("secret_project") - - assert "No completed test runs found in project `secret_project`" in result + with pytest.raises(MCPPermissionDenied, match="No completed test runs found in project `secret_project`"): + get_recent_test_runs("secret_project") @patch("testgen.mcp.permissions._compute_project_permissions") -def test_get_recent_test_runs_returns_denial_for_insufficient_permission( +def test_get_recent_test_runs_raises_denial_for_insufficient_permission( mock_compute, db_session_mock, ): mock_compute.return_value = ProjectPermissions( @@ -161,7 +163,5 @@ def test_get_recent_test_runs_returns_denial_for_insufficient_permission( from testgen.mcp.tools.test_runs import get_recent_test_runs - result = get_recent_test_runs("secret_project") - - assert "necessary permission" in result - assert "role" in result.lower() + with pytest.raises(MCPPermissionDenied, match="necessary permission"): + get_recent_test_runs("secret_project") From 8bdaa83de315942ff74534ace1760c3d481dba78 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 17 Mar 2026 23:09:23 -0400 Subject: [PATCH 67/95] feat: pii masking, xde, hash fingerprints --- testgen/commands/queries/profiling_query.py | 4 +- testgen/commands/run_profiling.py | 18 + testgen/common/models/data_column.py | 29 ++ testgen/common/models/table_group.py | 3 + testgen/common/pii_masking.py | 68 +++ .../030_initialize_new_schema_structure.sql | 3 + ...file_anomaly_types_Non_Printing_Chars.yaml | 16 +- ...maly_types_Standardized_Value_Matches.yaml | 22 +- .../test_types_Freshness_Trend.yaml | 18 +- .../test_types_Table_Freshness.yaml | 18 +- .../dbupgrade/0178_incremental_upgrade.sql | 7 + .../dbupgrade/0179_incremental_upgrade.sql | 31 ++ .../profiling/project_profiling_query.sql | 2 +- .../profiling/project_profiling_query.sql | 2 +- .../profiling/project_profiling_query.sql | 1 + .../profiling/project_profiling_query.sql | 2 +- .../profiling/project_profiling_query.sql | 2 +- .../profiling/project_profiling_query.sql | 2 +- .../profiling/project_profiling_query.sql | 2 +- .../profiling/project_profiling_query.sql | 2 +- .../gen_funny_cat_tests/gen_Constant.sql | 14 + .../gen_Distinct_Value_Ct.sql | 14 + .../generation/gen_selection_tests.sql | 7 + .../template/profiling/pii_flag_update.sql | 18 + testgen/ui/auth.py | 2 +- testgen/ui/components/frontend/css/shared.css | 20 +- .../frontend/js/components/attribute.js | 19 +- .../frontend/js/components/radio_group.js | 10 +- .../js/components/table_group_form.js | 11 + .../components/frontend/js/components/tree.js | 12 +- .../js/data_profiling/column_distribution.js | 48 +- .../column_profiling_results.js | 7 +- .../frontend/js/data_profiling/data_issues.js | 41 +- .../js/data_profiling/data_profiling_utils.js | 2 + .../js/data_profiling/metadata_tags.js | 451 ++++++++++++++++++ .../components/frontend/js/display_utils.js | 8 + .../frontend/js/pages/data_catalog.js | 358 ++++---------- .../js/pages/import_metadata_dialog.js | 21 +- .../js/pages/notification_settings.js | 4 +- testgen/ui/pdf/hygiene_issue_report.py | 16 +- testgen/ui/pdf/test_result_report.py | 16 +- testgen/ui/queries/profiling_queries.py | 3 + testgen/ui/queries/scoring_queries.py | 4 + testgen/ui/queries/test_result_queries.py | 1 + testgen/ui/static/css/shared.css | 20 +- testgen/ui/static/js/components/attribute.js | 19 +- .../ui/static/js/components/radio_group.js | 19 +- .../static/js/components/table_group_form.js | 11 + testgen/ui/static/js/components/tree.js | 12 +- testgen/ui/static/js/display_utils.js | 8 + testgen/ui/views/data_catalog.py | 69 ++- .../ui/views/dialogs/column_history_dialog.py | 6 + .../ui/views/dialogs/data_preview_dialog.py | 6 + .../views/dialogs/import_metadata_dialog.py | 124 ++++- .../views/dialogs/profiling_results_dialog.py | 6 + testgen/ui/views/hygiene_issues.py | 9 +- testgen/ui/views/profiling_results.py | 19 +- testgen/ui/views/score_details.py | 6 +- testgen/ui/views/score_explorer.py | 6 +- testgen/ui/views/test_results.py | 9 +- tests/unit/common/test_pii_masking.py | 191 ++++++++ tests/unit/ui/test_import_metadata.py | 28 +- 62 files changed, 1451 insertions(+), 476 deletions(-) create mode 100644 testgen/common/models/data_column.py create mode 100644 testgen/common/pii_masking.py create mode 100644 testgen/template/dbupgrade/0178_incremental_upgrade.sql create mode 100644 testgen/template/dbupgrade/0179_incremental_upgrade.sql create mode 100644 testgen/template/profiling/pii_flag_update.sql create mode 100644 testgen/ui/components/frontend/js/data_profiling/metadata_tags.js create mode 100644 tests/unit/common/test_pii_masking.py diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index b69c7063..95c60433 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -167,8 +167,10 @@ def update_profiling_results(self) -> list[tuple[str, dict]]: self._get_query("functional_datatype.sql"), self._get_query("functional_tabletype_stage.sql"), self._get_query("functional_tabletype_update.sql"), - self._get_query("pii_flag.sql"), ] + if self.table_group.profile_flag_pii: + queries.append(self._get_query("pii_flag.sql")) + queries.append(self._get_query("pii_flag_update.sql")) if self.table_group.profile_flag_cdes: queries.append(self._get_query("cde_flagger_query.sql")) return queries diff --git a/testgen/commands/run_profiling.py b/testgen/commands/run_profiling.py index 1c8e58fc..c5b21059 100644 --- a/testgen/commands/run_profiling.py +++ b/testgen/commands/run_profiling.py @@ -28,6 +28,7 @@ from testgen.common.mixpanel_service import MixpanelService from testgen.common.models import get_current_session, with_database_session from testgen.common.models.connection import Connection +from testgen.common.models.data_column import DataColumnChars from testgen.common.models.profiling_run import ProfilingRun from testgen.common.models.table_group import TableGroup from testgen.common.models.test_suite import TestSuite @@ -85,6 +86,7 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d LOG.info(f"Profiling run: {profiling_run.id}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}") try: data_chars = run_data_chars_refresh(connection, table_group, profiling_run.profiling_starttime) + data_chars = _exclude_xde_columns(data_chars, table_group.id) distinct_tables = {(column.table_name, column.record_ct) for column in data_chars} profiling_run.set_progress("data_chars", "Completed") @@ -144,6 +146,22 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d """ +def _exclude_xde_columns(data_chars: list[ColumnChars], table_group_id: UUID) -> list[ColumnChars]: + """Filter out columns marked as excluded_data_element in data_column_chars.""" + xde_columns = DataColumnChars.select_where( + DataColumnChars.table_groups_id == table_group_id, + DataColumnChars.excluded_data_element.is_(True), + ) + if not xde_columns: + return data_chars + + excluded = {(col.table_name, col.column_name) for col in xde_columns} + filtered = [col for col in data_chars if (col.table_name, col.column_name) not in excluded] + if len(filtered) < len(data_chars): + LOG.info(f"Excluding {len(data_chars) - len(filtered)} XDE columns from profiling") + return filtered + + def _run_column_profiling(sql_generator: ProfilingSQL, data_chars: list[ColumnChars]) -> None: profiling_run = sql_generator.profiling_run profiling_run.set_progress("col_profiling", "Running") diff --git a/testgen/common/models/data_column.py b/testgen/common/models/data_column.py new file mode 100644 index 00000000..50122266 --- /dev/null +++ b/testgen/common/models/data_column.py @@ -0,0 +1,29 @@ +from uuid import UUID, uuid4 + +from sqlalchemy import Boolean, Column, ForeignKey, String +from sqlalchemy.dialects import postgresql + +from testgen.common.models.entity import Entity + + +class DataColumnChars(Entity): + __tablename__ = "data_column_chars" + + id: UUID = Column("column_id", postgresql.UUID(as_uuid=True), primary_key=True, default=uuid4) + table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("table_groups.id")) + schema_name: str = Column(String) + table_name: str = Column(String) + column_name: str = Column(String) + excluded_data_element: bool | None = Column(Boolean, nullable=True) + pii_flag: str | None = Column(String(50), nullable=True) + + # Unmapped columns: table_id, ordinal_position, general_type, column_type, + # db_data_type, functional_data_type, description, critical_data_element, + # data_source, source_system, source_process, business_domain, + # stakeholder_group, transform_level, aggregation_level, data_product, + # add_date, last_mod_date, drop_date, test_ct, last_test_date, + # tests_last_run, tests_7_days_prior, tests_30_days_prior, + # fails_last_run, fails_7_days_prior, fails_30_days_prior, + # warnings_last_run, warnings_7_days_prior, warnings_30_days_prior, + # last_complete_profile_run_id, valid_profile_issue_ct, + # valid_test_issue_ct, dq_score_profiling, dq_score_testing diff --git a/testgen/common/models/table_group.py b/testgen/common/models/table_group.py index 7ca2dd90..39d81552 100644 --- a/testgen/common/models/table_group.py +++ b/testgen/common/models/table_group.py @@ -28,6 +28,8 @@ class TableGroupMinimal(EntityMinimal): profile_use_sampling: bool profiling_delay_days: str monitor_test_suite_id: UUID | None + profile_flag_cdes: bool + profile_flag_pii: bool last_complete_profile_run_id: UUID | None @@ -112,6 +114,7 @@ class TableGroup(Entity): profile_sample_min_count: int = Column(BigInteger, default=100000) profiling_delay_days: str = Column(String, default="0") profile_flag_cdes: bool = Column(Boolean, default=True) + profile_flag_pii: bool = Column(Boolean, default=True) profile_do_pair_rules: bool = Column(YNString, default="N") profile_pair_rule_pct: int = Column(Integer, default=95) include_in_dashboard: bool = Column(Boolean, default=True) diff --git a/testgen/common/pii_masking.py b/testgen/common/pii_masking.py new file mode 100644 index 00000000..cff28f9d --- /dev/null +++ b/testgen/common/pii_masking.py @@ -0,0 +1,68 @@ +"""PII masking utilities for redacting sensitive data in the UI.""" +import pandas as pd + +from testgen.ui.services.database_service import fetch_all_from_db + +PII_REDACTED = "[PII Redacted]" + +PROFILING_PII_FIELDS = ( + "top_freq_values", "min_text", "max_text", + "min_value", "min_value_over_0", "max_value", + "min_date", "max_date", +) + + +def get_pii_columns(table_group_id: str, schema: str | None = None, table_name: str | None = None) -> set[str]: + """Look up PII-flagged column names from data_column_chars.""" + + query = f""" + SELECT column_name + FROM data_column_chars + WHERE table_groups_id = :table_group_id + AND pii_flag IS NOT NULL + {"AND schema_name = :schema" if schema else ""} + {"AND table_name = :table_name" if table_name else ""} + """ + params: dict = { + "table_group_id": table_group_id, + "schema": schema, + "table_name": table_name, + } + + results = fetch_all_from_db(query, params) + return {row.column_name for row in results} + + +def mask_dataframe_pii(df: pd.DataFrame, pii_columns: set[str]) -> None: + """In-place mask values in PII columns with PII_REDACTED.""" + if df.empty or not pii_columns: + return + for col in pii_columns: + # Match case-insensitively since column names may differ in case + for df_col in df.columns: + if df_col.lower() == col.lower(): + df[df_col] = PII_REDACTED + + +def mask_profiling_pii(data: pd.DataFrame | dict, pii_columns: set[str]) -> None: + """Mask profiling fields for PII columns. Accepts a DataFrame or a single-row dict.""" + if isinstance(data, dict): + if not pii_columns: + return + column_name = data.get("column_name") + if column_name and column_name.lower() not in {c.lower() for c in pii_columns}: + return + for field in PROFILING_PII_FIELDS: + if field in data: + data[field] = PII_REDACTED + return + + if data.empty or not pii_columns: + return + pii_lower = {c.lower() for c in pii_columns} + mask = data["column_name"].str.lower().isin(pii_lower) + for field in PROFILING_PII_FIELDS: + if field in data.columns: + if data[field].dtype != object: + data[field] = data[field].astype(object) + data.loc[mask, field] = PII_REDACTED diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index b5b1eefe..ba0edd3a 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -113,6 +113,7 @@ CREATE TABLE table_groups profile_sample_min_count BIGINT DEFAULT 100000, profiling_delay_days VARCHAR(3) DEFAULT '0', profile_flag_cdes BOOLEAN DEFAULT TRUE, + profile_flag_pii BOOLEAN DEFAULT TRUE, profile_do_pair_rules VARCHAR(3) DEFAULT 'N', profile_pair_rule_pct INTEGER DEFAULT 95, include_in_dashboard BOOLEAN DEFAULT TRUE, @@ -447,6 +448,8 @@ CREATE TABLE data_column_chars ( functional_data_type VARCHAR(50), description VARCHAR(1000), critical_data_element BOOLEAN, + excluded_data_element BOOLEAN, + pii_flag VARCHAR(50), data_source VARCHAR(40), source_system VARCHAR(40), source_process VARCHAR(40), diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml index c922d9d8..3c2783fb 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Printing_Chars.yaml @@ -33,7 +33,7 @@ profile_anomaly_types: '\u200f', '\x8207'), '\u202f', '\x8239'), '\u3000', '\x12288'), - '\ufeff', '\x65279') as `{COLUMN_NAME}_content`, + '\ufeff', '\x65279') as `{COLUMN_NAME}`, COUNT(*) as record_ct FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` WHERE TRANSLATE(`{COLUMN_NAME}`, '\u00a0\u2009\u200b\u200c\u200d\u200e\u200f\u202f\u3000\ufeff', 'XXXXXXXXXX') <> `{COLUMN_NAME}` GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT {LIMIT} @@ -54,7 +54,7 @@ profile_anomaly_types: NCHAR(8207), '\x8207'), NCHAR(8239), '\x8239'), NCHAR(12288), '\x12288'), - NCHAR(65279), '\x65279') AS "{COLUMN_NAME}_content", + NCHAR(65279), '\x65279') AS "{COLUMN_NAME}", COUNT(*) AS record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", NCHAR(160) + NCHAR(8201) + NCHAR(8203) + NCHAR(8204) + NCHAR(8205) + NCHAR(8206) + NCHAR(8207) + NCHAR(8239) + NCHAR(12288) + NCHAR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" @@ -76,7 +76,7 @@ profile_anomaly_types: CHR(8207), '\x8207'), CHR(8239), '\x8239'), CHR(12288), '\x12288'), - CHR(65279), '\x65279') as "{COLUMN_NAME}_content", + CHR(65279), '\x65279') as "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} @@ -97,7 +97,7 @@ profile_anomaly_types: CHR(8207), '\x8207'), CHR(8239), '\x8239'), CHR(12288), '\x12288'), - CHR(65279), '\x65279') as "{COLUMN_NAME}_content", + CHR(65279), '\x65279') as "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} @@ -118,7 +118,7 @@ profile_anomaly_types: CHR(8207), '\x8207'), CHR(8239), '\x8239'), CHR(12288), '\x12288'), - CHR(65279), '\x65279') as "{COLUMN_NAME}_content", + CHR(65279), '\x65279') as "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} @@ -139,7 +139,7 @@ profile_anomaly_types: CHR(8207), '\x8207'), CHR(8239), '\x8239'), CHR(12288), '\x12288'), - CHR(65279), '\x65279') as "{COLUMN_NAME}_content", + CHR(65279), '\x65279') as "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", CHR(160) || CHR(8201) || CHR(8203) || CHR(8204) || CHR(8205) || CHR(8206) || CHR(8207) || CHR(8239) || CHR(12288) || CHR(65279), 'XXXXXXXXXX') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} @@ -150,7 +150,7 @@ profile_anomaly_types: sql_flavor: oracle lookup_type: null lookup_query: |- - SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", UNISTR('\00A0'), '\x160'), UNISTR('\2009'), '\x8201'), UNISTR('\200B'), '\x8203'), UNISTR('\200C'), '\x8204'), UNISTR('\200D'), '\x8205'), UNISTR('\200E'), '\x8206'), UNISTR('\200F'), '\x8207'), UNISTR('\202F'), '\x8239'), UNISTR('\3000'), '\x12288'), UNISTR('\FEFF'), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", 'X' || UNISTR('\00A0') || UNISTR('\2009') || UNISTR('\200B') || UNISTR('\200C') || UNISTR('\200D') || UNISTR('\200E') || UNISTR('\200F') || UNISTR('\202F') || UNISTR('\3000') || UNISTR('\FEFF'), 'XXXXXXXXXXX') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY + SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", UNISTR('\00A0'), '\x160'), UNISTR('\2009'), '\x8201'), UNISTR('\200B'), '\x8203'), UNISTR('\200C'), '\x8204'), UNISTR('\200D'), '\x8205'), UNISTR('\200E'), '\x8206'), UNISTR('\200F'), '\x8207'), UNISTR('\202F'), '\x8239'), UNISTR('\3000'), '\x12288'), UNISTR('\FEFF'), '\x65279') as "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE TRANSLATE("{COLUMN_NAME}", 'X' || UNISTR('\00A0') || UNISTR('\2009') || UNISTR('\200B') || UNISTR('\200C') || UNISTR('\200D') || UNISTR('\200E') || UNISTR('\200F') || UNISTR('\202F') || UNISTR('\3000') || UNISTR('\FEFF'), 'XXXXXXXXXXX') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly - id: '1529' test_id: '1031' @@ -158,5 +158,5 @@ profile_anomaly_types: sql_flavor: sap_hana lookup_type: null lookup_query: |- - SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", NCHAR(160), '\x160'), NCHAR(8201), '\x8201'), NCHAR(8203), '\x8203'), NCHAR(8204), '\x8204'), NCHAR(8205), '\x8205'), NCHAR(8206), '\x8206'), NCHAR(8207), '\x8207'), NCHAR(8239), '\x8239'), NCHAR(12288), '\x12288'), NCHAR(65279), '\x65279') as "{COLUMN_NAME}_content", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", NCHAR(160), ''), NCHAR(8201), ''), NCHAR(8203), ''), NCHAR(8204), ''), NCHAR(8205), ''), NCHAR(8206), ''), NCHAR(8207), ''), NCHAR(8239), ''), NCHAR(12288), ''), NCHAR(65279), '') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} + SELECT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", NCHAR(160), '\x160'), NCHAR(8201), '\x8201'), NCHAR(8203), '\x8203'), NCHAR(8204), '\x8204'), NCHAR(8205), '\x8205'), NCHAR(8206), '\x8206'), NCHAR(8207), '\x8207'), NCHAR(8239), '\x8239'), NCHAR(12288), '\x12288'), NCHAR(65279), '\x65279') as "{COLUMN_NAME}", COUNT(*) as record_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" WHERE REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", NCHAR(160), ''), NCHAR(8201), ''), NCHAR(8203), ''), NCHAR(8204), ''), NCHAR(8205), ''), NCHAR(8206), ''), NCHAR(8207), ''), NCHAR(8239), ''), NCHAR(12288), ''), NCHAR(65279), '') <> "{COLUMN_NAME}" GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT {LIMIT} error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml index 7210d2b8..a76ec345 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Standardized_Value_Matches.yaml @@ -31,12 +31,12 @@ profile_anomaly_types: GROUP BY possible_standard_value HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) - SELECT DISTINCT a.`{COLUMN_NAME}`, b.possible_standard_value, COUNT(*) AS count + SELECT DISTINCT a.`{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a JOIN cte b ON UPPER(REGEXP_REPLACE(CAST(a.`{COLUMN_NAME}` AS STRING), r"[ '\.\-\,]", '')) = b.possible_standard_value - GROUP BY a.`{COLUMN_NAME}`, b.possible_standard_value - ORDER BY b.possible_standard_value ASC, count DESC + GROUP BY a.`{COLUMN_NAME}` + ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1289' @@ -45,7 +45,7 @@ profile_anomaly_types: sql_flavor: databricks lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, ' '',.-', '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, ' '',.-', '')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, COUNT(*) AS count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, ' '',.-', '')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}` ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1131' test_id: '1017' @@ -53,7 +53,7 @@ profile_anomaly_types: sql_flavor: mssql lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT TOP {LIMIT} UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC; + WITH CTE AS ( SELECT DISTINCT TOP {LIMIT} UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") as distinct_ct FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(REPLACE(TRANSLATE("{COLUMN_NAME}",' '''',.-',REPLICATE(' ', LEN(' '''',.-'))),' ','')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}" ORDER BY possible_standard_value ASC, count DESC; error_type: Profile Anomaly - id: '1074' test_id: '1017' @@ -61,7 +61,7 @@ profile_anomaly_types: sql_flavor: postgresql lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}" ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1049' test_id: '1017' @@ -69,7 +69,7 @@ profile_anomaly_types: sql_flavor: redshift lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}" ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1449' test_id: '1017' @@ -77,7 +77,7 @@ profile_anomaly_types: sql_flavor: redshift_spectrum lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}" ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1188' test_id: '1017' @@ -85,7 +85,7 @@ profile_anomaly_types: sql_flavor: snowflake lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", ' '',.-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", ' '',.-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}" ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT}; error_type: Profile Anomaly - id: '1516' test_id: '1017' @@ -93,7 +93,7 @@ profile_anomaly_types: sql_flavor: oracle lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", 'X '',.-', 'X')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") AS cnt FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", 'X '',.-', 'X')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", 'X '',.-', 'X')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC FETCH FIRST {LIMIT} ROWS ONLY + WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE("{COLUMN_NAME}", 'X '',.-', 'X')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") AS cnt FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(TRANSLATE("{COLUMN_NAME}", 'X '',.-', 'X')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(TRANSLATE(a."{COLUMN_NAME}", 'X '',.-', 'X')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}" ORDER BY possible_standard_value ASC, count DESC FETCH FIRST {LIMIT} ROWS ONLY error_type: Profile Anomaly - id: '1516' test_id: '1017' @@ -101,5 +101,5 @@ profile_anomaly_types: sql_flavor: sap_hana lookup_type: null lookup_query: |- - WITH CTE AS ( SELECT DISTINCT UPPER(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", ' ', ''), '''', ''), ',', ''), '.', ''), '-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") AS cnt FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", ' ', ''), '''', ''), ',', ''), '.', ''), '-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", possible_standard_value, COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(a."{COLUMN_NAME}", ' ', ''), '''', ''), ',', ''), '.', ''), '-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}", possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT} + WITH CTE AS ( SELECT DISTINCT UPPER(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", ' ', ''), '''', ''), ',', ''), '.', ''), '-', '')) as possible_standard_value, COUNT(DISTINCT "{COLUMN_NAME}") AS cnt FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" GROUP BY UPPER(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("{COLUMN_NAME}", ' ', ''), '''', ''), ',', ''), '.', ''), '-', '')) HAVING COUNT(DISTINCT "{COLUMN_NAME}") > 1 ) SELECT DISTINCT a."{COLUMN_NAME}", COUNT(*) AS count FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" a, cte b WHERE UPPER(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(a."{COLUMN_NAME}", ' ', ''), '''', ''), ',', ''), '.', ''), '-', '')) = b.possible_standard_value GROUP BY a."{COLUMN_NAME}" ORDER BY possible_standard_value ASC, count DESC LIMIT {LIMIT} error_type: Profile Anomaly diff --git a/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml b/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml index 285037b0..e151fa6c 100644 --- a/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Freshness_Trend.yaml @@ -46,7 +46,7 @@ test_types: template: |- WITH test_data AS ( SELECT - {CUSTOM_QUERY} AS fingerprint, + TO_HEX(MD5({CUSTOM_QUERY})) AS fingerprint, DATETIME_DIFF(DATETIME('{RUN_DATE}'), DATETIME(NULLIF('{BASELINE_SUM}', '')), MINUTE) AS interval_minutes FROM `{SCHEMA_NAME}.{TABLE_NAME}` WHERE {SUBSET_CONDITION} @@ -96,7 +96,7 @@ test_types: template: |- WITH test_data AS ( SELECT - {CUSTOM_QUERY} AS fingerprint, + MD5({CUSTOM_QUERY}) AS fingerprint, DATEDIFF(MINUTE, TO_TIMESTAMP(NULLIF('{BASELINE_SUM}', '')), TIMESTAMP '{RUN_DATE}') AS interval_minutes FROM `{SCHEMA_NAME}`.`{TABLE_NAME}` WHERE {SUBSET_CONDITION} @@ -146,7 +146,7 @@ test_types: template: |- WITH test_data AS ( SELECT - {CUSTOM_QUERY} AS fingerprint, + LOWER(CONVERT(VARCHAR(40), HASHBYTES('MD5', CAST({CUSTOM_QUERY} AS VARCHAR(MAX))), 2)) AS fingerprint, DATEDIFF(MINUTE, CAST(NULLIF('{BASELINE_SUM}', '') AS DATETIME2), CAST('{RUN_DATE}' AS DATETIME2)) AS interval_minutes FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WITH (NOLOCK) WHERE {SUBSET_CONDITION} @@ -196,7 +196,7 @@ test_types: template: |- WITH test_data AS ( SELECT - {CUSTOM_QUERY} AS fingerprint, + MD5({CUSTOM_QUERY}) AS fingerprint, (EXTRACT(EPOCH FROM ('{RUN_DATE}'::TIMESTAMP - NULLIF('{BASELINE_SUM}', '')::TIMESTAMP)) / 60)::INTEGER AS interval_minutes FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} @@ -246,7 +246,7 @@ test_types: template: |- WITH test_data AS ( SELECT - {CUSTOM_QUERY} AS fingerprint, + MD5({CUSTOM_QUERY}) AS fingerprint, DATEDIFF(MINUTE, NULLIF('{BASELINE_SUM}', '')::TIMESTAMP, '{RUN_DATE}'::TIMESTAMP) AS interval_minutes FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} @@ -296,7 +296,7 @@ test_types: template: |- WITH test_data AS ( SELECT - {CUSTOM_QUERY} AS fingerprint, + MD5({CUSTOM_QUERY}) AS fingerprint, DATEDIFF(MINUTE, NULLIF('{BASELINE_SUM}', '')::TIMESTAMP, '{RUN_DATE}'::TIMESTAMP) AS interval_minutes FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} @@ -346,7 +346,7 @@ test_types: template: |- WITH test_data AS ( SELECT - {CUSTOM_QUERY} AS fingerprint, + MD5({CUSTOM_QUERY}) AS fingerprint, DATEDIFF(MINUTE, NULLIF('{BASELINE_SUM}', '')::TIMESTAMP, '{RUN_DATE}'::TIMESTAMP) AS interval_minutes FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} @@ -396,7 +396,7 @@ test_types: template: |- WITH test_data AS ( SELECT - {CUSTOM_QUERY} AS fingerprint, + LOWER(RAWTOHEX(STANDARD_HASH({CUSTOM_QUERY}, 'MD5'))) AS fingerprint, ROUND((CAST(TO_TIMESTAMP('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS') AS DATE) - CAST(TO_TIMESTAMP('{BASELINE_SUM}', 'YYYY-MM-DD HH24:MI:SS') AS DATE)) * 24 * 60) AS interval_minutes FROM "{SCHEMA_NAME}"."{TABLE_NAME}" @@ -447,7 +447,7 @@ test_types: template: |- WITH test_data AS ( SELECT - {CUSTOM_QUERY} AS fingerprint, + LOWER(BINTOHEX(HASH_MD5(TO_BINARY({CUSTOM_QUERY})))) AS fingerprint, ROUND(SECONDS_BETWEEN(TO_TIMESTAMP('{BASELINE_SUM}', 'YYYY-MM-DD HH24:MI:SS'), TO_TIMESTAMP('{RUN_DATE}', 'YYYY-MM-DD HH24:MI:SS')) / 60.0) AS interval_minutes FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WHERE {SUBSET_CONDITION} diff --git a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml index 85396aaa..27e89cf0 100644 --- a/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Table_Freshness.yaml @@ -68,7 +68,7 @@ test_types: ELSE 1 END AS result_measure FROM ( - SELECT {CUSTOM_QUERY} AS fingerprint + SELECT TO_HEX(MD5({CUSTOM_QUERY})) AS fingerprint FROM `{SCHEMA_NAME}.{TABLE_NAME}` WHERE {SUBSET_CONDITION} ) test; @@ -100,7 +100,7 @@ test_types: WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 ELSE 1 END AS result_measure - FROM ( SELECT {CUSTOM_QUERY} as fingerprint + FROM ( SELECT MD5({CUSTOM_QUERY}) as fingerprint FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) test; @@ -132,7 +132,7 @@ test_types: WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 ELSE 1 END AS result_measure - FROM ( SELECT {CUSTOM_QUERY} as fingerprint + FROM ( SELECT LOWER(CONVERT(VARCHAR(40), HASHBYTES('MD5', CAST({CUSTOM_QUERY} AS VARCHAR(MAX))), 2)) as fingerprint FROM "{SCHEMA_NAME}"."{TABLE_NAME}" WITH (NOLOCK) WHERE {SUBSET_CONDITION} ) test; @@ -164,7 +164,7 @@ test_types: WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 ELSE 1 END AS result_measure - FROM ( SELECT {CUSTOM_QUERY} as fingerprint + FROM ( SELECT MD5({CUSTOM_QUERY}) as fingerprint FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) test; @@ -196,7 +196,7 @@ test_types: WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 ELSE 1 END AS result_measure - FROM ( SELECT {CUSTOM_QUERY} as fingerprint + FROM ( SELECT MD5({CUSTOM_QUERY}) as fingerprint FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) test; @@ -228,7 +228,7 @@ test_types: WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 ELSE 1 END AS result_measure - FROM ( SELECT {CUSTOM_QUERY} as fingerprint + FROM ( SELECT MD5({CUSTOM_QUERY}) as fingerprint FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) test; @@ -260,7 +260,7 @@ test_types: WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 ELSE 1 END AS result_measure - FROM ( SELECT {CUSTOM_QUERY} as fingerprint + FROM ( SELECT MD5({CUSTOM_QUERY}) as fingerprint FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) test; @@ -292,7 +292,7 @@ test_types: WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 ELSE 1 END AS result_measure - FROM ( SELECT {CUSTOM_QUERY} as fingerprint + FROM ( SELECT LOWER(RAWTOHEX(STANDARD_HASH({CUSTOM_QUERY}, 'MD5'))) as fingerprint FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) test @@ -324,7 +324,7 @@ test_types: WHEN '{LOWER_TOLERANCE}' = 'NULL' OR fingerprint = '{LOWER_TOLERANCE}' THEN 0 ELSE 1 END AS result_measure - FROM ( SELECT {CUSTOM_QUERY} as fingerprint + FROM ( SELECT LOWER(BINTOHEX(HASH_MD5(TO_BINARY({CUSTOM_QUERY})))) as fingerprint FROM {QUOTE}{SCHEMA_NAME}{QUOTE}.{QUOTE}{TABLE_NAME}{QUOTE} WHERE {SUBSET_CONDITION} ) test diff --git a/testgen/template/dbupgrade/0178_incremental_upgrade.sql b/testgen/template/dbupgrade/0178_incremental_upgrade.sql new file mode 100644 index 00000000..ba31a28f --- /dev/null +++ b/testgen/template/dbupgrade/0178_incremental_upgrade.sql @@ -0,0 +1,7 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE table_groups ADD COLUMN IF NOT EXISTS profile_flag_pii BOOLEAN DEFAULT TRUE; + +ALTER TABLE data_column_chars + ADD COLUMN IF NOT EXISTS excluded_data_element BOOLEAN, + ADD COLUMN IF NOT EXISTS pii_flag VARCHAR(50); diff --git a/testgen/template/dbupgrade/0179_incremental_upgrade.sql b/testgen/template/dbupgrade/0179_incremental_upgrade.sql new file mode 100644 index 00000000..2bcf9f14 --- /dev/null +++ b/testgen/template/dbupgrade/0179_incremental_upgrade.sql @@ -0,0 +1,31 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +-- Hash existing fingerprint values for Table_Freshness tests +-- lower_tolerance stores the last computed fingerprint used for comparison +UPDATE test_definitions + SET lower_tolerance = MD5(lower_tolerance) + WHERE test_type = 'Table_Freshness' + AND lower_tolerance IS NOT NULL + AND LENGTH(lower_tolerance) <> 32; + +-- Hash existing fingerprint values for Freshness_Trend monitors +-- baseline_value stores the fingerprint at the last detected table change +UPDATE test_definitions + SET baseline_value = MD5(baseline_value) + WHERE test_type = 'Freshness_Trend' + AND baseline_value IS NOT NULL + AND LENGTH(baseline_value) <> 32; + +-- Hash existing result_signal values for Table_Freshness test results +UPDATE test_results + SET result_signal = MD5(result_signal) + WHERE test_type = 'Table_Freshness' + AND result_signal IS NOT NULL + AND LENGTH(result_signal) <> 32; + +-- Hash existing result_measure values for Freshness_Trend test results +UPDATE test_results + SET result_measure = MD5(result_measure) + WHERE test_type = 'Freshness_Trend' + AND result_measure IS NOT NULL + AND LENGTH(result_measure) <> 32; diff --git a/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql b/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql index b77044a6..34eb1c24 100644 --- a/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql @@ -104,7 +104,7 @@ SELECT THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'ZIP_USA' WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[\w\s\-]+\.(txt|csv|tsv|dat|doc|pdf|xlsx)$') THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'FILE_NAME' - WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([0-9]{4}[- ]){3}[0-9]{4}$') + WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([0-9]{4}[- ]?){3}[0-9]{4}$') THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'CREDIT_CARD' WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$') AND NOT REGEXP_CONTAINS(`{COL_NAME}`, r'\s(and|but|or|yet)\s') diff --git a/testgen/template/flavors/databricks/profiling/project_profiling_query.sql b/testgen/template/flavors/databricks/profiling/project_profiling_query.sql index 407901ba..b72090a7 100644 --- a/testgen/template/flavors/databricks/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/databricks/profiling/project_profiling_query.sql @@ -114,7 +114,7 @@ SELECT OR `{COL_NAME}` LIKE '%.pdf' OR `{COL_NAME}` LIKE '%.xlsx') THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.9 THEN 'FILE_NAME' - WHEN CAST(SUM( CASE WHEN `{COL_NAME}` RLIKE '[0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9]' + WHEN CAST(SUM( CASE WHEN `{COL_NAME}` RLIKE '^([0-9]{4}[- ]?){3}[0-9]{4}$' THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.8 THEN 'CREDIT_CARD' WHEN CAST(SUM( CASE WHEN ( `{COL_NAME}` LIKE '%,%,%,%' OR `{COL_NAME}` LIKE '%|%|%|%' diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query.sql b/testgen/template/flavors/mssql/profiling/project_profiling_query.sql index 89c5c42b..fbcfb57e 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query.sql @@ -124,6 +124,7 @@ SELECT OR "{COL_NAME}" COLLATE SQL_Latin1_General_CP1_CI_AS LIKE '%.xlsx') THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'FILE_NAME' WHEN CAST(SUM(CASE WHEN "{COL_NAME}" LIKE '[0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9][- ][0-9][0-9][0-9][0-9]' + OR "{COL_NAME}" LIKE '[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]' THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'CREDIT_CARD' WHEN CAST(SUM(CASE WHEN ( "{COL_NAME}" LIKE '%,%,%,%' OR "{COL_NAME}" LIKE '%|%|%|%' diff --git a/testgen/template/flavors/oracle/profiling/project_profiling_query.sql b/testgen/template/flavors/oracle/profiling/project_profiling_query.sql index a7240103..cc93718c 100644 --- a/testgen/template/flavors/oracle/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/oracle/profiling/project_profiling_query.sql @@ -164,7 +164,7 @@ FROM ( THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'ZIP_USA' WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^[[:alnum:]_[:space:]-]+\.(txt|csv|tsv|dat|doc|pdf|xlsx)$') THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'FILE_NAME' - WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^([0-9]{4}[- ]){3}[0-9]{4}$') + WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^([0-9]{4}[- ]?){3}[0-9]{4}$') THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'CREDIT_CARD' WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^([^,|' || CHR(9) || ']{1,20}[,|' || CHR(9) || ']){2,}[^,|' || CHR(9) || ']{0,20}([,|' || CHR(9) || ']?[^,|' || CHR(9) || ']{0,20})*$') AND NOT REGEXP_LIKE("{COL_NAME}", '[[:space:]](and|but|or|yet)[[:space:]]') diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query.sql b/testgen/template/flavors/postgresql/profiling/project_profiling_query.sql index 8ad3a999..b2cb78bf 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query.sql @@ -95,7 +95,7 @@ SELECT THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^[\w\s\-]+(? 0.9 THEN 'FILE_NAME' - WHEN SUM( CASE WHEN "{COL_NAME}" SIMILAR TO '^([0-9]{4}[- ]){3}[0-9]{4}$' + WHEN SUM( CASE WHEN "{COL_NAME}" SIMILAR TO '^([0-9]{4}[- ]?){3}[0-9]{4}$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'CREDIT_CARD' WHEN SUM( CASE WHEN "{COL_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COL_NAME}" !~ '\s(and|but|or|yet)\s' diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query.sql b/testgen/template/flavors/redshift/profiling/project_profiling_query.sql index 4c48dd5d..d054e40e 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query.sql @@ -75,7 +75,7 @@ SELECT THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[\\w\\s\-]+\\.(txt|(c|t|p)sv|dat|doc|docx|json|pdf|xlsx|xml)$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'FILE_NAME' - WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]){3}[0-9]{4}$' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]?){3}[0-9]{4}$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'CREDIT_CARD' WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COL_NAME}" !~ '\\s(and|but|or|yet)\\s' diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql index 4c48dd5d..d054e40e 100644 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql @@ -75,7 +75,7 @@ SELECT THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[\\w\\s\-]+\\.(txt|(c|t|p)sv|dat|doc|docx|json|pdf|xlsx|xml)$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'FILE_NAME' - WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]){3}[0-9]{4}$' + WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^([0-9]{4}[- ]?){3}[0-9]{4}$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'CREDIT_CARD' WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$' AND "{COL_NAME}" !~ '\\s(and|but|or|yet)\\s' diff --git a/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql b/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql index 8bfac838..82e5ce85 100644 --- a/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql @@ -92,7 +92,7 @@ SELECT THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'ZIP_USA' WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^[[:alnum:]_[:space:]-]+\.(txt|csv|tsv|dat|doc|pdf|xlsx)$' THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'FILE_NAME' - WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^([0-9]{4}[- ]){3}[0-9]{4}$' + WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^([0-9]{4}[- ]?){3}[0-9]{4}$' THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'CREDIT_CARD' WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^([^,|' || CHAR(9) || ']{1,20}[,|' || CHAR(9) || ']){2,}[^,|' || CHAR(9) || ']{0,20}([,|' || CHAR(9) || ']?[^,|' || CHAR(9) || ']{0,20})*$' AND NOT "{COL_NAME}" LIKE_REGEXPR '[[:space:]](and|but|or|yet)[[:space:]]' diff --git a/testgen/template/gen_funny_cat_tests/gen_Constant.sql b/testgen/template/gen_funny_cat_tests/gen_Constant.sql index 4a0af8d6..4c66729e 100644 --- a/testgen/template/gen_funny_cat_tests/gen_Constant.sql +++ b/testgen/template/gen_funny_cat_tests/gen_Constant.sql @@ -10,7 +10,14 @@ latest_results AS ( SELECT p.* FROM profile_results p INNER JOIN latest_run lr ON p.run_date = lr.last_run_date + LEFT JOIN data_column_chars dcc ON ( + p.table_groups_id = dcc.table_groups_id + AND p.schema_name = dcc.schema_name + AND p.table_name = dcc.table_name + AND p.column_name = dcc.column_name + ) WHERE p.table_groups_id = :TABLE_GROUPS_ID ::UUID + AND dcc.excluded_data_element IS NOT TRUE ), all_runs AS ( SELECT DISTINCT table_groups_id, run_date, @@ -42,7 +49,14 @@ selected_columns AS ( rr.table_groups_id = p.table_groups_id AND rr.run_date = p.run_date ) + LEFT JOIN data_column_chars dcc ON ( + p.table_groups_id = dcc.table_groups_id + AND p.schema_name = dcc.schema_name + AND p.table_name = dcc.table_name + AND p.column_name = dcc.column_name + ) WHERE p.table_groups_id = :TABLE_GROUPS_ID ::UUID + AND dcc.excluded_data_element IS NOT TRUE -- No dates as constants AND NOT (p.general_type = 'D' AND rr.run_rank = 1) GROUP BY p.schema_name, p.table_name, p.column_name diff --git a/testgen/template/gen_funny_cat_tests/gen_Distinct_Value_Ct.sql b/testgen/template/gen_funny_cat_tests/gen_Distinct_Value_Ct.sql index c06b458a..a7c186f2 100644 --- a/testgen/template/gen_funny_cat_tests/gen_Distinct_Value_Ct.sql +++ b/testgen/template/gen_funny_cat_tests/gen_Distinct_Value_Ct.sql @@ -13,7 +13,14 @@ latest_results AS ( SELECT p.* FROM profile_results p INNER JOIN latest_run lr ON p.run_date = lr.last_run_date + LEFT JOIN data_column_chars dcc ON ( + p.table_groups_id = dcc.table_groups_id + AND p.schema_name = dcc.schema_name + AND p.table_name = dcc.table_name + AND p.column_name = dcc.column_name + ) WHERE p.table_groups_id = :TABLE_GROUPS_ID ::UUID + AND dcc.excluded_data_element IS NOT TRUE ), all_runs AS ( SELECT DISTINCT table_groups_id, run_date, @@ -45,7 +52,14 @@ selected_columns AS ( rr.table_groups_id = p.table_groups_id AND rr.run_date = p.run_date ) + LEFT JOIN data_column_chars dcc ON ( + p.table_groups_id = dcc.table_groups_id + AND p.schema_name = dcc.schema_name + AND p.table_name = dcc.table_name + AND p.column_name = dcc.column_name + ) WHERE p.table_groups_id = :TABLE_GROUPS_ID ::UUID + AND dcc.excluded_data_element IS NOT TRUE GROUP BY p.schema_name, p.table_name, p.column_name HAVING SUM(CASE WHEN p.distinct_value_ct = 1 THEN 0 ELSE 1 END) = 0 AND ( diff --git a/testgen/template/generation/gen_selection_tests.sql b/testgen/template/generation/gen_selection_tests.sql index c6b846dd..36a8444f 100644 --- a/testgen/template/generation/gen_selection_tests.sql +++ b/testgen/template/generation/gen_selection_tests.sql @@ -10,7 +10,14 @@ selected_columns AS ( SELECT p.* FROM profile_results p INNER JOIN latest_run lr ON p.run_date = lr.last_run_date + LEFT JOIN data_column_chars dcc ON ( + p.table_groups_id = dcc.table_groups_id + AND p.schema_name = dcc.schema_name + AND p.table_name = dcc.table_name + AND p.column_name = dcc.column_name + ) WHERE p.table_groups_id = :TABLE_GROUPS_ID ::UUID + AND dcc.excluded_data_element IS NOT TRUE AND {SELECTION_CRITERIA} ) INSERT INTO test_definitions ( diff --git a/testgen/template/profiling/pii_flag_update.sql b/testgen/template/profiling/pii_flag_update.sql new file mode 100644 index 00000000..644122f2 --- /dev/null +++ b/testgen/template/profiling/pii_flag_update.sql @@ -0,0 +1,18 @@ +-- Propagate pii_flag from profile_results to data_column_chars +-- Clears existing flags first, then sets flags from the latest profiling run +UPDATE data_column_chars + SET pii_flag = NULL + WHERE table_groups_id = :TABLE_GROUPS_ID; + +WITH pii_selects + AS ( SELECT table_groups_id, schema_name, table_name, column_name, pii_flag + FROM profile_results + WHERE profile_run_id = :PROFILE_RUN_ID + AND pii_flag IS NOT NULL ) +UPDATE data_column_chars + SET pii_flag = pii_selects.pii_flag + FROM pii_selects + WHERE data_column_chars.table_groups_id = pii_selects.table_groups_id + AND data_column_chars.schema_name = pii_selects.schema_name + AND data_column_chars.table_name = pii_selects.table_name + AND data_column_chars.column_name = pii_selects.column_name; diff --git a/testgen/ui/auth.py b/testgen/ui/auth.py index 1ae89c2d..9518f8b7 100644 --- a/testgen/ui/auth.py +++ b/testgen/ui/auth.py @@ -13,7 +13,7 @@ LOG = logging.getLogger("testgen") -Permission = Literal["catalog", "view", "disposition", "edit", "administer", "global_admin"] +Permission = Literal["catalog", "view", "disposition", "view_pii", "edit", "administer", "global_admin"] class Authentication: diff --git a/testgen/ui/components/frontend/css/shared.css b/testgen/ui/components/frontend/css/shared.css index 8390aafe..f3550307 100644 --- a/testgen/ui/components/frontend/css/shared.css +++ b/testgen/ui/components/frontend/css/shared.css @@ -226,10 +226,26 @@ body { color: var(--error-color); } +.text-warning { + color: var(--orange); +} + .text-green { color: var(--primary-color); } +.text-purple { + color: var(--purple); +} + +.text-orange { + color: var(--orange); +} + +.text-brown { + color: var(--brown); +} + .text-capitalize { text-transform: capitalize; } @@ -744,7 +760,3 @@ input::-ms-clear { .notifications--empty.tg-empty-state { margin-top: 0; } - -.warning-text { - color: var(--orange); -} diff --git a/testgen/ui/components/frontend/js/components/attribute.js b/testgen/ui/components/frontend/js/components/attribute.js index 61240f7f..a7bb60eb 100644 --- a/testgen/ui/components/frontend/js/components/attribute.js +++ b/testgen/ui/components/frontend/js/components/attribute.js @@ -8,11 +8,12 @@ * @property {string?} class */ import { getValue, loadStylesheet } from '../utils.js'; +import { PII_REDACTED } from '../display_utils.js'; import { Icon } from './icon.js'; import { withTooltip } from './tooltip.js'; import van from '../van.min.js'; -const { div } = van.tags; +const { div, code } = van.tags; const Attribute = (/** @type Properties */ props) => { loadStylesheet('attribute', stylesheet); @@ -33,6 +34,12 @@ const Attribute = (/** @type Properties */ props) => { { class: 'attribute-value' }, () => { const value = getValue(props.value); + if (value === PII_REDACTED) { + return withTooltip( + code({ class: 'attribute-pii-redacted' }, 'PII Redacted'), + { text: 'You do not have permission to view PII data', position: 'top-right' }, + ); + } return (value || value === 0) ? value : '--'; }, ), @@ -44,6 +51,16 @@ stylesheet.replace(` .attribute-value { word-wrap: break-word; } + +.attribute-pii-redacted { + display: inline-block; + font-size: 12px; + padding: 2px 6px; + border-radius: 4px; + background: color-mix(in srgb, var(--disabled-text-color) 15%, transparent); + color: var(--disabled-text-color); + overflow: visible; +} `); export { Attribute }; diff --git a/testgen/ui/components/frontend/js/components/radio_group.js b/testgen/ui/components/frontend/js/components/radio_group.js index 9ddaba78..97aef2df 100644 --- a/testgen/ui/components/frontend/js/components/radio_group.js +++ b/testgen/ui/components/frontend/js/components/radio_group.js @@ -14,6 +14,7 @@ * @property {function(string | number | boolean | null)?} onChange * @property {number?} width * @property {('default' | 'inline' | 'vertical')?} layout + * @property {boolean?} disabled */ import van from '../van.min.js'; import { getRandomId, getValue, loadStylesheet } from '../utils.js'; @@ -27,9 +28,10 @@ const RadioGroup = (/** @type Properties */ props) => { const groupName = getRandomId(); const layout = getValue(props.layout) ?? 'default'; + const disabled = getValue(props.disabled) ?? false; return div( - { class: () => `tg-radio-group--wrapper ${layout}`, style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}` }, + { class: () => `tg-radio-group--wrapper ${layout}${disabled ? ' disabled' : ''}`, style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}` }, div( { class: 'text-caption tg-radio-group--label flex-row fx-gap-1' }, props.label, @@ -49,6 +51,7 @@ const RadioGroup = (/** @type Properties */ props) => { name: groupName, value: option.value, checked: () => option.value === getValue(props.value), + disabled, onchange: van.derive(() => { const onChange = props.onChange?.val ?? props.onChange; return onChange ? () => onChange(option.value) : null; @@ -156,6 +159,11 @@ stylesheet.replace(` border-radius: 5px; } +.tg-radio-group--wrapper.disabled { + opacity: 0.5; + pointer-events: none; +} + .tg-radio-group--help { white-space: pre-wrap; line-height: 16px; diff --git a/testgen/ui/components/frontend/js/components/table_group_form.js b/testgen/ui/components/frontend/js/components/table_group_form.js index 6b072255..609f6aa0 100644 --- a/testgen/ui/components/frontend/js/components/table_group_form.js +++ b/testgen/ui/components/frontend/js/components/table_group_form.js @@ -14,6 +14,7 @@ * @property {string?} profile_sk_column_mask * @property {number?} profiling_delay_days * @property {boolean?} profile_flag_cdes + * @property {boolean?} profile_flag_pii * @property {boolean?} include_in_dashboard * @property {boolean?} add_scorecard_definition * @property {boolean?} profile_use_sampling @@ -81,6 +82,7 @@ const TableGroupForm = (props) => { const profileSkColumnMask = van.state(tableGroup.profile_sk_column_mask ?? '%_sk'); const profilingDelayDays = van.state(tableGroup.profiling_delay_days ?? 0); const profileFlagCdes = van.state(tableGroup.profile_flag_cdes ?? true); + const profileFlagPii = van.state(tableGroup.profile_flag_pii ?? true); const includeInDashboard = van.state(tableGroup.include_in_dashboard ?? true); const addScorecardDefinition = van.state(tableGroup.add_scorecard_definition ?? true); const profileUseSampling = van.state(tableGroup.profile_use_sampling ?? false); @@ -120,6 +122,7 @@ const TableGroupForm = (props) => { profile_sk_column_mask: profileSkColumnMask.val, profiling_delay_days: profilingDelayDays.val, profile_flag_cdes: profileFlagCdes.val, + profile_flag_pii: profileFlagPii.val, include_in_dashboard: includeInDashboard.val, add_scorecard_definition: addScorecardDefinition.val, profile_use_sampling: profileUseSampling.val, @@ -186,6 +189,7 @@ const TableGroupForm = (props) => { { editMode: !!tableGroup.id, setValidity: setFieldValidity }, profilingDelayDays, profileFlagCdes, + profileFlagPii, includeInDashboard, addScorecardDefinition, ), @@ -325,6 +329,7 @@ const SettingsForm = ( options, profilingDelayDays, profileFlagCdes, + profileFlagPii, includeInDashboard, addScorecardDefinition, ) => { @@ -339,6 +344,12 @@ const SettingsForm = ( checked: profileFlagCdes, onChange: (value) => profileFlagCdes.val = value, }), + Checkbox({ + name: 'profile_flag_pii', + label: 'Detect PII during profiling', + checked: profileFlagPii, + onChange: (value) => profileFlagPii.val = value, + }), Checkbox({ name: 'include_in_dashboard', label: 'Include table group in Project Dashboard', diff --git a/testgen/ui/components/frontend/js/components/tree.js b/testgen/ui/components/frontend/js/components/tree.js index 82acc371..fbf77c9c 100644 --- a/testgen/ui/components/frontend/js/components/tree.js +++ b/testgen/ui/components/frontend/js/components/tree.js @@ -6,8 +6,9 @@ * @property {string?} classes * @property {string?} icon * @property {number?} iconSize - * @property {'red'?} iconColor + * @property {string?} iconClass * @property {string?} iconTooltip + * @property {Element?} prefix * @property {TreeNode[]?} children * @property {number?} level * @property {boolean?} expanded @@ -91,7 +92,7 @@ const Tree = (/** @type Properties */ props, /** @type any? */ searchOptionsCont }, Toolbar(treeNodes, multiSelect, props, searchOptionsContent, filtersContent), div( - { class: 'tg-tree' }, + { class: () => `tg-tree ${multiSelect.val ? 'multi-select' : ''}` }, () => div( { class: 'tg-tree--nodes', @@ -312,9 +313,10 @@ const TreeNode = ( span({ class: 'mr-1' }), ] : null, + !multiSelect && node.prefix ? node.prefix : null, () => { if (node.icon) { - const icon = Icon({ size: node.iconSize, classes: `tg-tree--row-icon ${node.iconColor}` }, node.icon); + const icon = Icon({ size: node.iconSize, classes: `tg-tree--row-icon ${node.iconClass}` }, node.icon); return node.iconTooltip ? withTooltip(icon, { text: node.iconTooltip, position: 'right' }) : icon; } return null; @@ -519,10 +521,6 @@ stylesheet.replace(` color: #B0BEC5; text-align: center; } - -.tg-tree--row-icon.red { - color: var(--red); -} `); export { Tree }; diff --git a/testgen/ui/components/frontend/js/data_profiling/column_distribution.js b/testgen/ui/components/frontend/js/data_profiling/column_distribution.js index 85689099..49c63832 100644 --- a/testgen/ui/components/frontend/js/data_profiling/column_distribution.js +++ b/testgen/ui/components/frontend/js/data_profiling/column_distribution.js @@ -17,7 +17,7 @@ import { PercentBar } from '../components/percent_bar.js'; import { FrequencyBars } from '../components/frequency_bars.js'; import { BoxPlot } from '../components/box_plot.js'; import { loadStylesheet, emitEvent, friendlyPercent, getValue } from '../utils.js'; -import { formatNumber, formatTimestamp } from '../display_utils.js'; +import { formatNumber, formatTimestamp, PII_REDACTED } from '../display_utils.js'; const { div, span } = van.tags; const columnTypeFunctionMap = { @@ -150,15 +150,17 @@ function AlphaColumn(/** @type Column */ item) { ), item.top_freq_values || item.top_patterns ? div( { class: 'flex-row fx-flex-wrap fx-align-flex-start fx-gap-5 tg-profile--plot-block' }, - item.top_freq_values ? FrequencyBars({ - title: 'Frequent Values', - total: item.record_ct, - nullCount: item.null_value_ct, - items: item.top_freq_values.substring(2).split('\n| ').map(parts => { - const [value, count] = parts.split(' | '); - return { value, count: Number(count) }; - }), - }) : null, + item.top_freq_values === PII_REDACTED + ? Attribute({ label: 'Frequent Values', value: PII_REDACTED, width: attributeWidth }) + : item.top_freq_values ? FrequencyBars({ + title: 'Frequent Values', + total: item.record_ct, + nullCount: item.null_value_ct, + items: item.top_freq_values.substring(2).split('\n| ').map(parts => { + const [value, count] = parts.split(' | '); + return { value, count: Number(count) }; + }), + }) : null, item.top_patterns ? FrequencyBars({ title: 'Frequent Patterns', total: item.record_ct, @@ -292,19 +294,19 @@ function NumericColumn(/** @type Column */ item) { Attribute({ label: 'Median Value', value: formatNumber(item.percentile_50), width: attributeWidth }), Attribute({ label: '75th Percentile', value: formatNumber(item.percentile_75), width: attributeWidth }), ), - div( - { class: 'flex-row fx-justify-center tg-profile--plot-block' }, - BoxPlot({ - minimum: item.min_value, - maximum: item.max_value, - median: item.percentile_50, - lowerQuartile: item.percentile_25, - upperQuartile: item.percentile_75, - average: item.avg_value, - standardDeviation: item.stdev_value, - width: boxPlotWidth, - }), - ), + item.min_value === PII_REDACTED || item.max_value === PII_REDACTED ? null : div( + { class: 'flex-row fx-justify-center tg-profile--plot-block' }, + BoxPlot({ + minimum: item.min_value, + maximum: item.max_value, + median: item.percentile_50, + lowerQuartile: item.percentile_25, + upperQuartile: item.percentile_75, + average: item.avg_value, + standardDeviation: item.stdev_value, + width: boxPlotWidth, + }), + ), ); } diff --git a/testgen/ui/components/frontend/js/data_profiling/column_profiling_results.js b/testgen/ui/components/frontend/js/data_profiling/column_profiling_results.js index 98f4a6e1..f08dbf7f 100644 --- a/testgen/ui/components/frontend/js/data_profiling/column_profiling_results.js +++ b/testgen/ui/components/frontend/js/data_profiling/column_profiling_results.js @@ -12,7 +12,7 @@ import { getValue, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange, loa import { ColumnDistributionCard } from './column_distribution.js'; import { DataCharacteristicsCard } from './data_characteristics.js'; import { LatestProfilingTime } from './data_profiling_utils.js'; -import { HygieneIssuesCard, PotentialPIICard } from './data_issues.js'; +import { HygieneIssuesCard } from './data_issues.js'; const { div, h2, span } = van.tags; @@ -51,10 +51,7 @@ const ColumnProfilingResults = (/** @type Properties */ props) => { ), DataCharacteristicsCard({ border: true }, column.val), ColumnDistributionCard({ border: true, dataPreview: !!props.data_preview?.val }, column.val), - column.val.hygiene_issues ? [ - PotentialPIICard({ border: true }, column.val), - HygieneIssuesCard({ border: true }, column.val), - ] : null, + column.val.hygiene_issues ? HygieneIssuesCard({ border: true }, column.val) : null, ), ); } diff --git a/testgen/ui/components/frontend/js/data_profiling/data_issues.js b/testgen/ui/components/frontend/js/data_profiling/data_issues.js index ea8a86ef..1bd38e7a 100644 --- a/testgen/ui/components/frontend/js/data_profiling/data_issues.js +++ b/testgen/ui/components/frontend/js/data_profiling/data_issues.js @@ -22,11 +22,6 @@ import { formatTimestamp } from '../display_utils.js'; const { div, span, i } = van.tags; -const RISK_COLORS = { - High: 'red', - Moderate: 'orange', -}; - const LIKELIHOOD_COLORS = { Definite: 'red', Likely: 'orange', @@ -40,40 +35,6 @@ const STATUS_COLORS = { Log: 'blue', }; -const PotentialPIICard = (/** @type Properties */ props, /** @type Table | Column */ item) => { - const title = `Potential PII ${item.is_latest_profile ? '*' : ''}`; - const attributes = [ - { - key: 'detail', width: 150, label: 'Type', - value_function: (issue) => (issue.detail || '').split('Type: ')[1], - }, - { - key: 'pii_risk', width: 100, label: 'Risk', classes: 'text-secondary', - value_function: (issue) => div( - { class: 'flex-row' }, - span({ class: 'dot mr-2', style: `color: var(--${RISK_COLORS[issue.pii_risk]});` }), - issue.pii_risk, - ), - }, - ]; - if (item.type === 'table') { - attributes.unshift( - { key: 'column_name', width: 150, label: 'Column' }, - ); - } - - const potentialPII = item.hygiene_issues.filter(({ issue_likelihood }) => issue_likelihood === 'Potential PII'); - const linkProps = props.noLinks ? null : { - href: 'profiling-runs:hygiene', - params: { run_id: item.profile_run_id, issue_class: 'Potential PII', project_code: item.project_code }, - }; - const noneContent = item.profile_run_id && !item.profiling_error - ? 'No potential PII detected' - : span({ class: 'text-secondary' }, `No profiling results for ${item.type}`); - - return IssuesCard(props, title, potentialPII, attributes, linkProps, noneContent); -}; - const HygieneIssuesCard = (/** @type Properties */ props, /** @type Table | Column */ item) => { const title = `Hygiene Issues ${item.is_latest_profile ? '*' : ''}`; const attributes = [ @@ -250,4 +211,4 @@ const IssuesCard = ( }); } -export { PotentialPIICard, HygieneIssuesCard, TestIssuesCard }; +export { HygieneIssuesCard, TestIssuesCard }; diff --git a/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js b/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js index 829a8a24..71f2ac5e 100644 --- a/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js +++ b/testgen/ui/components/frontend/js/data_profiling/data_profiling_utils.js @@ -45,6 +45,8 @@ * * Column Tags * @property {string?} description * @property {boolean?} critical_data_element + * @property {boolean?} excluded_data_element + * @property {boolean?} pii_flag * @property {string?} data_source * @property {string?} source_system * @property {string?} source_process diff --git a/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js b/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js new file mode 100644 index 00000000..5687479e --- /dev/null +++ b/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js @@ -0,0 +1,451 @@ +/** + * @import { Column, Table } from './data_profiling_utils.js'; + * + * @typedef TagProperties + * @type {object} + * @property {Object.} tagOptions + * @property {boolean} editable + * @property {boolean} piiEditable + * @property {AutoflagSettings} autoflagSettings + * @property {(() => void)?} onCancel + */ +import van from '../van.min.js'; +import { EditableCard } from '../components/editable_card.js'; +import { Attribute } from '../components/attribute.js'; +import { Input } from '../components/input.js'; +import { Icon } from '../components/icon.js'; +import { withTooltip } from '../components/tooltip.js'; +import { emitEvent } from '../utils.js'; +import { RadioGroup } from '../components/radio_group.js'; +import { Checkbox } from '../components/checkbox.js'; +import { capitalize } from '../display_utils.js'; +import { Card } from '../components/card.js'; +import { Dialog } from '../components/dialog.js'; +import { Button } from '../components/button.js'; +import { Alert } from '../components/alert.js'; + +const { div, span } = van.tags; + +const attributeWidth = 300; +const descriptionWidth = 932; +const multiEditWidth = 400; + +const booleanOptions = [ + { label: 'Yes', value: true }, + { label: 'No', value: false }, +]; + +const piiOptions = [ + { label: 'Yes', value: 'MANUAL' }, + { label: 'No', value: null }, +]; + +const pii_risk_map = { + 'A': 'High', + 'B': 'Moderate', + 'C': 'Low', +}; +const pii_type_map = { + 'ID': 'ID', + 'NAME': 'Name', + 'DEMO': 'Demographic', + 'CONTACT': 'Contact', +}; + +const TAG_KEYS = [ + 'data_source', + 'source_system', + 'source_process', + 'business_domain', + 'stakeholder_group', + 'transform_level', + 'aggregation_level', + 'data_product', +]; +const TAG_HELP = { + data_source: 'Original source of the dataset', + source_system: 'Enterprise system source for the dataset', + source_process: 'Process, program, or data flow that produced the dataset', + business_domain: 'Business division responsible for the dataset, e.g., Finance, Sales, Manufacturing', + stakeholder_group: 'Data owners or stakeholders responsible for the dataset', + transform_level: 'Data warehouse processing stage, e.g., Raw, Conformed, Processed, Reporting, or Medallion level (bronze, silver, gold)', + aggregation_level: 'Data granularity of the dataset, e.g. atomic, historical, snapshot, aggregated, time-rollup, rolling, summary', + data_product: 'Data domain that comprises the dataset', +}; + +/** + * @param {TagProperties} props + * @param {Table | Column} item + * @returns + */ +const MetadataTagsCard = (props, item) => { + const title = `${item.type} Tags `; + const attributes = [ + 'critical_data_element', + ...(item.type === 'column' ? ['excluded_data_element', 'pii_flag'] : []), + 'description', + ...TAG_KEYS, + ].map(key => { + let value = item[key]; + if (['excluded_data_element', 'pii_flag'].includes(key) || (item.type === 'table' && key === 'critical_data_element')) { + value = value ?? false; + } + return { + key, + help: TAG_HELP[key], + label: key === 'pii_flag' ? 'PII Data' : capitalize(key.replaceAll('_', ' ')), + state: van.state(value), + inheritTableGroup: item[`table_group_${key}`] ?? null, // Table group values inherited by table or column + inheritTable: item[`table_${key}`] ?? null, // Table values inherited by column + }; + }); + + const content = div( + { class: 'flex-row fx-flex-wrap fx-gap-4' }, + attributes.map(({ key, label, help, state, inheritTable, inheritTableGroup }) => { + let value = state.rawVal ?? inheritTable ?? inheritTableGroup; + + if (key === 'critical_data_element') { + return CdeDisplay(value, item.type === 'column', state.rawVal === null); + } + if (key === 'excluded_data_element') { + return XdeDisplay(value); + } + if (key === 'pii_flag') { + return PiiDisplay(value); + } + + const inheritedFrom = state.rawVal !== null ? null + : inheritTable !== null ? 'table' + : inheritTableGroup !== null ? 'table group' + : null; + + if (inheritedFrom && value) { + value = span( + { class: 'flex-row fx-gap-1' }, + InheritedIcon(inheritedFrom), + value, + ); + } + return Attribute({ label, help, value, width: key === 'description' ? descriptionWidth : attributeWidth }); + }), + ); + + if (!props.editable) { + return Card({ title, content }); + } + + // Define as function so the block is re-rendered with reset values when re-editing after a cancel + const editingContent = () => div( + { class: 'flex-row fx-flex-wrap fx-gap-4' }, + attributes.map(({ key, label, help, state, inheritTable, inheritTableGroup }) => { + if (key === 'critical_data_element') { + return RadioGroup({ + label, + options: item.type === 'column' ? [...booleanOptions, { label: 'Inherit', value: null }] : booleanOptions, + width: attributeWidth, + value: state.rawVal, + onChange: (value) => state.val = value, + }); + } + if (key === 'excluded_data_element') { + return RadioGroup({ + label, + options: booleanOptions, + width: attributeWidth, + value: state.rawVal, + onChange: (value) => state.val = value, + }); + } + if (key === 'pii_flag') { + return RadioGroup({ + label, + options: piiOptions, + width: attributeWidth, + value: state.rawVal ? 'MANUAL' : null, + onChange: (value) => state.val = value, + disabled: !props.piiEditable, + }); + } + return Input({ + label, help, + width: key === 'description' ? descriptionWidth : attributeWidth, + height: 32, + value: state.rawVal, + placeholder: (inheritTable || inheritTableGroup) ? `Inherited: ${inheritTable ?? inheritTableGroup}` : null, + autocompleteOptions: props.tagOptions?.[key], + onChange: (value) => state.val = value || null, + }); + }), + ); + + const warningDialogOpen = van.state(false); + const pendingSaveAction = van.state(null); + const warnCde = van.state(false); + const warnPii = van.state(false); + + return div( + EditableCard({ + title: `${item.type} Tags `, + content, editingContent, + onSave: () => { + const items = [{ type: item.type, id: item.id }]; + const tags = attributes.reduce((object, { key, state }) => { + object[key] = state.rawVal; + return object; + }, {}); + + warnCde.val = props.autoflagSettings.profile_flag_cdes && tags.critical_data_element !== item.critical_data_element; + warnPii.val = props.autoflagSettings.profile_flag_pii && tags.pii_flag !== item.pii_flag; + + if (warnCde.val || warnPii.val) { + const disableFlags = []; + if (warnCde.val) { + disableFlags.push('profile_flag_cdes'); + } + if (warnPii.val) { + disableFlags.push('profile_flag_pii'); + } + pendingSaveAction.val = () => emitEvent('TagsChanged', { payload: { items, tags, disable_flags: disableFlags } }); + warningDialogOpen.val = true; + } else { + emitEvent('TagsChanged', { payload: { items, tags } }) + } + }, + // Reset states to original values on cancel + onCancel: () => attributes.forEach(({ key, state }) => state.val = item[key]), + hasChanges: () => attributes.some(({ key, state }) => state.val !== item[key]), + }), + WarningDialog(warningDialogOpen, pendingSaveAction, warnCde, warnPii), + ); +}; + +const InheritedIcon = (/** @type string */ inheritedFrom) => withTooltip( + Icon({ size: 18, classes: 'text-disabled' }, 'layers'), + { text: `Inherited from ${inheritedFrom} tags`, position: 'top-right'}, +); + +/** + * @param {boolean|null} value + * @param {boolean} isColumn + * @param {boolean} isInherited + * @returns + */ +const CdeDisplay = (value, isColumn, isInherited) => { + return span( + { class: 'flex-row fx-gap-1', style: `width: ${attributeWidth}px` }, + Icon( + { size: value ? 24 : 20, classes: value ? 'text-purple' : 'text-disabled' }, + value ? 'star' : 'cancel', + ), + span( + { class: value ? '' : 'text-secondary' }, + isColumn + ? (value ? 'Critical data element' : 'Not a critical data element') + : (value ? 'All critical data elements' : 'Not all critical data elements'), + ), + (isColumn && isInherited) ? InheritedIcon('table') : null, + ); +} + +const XdeDisplay = (/** @type boolean */ value) => { + return span( + { class: 'flex-row fx-gap-1', style: `width: ${attributeWidth}px` }, + Icon( + { size: 20, classes: value ? 'text-brown' : 'text-disabled' }, + value ? 'visibility_off' : 'visibility', + ), + span( + { class: value ? '' : 'text-secondary' }, + value ? 'Excluded data element' : 'Not an excluded data element', + ), + ); +} + +const PiiDisplay = (/** @type string|null */ value) => { + if (value) { + let caption = null; + if (value !== 'MANUAL') { + const [ risk, type, detail ] = value.split('/'); // e.g., A/ID/Passport, B/DEMO/Financial + const typeLabel = pii_type_map[type]; + caption = `${pii_risk_map[risk] ?? 'Moderate'} Risk${typeLabel ? ' - ' + typeLabel : ''}${detail && detail !== typeLabel ? ' / ' + detail : ''}`; + } + return span( + { class: 'flex-row fx-gap-1', style: `width: ${attributeWidth}px` }, + Icon({ size: 24, classes: 'text-orange' }, 'shield_person'), + div( + { class: 'flex-column fx-gap-1' }, + span('PII data'), + caption ? span({ class: 'text-caption' }, caption) : null, + ), + ); + } + return span( + { class: 'flex-row fx-gap-1', style: `width: ${attributeWidth}px` }, + Icon({ classes: 'text-disabled' }, 'remove_moderator'), + span({ class: 'text-secondary' }, 'Not PII data'), + ); +}; + +/** + * @param {TagProperties} props + * @param {Object} selectedItems + * @returns + */ +const MetadataTagsMultiEdit = (props, selectedItems) => { + const columnCount = van.derive(() => selectedItems.val?.reduce((count, { children }) => count + children.length, 0)); + + const attributes = [ + 'critical_data_element', + 'excluded_data_element', + 'pii_flag', + ...TAG_KEYS, + ].map(key => ({ + key, + help: TAG_HELP[key], + label: key === 'pii_flag' ? 'PII' : capitalize(key.replaceAll('_', ' ')), + checkedState: van.state(null), + valueState: van.state(null), + })); + + const warningDialogOpen = van.state(false); + const pendingSaveAction = van.state(null); + const warnCde = van.state(false); + const warnPii = van.state(false); + + return div( + Card({ + title: 'Edit Tags for Selection', + actionContent: span( + { class: 'text-secondary mr-4' }, + span({ style: 'font-weight: 500' }, columnCount), + () => ` column${columnCount.val > 1 ? 's' : ''} selected` + ), + content: div( + { class: 'flex-column' }, + attributes.map(({ key, label, help, checkedState, valueState }) => div( + { class: 'flex-row fx-gap-3' }, + Checkbox({ + checked: checkedState, + onChange: (checked) => checkedState.val = checked, + }), + div( + { + class: 'pb-4 flex-row', + style: `min-width: ${multiEditWidth}px`, + onclick: () => checkedState.val = true, + }, + ['critical_data_element', 'excluded_data_element', 'pii_flag'].includes(key) + ? RadioGroup({ + label, + width: multiEditWidth, + options: key === 'pii_flag' ? piiOptions : booleanOptions, + onChange: (value) => valueState.val = value, + disabled: key === 'pii_flag' && !props.piiEditable, + }) + : Input({ + label, help, + width: multiEditWidth, + height: 32, + placeholder: () => checkedState.val ? null : '(keep current values)', + autocompleteOptions: props.tagOptions?.[key], + onChange: (value) => valueState.val = value || null, + }), + ), + )), + div( + { class: 'flex-row fx-justify-content-flex-end fx-gap-3 mt-4' }, + Button({ + type: 'stroked', + label: 'Cancel', + width: 'auto', + onclick: props.onCancel, + }), + Button({ + type: 'stroked', + color: 'primary', + label: 'Save', + width: 'auto', + disabled: () => attributes.every(({ checkedState }) => !checkedState.val), + onclick: () => { + const items = selectedItems.val.reduce((array, table) => { + const [ type, id ] = table.id.split('_'); + array.push({ type, id }); + + table.children.forEach(column => { + const [ type, id ] = column.id.split('_'); + array.push({ type, id }); + }); + + return array; + }, []); + + const tags = attributes.reduce((object, { key, checkedState, valueState }) => { + if (checkedState.val) { + object[key] = valueState.rawVal; + } + return object; + }, {}); + + warnCde.val = props.autoflagSettings.profile_flag_cdes && tags.critical_data_element !== undefined; + warnPii.val = props.autoflagSettings.profile_flag_pii && tags.pii_flag !== undefined; + + if (warnCde.val || warnPii.val) { + const disableFlags = []; + if (warnCde.val) { + disableFlags.push('profile_flag_cdes'); + } + if (warnPii.val) { + disableFlags.push('profile_flag_pii'); + } + pendingSaveAction.val = () => emitEvent('TagsChanged', { payload: { items, tags, disable_flags: disableFlags } });; + warningDialogOpen.val = true; + } else { + emitEvent('TagsChanged', { payload: { items, tags } }); + // Don't set multiEditMode to false here + // Otherwise this event gets superseded by the ItemSelected event + // Let the Streamlit rerun handle the state reset with 'last_saved_timestamp' + } + }, + }), + ), + ), + }), + WarningDialog(warningDialogOpen, pendingSaveAction, warnCde, warnPii), + ); +}; + +const WarningDialog = (open, pendingAction, warnCde, warnPii) => { + return Dialog( + { open, width: '40rem', onClose: () => open.val = false }, + div( + { class: 'flex-column fx-gap-4' }, + span(() => `This table group is currently configured to detect ${warnCde.val ? 'CDEs' : ''}${warnCde.val && warnPii.val ? ' and ' : ''}${warnPii.val ? 'PIIs' : ''} during profiling.`), + Alert( + { type: 'warn', icon: 'warning' }, + 'To preserve your manual edits, autodetection will be turned off.', + ), + div( + { class: 'flex-row fx-justify-content-flex-end fx-gap-3 mt-4' }, + Button({ + type: 'stroked', + label: 'Cancel', + width: 'auto', + onclick: () => open.val = false, + }), + Button({ + type: 'stroked', + color: 'primary', + label: 'OK', + width: 'auto', + onclick: () => { + open.val = false; + pendingAction.val?.(); + }, + }), + ), + ), + ); +}; + +export { MetadataTagsCard, MetadataTagsMultiEdit, TAG_KEYS }; diff --git a/testgen/ui/components/frontend/js/display_utils.js b/testgen/ui/components/frontend/js/display_utils.js index c590c9a0..8dc0c9f5 100644 --- a/testgen/ui/components/frontend/js/display_utils.js +++ b/testgen/ui/components/frontend/js/display_utils.js @@ -2,6 +2,9 @@ function formatTimestamp( /** @type number | string */ timestamp, /** @type boolean */ showYear, ) { + if (timestamp === PII_REDACTED) { + return timestamp; + } if (timestamp) { let date = timestamp; if (typeof timestamp === 'number') { @@ -81,6 +84,9 @@ function humanReadableDuration(/** @type string */ duration, /** @type boolean * } function formatNumber(/** @type number | string */ number, /** @type number */ decimals = 3) { + if (number === PII_REDACTED) { + return number; + } if (!['number', 'string'].includes(typeof number) || isNaN(number)) { return '--'; } @@ -173,6 +179,7 @@ const colorMap = { } const DISABLED_ACTION_TEXT = 'You do not have permissions to perform this action. Contact your administrator.'; +const PII_REDACTED = '[PII Redacted]'; export { formatTimestamp, @@ -187,4 +194,5 @@ export { viewPortUnitsToPixels, colorMap, DISABLED_ACTION_TEXT, + PII_REDACTED, }; diff --git a/testgen/ui/components/frontend/js/pages/data_catalog.js b/testgen/ui/components/frontend/js/pages/data_catalog.js index e8825413..32e01a7e 100644 --- a/testgen/ui/components/frontend/js/pages/data_catalog.js +++ b/testgen/ui/components/frontend/js/pages/data_catalog.js @@ -19,6 +19,8 @@ * @property {string} table_drop_date * @property {boolean} critical_data_element * @property {boolean} table_critical_data_element + * @property {boolean} excluded_data_element + * @property {boolean} pii_flag * @property {string} data_source * @property {string} source_system * @property {string} source_process @@ -40,6 +42,12 @@ * @type {object} * @property {boolean} can_edit * @property {boolean} can_navigate + * @property {boolean} can_view_pii + * + * @typedef AutoflagSettings + * @type {object} + * @property {boolean} profile_flag_cdes + * @property {boolean} profile_flag_pii * * @typedef Properties * @type {object} @@ -50,21 +58,18 @@ * @property {Object.} tag_values * @property {string} last_saved_timestamp * @property {Permissions} permissions + * @property {AutoflagSettings} autoflag_settings */ import van from '../van.min.js'; import { Tree } from '../components/tree.js'; -import { EditableCard } from '../components/editable_card.js'; -import { Attribute } from '../components/attribute.js'; -import { Input } from '../components/input.js'; import { Icon } from '../components/icon.js'; import { withTooltip } from '../components/tooltip.js'; import { Streamlit } from '../streamlit.js'; import { emitEvent, getRandomId, getValue, loadStylesheet } from '../utils.js'; import { ColumnDistributionCard } from '../data_profiling/column_distribution.js'; import { DataCharacteristicsCard } from '../data_profiling/data_characteristics.js'; -import { PotentialPIICard, HygieneIssuesCard, TestIssuesCard } from '../data_profiling/data_issues.js'; +import { HygieneIssuesCard, TestIssuesCard } from '../data_profiling/data_issues.js'; import { getColumnIcon, TABLE_ICON, LatestProfilingTime } from '../data_profiling/data_profiling_utils.js'; -import { RadioGroup } from '../components/radio_group.js'; import { Checkbox } from '../components/checkbox.js'; import { Select } from '../components/select.js'; import { capitalize, caseInsensitiveIncludes, DISABLED_ACTION_TEXT } from '../display_utils.js'; @@ -75,6 +80,7 @@ import { Link } from '../components/link.js'; import { EMPTY_STATE_MESSAGE, EmptyState } from '../components/empty_state.js'; import { Portal } from '../components/portal.js'; import { TableCreateScriptCard } from '../data_profiling/table_create_script.js'; +import { MetadataTagsCard, MetadataTagsMultiEdit, TAG_KEYS } from '../data_profiling/metadata_tags.js'; const { div, h2, span } = van.tags; @@ -82,27 +88,6 @@ const { div, h2, span } = van.tags; const EMPTY_IMAGE = new Image(1, 1); EMPTY_IMAGE.src = 'data:image/gif;base64,R0lGODlhAQABAIAAAP///wAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw=='; -const TAG_KEYS = [ - 'data_source', - 'source_system', - 'source_process', - 'business_domain', - 'stakeholder_group', - 'transform_level', - 'aggregation_level', - 'data_product', -]; -const TAG_HELP = { - data_source: 'Original source of the dataset', - source_system: 'Enterprise system source for the dataset', - source_process: 'Process, program, or data flow that produced the dataset', - business_domain: 'Business division responsible for the dataset, e.g., Finance, Sales, Manufacturing', - stakeholder_group: 'Data owners or stakeholders responsible for the dataset', - transform_level: 'Data warehouse processing stage, e.g., Raw, Conformed, Processed, Reporting, or Medallion level (bronze, silver, gold)', - aggregation_level: 'Data granularity of the dataset, e.g. atomic, historical, snapshot, aggregated, time-rollup, rolling, summary', - data_product: 'Data domain that comprises the dataset', -}; - const DataCatalog = (/** @type Properties */ props) => { loadStylesheet('data-catalog', stylesheet); @@ -126,21 +111,34 @@ const DataCatalog = (/** @type Properties */ props) => { label: table_name, classes: table_drop_date ? 'text-disabled' : (table_add_date && (Date.now() - new Date(table_add_date * 1000).getTime()) < 7 * 86400000) ? 'text-bold' : '', ...TABLE_ICON, - iconColor: record_ct === 0 ? 'red' : null, + iconClass: record_ct === 0 ? 'text-error' : null, iconTooltip: record_ct === 0 ? 'No records detected' : null, criticalDataElement: !!item.table_critical_data_element, children: [], }; TAG_KEYS.forEach(key => tables[table_id][key] = item[`table_${key}`]); } + const prefixIcons = []; + if (item.critical_data_element ?? item.table_critical_data_element) { + prefixIcons.push(withTooltip(Icon({ size: 15, classes: 'text-purple' }, 'star'), { text: 'Critical data element', position: 'right' })); + } + if (item.excluded_data_element) { + prefixIcons.push(withTooltip(Icon({ size: 15, classes: 'text-brown' }, 'visibility_off'), { text: 'Excluded data element', position: 'right' })); + } + if (item.pii_flag) { + prefixIcons.push(withTooltip(Icon({ size: 15, classes: 'text-orange' }, 'shield_person'), { text: 'PII data', position: 'right' })); + } const columnNode = { id: column_id, label: column_name, - classes: drop_date ? 'text-disabled' : (add_date && (Date.now() - new Date(add_date * 1000).getTime()) < 7 * 86400000) ? 'text-bold' : '', + classes: `column ${drop_date ? 'text-disabled' : (add_date && (Date.now() - new Date(add_date * 1000).getTime()) < 7 * 86400000) ? 'text-bold' : ''}`, ...getColumnIcon(item), - iconColor: value_ct === 0 ? 'red' : null, + iconClass: value_ct === 0 ? 'text-error' : null, iconTooltip: value_ct === 0 ? 'No non-null values detected' : null, + prefix: span({ class: 'tg-dh--column-prefix' }, ...prefixIcons), criticalDataElement: !!(item.critical_data_element ?? item.table_critical_data_element), + excludedDataElement: !!item.excluded_data_element, + piiFlag: !!item.pii_flag, }; TAG_KEYS.forEach(key => columnNode[key] = item[key] ?? item[`table_${key}`]); tables[table_id].children.push(columnNode); @@ -177,7 +175,7 @@ const DataCatalog = (/** @type Properties */ props) => { tableName: van.state(true), columnName: van.state(true), }; - const filters = { criticalDataElement: van.state(false) }; + const filters = { criticalDataElement: van.state(false), piiFlag: van.state(false), showExcluded: van.state(false) }; TAG_KEYS.forEach(key => filters[key] = van.state(null)); // To hold temporary state within the portals, which might be discarded by clicking outside @@ -193,6 +191,7 @@ const DataCatalog = (/** @type Properties */ props) => { const userCanEdit = getValue(props.permissions)?.can_edit ?? false; const userCanNavigate = getValue(props.permissions)?.can_navigate ?? false; + const userCanViewPii = getValue(props.permissions)?.can_view_pii ?? false; const projectSummary = getValue(props.project_summary); return projectSummary.table_group_count > 0 @@ -248,6 +247,8 @@ const DataCatalog = (/** @type Properties */ props) => { || (!!node.children && !searchOptions.tableName.val) || (!node.children && !searchOptions.columnName.val)) || ![ node.criticalDataElement, false ].includes(filters.criticalDataElement.val) + || ![ node.piiFlag, false ].includes(filters.piiFlag.val) + || (node.excludedDataElement && !filters.showExcluded.val) || TAG_KEYS.some(key => ![ node[key], null ].includes(filters[key].val)), onApplySearchOptions: () => { copyState(tempSearchOptions, searchOptions); @@ -258,10 +259,12 @@ const DataCatalog = (/** @type Properties */ props) => { searchOptions.columnName.val = true; } }, - hasActiveFilters: () => filters.criticalDataElement.val || TAG_KEYS.some(key => !!filters[key].val), + hasActiveFilters: () => filters.criticalDataElement.val || filters.piiFlag.val || filters.showExcluded.val || TAG_KEYS.some(key => !!filters[key].val), onApplyFilters: () => copyState(tempFilters, filters), onResetFilters: () => { tempFilters.criticalDataElement.val = false; + tempFilters.piiFlag.val = false; + tempFilters.showExcluded.val = false; TAG_KEYS.forEach(key => tempFilters[key].val = null); }, }, @@ -288,11 +291,24 @@ const DataCatalog = (/** @type Properties */ props) => { () => { copyState(filters, tempFilters); return div( - Checkbox({ - label: 'Only critical data elements (CDEs)', - checked: tempFilters.criticalDataElement, - onChange: (checked) => tempFilters.criticalDataElement.val = checked, - }), + div( + { class: 'flex-column fx-gap-3' }, + Checkbox({ + label: span({ class: 'flex-row fx-gap-1' }, 'Only critical data elements (CDEs)', Icon({ size: 18, classes: 'text-purple' }, 'star')), + checked: tempFilters.criticalDataElement, + onChange: (checked) => tempFilters.criticalDataElement.val = checked, + }), + Checkbox({ + label: span({ class: 'flex-row fx-gap-1' }, 'Only PII data', Icon({ size: 18, classes: 'text-orange' }, 'shield_person')), + checked: tempFilters.piiFlag, + onChange: (checked) => tempFilters.piiFlag.val = checked, + }), + Checkbox({ + label: span({ class: 'flex-row fx-gap-1' }, 'Show excluded data elements (XDEs)', Icon({ size: 18, classes: 'text-brown' }, 'visibility_off')), + checked: tempFilters.showExcluded, + onChange: (checked) => tempFilters.showExcluded.val = checked, + }), + ), div( { class: 'flex-row fx-flex-wrap fx-gap-4 fx-justify-space-between mt-4', @@ -329,7 +345,23 @@ const DataCatalog = (/** @type Properties */ props) => { }, ), () => multiEditMode.val - ? MultiEdit(props, multiSelectedItems, multiEditMode) + ? div( + { class: 'tg-dh--details flex-column' }, + () => multiSelectedItems.val?.length + ? MetadataTagsMultiEdit( + { + tagOptions: getValue(props.tag_values), + piiEditable: userCanViewPii, + autoflagSettings: getValue(props.autoflag_settings) ?? {}, + onCancel: () => multiEditMode.val = false, + }, + multiSelectedItems, + ) + : ItemEmptyState( + 'Select tables or columns on the left to edit their tags.', + 'edit_document', + ) + ) : SelectedDetails(props, selectedItem.val), ) : ConditionalEmptyState(projectSummary, userCanEdit, userCanNavigate), @@ -433,6 +465,7 @@ const ExportOptions = (/** @type TreeNode[] */ treeNodes, /** @type SelectedNode const SelectedDetails = (/** @type Properties */ props, /** @type Table | Column */ item) => { const userCanEdit = getValue(props.permissions)?.can_edit ?? false; const userCanNavigate = getValue(props.permissions)?.can_navigate ?? false; + const userCanViewPii = getValue(props.permissions)?.can_view_pii ?? false; return item ? div( @@ -455,8 +488,15 @@ const SelectedDetails = (/** @type Properties */ props, /** @type Table | Column item.type === 'column' ? ColumnDistributionCard({ dataPreview: true, history: true }, item) : TableSizeCard({}, item), - TagsCard({ tagOptions: getValue(props.tag_values), editable: userCanEdit }, item), - PotentialPIICard({ noLinks: !userCanNavigate }, item), + MetadataTagsCard( + { + tagOptions: getValue(props.tag_values), + editable: userCanEdit, + piiEditable: userCanViewPii, + autoflagSettings: getValue(props.autoflag_settings) ?? {}, + }, + item, + ), HygieneIssuesCard({ noLinks: !userCanNavigate }, item), TestIssuesCard({ noLinks: !userCanNavigate }, item), TestSuitesCard({ noLinks: !userCanNavigate }, item), @@ -470,122 +510,6 @@ const SelectedDetails = (/** @type Properties */ props, /** @type Table | Column ); }; -/** -* @typedef TagProperties -* @type {object} -* @property {Object.} tagOptions -* @property {boolean} editable -*/ -const TagsCard = (/** @type TagProperties */ props, /** @type Table | Column */ item) => { - const title = `${item.type} Tags `; - const attributes = [ - 'description', - 'critical_data_element', - ...TAG_KEYS, - ].map(key => ({ - key, - help: TAG_HELP[key], - label: capitalize(key.replaceAll('_', ' ')), - state: van.state(item[key]), - inheritTableGroup: item[`table_group_${key}`] ?? null, // Table group values inherited by table or column - inheritTable: item[`table_${key}`] ?? null, // Table values inherited by column - })); - - const InheritedIcon = (/** @type string */ inheritedFrom) => withTooltip( - Icon({ size: 18, classes: 'text-disabled' }, 'layers'), - { text: `Inherited from ${inheritedFrom} tags`, position: 'top-right'}, - ); - const width = 300; - const descriptionWidth = 932; - - const content = div( - { class: 'flex-row fx-flex-wrap fx-gap-4' }, - attributes.map(({ key, label, help, state, inheritTable, inheritTableGroup }) => { - let value = state.rawVal ?? inheritTable ?? inheritTableGroup; - - if (key === 'critical_data_element') { - return span( - { class: 'flex-row fx-gap-1', style: `width: ${width}px` }, - Icon( - { classes: value ? 'text-green' : 'text-disabled' }, - value ? 'check_circle' : 'cancel', - ), - span( - { class: value ? '' : 'text-secondary' }, - item.type === 'column' - ? (value ? 'Critical data element' : 'Not a critical data element') - : (value ? 'All critical data elements' : 'Not all critical data elements'), - ), - (item.type === 'column' && state.rawVal === null) ? InheritedIcon('table') : null, - ); - } - - const inheritedFrom = state.rawVal !== null ? null - : inheritTable !== null ? 'table' - : inheritTableGroup !== null ? 'table group' - : null; - - if (inheritedFrom && value) { - value = span( - { class: 'flex-row fx-gap-1' }, - InheritedIcon(inheritedFrom), - value, - ); - } - return Attribute({ label, help, value, width: key === 'description' ? descriptionWidth : width }); - }), - ); - - if (!props.editable) { - return Card({ title, content }); - } - - // Define as function so the block is re-rendered with reset values when re-editing after a cancel - const editingContent = () => div( - { class: 'flex-row fx-flex-wrap fx-gap-4' }, - attributes.map(({ key, label, help, state, inheritTable, inheritTableGroup }) => { - if (key === 'critical_data_element') { - const options = [ - { label: 'Yes', value: true }, - { label: 'No', value: false }, - { label: 'Inherit', value: null }, - ]; - return RadioGroup({ - label, width, options, - value: state.rawVal, - onChange: (value) => state.val = value, - }); - }; - - return Input({ - label, help, - width: key === 'description' ? descriptionWidth : width, - height: 32, - value: state.rawVal, - placeholder: (inheritTable || inheritTableGroup) ? `Inherited: ${inheritTable ?? inheritTableGroup}` : null, - autocompleteOptions: props.tagOptions?.[key], - onChange: (value) => state.val = value || null, - }); - }), - ); - - return EditableCard({ - title: `${item.type} Tags `, - content, editingContent, - onSave: () => { - const items = [{ type: item.type, id: item.id }]; - const tags = attributes.reduce((object, { key, state }) => { - object[key] = state.rawVal; - return object; - }, {}); - emitEvent('TagsChanged', { payload: { items, tags } }); - }, - // Reset states to original values on cancel - onCancel: () => attributes.forEach(({ key, state }) => state.val = item[key]), - hasChanges: () => attributes.some(({ key, state }) => state.val !== item[key]), - }); -}; - const TestSuitesCard = (/** @type Properties */ props, /** @type Table | Column */ item) => { return Card({ title: 'Related Test Suites', @@ -632,118 +556,6 @@ const TestSuitesCard = (/** @type Properties */ props, /** @type Table | Column }); }; -const MultiEdit = (/** @type Properties */ props, /** @type Object */ selectedItems, /** @type Object */ multiEditMode) => { - const hasSelection = van.derive(() => selectedItems.val?.length); - const columnCount = van.derive(() => selectedItems.val?.reduce((count, { children }) => count + children.length, 0)); - - const attributes = [ - 'critical_data_element', - ...TAG_KEYS, - ].map(key => ({ - key, - help: TAG_HELP[key], - label: capitalize(key.replaceAll('_', ' ')), - checkedState: van.state(null), - valueState: van.state(null), - })); - - const cdeOptions = [ - { label: 'Yes', value: true }, - { label: 'No', value: false }, - { label: 'Inherit', value: null }, - ]; - const tagOptions = getValue(props.tag_values) ?? {}; - const width = 400; - - return div( - { class: 'tg-dh--details flex-column' }, - () => hasSelection.val - ? Card({ - title: 'Edit Tags for Selection', - actionContent: span( - { class: 'text-secondary mr-4' }, - span({ style: 'font-weight: 500' }, columnCount), - () => ` column${columnCount.val > 1 ? 's' : ''} selected` - ), - content: div( - { class: 'flex-column' }, - attributes.map(({ key, label, help, checkedState, valueState }) => div( - { class: 'flex-row fx-gap-3' }, - Checkbox({ - checked: checkedState, - onChange: (checked) => checkedState.val = checked, - }), - div( - { - class: 'pb-4 flex-row', - style: `min-width: ${width}px`, - onclick: () => checkedState.val = true, - }, - key === 'critical_data_element' - ? RadioGroup({ - label, width, - options: cdeOptions, - onChange: (value) => valueState.val = value, - }) - : Input({ - label, help, width, - height: 32, - placeholder: () => checkedState.val ? null : '(keep current values)', - autocompleteOptions: tagOptions[key], - onChange: (value) => valueState.val = value || null, - }), - ), - )), - div( - { class: 'flex-row fx-justify-content-flex-end fx-gap-3 mt-4' }, - Button({ - type: 'stroked', - label: 'Cancel', - width: 'auto', - onclick: () => multiEditMode.val = false, - }), - Button({ - type: 'stroked', - color: 'primary', - label: 'Save', - width: 'auto', - disabled: () => attributes.every(({ checkedState }) => !checkedState.val), - onclick: () => { - const items = selectedItems.val.reduce((array, table) => { - const [ type, id ] = table.id.split('_'); - array.push({ type, id }); - - table.children.forEach(column => { - const [ type, id ] = column.id.split('_'); - array.push({ type, id }); - }); - - return array; - }, []); - - const tags = attributes.reduce((object, { key, checkedState, valueState }) => { - if (checkedState.val) { - object[key] = valueState.rawVal; - } - return object; - }, {}); - - emitEvent('TagsChanged', { payload: { items, tags } }); - // Don't set multiEditMode to false here - // Otherwise this event gets superseded by the ItemSelected event - // Let the Streamlit rerun handle the state reset with 'last_saved_timestamp' - }, - }), - ), - ), - }) - : ItemEmptyState( - 'Select tables or columns on the left to edit their tags.', - 'edit_document', - ), - ); -}; - const ItemEmptyState = (/** @type string */ message, /** @type string */ icon) => { return div( { class: 'flex-column fx-align-flex-center fx-justify-center tg-dh--no-selection' }, @@ -830,6 +642,18 @@ stylesheet.replace(` background-color: var(--sidebar-background-color); } +.tg-dh--tree .tg-tree:not(.multi-select) .tg-tree--row.column { + margin-left: -30px; +} + +.tg-dh--column-prefix { + display: inline-flex; + align-items: center; + justify-content: flex-end; + width: 34px; + flex-shrink: 0; +} + .tg-dh--details { padding-top: 8px; overflow: auto; diff --git a/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js b/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js index 5ca5259c..13084655 100644 --- a/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js +++ b/testgen/ui/components/frontend/js/pages/import_metadata_dialog.js @@ -107,6 +107,19 @@ const ImportMetadataDialog = (/** @type Properties */ props) => { hasError ? Alert({ type: 'error', icon: 'error' }, span(preview.error)) : PreviewTable(preview), + preview.pii_skipped + ? Alert( + { type: 'info', icon: 'info' }, + 'PII data in this CSV will be ignored because you do not have permission to edit PII flags.', + ) + : null, + preview.warn_cde || preview.warn_pii + ? Alert( + { type: 'warn', icon: 'warning' }, + `This table group is currently configured to detect ${preview.warn_cde ? 'CDEs' : ''}${preview.warn_cde && preview.warn_pii ? ' and ' : ''}${preview.warn_pii ? 'PIIs' : ''} during profiling. + To preserve your imported edits, autodetection will be turned off.`, + ) + : null, div( { class: 'flex-row fx-justify-content-flex-end' }, Button({ @@ -131,6 +144,12 @@ const STATUS_ICONS = { unmatched: 'block', }; +const COLUMN_LABELS = { + critical_data_element: 'CDE', + excluded_data_element: 'XDE', + pii_flag: 'PII', +}; + const PreviewTable = (preview) => { const metadataColumns = preview.metadata_columns || []; const previewRows = preview.preview_rows || []; @@ -141,7 +160,7 @@ const PreviewTable = (preview) => { { name: 'column_name', label: 'Column', width: 150 }, ...metadataColumns.map(col => ({ name: col, - label: col === 'critical_data_element' ? 'CDE' : capitalize(col.replaceAll('_', ' ')), + label: COLUMN_LABELS[col] ?? capitalize(col.replaceAll('_', ' ')), width: col === 'description' ? 200 : 120, })), ]; diff --git a/testgen/ui/components/frontend/js/pages/notification_settings.js b/testgen/ui/components/frontend/js/pages/notification_settings.js index 55f45616..a72be6d4 100644 --- a/testgen/ui/components/frontend/js/pages/notification_settings.js +++ b/testgen/ui/components/frontend/js/pages/notification_settings.js @@ -209,8 +209,8 @@ const NotificationSettings = (/** @type Properties */ props) => { ), duplicatedMessage ? div( - { class: 'flex-row fx-gap-1 text-caption warning-text' }, - Icon({ size: 12, classes: 'warning-text' }, 'warning'), + { class: 'flex-row fx-gap-1 text-caption text-warning' }, + Icon({ size: 12, classes: 'text-warning' }, 'warning'), span({}, duplicatedMessage), ) : '', diff --git a/testgen/ui/pdf/hygiene_issue_report.py b/testgen/ui/pdf/hygiene_issue_report.py index 1e67ae06..c2c92e4d 100644 --- a/testgen/ui/pdf/hygiene_issue_report.py +++ b/testgen/ui/pdf/hygiene_issue_report.py @@ -5,6 +5,7 @@ from reportlab.platypus import CondPageBreak, KeepTogether, Paragraph, Table, TableStyle from testgen.common.models.settings import PersistedSetting +from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii from testgen.settings import ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT from testgen.ui.pdf.dataframe_table import DataFrameTableBuilder from testgen.ui.pdf.style import ( @@ -118,6 +119,10 @@ def build_summary_table(document, hi_data): "Critical data element: Yes" if hi_data["critical_data_element"] else "Critical data element: No", style=PARA_STYLE_CELL, ), + Paragraph( + "PII: Yes" if hi_data["pii_flag"] else "PII: No", + style=PARA_STYLE_CELL, + ), Paragraph(f"Description: {hi_data['column_description']}", style=PARA_STYLE_CELL) if hi_data["column_description"] else [], @@ -178,7 +183,7 @@ def build_sql_query_content(sample_data_tuple): return Paragraph("No sample data lookup query registered for this issue.") -def get_report_content(document, hi_data): +def get_report_content(document, hi_data, mask_pii: bool = False): yield Paragraph("TestGen Hygiene Issue Report", PARA_STYLE_TITLE) yield build_summary_table(document, hi_data) @@ -188,6 +193,11 @@ def get_report_content(document, hi_data): sample_data_tuple = get_hygiene_issue_source_data(hi_data, limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT) + # Mask PII in sample data + if sample_data_tuple[3] is not None and mask_pii: + pii_columns = get_pii_columns(str(hi_data["table_groups_id"]), table_name=hi_data["table_name"]) + mask_dataframe_pii(sample_data_tuple[3], pii_columns) + yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT) yield Paragraph("Sample Data", PARA_STYLE_H1) yield from build_sample_data_content(document, sample_data_tuple) @@ -198,6 +208,6 @@ def get_report_content(document, hi_data): ]) -def create_report(filename, hi_data): +def create_report(filename, hi_data, mask_pii: bool = False): doc = DatakitchenTemplate(filename) - doc.build(flowables=list(get_report_content(doc, hi_data))) + doc.build(flowables=list(get_report_content(doc, hi_data, mask_pii=mask_pii))) diff --git a/testgen/ui/pdf/test_result_report.py b/testgen/ui/pdf/test_result_report.py index eab9cee8..28b3c900 100644 --- a/testgen/ui/pdf/test_result_report.py +++ b/testgen/ui/pdf/test_result_report.py @@ -11,6 +11,7 @@ ) from testgen.common.models.settings import PersistedSetting +from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii from testgen.settings import ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT from testgen.ui.pdf.dataframe_table import TABLE_STYLE_DATA, DataFrameTableBuilder from testgen.ui.pdf.style import ( @@ -131,6 +132,10 @@ def build_summary_table(document, tr_data): "Critical data element: Yes" if tr_data["critical_data_element"] else "Critical data element: No", style=PARA_STYLE_CELL, ), + Paragraph( + "PII: Yes" if tr_data["pii_flag"] else "PII: No", + style=PARA_STYLE_CELL, + ), Paragraph(f"Description: {tr_data['column_description']}", style=PARA_STYLE_CELL) if tr_data["column_description"] else [], @@ -227,7 +232,7 @@ def build_sql_query_content(sample_data_tuple): return Paragraph("No sample data lookup query registered for this test.") -def get_report_content(document, tr_data): +def get_report_content(document, tr_data, mask_pii: bool = False): yield Paragraph("TestGen Test Issue Report", PARA_STYLE_TITLE) yield build_summary_table(document, tr_data) @@ -246,6 +251,11 @@ def get_report_content(document, tr_data): else: sample_data_tuple = get_test_issue_source_data(tr_data, limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT) + # Mask PII in sample data + if sample_data_tuple[3] is not None and mask_pii: + pii_columns = get_pii_columns(str(tr_data["table_groups_id"]), table_name=tr_data["table_name"]) + mask_dataframe_pii(sample_data_tuple[3], pii_columns) + yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT) yield Paragraph("Sample Data", PARA_STYLE_H1) yield from build_sample_data_content(document, sample_data_tuple) @@ -256,6 +266,6 @@ def get_report_content(document, tr_data): ]) -def create_report(filename, tr_data): +def create_report(filename, tr_data, mask_pii: bool = False): doc = DatakitchenTemplate(filename) - doc.build(flowables=list(get_report_content(doc, tr_data))) + doc.build(flowables=list(get_report_content(doc, tr_data, mask_pii=mask_pii))) diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py index 5f8362e6..4db67d55 100644 --- a/testgen/ui/queries/profiling_queries.py +++ b/testgen/ui/queries/profiling_queries.py @@ -368,6 +368,8 @@ def get_columns_by_condition( -- Column Tags column_chars.description, column_chars.critical_data_element, + column_chars.excluded_data_element, + column_chars.pii_flag, {", ".join([ f"column_chars.{tag}" for tag in TAG_FIELDS ])}, -- Table Tags table_chars.critical_data_element AS table_critical_data_element, @@ -525,6 +527,7 @@ def get_profiling_anomalies( dcc.functional_data_type, dcc.description as column_description, COALESCE(dcc.critical_data_element, dtc.critical_data_element) as critical_data_element, + dcc.pii_flag, COALESCE(dcc.data_source, dtc.data_source, tg.data_source) as data_source, COALESCE(dcc.source_system, dtc.source_system, tg.source_system) as source_system, COALESCE(dcc.source_process, dtc.source_process, tg.source_process) as source_process, diff --git a/testgen/ui/queries/scoring_queries.py b/testgen/ui/queries/scoring_queries.py index f8d78bdd..2bcb5d97 100644 --- a/testgen/ui/queries/scoring_queries.py +++ b/testgen/ui/queries/scoring_queries.py @@ -42,10 +42,12 @@ def get_score_card_issue_reports(selected_issues: list["SelectedIssue"]) -> list results.profile_run_id::VARCHAR, types.suggested_action, results.table_groups_id::VARCHAR, + results.project_code, results.anomaly_id::VARCHAR, column_chars.functional_data_type, column_chars.description as column_description, COALESCE(column_chars.critical_data_element, table_chars.critical_data_element) as critical_data_element, + column_chars.pii_flag, COALESCE(column_chars.data_source, table_chars.data_source, groups.data_source) as data_source, COALESCE(column_chars.source_system, table_chars.source_system, groups.source_system) as source_system, COALESCE(column_chars.source_process, table_chars.source_process, groups.source_process) as source_process, @@ -104,9 +106,11 @@ def get_score_card_issue_reports(selected_issues: list["SelectedIssue"]) -> list results.test_suite_id, results.test_definition_id::VARCHAR, results.table_groups_id::VARCHAR, + groups.project_code, types.id::VARCHAR AS test_type_id, column_chars.description as column_description, COALESCE(column_chars.critical_data_element, table_chars.critical_data_element) as critical_data_element, + column_chars.pii_flag, COALESCE(column_chars.data_source, table_chars.data_source, groups.data_source) as data_source, COALESCE(column_chars.source_system, table_chars.source_system, groups.source_system) as source_system, COALESCE(column_chars.source_process, table_chars.source_process, groups.source_process) as source_process, diff --git a/testgen/ui/queries/test_result_queries.py b/testgen/ui/queries/test_result_queries.py index 928cfb27..7c73df03 100644 --- a/testgen/ui/queries/test_result_queries.py +++ b/testgen/ui/queries/test_result_queries.py @@ -68,6 +68,7 @@ def get_test_results( dcc.description as column_description, dcc.column_type as column_type, COALESCE(dcc.critical_data_element, dtc.critical_data_element) as critical_data_element, + dcc.pii_flag, COALESCE(dcc.data_source, dtc.data_source, tg.data_source) as data_source, COALESCE(dcc.source_system, dtc.source_system, tg.source_system) as source_system, COALESCE(dcc.source_process, dtc.source_process, tg.source_process) as source_process, diff --git a/testgen/ui/static/css/shared.css b/testgen/ui/static/css/shared.css index 28ed9baa..9f6af80f 100644 --- a/testgen/ui/static/css/shared.css +++ b/testgen/ui/static/css/shared.css @@ -226,10 +226,26 @@ body { color: var(--error-color); } +.text-warning { + color: var(--orange); +} + .text-green { color: var(--primary-color); } +.text-purple { + color: var(--purple); +} + +.text-orange { + color: var(--orange); +} + +.text-brown { + color: var(--brown); +} + .text-capitalize { text-transform: capitalize; } @@ -745,10 +761,6 @@ input::-ms-clear { margin-top: 0; } -.warning-text { - color: var(--orange); -} - /* Base Styles - Using standard system fonts for that Material feel */ .display, .headline, .title, .body, .label { margin: 0; diff --git a/testgen/ui/static/js/components/attribute.js b/testgen/ui/static/js/components/attribute.js index 61240f7f..a7bb60eb 100644 --- a/testgen/ui/static/js/components/attribute.js +++ b/testgen/ui/static/js/components/attribute.js @@ -8,11 +8,12 @@ * @property {string?} class */ import { getValue, loadStylesheet } from '../utils.js'; +import { PII_REDACTED } from '../display_utils.js'; import { Icon } from './icon.js'; import { withTooltip } from './tooltip.js'; import van from '../van.min.js'; -const { div } = van.tags; +const { div, code } = van.tags; const Attribute = (/** @type Properties */ props) => { loadStylesheet('attribute', stylesheet); @@ -33,6 +34,12 @@ const Attribute = (/** @type Properties */ props) => { { class: 'attribute-value' }, () => { const value = getValue(props.value); + if (value === PII_REDACTED) { + return withTooltip( + code({ class: 'attribute-pii-redacted' }, 'PII Redacted'), + { text: 'You do not have permission to view PII data', position: 'top-right' }, + ); + } return (value || value === 0) ? value : '--'; }, ), @@ -44,6 +51,16 @@ stylesheet.replace(` .attribute-value { word-wrap: break-word; } + +.attribute-pii-redacted { + display: inline-block; + font-size: 12px; + padding: 2px 6px; + border-radius: 4px; + background: color-mix(in srgb, var(--disabled-text-color) 15%, transparent); + color: var(--disabled-text-color); + overflow: visible; +} `); export { Attribute }; diff --git a/testgen/ui/static/js/components/radio_group.js b/testgen/ui/static/js/components/radio_group.js index 4f8b0008..97aef2df 100644 --- a/testgen/ui/static/js/components/radio_group.js +++ b/testgen/ui/static/js/components/radio_group.js @@ -8,11 +8,13 @@ * @typedef Properties * @type {object} * @property {string} label + * @property {string?} help * @property {Option[]} options * @property {string | number | boolean | null} value * @property {function(string | number | boolean | null)?} onChange * @property {number?} width * @property {('default' | 'inline' | 'vertical')?} layout + * @property {boolean?} disabled */ import van from '../van.min.js'; import { getRandomId, getValue, loadStylesheet } from '../utils.js'; @@ -26,12 +28,19 @@ const RadioGroup = (/** @type Properties */ props) => { const groupName = getRandomId(); const layout = getValue(props.layout) ?? 'default'; + const disabled = getValue(props.disabled) ?? false; return div( - { class: () => `tg-radio-group--wrapper ${layout}`, style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}` }, + { class: () => `tg-radio-group--wrapper ${layout}${disabled ? ' disabled' : ''}`, style: () => `width: ${props.width ? getValue(props.width) + 'px' : 'auto'}` }, div( - { class: 'text-caption tg-radio-group--label' }, + { class: 'text-caption tg-radio-group--label flex-row fx-gap-1' }, props.label, + () => getValue(props.help) + ? withTooltip( + Icon({ size: 16, classes: 'text-disabled' }, 'help'), + { text: props.help, position: 'top', width: 200 } + ) + : null, ), () => div( { class: 'tg-radio-group' }, @@ -42,6 +51,7 @@ const RadioGroup = (/** @type Properties */ props) => { name: groupName, value: option.value, checked: () => option.value === getValue(props.value), + disabled, onchange: van.derive(() => { const onChange = props.onChange?.val ?? props.onChange; return onChange ? () => onChange(option.value) : null; @@ -149,6 +159,11 @@ stylesheet.replace(` border-radius: 5px; } +.tg-radio-group--wrapper.disabled { + opacity: 0.5; + pointer-events: none; +} + .tg-radio-group--help { white-space: pre-wrap; line-height: 16px; diff --git a/testgen/ui/static/js/components/table_group_form.js b/testgen/ui/static/js/components/table_group_form.js index 6b072255..609f6aa0 100644 --- a/testgen/ui/static/js/components/table_group_form.js +++ b/testgen/ui/static/js/components/table_group_form.js @@ -14,6 +14,7 @@ * @property {string?} profile_sk_column_mask * @property {number?} profiling_delay_days * @property {boolean?} profile_flag_cdes + * @property {boolean?} profile_flag_pii * @property {boolean?} include_in_dashboard * @property {boolean?} add_scorecard_definition * @property {boolean?} profile_use_sampling @@ -81,6 +82,7 @@ const TableGroupForm = (props) => { const profileSkColumnMask = van.state(tableGroup.profile_sk_column_mask ?? '%_sk'); const profilingDelayDays = van.state(tableGroup.profiling_delay_days ?? 0); const profileFlagCdes = van.state(tableGroup.profile_flag_cdes ?? true); + const profileFlagPii = van.state(tableGroup.profile_flag_pii ?? true); const includeInDashboard = van.state(tableGroup.include_in_dashboard ?? true); const addScorecardDefinition = van.state(tableGroup.add_scorecard_definition ?? true); const profileUseSampling = van.state(tableGroup.profile_use_sampling ?? false); @@ -120,6 +122,7 @@ const TableGroupForm = (props) => { profile_sk_column_mask: profileSkColumnMask.val, profiling_delay_days: profilingDelayDays.val, profile_flag_cdes: profileFlagCdes.val, + profile_flag_pii: profileFlagPii.val, include_in_dashboard: includeInDashboard.val, add_scorecard_definition: addScorecardDefinition.val, profile_use_sampling: profileUseSampling.val, @@ -186,6 +189,7 @@ const TableGroupForm = (props) => { { editMode: !!tableGroup.id, setValidity: setFieldValidity }, profilingDelayDays, profileFlagCdes, + profileFlagPii, includeInDashboard, addScorecardDefinition, ), @@ -325,6 +329,7 @@ const SettingsForm = ( options, profilingDelayDays, profileFlagCdes, + profileFlagPii, includeInDashboard, addScorecardDefinition, ) => { @@ -339,6 +344,12 @@ const SettingsForm = ( checked: profileFlagCdes, onChange: (value) => profileFlagCdes.val = value, }), + Checkbox({ + name: 'profile_flag_pii', + label: 'Detect PII during profiling', + checked: profileFlagPii, + onChange: (value) => profileFlagPii.val = value, + }), Checkbox({ name: 'include_in_dashboard', label: 'Include table group in Project Dashboard', diff --git a/testgen/ui/static/js/components/tree.js b/testgen/ui/static/js/components/tree.js index 82acc371..fbf77c9c 100644 --- a/testgen/ui/static/js/components/tree.js +++ b/testgen/ui/static/js/components/tree.js @@ -6,8 +6,9 @@ * @property {string?} classes * @property {string?} icon * @property {number?} iconSize - * @property {'red'?} iconColor + * @property {string?} iconClass * @property {string?} iconTooltip + * @property {Element?} prefix * @property {TreeNode[]?} children * @property {number?} level * @property {boolean?} expanded @@ -91,7 +92,7 @@ const Tree = (/** @type Properties */ props, /** @type any? */ searchOptionsCont }, Toolbar(treeNodes, multiSelect, props, searchOptionsContent, filtersContent), div( - { class: 'tg-tree' }, + { class: () => `tg-tree ${multiSelect.val ? 'multi-select' : ''}` }, () => div( { class: 'tg-tree--nodes', @@ -312,9 +313,10 @@ const TreeNode = ( span({ class: 'mr-1' }), ] : null, + !multiSelect && node.prefix ? node.prefix : null, () => { if (node.icon) { - const icon = Icon({ size: node.iconSize, classes: `tg-tree--row-icon ${node.iconColor}` }, node.icon); + const icon = Icon({ size: node.iconSize, classes: `tg-tree--row-icon ${node.iconClass}` }, node.icon); return node.iconTooltip ? withTooltip(icon, { text: node.iconTooltip, position: 'right' }) : icon; } return null; @@ -519,10 +521,6 @@ stylesheet.replace(` color: #B0BEC5; text-align: center; } - -.tg-tree--row-icon.red { - color: var(--red); -} `); export { Tree }; diff --git a/testgen/ui/static/js/display_utils.js b/testgen/ui/static/js/display_utils.js index c590c9a0..8dc0c9f5 100644 --- a/testgen/ui/static/js/display_utils.js +++ b/testgen/ui/static/js/display_utils.js @@ -2,6 +2,9 @@ function formatTimestamp( /** @type number | string */ timestamp, /** @type boolean */ showYear, ) { + if (timestamp === PII_REDACTED) { + return timestamp; + } if (timestamp) { let date = timestamp; if (typeof timestamp === 'number') { @@ -81,6 +84,9 @@ function humanReadableDuration(/** @type string */ duration, /** @type boolean * } function formatNumber(/** @type number | string */ number, /** @type number */ decimals = 3) { + if (number === PII_REDACTED) { + return number; + } if (!['number', 'string'].includes(typeof number) || isNaN(number)) { return '--'; } @@ -173,6 +179,7 @@ const colorMap = { } const DISABLED_ACTION_TEXT = 'You do not have permissions to perform this action. Contact your administrator.'; +const PII_REDACTED = '[PII Redacted]'; export { formatTimestamp, @@ -187,4 +194,5 @@ export { viewPortUnitsToPixels, colorMap, DISABLED_ACTION_TEXT, + PII_REDACTED, }; diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index 407a4447..92906c8f 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -12,6 +12,7 @@ from testgen.common.models import with_database_session from testgen.common.models.project import Project from testgen.common.models.table_group import TableGroup, TableGroupMinimal +from testgen.common.pii_masking import PII_REDACTED, get_pii_columns, mask_profiling_pii from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets import testgen_component from testgen.ui.components.widgets.download_dialog import ( @@ -113,7 +114,12 @@ def render( "permissions": { "can_edit": session.auth.user_has_permission("disposition"), "can_navigate": user_can_navigate, + "can_view_pii": session.auth.user_has_permission("view_pii"), }, + "autoflag_settings": { + "profile_flag_cdes": selected_table_group.profile_flag_cdes, + "profile_flag_pii": selected_table_group.profile_flag_pii, + } if selected_table_group else None, }, on_change_handlers={ "RunProfilingClicked": lambda _: run_profiling_dialog( @@ -154,7 +160,7 @@ def render( if selected_table_group else None, }, - event_handlers={"TagsChanged": partial(on_tags_changed, spinner_container)}, + event_handlers={"TagsChanged": partial(on_tags_changed, spinner_container, table_group_id)}, ) @@ -198,6 +204,11 @@ def get_excel_report_data( ) data = pd.DataFrame(table_data + column_data) + + if not session.auth.user_has_permission("view_pii"): + pii_columns = get_pii_columns(str(table_group.id)) + mask_profiling_pii(data, pii_columns) + data = data.sort_values( by=["table_name", "ordinal_position"], na_position="first", @@ -211,7 +222,7 @@ def get_excel_report_data( data[key] = data[key].apply(lambda val: round(val, 2) if not pd.isna(val) else None) for key in ["min_date", "max_date", "add_date", "last_mod_date", "drop_date"]: - data[key] = data[key].apply(lambda val: val.strftime("%b %-d %Y, %-I:%M %p") if not pd.isna(val) else None) + data[key] = data[key].apply(lambda val: val.strftime("%b %-d %Y, %-I:%M %p") if not pd.isna(val) and not isinstance(val, str) else val) for key in [ "data_source", @@ -237,22 +248,26 @@ def get_excel_report_data( else None, axis=1, ) + data["excluded_data_element"] = data["excluded_data_element"].apply(lambda val: "Yes" if val else None) + data["pii_flag"] = data["pii_flag"].apply(lambda val: "Yes" if val else None) data["top_freq_values"] = data["top_freq_values"].apply( lambda val: "\n".join([f"{part.split(" | ")[1]} | {part.split(" | ")[0]}" for part in val[2:].split("\n| ")]) - if not pd.isna(val) - else None + if not pd.isna(val) and val != PII_REDACTED + else val ) data["top_patterns"] = data["top_patterns"].apply( lambda val: "".join([f"{part}{'\n' if index % 2 else ' | '}" for index, part in enumerate(val.split(" | "))]) - if not pd.isna(val) - else None + if not pd.isna(val) and val != PII_REDACTED + else val ) file_columns = { "schema_name": {"header": "Schema"}, "table_name": {"header": "Table"}, "column_name": {"header": "Column"}, - "critical_data_element": {}, + "critical_data_element": {"header": "Critical data element (CDE)"}, + "excluded_data_element": {"header": "Excluded data element (XDE)"}, + "pii_flag": {"header": "PII"}, "active_test_count": {"header": "Active tests"}, "ordinal_position": {"header": "Position"}, "general_type": {}, @@ -358,7 +373,7 @@ def remove_table_dialog(item: dict) -> None: safe_rerun() -def on_tags_changed(spinner_container: DeltaGenerator, payload: dict) -> FILE_DATA_TYPE: +def on_tags_changed(spinner_container: DeltaGenerator, table_group_id: str, payload: dict) -> FILE_DATA_TYPE: attributes = ["description"] attributes.extend(TAG_FIELDS) @@ -374,7 +389,7 @@ def on_tags_changed(spinner_container: DeltaGenerator, payload: dict) -> FILE_DA with spinner_container: with st.spinner("Saving tags"): - if params["table_ids"]: + if params["table_ids"] and set_attributes: execute_db_query( f""" WITH selected as ( @@ -390,6 +405,15 @@ def on_tags_changed(spinner_container: DeltaGenerator, payload: dict) -> FILE_DA ) if params["column_ids"]: + if "excluded_data_element" in tags: + set_attributes.append("excluded_data_element = :excluded_data_element") + params.update({"excluded_data_element": tags.get("excluded_data_element")}) + + # Prevent user from editing PII flag if they cannot view PII + if "pii_flag" in tags and session.auth.user_has_permission("view_pii"): + set_attributes.append("pii_flag = :pii_flag") + params.update({"pii_flag": tags.get("pii_flag")}) + execute_db_query( f""" WITH selected as ( @@ -404,10 +428,28 @@ def on_tags_changed(spinner_container: DeltaGenerator, payload: dict) -> FILE_DA params, ) + _disable_autoflags(table_group_id, payload.get("disable_flags")) + st.session_state["data_catalog:last_saved_timestamp"] = datetime.now().timestamp() safe_rerun() +def _disable_autoflags(table_group_id: str, disable_flags: list[str] | None) -> None: + if not disable_flags or not (table_group := TableGroup.get(table_group_id)): + return + + changed = False + if "profile_flag_cdes" in disable_flags: + table_group.profile_flag_cdes = False + changed = True + if "profile_flag_pii" in disable_flags: + table_group.profile_flag_pii = False + changed = True + + if changed: + table_group.save() + + def export_metadata_csv(table_group: TableGroupMinimal) -> None: def _get_csv_data(update_progress: PROGRESS_UPDATE_TYPE) -> FILE_DATA_TYPE: table_data = fetch_all_from_db( @@ -428,6 +470,8 @@ def _get_csv_data(update_progress: PROGRESS_UPDATE_TYPE) -> FILE_DATA_TYPE: SELECT c.table_name, c.column_name, c.description, c.critical_data_element, + c.excluded_data_element, + c.pii_flag, {", ".join([ f"c.{tag}" for tag in TAG_FIELDS ])} FROM data_column_chars c LEFT JOIN data_table_chars t ON (c.table_id = t.table_id) @@ -444,6 +488,8 @@ def _get_csv_data(update_progress: PROGRESS_UPDATE_TYPE) -> FILE_DATA_TYPE: "Column": row["column_name"], "Description": row["description"] or "", "Critical Data Element": "Yes" if row["critical_data_element"] is True else "No" if row["critical_data_element"] is False else "", + "Excluded Data Element": "Yes" if row.get("excluded_data_element") else "No", + "PII": "Yes" if row.get("pii_flag") else "No", } for tag in TAG_FIELDS: header = tag.replace("_", " ").title() @@ -484,6 +530,8 @@ def get_table_group_columns(table_group_id: str) -> list[dict]: table_chars.drop_date AS table_drop_date, column_chars.critical_data_element, table_chars.critical_data_element AS table_critical_data_element, + column_chars.excluded_data_element, + column_chars.pii_flag, {", ".join([ f"column_chars.{tag}" for tag in TAG_FIELDS ])}, {", ".join([ f"table_chars.{tag} AS table_{tag}" for tag in TAG_FIELDS ])} FROM data_column_chars column_chars @@ -528,6 +576,9 @@ def get_selected_item(selected: str, table_group_id: str) -> dict | None: item["test_suites"] = get_related_test_suites( item["table_group_id"], item["table_name"], item.get("column_name") ) + if not session.auth.user_has_permission("view_pii"): + pii_columns = get_pii_columns(item["table_group_id"], table_name=item["table_name"]) + mask_profiling_pii(item, pii_columns) return item diff --git a/testgen/ui/views/dialogs/column_history_dialog.py b/testgen/ui/views/dialogs/column_history_dialog.py index 24915163..6d2dc2ec 100644 --- a/testgen/ui/views/dialogs/column_history_dialog.py +++ b/testgen/ui/views/dialogs/column_history_dialog.py @@ -3,10 +3,12 @@ from testgen.common.models import with_database_session from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.pii_masking import get_pii_columns, mask_profiling_pii from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets import testgen_component from testgen.ui.queries.profiling_queries import COLUMN_PROFILING_FIELDS from testgen.ui.services.database_service import fetch_one_from_db +from testgen.ui.session import session from testgen.utils import make_json_safe @@ -40,6 +42,10 @@ def _column_history_dialog( run_id = st.session_state.get("column_history_dialog:run_id") or profiling_runs[0]["id"] selected_item = get_run_column(run_id, schema_name, table_name, column_name) + if selected_item and not session.auth.user_has_permission("view_pii"): + pii_columns = get_pii_columns(table_group_id, table_name=table_name) + mask_profiling_pii(selected_item, pii_columns) + testgen_component( "column_profiling_history", props={ diff --git a/testgen/ui/views/dialogs/data_preview_dialog.py b/testgen/ui/views/dialogs/data_preview_dialog.py index c72b2223..d57d78f3 100644 --- a/testgen/ui/views/dialogs/data_preview_dialog.py +++ b/testgen/ui/views/dialogs/data_preview_dialog.py @@ -3,8 +3,10 @@ from testgen.common.database.database_service import get_flavor_service from testgen.common.models.connection import Connection +from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii from testgen.ui.components import widgets as testgen from testgen.ui.services.database_service import fetch_from_target_db +from testgen.ui.session import session from testgen.utils import to_dataframe @@ -26,6 +28,10 @@ def data_preview_dialog( with st.spinner("Loading data ..."): data = get_preview_data(table_group_id, schema_name, table_name, column_name) + if not data.empty and not session.auth.user_has_permission("view_pii"): + pii_columns = get_pii_columns(table_group_id, schema_name, table_name) + mask_dataframe_pii(data, pii_columns) + if data.empty: st.warning("The preview data could not be loaded.") else: diff --git a/testgen/ui/views/dialogs/import_metadata_dialog.py b/testgen/ui/views/dialogs/import_metadata_dialog.py index aa639745..36eeb345 100644 --- a/testgen/ui/views/dialogs/import_metadata_dialog.py +++ b/testgen/ui/views/dialogs/import_metadata_dialog.py @@ -8,11 +8,12 @@ import streamlit as st from testgen.common.models import with_database_session +from testgen.common.models.table_group import TableGroup from testgen.ui.components.widgets.testgen_component import testgen_component from testgen.ui.queries.profiling_queries import TAG_FIELDS from testgen.ui.services.database_service import execute_db_query, fetch_all_from_db from testgen.ui.services.rerun_service import safe_rerun -from testgen.ui.session import temp_value +from testgen.ui.session import session, temp_value LOG = logging.getLogger("testgen") @@ -22,6 +23,11 @@ "description": "description", "critical data element": "critical_data_element", "cde": "critical_data_element", + "excluded": "excluded_data_element", + "excluded data element": "excluded_data_element", + "xde": "excluded_data_element", + "pii": "pii_flag", + "pii flag": "pii_flag", "data source": "data_source", "source system": "source_system", "source process": "source_process", @@ -29,13 +35,13 @@ "stakeholder group": "stakeholder_group", "transform level": "transform_level", "aggregation level": "aggregation_level", - "data product": "data_product", + "data product": "data_product", } -METADATA_COLUMNS = ["description", "critical_data_element", *TAG_FIELDS] +METADATA_COLUMNS = ["description", "critical_data_element", "excluded_data_element", "pii_flag", *TAG_FIELDS] -CDE_TRUE_VALUES = {"yes", "y", "true", "1"} -CDE_FALSE_VALUES = {"no", "n", "false", "0"} +TRUE_VALUES = {"yes", "y", "true", "1"} +FALSE_VALUES = {"no", "n", "false", "0"} TAG_MAX_LENGTH = 40 DESCRIPTION_MAX_LENGTH = 1000 @@ -147,13 +153,13 @@ def _match_and_validate( preview_rows.append(preview_row) continue - fields, bad_cde = _extract_metadata_fields(row, blank_behavior) + fields, bad_cde, bad_xde, bad_pii = _extract_metadata_fields(row, blank_behavior) fields, truncated = _truncate_fields(fields) - if fields and not bad_cde: + if fields and not bad_cde and not bad_xde and not bad_pii: table_rows.append({"table_id": table_id, "table_name": table_name, **fields}) preview_row.update(fields) - _set_row_status(preview_row, bad_cde, truncated) + _set_row_status(preview_row, bad_cde, bad_xde, bad_pii, truncated) preview_rows.append(preview_row) else: column_id = column_lookup.get((table_name, column_name)) @@ -165,20 +171,26 @@ def _match_and_validate( preview_rows.append(preview_row) continue - fields, bad_cde = _extract_metadata_fields(row, blank_behavior) + fields, bad_cde, bad_xde, bad_pii = _extract_metadata_fields(row, blank_behavior) fields, truncated = _truncate_fields(fields) - if fields and not bad_cde: + if fields and not bad_cde and not bad_xde and not bad_pii: column_rows.append( {"column_id": column_id, "table_name": table_name, "column_name": column_name, **fields} ) preview_row.update(fields) - _set_row_status(preview_row, bad_cde, truncated) + _set_row_status(preview_row, bad_cde, bad_xde, bad_pii, truncated) preview_rows.append(preview_row) # Determine which metadata columns are present in the CSV metadata_columns = [c for c in METADATA_COLUMNS if c in df.columns] + # Strip PII column if user lacks permission + pii_skipped = False + if "pii_flag" in metadata_columns and not session.auth.user_has_permission("view_pii"): + metadata_columns.remove("pii_flag") + pii_skipped = True + # Count matched vs skipped rows from preview # "ok" and "warning" rows will be imported; "error" and "unmatched" rows are skipped _importable = {"ok", "warning"} @@ -186,6 +198,8 @@ def _match_and_validate( matched_columns = sum(1 for r in preview_rows if r.get("column_name") and r.get("_status") in _importable) skipped = sum(1 for r in preview_rows if r.get("_status") not in _importable) + table_group = TableGroup.get(table_group_id) + return { "table_rows": table_rows, "column_rows": column_rows, @@ -195,12 +209,17 @@ def _match_and_validate( "matched_tables": matched_tables, "matched_columns": matched_columns, "skipped_count": skipped, + "warn_cde": bool("critical_data_element" in metadata_columns and table_group.profile_flag_cdes), + "warn_pii": bool("pii_flag" in metadata_columns and table_group.profile_flag_pii), + "pii_skipped": pii_skipped, } -def _extract_metadata_fields(row: pd.Series, blank_behavior: str) -> tuple[dict, bool]: +def _extract_metadata_fields(row: pd.Series, blank_behavior: str) -> tuple[dict, bool, bool, bool]: fields = {} bad_cde = False + bad_xde = False + bad_pii = False for col in METADATA_COLUMNS: if col not in row.index: continue @@ -208,9 +227,9 @@ def _extract_metadata_fields(row: pd.Series, blank_behavior: str) -> tuple[dict, value = row[col] if col == "critical_data_element": - if value.lower() in CDE_TRUE_VALUES: + if value.lower() in TRUE_VALUES: fields[col] = True - elif value.lower() in CDE_FALSE_VALUES: + elif value.lower() in FALSE_VALUES: fields[col] = False elif not value: if blank_behavior == "clear": @@ -219,6 +238,26 @@ def _extract_metadata_fields(row: pd.Series, blank_behavior: str) -> tuple[dict, else: # Unrecognized value — skip (don't set field at all) bad_cde = True + elif col == "excluded_data_element": + if value.lower() in TRUE_VALUES: + fields[col] = True + elif value.lower() in FALSE_VALUES: + fields[col] = False + elif not value: + if blank_behavior == "clear": + fields[col] = False + else: + bad_xde = True + elif col == "pii_flag": + if value.lower() in TRUE_VALUES: + fields[col] = "MANUAL" + elif value.lower() in FALSE_VALUES: + fields[col] = None + elif not value: + if blank_behavior == "clear": + fields[col] = None + else: + bad_pii = True else: if value: fields[col] = value @@ -226,7 +265,7 @@ def _extract_metadata_fields(row: pd.Series, blank_behavior: str) -> tuple[dict, fields[col] = "" # "keep" with blank value → skip this field - return fields, bad_cde + return fields, bad_cde, bad_xde, bad_pii def _truncate_fields(fields: dict) -> tuple[dict, list[str]]: @@ -241,14 +280,18 @@ def _truncate_fields(fields: dict) -> tuple[dict, list[str]]: return fields, truncated -def _set_row_status(preview_row: dict, bad_cde: bool, truncated: list[str]) -> None: +def _set_row_status(preview_row: dict, bad_cde: bool, bad_xde: bool, bad_pii: bool, truncated: list[str]) -> None: issues = [] if bad_cde: issues.append("Unrecognized CDE value (expected Yes/No) — skipped") + if bad_xde: + issues.append("Unrecognized XDE value (expetced Yes/No) - skipped") + if bad_pii: + issues.append("Unrecognized PII value (expected Yes/No) - skipped") if truncated: issues.append(f"Values truncated: {', '.join(truncated)}") - if bad_cde: + if bad_cde or bad_xde or bad_pii: preview_row["_status"] = "error" elif truncated: preview_row["_status"] = "warning" @@ -258,12 +301,12 @@ def _set_row_status(preview_row: dict, bad_cde: bool, truncated: list[str]) -> N preview_row["_truncated_fields"] = truncated -def apply_metadata_import(preview: dict) -> dict: +def apply_metadata_import(preview: dict, table_group_id: str | None = None) -> dict: table_count = 0 column_count = 0 for row in preview.get("table_rows", []): - set_clauses, params = _build_update_params(row, preview["metadata_columns"]) + set_clauses, params = _build_update_params(row, preview["metadata_columns"], is_column=False) if not set_clauses: continue params["table_id"] = row["table_id"] @@ -274,7 +317,7 @@ def apply_metadata_import(preview: dict) -> dict: table_count += 1 for row in preview.get("column_rows", []): - set_clauses, params = _build_update_params(row, preview["metadata_columns"]) + set_clauses, params = _build_update_params(row, preview["metadata_columns"], is_column=True) if not set_clauses: continue params["column_id"] = row["column_id"] @@ -284,10 +327,26 @@ def apply_metadata_import(preview: dict) -> dict: ) column_count += 1 + if table_group_id: + _disable_autoflags(table_group_id, preview.get("metadata_columns", [])) + return {"table_count": table_count, "column_count": column_count} -def _build_update_params(row: dict, metadata_columns: list[str]) -> tuple[list[str], dict]: +def _disable_autoflags(table_group_id: str, metadata_columns: list[str]) -> None: + table_group = TableGroup.get(table_group_id) + changed = False + if "critical_data_element" in metadata_columns and table_group.profile_flag_cdes: + table_group.profile_flag_cdes = False + changed = True + if "pii_flag" in metadata_columns and table_group.profile_flag_pii: + table_group.profile_flag_pii = False + changed = True + if changed: + table_group.save() + + +def _build_update_params(row: dict, metadata_columns: list[str], is_column: bool = False) -> tuple[list[str], dict]: set_clauses = [] params = {} @@ -299,6 +358,15 @@ def _build_update_params(row: dict, metadata_columns: list[str]) -> tuple[list[s if col == "critical_data_element": set_clauses.append("critical_data_element = :critical_data_element") params["critical_data_element"] = value + elif col == "excluded_data_element": + if is_column: + set_clauses.append("excluded_data_element = :excluded_data_element") + params["excluded_data_element"] = value + elif col == "pii_flag": + # Prevent user from editing PII flag if they cannot view PII + if is_column and session.auth.user_has_permission("view_pii"): + set_clauses.append("pii_flag = :pii_flag") + params["pii_flag"] = value else: set_clauses.append(f"{col} = NULLIF(:{col}, '')") params[col] = value if value is not None else "" @@ -335,7 +403,7 @@ def on_file_cleared(_payload: dict) -> None: result = None if should_import() and preview and not preview.get("error"): try: - apply_metadata_import(preview) + apply_metadata_import(preview, table_group_id) # Clear caches from testgen.ui.queries.profiling_queries import get_column_by_id, get_table_by_id @@ -406,9 +474,12 @@ def _build_preview_props(preview: dict) -> dict: for col in metadata_columns: if col in row: val = row[col] - formatted_row[col] = ( - "Yes" if val is True else "No" if val is False else ("" if val is None else str(val)) - ) + if col in ["excluded_data_element", "pii_flag"]: + formatted_row[col] = "Yes" if val else "No" + else: + formatted_row[col] = ( + "Yes" if val is True else "No" if val is False else ("" if val is None else str(val)) + ) formatted_rows.append(formatted_row) return { @@ -417,4 +488,7 @@ def _build_preview_props(preview: dict) -> dict: "skipped_count": preview.get("skipped_count", 0), "metadata_columns": metadata_columns, "preview_rows": formatted_rows, + "warn_cde": preview.get("warn_cde", False), + "warn_pii": preview.get("warn_pii", False), + "pii_skipped": preview.get("pii_skipped", False), } diff --git a/testgen/ui/views/dialogs/profiling_results_dialog.py b/testgen/ui/views/dialogs/profiling_results_dialog.py index 3b824a5b..f8907db3 100644 --- a/testgen/ui/views/dialogs/profiling_results_dialog.py +++ b/testgen/ui/views/dialogs/profiling_results_dialog.py @@ -4,7 +4,9 @@ import testgen.ui.queries.profiling_queries as profiling_queries from testgen.common.models import with_database_session +from testgen.common.pii_masking import get_pii_columns, mask_profiling_pii from testgen.ui.components.widgets.testgen_component import testgen_component +from testgen.ui.session import session from testgen.utils import make_json_safe @@ -24,6 +26,10 @@ def profiling_results_dialog(column_name: str, table_name: str, table_groups_id: column = profiling_queries.get_column_by_name(column_name, table_name, table_groups_id) if column: + if not session.auth.user_has_permission("view_pii"): + pii_columns = get_pii_columns(table_groups_id, table_name=table_name) + mask_profiling_pii(column, pii_columns) + testgen_component( "column_profiling_results", props={ "column": json.dumps(make_json_safe(column)) }, diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py index 8ac10dda..47a85f90 100644 --- a/testgen/ui/views/hygiene_issues.py +++ b/testgen/ui/views/hygiene_issues.py @@ -11,6 +11,7 @@ from testgen.common.mixpanel_service import MixpanelService from testgen.common.models import with_database_session from testgen.common.models.hygiene_issue import HygieneIssue +from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii from testgen.common.models.profiling_run import ProfilingRun from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( @@ -478,6 +479,12 @@ def source_data_dialog(selected_row): else: if bad_data_msg: st.info(bad_data_msg) + if not session.auth.user_has_permission("view_pii"): + pii_columns = get_pii_columns( + selected_row["table_groups_id"], + table_name=selected_row["table_name"], + ) + mask_dataframe_pii(df_bad, pii_columns) # Pretify the dataframe df_bad.columns = [col.replace("_", " ").title() for col in df_bad.columns] df_bad.fillna("", inplace=True) @@ -511,7 +518,7 @@ def get_report_file_data(update_progress, tr_data) -> FILE_DATA_TYPE: file_name = f"testgen_hygiene_issue_report_{hi_id}_{profiling_time}.pdf" with BytesIO() as buffer: - create_report(buffer, tr_data) + create_report(buffer, tr_data, mask_pii=not session.auth.user_has_permission("view_pii")) update_progress(1.0) buffer.seek(0) return file_name, "application/pdf", buffer.read() diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index cabedd33..f0af0b9e 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -10,6 +10,7 @@ from testgen.common import date_service from testgen.common.models import with_database_session from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.pii_masking import PII_REDACTED, get_pii_columns, mask_profiling_pii from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( FILE_DATA_TYPE, @@ -130,6 +131,10 @@ def render(self, run_id: str, table_name: str | None = None, column_name: str | sorting_columns=sorting_columns, ) + if not session.auth.user_has_permission("view_pii"): + pii_columns = get_pii_columns(str(run.table_groups_id)) + mask_profiling_pii(df, pii_columns) + selected, selected_row = fm.render_grid_select( df, ["table_name", "column_name", "db_data_type", "semantic_data_type", "hygiene_issues", "result_details"], @@ -197,6 +202,10 @@ def get_excel_report_data( data = profiling_queries.get_profiling_results(run_id) date_service.accommodate_dataframe_to_timezone(data, st.session_state) + if not session.auth.user_has_permission("view_pii"): + pii_columns = get_pii_columns(data["table_group_id"].iloc[0] if "table_group_id" in data.columns else "") + mask_profiling_pii(data, pii_columns) + for key in ["datatype_suggestion"]: data[key] = data[key].apply(lambda val: val.lower() if not pd.isna(val) else None) @@ -205,7 +214,7 @@ def get_excel_report_data( for key in ["min_date", "max_date"]: data[key] = data[key].apply( - lambda val: parse_fuzzy_date(val) if not pd.isna(val) and val != "NaT" else None + lambda val: parse_fuzzy_date(val) if not pd.isna(val) and val != "NaT" and val != PII_REDACTED else val ) data["hygiene_issues"] = data["hygiene_issues"].apply(lambda val: "Yes" if val else None) @@ -215,13 +224,13 @@ def get_excel_report_data( data["top_freq_values"] = data["top_freq_values"].apply( lambda val: "\n".join([ f"{part.split(" | ")[1]} | {part.split(" | ")[0]}" for part in val[2:].split("\n| ") ]) - if val - else None + if val and val != PII_REDACTED + else val ) data["top_patterns"] = data["top_patterns"].apply( lambda val: "".join([ f"{part}{'\n' if index % 2 else ' | '}" for index, part in enumerate(val.split(" | ")) ]) - if val - else None + if val and val != PII_REDACTED + else val ) columns = { diff --git a/testgen/ui/views/score_details.py b/testgen/ui/views/score_details.py index 629bffe9..ed9a3860 100644 --- a/testgen/ui/views/score_details.py +++ b/testgen/ui/views/score_details.py @@ -181,11 +181,13 @@ def get_report_file_data(update_progress, issue) -> FILE_DATA_TYPE: if issue["issue_type"] == "hygiene": issue_id = issue["id"][:8] timestamp = pd.Timestamp(issue["profiling_starttime"]).strftime("%Y%m%d_%H%M%S") - hygiene_issue_report.create_report(buffer, issue) + mask_pii = not session.auth.user_has_permission("view_pii") + hygiene_issue_report.create_report(buffer, issue, mask_pii=mask_pii) else: issue_id = issue["test_result_id"][:8] timestamp = pd.Timestamp(issue["test_date"]).strftime("%Y%m%d_%H%M%S") - test_result_report.create_report(buffer, issue) + mask_pii = not session.auth.user_has_permission("view_pii") + test_result_report.create_report(buffer, issue, mask_pii=mask_pii) update_progress(1.0) buffer.seek(0) diff --git a/testgen/ui/views/score_explorer.py b/testgen/ui/views/score_explorer.py index 64e004b0..3d8206a7 100644 --- a/testgen/ui/views/score_explorer.py +++ b/testgen/ui/views/score_explorer.py @@ -244,11 +244,13 @@ def get_report_file_data(update_progress, issue) -> FILE_DATA_TYPE: if issue["issue_type"] == "hygiene": issue_id = issue["id"][:8] timestamp = pd.Timestamp(issue["profiling_starttime"]).strftime("%Y%m%d_%H%M%S") - hygiene_issue_report.create_report(buffer, issue) + mask_pii = not session.auth.user_has_permission("view_pii") + hygiene_issue_report.create_report(buffer, issue, mask_pii=mask_pii) else: issue_id = issue["test_result_id"][:8] timestamp = pd.Timestamp(issue["test_date"]).strftime("%Y%m%d_%H%M%S") - test_result_report.create_report(buffer, issue) + mask_pii = not session.auth.user_has_permission("view_pii") + test_result_report.create_report(buffer, issue, mask_pii=mask_pii) update_progress(1.0) buffer.seek(0) diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index bafe46a6..aa86f081 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -20,6 +20,7 @@ from testgen.common.models.test_definition import TestDefinition from testgen.common.models.test_run import TestRun from testgen.common.models.test_suite import TestSuite, TestSuiteMinimal +from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( FILE_DATA_TYPE, @@ -893,6 +894,12 @@ def source_data_dialog(selected_row): else: if bad_data_msg: st.info(bad_data_msg) + if not session.auth.user_has_permission("view_pii"): + pii_columns = get_pii_columns( + selected_row["table_groups_id"], + table_name=selected_row["table_name"], + ) + mask_dataframe_pii(df_bad, pii_columns) # Pretify the dataframe df_bad.columns = [col.replace("_", " ").title() for col in df_bad.columns] df_bad.fillna("", inplace=True) @@ -916,7 +923,7 @@ def get_report_file_data(update_progress, tr_data) -> FILE_DATA_TYPE: file_name = f"testgen_test_issue_report_{tr_id}_{tr_time}.pdf" with BytesIO() as buffer: - create_report(buffer, tr_data) + create_report(buffer, tr_data, mask_pii=not session.auth.user_has_permission("view_pii")) update_progress(1.0) buffer.seek(0) return file_name, "application/pdf", buffer.read() diff --git a/tests/unit/common/test_pii_masking.py b/tests/unit/common/test_pii_masking.py new file mode 100644 index 00000000..eb0e4b77 --- /dev/null +++ b/tests/unit/common/test_pii_masking.py @@ -0,0 +1,191 @@ +import pandas as pd + +from testgen.common.pii_masking import PII_REDACTED, mask_dataframe_pii, mask_profiling_pii + + +class Test_mask_dataframe_pii: + def test_masks_pii_columns(self): + df = pd.DataFrame({ + "name": ["Alice", "Bob"], + "ssn": ["123-45-6789", "987-65-4321"], + "age": [30, 25], + }) + mask_dataframe_pii(df, {"ssn"}) + assert df["ssn"].tolist() == [PII_REDACTED, PII_REDACTED] + assert df["age"].tolist() == [30, 25] + assert df["name"].tolist() == ["Alice", "Bob"] + + def test_preserves_non_pii_columns(self): + df = pd.DataFrame({"col_a": [1, 2], "col_b": ["x", "y"]}) + mask_dataframe_pii(df, {"col_a"}) + assert df["col_b"].tolist() == ["x", "y"] + + def test_handles_empty_dataframe(self): + df = pd.DataFrame(columns=["name", "ssn"]) + mask_dataframe_pii(df, {"ssn"}) + assert df.empty + + def test_handles_missing_pii_column(self): + df = pd.DataFrame({"col_a": [1, 2]}) + mask_dataframe_pii(df, {"nonexistent_col"}) + assert df["col_a"].tolist() == [1, 2] + + def test_handles_empty_pii_set(self): + df = pd.DataFrame({"col_a": [1, 2]}) + mask_dataframe_pii(df, set()) + assert df["col_a"].tolist() == [1, 2] + + def test_case_insensitive_matching(self): + df = pd.DataFrame({"SSN": ["123-45-6789"], "Name": ["Alice"]}) + mask_dataframe_pii(df, {"ssn"}) + assert df["SSN"].tolist() == [PII_REDACTED] + assert df["Name"].tolist() == ["Alice"] + + def test_multiple_pii_columns(self): + df = pd.DataFrame({ + "name": ["Alice"], + "ssn": ["123"], + "email": ["a@b.com"], + "age": [30], + }) + mask_dataframe_pii(df, {"ssn", "email"}) + assert df["ssn"].tolist() == [PII_REDACTED] + assert df["email"].tolist() == [PII_REDACTED] + assert df["name"].tolist() == ["Alice"] + assert df["age"].tolist() == [30] + + +class Test_mask_profiling_pii: + def _make_profiling_df(self): + return pd.DataFrame({ + "column_name": ["ssn", "age", "email"], + "top_freq_values": ["123|456", "30|25", "a@b|c@d"], + "min_text": ["000", "20", "a@a"], + "max_text": ["999", "40", "z@z"], + "min_value": [0, 20, None], + "max_value": [999, 40, None], + }) + + def test_masks_pii_profiling_fields(self): + df = self._make_profiling_df() + mask_profiling_pii(df, {"ssn", "email"}) + + ssn_row = df[df["column_name"] == "ssn"].iloc[0] + assert ssn_row["top_freq_values"] == PII_REDACTED + assert ssn_row["min_text"] == PII_REDACTED + assert ssn_row["max_text"] == PII_REDACTED + assert ssn_row["min_value"] == PII_REDACTED + assert ssn_row["max_value"] == PII_REDACTED + + email_row = df[df["column_name"] == "email"].iloc[0] + assert email_row["top_freq_values"] == PII_REDACTED + + def test_preserves_non_pii_rows(self): + df = self._make_profiling_df() + mask_profiling_pii(df, {"ssn"}) + + age_row = df[df["column_name"] == "age"].iloc[0] + assert age_row["top_freq_values"] == "30|25" + assert age_row["min_text"] == "20" + assert age_row["max_text"] == "40" + + def test_handles_empty_dataframe(self): + df = pd.DataFrame(columns=["column_name", "top_freq_values"]) + mask_profiling_pii(df, {"ssn"}) + assert df.empty + + def test_handles_empty_pii_set(self): + df = self._make_profiling_df() + original_values = df["top_freq_values"].tolist() + mask_profiling_pii(df, set()) + assert df["top_freq_values"].tolist() == original_values + + def test_handles_missing_fields(self): + df = pd.DataFrame({ + "column_name": ["ssn", "age"], + "top_freq_values": ["123", "30"], + }) + mask_profiling_pii(df, {"ssn"}) + assert df.loc[0, "top_freq_values"] == PII_REDACTED + assert df.loc[1, "top_freq_values"] == "30" + + def test_case_insensitive_column_name_matching(self): + df = pd.DataFrame({ + "column_name": ["SSN", "age"], + "top_freq_values": ["123", "30"], + "min_text": ["000", "20"], + }) + mask_profiling_pii(df, {"ssn"}) + assert df.loc[0, "top_freq_values"] == PII_REDACTED + assert df.loc[0, "min_text"] == PII_REDACTED + assert df.loc[1, "top_freq_values"] == "30" + + +class Test_mask_profiling_pii_dict: + def test_masks_fields_when_column_is_pii(self): + data = { + "column_name": "ssn", + "top_freq_values": "123|456", + "min_text": "000", + "max_text": "999", + "min_value": 0, + "max_value": 999, + "min_value_over_0": 1, + "min_date": "2024-01-01", + "max_date": "2024-12-31", + } + mask_profiling_pii(data, {"ssn"}) + assert data["top_freq_values"] == PII_REDACTED + assert data["min_text"] == PII_REDACTED + assert data["max_text"] == PII_REDACTED + assert data["min_value"] == PII_REDACTED + assert data["max_value"] == PII_REDACTED + assert data["min_value_over_0"] == PII_REDACTED + assert data["min_date"] == PII_REDACTED + assert data["max_date"] == PII_REDACTED + + def test_preserves_non_pii_column(self): + data = { + "column_name": "age", + "top_freq_values": "30|25", + "min_text": "20", + "max_text": "40", + } + mask_profiling_pii(data, {"ssn"}) + assert data["top_freq_values"] == "30|25" + assert data["min_text"] == "20" + assert data["max_text"] == "40" + + def test_case_insensitive_matching(self): + data = {"column_name": "SSN", "min_text": "000"} + mask_profiling_pii(data, {"ssn"}) + assert data["min_text"] == PII_REDACTED + + def test_empty_pii_set_skips_masking(self): + data = {"column_name": "ssn", "min_text": "000"} + mask_profiling_pii(data, set()) + assert data["min_text"] == "000" + + def test_missing_fields_handled(self): + data = {"column_name": "ssn", "min_text": "000"} + mask_profiling_pii(data, {"ssn"}) + assert data["min_text"] == PII_REDACTED + assert "top_freq_values" not in data + + def test_no_column_name_masks_unconditionally(self): + data = {"top_freq_values": "123|456", "min_text": "000"} + mask_profiling_pii(data, {"ssn"}) + assert data["top_freq_values"] == PII_REDACTED + assert data["min_text"] == PII_REDACTED + + def test_preserves_non_profiling_fields(self): + data = { + "column_name": "ssn", + "top_freq_values": "123", + "record_ct": 100, + "distinct_value_ct": 50, + } + mask_profiling_pii(data, {"ssn"}) + assert data["top_freq_values"] == PII_REDACTED + assert data["record_ct"] == 100 + assert data["distinct_value_ct"] == 50 diff --git a/tests/unit/ui/test_import_metadata.py b/tests/unit/ui/test_import_metadata.py index 75c85f65..037f9278 100644 --- a/tests/unit/ui/test_import_metadata.py +++ b/tests/unit/ui/test_import_metadata.py @@ -126,58 +126,58 @@ def test_parse_csv_no_column_header_adds_empty(): @pytest.mark.parametrize("val", ["Yes", "yes", "Y", "y", "True", "true", "1"]) def test_extract_cde_true_values(val): - fields, bad_cde = _extract_metadata_fields(_make_series({"critical_data_element": val}), "keep") + fields, bad_cde, *_ = _extract_metadata_fields(_make_series({"critical_data_element": val}), "keep") assert fields["critical_data_element"] is True assert bad_cde == 0 @pytest.mark.parametrize("val", ["No", "no", "N", "n", "False", "false", "0"]) def test_extract_cde_false_values(val): - fields, bad_cde = _extract_metadata_fields(_make_series({"critical_data_element": val}), "keep") + fields, bad_cde, *_ = _extract_metadata_fields(_make_series({"critical_data_element": val}), "keep") assert fields["critical_data_element"] is False assert bad_cde == 0 def test_extract_cde_blank_keep(): - fields, bad_cde = _extract_metadata_fields(_make_series({"critical_data_element": ""}), "keep") + fields, bad_cde, *_ = _extract_metadata_fields(_make_series({"critical_data_element": ""}), "keep") assert "critical_data_element" not in fields assert bad_cde == 0 def test_extract_cde_blank_clear(): - fields, bad_cde = _extract_metadata_fields(_make_series({"critical_data_element": ""}), "clear") + fields, bad_cde, *_ = _extract_metadata_fields(_make_series({"critical_data_element": ""}), "clear") assert fields["critical_data_element"] is None assert bad_cde == 0 def test_extract_cde_unrecognized(): - fields, bad_cde = _extract_metadata_fields(_make_series({"critical_data_element": "Maybe"}), "keep") + fields, bad_cde, *_ = _extract_metadata_fields(_make_series({"critical_data_element": "Maybe"}), "keep") assert "critical_data_element" not in fields assert bad_cde == 1 def test_extract_text_field_with_value(): - fields, _ = _extract_metadata_fields(_make_series({"description": "test desc"}), "keep") + fields, *_ = _extract_metadata_fields(_make_series({"description": "test desc"}), "keep") assert fields["description"] == "test desc" def test_extract_text_field_blank_keep(): - fields, _ = _extract_metadata_fields(_make_series({"description": ""}), "keep") + fields, *_ = _extract_metadata_fields(_make_series({"description": ""}), "keep") assert "description" not in fields def test_extract_text_field_blank_clear(): - fields, _ = _extract_metadata_fields(_make_series({"description": ""}), "clear") + fields, *_ = _extract_metadata_fields(_make_series({"description": ""}), "clear") assert fields["description"] == "" def test_extract_missing_column_skipped(): - fields, _ = _extract_metadata_fields(_make_series({"description": "test"}), "keep") + fields, *_ = _extract_metadata_fields(_make_series({"description": "test"}), "keep") assert "data_source" not in fields def test_extract_tag_field_with_value(): - fields, _ = _extract_metadata_fields(_make_series({"data_source": "ERP"}), "keep") + fields, *_ = _extract_metadata_fields(_make_series({"data_source": "ERP"}), "keep") assert fields["data_source"] == "ERP" @@ -224,7 +224,7 @@ def test_truncate_multiple_fields(): def test_set_row_status_ok(): row = {} - _set_row_status(row, bad_cde=0, truncated=[]) + _set_row_status(row, bad_cde=False, bad_xde=False, bad_pii=False, truncated=[]) assert row["_status"] == "ok" assert row["_status_detail"] == "" assert row["_truncated_fields"] == [] @@ -232,14 +232,14 @@ def test_set_row_status_ok(): def test_set_row_status_error_bad_cde(): row = {} - _set_row_status(row, bad_cde=1, truncated=[]) + _set_row_status(row, bad_cde=True, bad_xde=False, bad_pii=False, truncated=[]) assert row["_status"] == "error" assert "Unrecognized CDE" in row["_status_detail"] def test_set_row_status_warning_truncated(): row = {} - _set_row_status(row, bad_cde=0, truncated=["data_source"]) + _set_row_status(row, bad_cde=False, bad_xde=False, bad_pii=False, truncated=["data_source"]) assert row["_status"] == "warning" assert "truncated" in row["_status_detail"] assert "data_source" in row["_status_detail"] @@ -247,7 +247,7 @@ def test_set_row_status_warning_truncated(): def test_set_row_status_error_precedence(): row = {} - _set_row_status(row, bad_cde=1, truncated=["data_source"]) + _set_row_status(row, bad_cde=True, bad_xde=False, bad_pii=False, truncated=["data_source"]) assert row["_status"] == "error" assert "CDE" in row["_status_detail"] assert "truncated" in row["_status_detail"] From 31c0361af34e6db24f754c26106946094d64dd2f Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 24 Mar 2026 02:50:41 -0400 Subject: [PATCH 68/95] fix(monitor): use excluded days from schedule if active --- .../commands/queries/execute_tests_query.py | 13 ++- .../commands/test_thresholds_prediction.py | 9 +- tests/unit/common/test_freshness_service.py | 107 ++++++++++++++++++ 3 files changed, 123 insertions(+), 6 deletions(-) diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index 539a6dd0..4902cf98 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -326,16 +326,21 @@ def _get_params(self, test_def: TestExecutionDef | None = None) -> dict: # Freshness exclusion params — computed per test at execution time if test_def.test_type == "Freshness_Trend" and test_def.baseline_sum: sched = get_schedule_params(test_def.prediction) - has_exclusions = self._exclude_weekends or sched.excluded_days or sched.window_start is not None + # Once the schedule is active (excluded_days derived from active_days), + # it supersedes exclude_weekends as the single source of truth for + # day exclusion — avoids conflicts where a detection day (e.g. Saturday) + # is active per schedule but excluded per exclude_weekends. + effective_exclude_weekends = False if sched.excluded_days else self._exclude_weekends + has_exclusions = effective_exclude_weekends or sched.excluded_days or sched.window_start is not None if has_exclusions: last_update = pd.Timestamp(test_def.baseline_sum) - excluded = int(count_excluded_minutes( - last_update, self.run_date, self._exclude_weekends, self._holiday_dates, + excluded = round(count_excluded_minutes( + last_update, self.run_date, effective_exclude_weekends, self._holiday_dates, tz=self._schedule_tz, excluded_days=sched.excluded_days, window_start=sched.window_start, window_end=sched.window_end, )) is_excl = 1 if is_excluded_day( - pd.Timestamp(self.run_date), self._exclude_weekends, self._holiday_dates, + pd.Timestamp(self.run_date), effective_exclude_weekends, self._holiday_dates, tz=self._schedule_tz, excluded_days=sched.excluded_days, window_start=sched.window_start, window_end=sched.window_end, ) else 0 diff --git a/testgen/commands/test_thresholds_prediction.py b/testgen/commands/test_thresholds_prediction.py index ca7b679b..7f6617ee 100644 --- a/testgen/commands/test_thresholds_prediction.py +++ b/testgen/commands/test_thresholds_prediction.py @@ -213,6 +213,11 @@ def compute_freshness_threshold( if schedule.stage == "active": excluded_days = frozenset(range(7)) - schedule.active_days if schedule.active_days else None + # Once the schedule is active, excluded_days is the single source of truth + # for day exclusion — it supersedes exclude_weekends, which was the user's + # initial hint before enough data was available for schedule inference. + schedule_exclude_weekends = False if excluded_days else exclude_weekends + # For sub-daily schedules, apply window exclusion for overnight gaps has_window = ( schedule.frequency == "sub_daily" @@ -228,7 +233,7 @@ def compute_freshness_threshold( upper_percentile=upper_percentile, floor_multiplier=floor_multiplier, lower_percentile=lower_percentile, - exclude_weekends=exclude_weekends, + exclude_weekends=schedule_exclude_weekends, holiday_codes=holiday_codes, tz=schedule_tz, staleness_factor=staleness_factor, @@ -246,7 +251,7 @@ def compute_freshness_threshold( holiday_dates = resolve_holiday_dates(holiday_codes, history.index) if holiday_codes else None schedule_upper = minutes_to_next_deadline( result.last_update, schedule, - exclude_weekends, holiday_dates, schedule_tz, + schedule_exclude_weekends, holiday_dates, schedule_tz, deadline_buffer, excluded_days=excluded_days, ) if schedule_upper is not None: diff --git a/tests/unit/common/test_freshness_service.py b/tests/unit/common/test_freshness_service.py index 6021e8e2..f8317413 100644 --- a/tests/unit/common/test_freshness_service.py +++ b/tests/unit/common/test_freshness_service.py @@ -904,6 +904,113 @@ def test_with_excluded_days(self): assert 1700 <= result <= 1800 +class Test_WeekendScheduleInteraction: + """Test that schedule-inferred active_days supersedes exclude_weekends. + + Scenario: Tables update Mon-Fri evenings, monitor runs at midnight & noon. + Schedule inference detects active_days = Tue-Sat (detection days, shifted + by one from update days). excluded_days = {Mon, Sun}. + + Once the schedule is active, excluded_days should be the single source + of truth — Saturday (a detection day) should NOT be treated as excluded. + """ + + def _make_midnight_detection_schedule(self): + """Schedule for tables that update on weekday evenings and are detected + the following morning at midnight: Tue-Sat active, midnight window.""" + return _make_schedule( + active_days=frozenset({1, 2, 3, 4, 5}), # Tue-Sat + window_start=0.0, + window_end=0.0, + ) + + def test_saturday_check_update_detected_passes(self): + """Friday update detected at Saturday midnight check should pass. + + With the fix: exclude_weekends=False when excluded_days is present, + so Saturday is NOT excluded. Deadline lands on Saturday (next active day + after Friday), giving upper ~1620 min — well above the ~1440 min gap. + """ + schedule = self._make_midnight_detection_schedule() + excluded_days = frozenset({0, 6}) # Mon, Sun + zi = zoneinfo.ZoneInfo(TZ) + # Friday midnight ET = 05:00 UTC (EST) + last_update = pd.Timestamp("2026-02-06T05:00", tz="UTC").tz_localize(None) + + upper = minutes_to_next_deadline( + last_update, schedule, + exclude_weekends=False, # The fix: schedule supersedes this + holiday_dates=None, tz=TZ, buffer_hours=3.0, + excluded_days=excluded_days, + ) + assert upper is not None + # ~1620 min (Fri midnight to Sat 3AM, no excluded time on Saturday) + assert upper > 1500 + + # The actual gap (Fri midnight to Sat midnight) is ~1440 min + # which should be well within the tolerance + assert 1440 < upper + + def test_saturday_check_is_not_excluded_day(self): + """Saturday should not be IS_EXCLUDED_DAY when schedule says it's active.""" + zi = zoneinfo.ZoneInfo(TZ) + excluded_days = frozenset({0, 6}) # Mon, Sun + # Saturday midnight ET = 05:00 UTC (EST, Feb before DST) + sat_run = pd.Timestamp("2026-02-07T05:00", tz="UTC").tz_localize(None) + + result = is_excluded_day( + sat_run, + exclude_weekends=False, # The fix + holiday_dates=None, + tz=TZ, + excluded_days=excluded_days, + ) + # Saturday (weekday 5) is NOT in excluded_days {0, 6} + assert result is False + + def test_sunday_still_excluded(self): + """Sunday should remain excluded (in excluded_days={0, 6}).""" + excluded_days = frozenset({0, 6}) + sun_run = pd.Timestamp("2026-02-08T05:00", tz="UTC").tz_localize(None) + + result = is_excluded_day( + sun_run, + exclude_weekends=False, + holiday_dates=None, + tz=TZ, + excluded_days=excluded_days, + ) + assert result is True + + def test_monday_still_excluded(self): + """Monday should remain excluded (in excluded_days={0, 6}, weekday 0).""" + excluded_days = frozenset({0, 6}) + mon_run = pd.Timestamp("2026-02-09T05:00", tz="UTC").tz_localize(None) + + result = is_excluded_day( + mon_run, + exclude_weekends=False, + holiday_dates=None, + tz=TZ, + excluded_days=excluded_days, + ) + assert result is True + + def test_saturday_excluded_minutes_zero_for_weekday_gap(self): + """No excluded minutes between Friday and Saturday when Saturday is active.""" + excluded_days = frozenset({0, 6}) + fri = pd.Timestamp("2026-02-06T05:00") # Fri midnight ET + sat = pd.Timestamp("2026-02-07T05:00") # Sat midnight ET + + excl = count_excluded_minutes( + fri, sat, + exclude_weekends=False, + holiday_dates=None, + tz=TZ, + excluded_days=excluded_days, + ) + assert excl == 0 + # --------------------------------------------------------------------------- # is_excluded_day with window_start/window_end Tests From 36aa8589e16da05520271c8b9c3a10bc536bd2cf Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 24 Mar 2026 02:56:25 -0400 Subject: [PATCH 69/95] fix: remove summary from edit table group dialog --- testgen/ui/views/table_groups.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index 3acf740e..cccaf6e7 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -335,6 +335,9 @@ def on_close_clicked(_params: dict) -> None: message = "Profiling run encountered errors" LOG.exception(message) + if table_group_id and success: + st.rerun() + except IntegrityError: success = False message = "A Table Group with the same name already exists." From a326e67763eecf2b8964b8a92cfe71460b0c1193 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 24 Mar 2026 03:18:45 -0400 Subject: [PATCH 70/95] fix(data catalog): add help text --- .../ui/components/frontend/js/data_profiling/metadata_tags.js | 3 +++ testgen/ui/views/hygiene_issues.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js b/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js index 5687479e..cfabba45 100644 --- a/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js +++ b/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js @@ -142,6 +142,7 @@ const MetadataTagsCard = (props, item) => { if (key === 'critical_data_element') { return RadioGroup({ label, + help: 'Mark columns that are important for business decisions or regulatory compliance. CDEs are highlighted in the catalog and can be tracked separately in quality scores.', options: item.type === 'column' ? [...booleanOptions, { label: 'Inherit', value: null }] : booleanOptions, width: attributeWidth, value: state.rawVal, @@ -151,6 +152,7 @@ const MetadataTagsCard = (props, item) => { if (key === 'excluded_data_element') { return RadioGroup({ label, + help: 'Exclude this column from profiling and test generation. The column remains in the catalog but will not be analyzed.', options: booleanOptions, width: attributeWidth, value: state.rawVal, @@ -160,6 +162,7 @@ const MetadataTagsCard = (props, item) => { if (key === 'pii_flag') { return RadioGroup({ label, + help: 'Mark columns containing personally identifiable information. PII values are redacted for users without viewing permissions.', options: piiOptions, width: attributeWidth, value: state.rawVal ? 'MANUAL' : null, diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py index 47a85f90..1bc9913b 100644 --- a/testgen/ui/views/hygiene_issues.py +++ b/testgen/ui/views/hygiene_issues.py @@ -11,8 +11,8 @@ from testgen.common.mixpanel_service import MixpanelService from testgen.common.models import with_database_session from testgen.common.models.hygiene_issue import HygieneIssue -from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii from testgen.common.models.profiling_run import ProfilingRun +from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( FILE_DATA_TYPE, From 0146128a1d01c70d90e3cad0efa4a556fde57aba Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 24 Mar 2026 16:38:27 -0400 Subject: [PATCH 71/95] fix: updates to pii masking and xdes Address review and discussion feedback --- testgen/commands/run_profiling.py | 3 +- testgen/common/models/data_column.py | 2 + testgen/common/models/table_group.py | 2 + testgen/common/pii_masking.py | 30 ++++- .../030_initialize_new_schema_structure.sql | 3 + ..._anomaly_types_Boolean_Value_Mismatch.yaml | 1 + ...anomaly_types_Delimited_Data_Embedded.yaml | 1 + ...anomaly_types_Non_Alpha_Prefixed_Name.yaml | 1 + ...rofile_anomaly_types_Recency_One_Year.yaml | 1 + ...file_anomaly_types_Recency_Six_Months.yaml | 1 + ...ofile_anomaly_types_Unexpected_Emails.yaml | 1 + ...le_anomaly_types_Unexpected_US_States.yaml | 1 + ...le_anomaly_types_Unlikely_Date_Values.yaml | 1 + .../dbsetup_test_types/test_types_CUSTOM.yaml | 6 +- .../test_types_Distribution_Shift.yaml | 9 ++ .../test_types_LOV_All.yaml | 25 ++-- .../test_types_Recency.yaml | 9 ++ .../dbupgrade/0178_incremental_upgrade.sql | 8 +- .../generation/gen_selection_tests.sql | 15 ++- .../get_score_card_issues_by_column.sql | 10 ++ .../get_score_card_issues_by_dimension.sql | 10 ++ .../frontend/js/components/score_issues.js | 2 +- .../js/components/table_group_form.js | 14 +- .../frontend/js/pages/table_group_wizard.js | 6 + testgen/ui/pdf/hygiene_issue_report.py | 8 +- testgen/ui/pdf/test_result_report.py | 15 +-- testgen/ui/queries/profiling_queries.py | 3 +- testgen/ui/queries/scoring_queries.py | 3 + testgen/ui/queries/source_data_queries.py | 83 +++++++++++- .../ui/static/js/components/score_issues.js | 2 +- .../static/js/components/table_group_form.js | 14 +- testgen/ui/views/connections.py | 3 + testgen/ui/views/data_catalog.py | 3 +- .../ui/views/dialogs/data_preview_dialog.py | 4 +- .../views/dialogs/import_metadata_dialog.py | 2 +- testgen/ui/views/hygiene_issues.py | 19 +-- testgen/ui/views/profiling_results.py | 5 +- testgen/ui/views/score_details.py | 18 ++- testgen/ui/views/score_explorer.py | 22 ++- testgen/ui/views/table_groups.py | 5 +- testgen/ui/views/test_definitions.py | 2 +- testgen/ui/views/test_results.py | 12 +- tests/unit/common/test_pii_masking.py | 127 ++++++++++++++++-- 43 files changed, 422 insertions(+), 90 deletions(-) diff --git a/testgen/commands/run_profiling.py b/testgen/commands/run_profiling.py index c5b21059..73f45ce4 100644 --- a/testgen/commands/run_profiling.py +++ b/testgen/commands/run_profiling.py @@ -86,7 +86,8 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d LOG.info(f"Profiling run: {profiling_run.id}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}") try: data_chars = run_data_chars_refresh(connection, table_group, profiling_run.profiling_starttime) - data_chars = _exclude_xde_columns(data_chars, table_group.id) + if table_group.profile_exclude_xde: + data_chars = _exclude_xde_columns(data_chars, table_group.id) distinct_tables = {(column.table_name, column.record_ct) for column in data_chars} profiling_run.set_progress("data_chars", "Completed") diff --git a/testgen/common/models/data_column.py b/testgen/common/models/data_column.py index 50122266..7b344a14 100644 --- a/testgen/common/models/data_column.py +++ b/testgen/common/models/data_column.py @@ -17,6 +17,8 @@ class DataColumnChars(Entity): excluded_data_element: bool | None = Column(Boolean, nullable=True) pii_flag: str | None = Column(String(50), nullable=True) + _default_order_by = (id,) + # Unmapped columns: table_id, ordinal_position, general_type, column_type, # db_data_type, functional_data_type, description, critical_data_element, # data_source, source_system, source_process, business_domain, diff --git a/testgen/common/models/table_group.py b/testgen/common/models/table_group.py index 39d81552..724f1ba7 100644 --- a/testgen/common/models/table_group.py +++ b/testgen/common/models/table_group.py @@ -30,6 +30,7 @@ class TableGroupMinimal(EntityMinimal): monitor_test_suite_id: UUID | None profile_flag_cdes: bool profile_flag_pii: bool + profile_exclude_xde: bool last_complete_profile_run_id: UUID | None @@ -115,6 +116,7 @@ class TableGroup(Entity): profiling_delay_days: str = Column(String, default="0") profile_flag_cdes: bool = Column(Boolean, default=True) profile_flag_pii: bool = Column(Boolean, default=True) + profile_exclude_xde: bool = Column(Boolean, default=True) profile_do_pair_rules: bool = Column(YNString, default="N") profile_pair_rule_pct: int = Column(Integer, default=95) include_in_dashboard: bool = Column(Boolean, default=True) diff --git a/testgen/common/pii_masking.py b/testgen/common/pii_masking.py index cff28f9d..70b6a659 100644 --- a/testgen/common/pii_masking.py +++ b/testgen/common/pii_masking.py @@ -33,7 +33,7 @@ def get_pii_columns(table_group_id: str, schema: str | None = None, table_name: return {row.column_name for row in results} -def mask_dataframe_pii(df: pd.DataFrame, pii_columns: set[str]) -> None: +def mask_source_data_pii(df: pd.DataFrame, pii_columns: set[str]) -> None: """In-place mask values in PII columns with PII_REDACTED.""" if df.empty or not pii_columns: return @@ -44,6 +44,34 @@ def mask_dataframe_pii(df: pd.DataFrame, pii_columns: set[str]) -> None: df[df_col] = PII_REDACTED +def mask_hygiene_detail(data: pd.DataFrame | list[dict], pii_columns: set[str] | None = None) -> None: + """Redact hygiene issue detail for PII columns where detail_redactable is true. + + Accepts: + - DataFrame with detail_redactable, pii_flag, and detail columns (hygiene issues grid/export) + - List of issue dicts, each with detail_redactable and either pii_flag or column_name + (when pii_columns set is provided, matches column_name against it) + """ + if isinstance(data, pd.DataFrame): + if data.empty or "detail_redactable" not in data.columns: + return + pii_mask = data["detail_redactable"].fillna(False) & data["pii_flag"].notna() + data.loc[pii_mask, "detail"] = PII_REDACTED + return + + if not data: + return + pii_lower = {c.lower() for c in pii_columns} if pii_columns else None + for issue in data: + if not issue.get("detail_redactable"): + continue + if pii_lower is not None: + if issue.get("column_name", "").lower() in pii_lower: + issue["detail"] = PII_REDACTED + elif issue.get("pii_flag"): + issue["detail"] = PII_REDACTED + + def mask_profiling_pii(data: pd.DataFrame | dict, pii_columns: set[str]) -> None: """Mask profiling fields for PII columns. Accepts a DataFrame or a single-row dict.""" if isinstance(data, dict): diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index ba0edd3a..cd05e290 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -114,6 +114,7 @@ CREATE TABLE table_groups profiling_delay_days VARCHAR(3) DEFAULT '0', profile_flag_cdes BOOLEAN DEFAULT TRUE, profile_flag_pii BOOLEAN DEFAULT TRUE, + profile_exclude_xde BOOLEAN DEFAULT TRUE, profile_do_pair_rules VARCHAR(3) DEFAULT 'N', profile_pair_rule_pct INTEGER DEFAULT 95, include_in_dashboard BOOLEAN DEFAULT TRUE, @@ -342,6 +343,7 @@ CREATE TABLE profile_anomaly_types ( anomaly_description VARCHAR(500), anomaly_criteria VARCHAR(2000), detail_expression VARCHAR(2000), + detail_redactable BOOLEAN DEFAULT FALSE, issue_likelihood VARCHAR(50), -- Potential, Likely, Certain suggested_action VARCHAR(1000), dq_score_prevalence_formula TEXT, @@ -612,6 +614,7 @@ CREATE TABLE target_data_lookups ( sql_flavor VARCHAR(20) NOT NULL, lookup_type VARCHAR(10), lookup_query VARCHAR, + lookup_redactable_columns VARCHAR(100), error_type VARCHAR(30) NOT NULL, CONSTRAINT target_data_lookups_test_id_sql_flavor_error_type_pk PRIMARY KEY (test_id, sql_flavor, error_type) diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml index 23ccefa0..1f184a75 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml @@ -16,6 +16,7 @@ profile_anomaly_types: detail_expression: |- CASE WHEN p.top_freq_values IS NULL THEN 'Min: ' || p.min_text || ', Max: ' || p.max_text ELSE 'Top Freq: ' || p.top_freq_values END + detail_redactable: true issue_likelihood: Likely suggested_action: "Review your source data and follow-up with data owners to determine\ \ whether this data needs to be corrected. " diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml index 7728798b..caf0ea32 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml @@ -9,6 +9,7 @@ profile_anomaly_types: p.std_pattern_match = 'DELIMITED_DATA' detail_expression: |- CASE WHEN p.top_freq_values IS NULL THEN 'Min: ' || p.min_text || ', Max: ' || p.max_text ELSE 'Top Freq: ' || p.top_freq_values END + detail_redactable: true issue_likelihood: Likely suggested_action: |- Review your source data and follow-up with data consumers to determine the most useful representation of this data. diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml index f556a4bd..1ad2aeb0 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Non_Alpha_Prefixed_Name.yaml @@ -9,6 +9,7 @@ profile_anomaly_types: min_text < 'A' AND LEFT(min_text, 1) NOT IN ('"', ' ') AND RIGHT(min_text, 1) <> '''' AND functional_data_type IN ('City', 'Person Given Name', 'Person Last Name', 'Person Full Name') detail_expression: |- 'Minimum Value: ' || min_text + detail_redactable: true issue_likelihood: Definite suggested_action: |- Values starting with a non-alphabetic character are highly likely to be invalid for this kind of column. This may indicate a file format change, error in an ingestion process, or incorrect source data. It could also indicate flagging or coding of some kind that can be broken out in a separate column in processed data. Review your pipeline process and source data to determine the root-cause. If this data accurately reflects source data, and upstream corrections are not possible, consider applying corrections directly to processed data where possible. diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml index 7415a7bd..53a16368 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_One_Year.yaml @@ -9,6 +9,7 @@ profile_anomaly_types: MAX(p.max_date) < CURRENT_DATE - INTERVAL '1 year' detail_expression: |- 'Most Recent Date: ' || MAX(p.max_date)::VARCHAR + detail_redactable: true issue_likelihood: Possible suggested_action: |- Review your source data and follow-up with data owners to determine whether dates in table should be more recent. diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml index 4a31eb12..00467a7d 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Recency_Six_Months.yaml @@ -9,6 +9,7 @@ profile_anomaly_types: MAX(p.max_date) >= CURRENT_DATE - INTERVAL '1 year' AND MAX(p.max_date) < CURRENT_DATE - INTERVAL '6 months' detail_expression: |- 'Most Recent Date: ' || MAX(p.max_date)::VARCHAR + detail_redactable: true issue_likelihood: Possible suggested_action: |- Review your source data and follow-up with data owners to determine whether dates in table should be more recent. diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml index 7dde6180..ced5139f 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_Emails.yaml @@ -10,6 +10,7 @@ profile_anomaly_types: AND NOT (p.column_name ILIKE '%email%' OR p.column_name ILIKE '%addr%') detail_expression: |- 'Value Range: ' || p.min_text || ' thru ' || max_text + detail_redactable: true issue_likelihood: Possible suggested_action: |- Review your source data and follow-up with data owners to determine whether column should be populated with email addresses. diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml index 1cdcf0bf..b98e4d61 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unexpected_US_States.yaml @@ -12,6 +12,7 @@ profile_anomaly_types: detail_expression: "'Value Range: ' || p.min_text || ' thru ' || max_text || CASE\ \ WHEN p.top_freq_values > '' THEN ', Top Freq Values: ' || REPLACE(p.top_freq_values,\ \ CHR(10), ' ; ') ELSE '' END " + detail_redactable: true issue_likelihood: Possible suggested_action: |- Review your source data and follow-up with data owners to determine whether column should be populated with US states. diff --git a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml index 2b30f4e2..84d3bc5b 100644 --- a/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml +++ b/testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Unlikely_Date_Values.yaml @@ -11,6 +11,7 @@ profile_anomaly_types: OR p.max_date > CURRENT_DATE + INTERVAL '30 year') detail_expression: |- 'Date Range: ' || p.min_date::VARCHAR || ' thru ' || p.max_date::VARCHAR + detail_redactable: true issue_likelihood: Likely suggested_action: |- Review your source data and follow-up with data owners to determine whether this data needs to be corrected or removed. diff --git a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml index 4122df48..8e752a67 100644 --- a/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml +++ b/testgen/template/dbsetup_test_types/test_types_CUSTOM.yaml @@ -19,12 +19,12 @@ test_types: Test Focus column_name_help: |- Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another. - default_parm_columns: custom_query + default_parm_columns: custom_query,match_column_names default_parm_values: null default_parm_prompts: |- - Custom SQL Query Returning Error Records + Custom SQL Query Returning Error Records,PII Redactable Columns default_parm_help: |- - Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. A query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group. + Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. A query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.|Comma-separated list of result column names that contain PII data and should be redacted for users without PII viewing permissions. Leave blank if no columns need redacting. default_severity: Fail run_type: QUERY test_scope: custom diff --git a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml index d02569c6..6823fc52 100644 --- a/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Distribution_Shift.yaml @@ -44,6 +44,7 @@ test_types: test_type: Distribution_Shift sql_flavor: bigquery lookup_type: null + lookup_redactable_columns: category lookup_query: |- WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} AS category, @@ -61,6 +62,7 @@ test_types: test_type: Distribution_Shift sql_flavor: databricks lookup_type: null + lookup_redactable_columns: category lookup_query: |- WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} as category, @@ -88,6 +90,7 @@ test_types: test_type: Distribution_Shift sql_flavor: mssql lookup_type: null + lookup_redactable_columns: category lookup_query: |- WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} as category, @@ -114,6 +117,7 @@ test_types: test_type: Distribution_Shift sql_flavor: postgresql lookup_type: null + lookup_redactable_columns: category lookup_query: |- WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} as category, @@ -141,6 +145,7 @@ test_types: test_type: Distribution_Shift sql_flavor: redshift lookup_type: null + lookup_redactable_columns: category lookup_query: |- WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} as category, @@ -168,6 +173,7 @@ test_types: test_type: Distribution_Shift sql_flavor: redshift_spectrum lookup_type: null + lookup_redactable_columns: category lookup_query: |- WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} as category, @@ -195,6 +201,7 @@ test_types: test_type: Distribution_Shift sql_flavor: snowflake lookup_type: null + lookup_redactable_columns: category lookup_query: |- WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} as category, @@ -222,6 +229,7 @@ test_types: test_type: Distribution_Shift sql_flavor: oracle lookup_type: null + lookup_redactable_columns: category lookup_query: |- WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} as category, @@ -249,6 +257,7 @@ test_types: test_type: Distribution_Shift sql_flavor: sap_hana lookup_type: null + lookup_redactable_columns: category lookup_query: |- WITH latest_ver AS ( SELECT {CONCAT_COLUMNS} as category, diff --git a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml index c24a0bfb..6c69fa22 100644 --- a/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml +++ b/testgen/template/dbsetup_test_types/test_types_LOV_All.yaml @@ -120,6 +120,7 @@ test_types: test_type: LOV_All sql_flavor: bigquery lookup_type: null + lookup_redactable_columns: lov lookup_query: |- SELECT lov FROM ( @@ -134,63 +135,71 @@ test_types: test_type: LOV_All sql_flavor: databricks lookup_type: null + lookup_redactable_columns: lov lookup_query: |- - SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') AS aggregated_values FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; + SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') AS lov FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), '|') <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1152' test_id: '1018' test_type: LOV_All sql_flavor: mssql lookup_type: null + lookup_redactable_columns: lov lookup_query: |- - WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT TOP {LIMIT} STRING_AGG( "{COLUMN_NAME}", '|' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}'; + WITH CTE AS (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") SELECT TOP {LIMIT} STRING_AGG( "{COLUMN_NAME}", '|' ) WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) AS lov FROM CTE HAVING STRING_AGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}'; error_type: Test Results - id: '1095' test_id: '1018' test_type: LOV_All sql_flavor: postgresql lookup_type: null + lookup_redactable_columns: lov lookup_query: |- - SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; + SELECT STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) AS lov FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING STRING_AGG(DISTINCT "{COLUMN_NAME}", '|' ORDER BY "{COLUMN_NAME}" ASC) <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1013' test_id: '1018' test_type: LOV_All sql_flavor: redshift lookup_type: null + lookup_redactable_columns: lov lookup_query: |- - SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") AS lov FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1413' test_id: '1018' test_type: LOV_All sql_flavor: redshift_spectrum lookup_type: null + lookup_redactable_columns: lov lookup_query: |- - SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") AS lov FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '1209' test_id: '1018' test_type: LOV_All sql_flavor: snowflake lookup_type: null + lookup_redactable_columns: lov lookup_query: |- - SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; + SELECT LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") AS lov FROM "{TARGET_SCHEMA}"."{TABLE_NAME}" HAVING LISTAGG(DISTINCT "{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT}; error_type: Test Results - id: '8013' test_id: '1018' test_type: LOV_All sql_flavor: oracle lookup_type: null + lookup_redactable_columns: lov lookup_query: |- - SELECT LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") HAVING LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' FETCH FIRST {LIMIT} ROWS ONLY + SELECT LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") AS lov FROM (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") HAVING LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results - id: '8013' test_id: '1018' test_type: LOV_All sql_flavor: sap_hana lookup_type: null + lookup_redactable_columns: lov lookup_query: |- - SELECT LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") FROM (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") HAVING LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT} + SELECT LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") AS lov FROM (SELECT DISTINCT "{COLUMN_NAME}" FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") HAVING LISTAGG("{COLUMN_NAME}", '|') WITHIN GROUP (ORDER BY "{COLUMN_NAME}") <> '{THRESHOLD_VALUE}' LIMIT {LIMIT} error_type: Test Results test_templates: [] diff --git a/testgen/template/dbsetup_test_types/test_types_Recency.yaml b/testgen/template/dbsetup_test_types/test_types_Recency.yaml index 0f9e6b4f..9607a3ac 100644 --- a/testgen/template/dbsetup_test_types/test_types_Recency.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Recency.yaml @@ -123,6 +123,7 @@ test_types: test_type: Recency sql_flavor: bigquery lookup_type: null + lookup_redactable_columns: latest_date_available lookup_query: |- SELECT DISTINCT col AS latest_date_available, CAST(CAST('{TEST_DATE}' AS DATETIME) AS {COLUMN_TYPE}) AS test_run_date FROM (SELECT DATE_TRUNC(MAX(`{COLUMN_NAME}`), DAY) AS col FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) @@ -134,6 +135,7 @@ test_types: test_type: Recency sql_flavor: databricks lookup_type: null + lookup_redactable_columns: latest_date_available lookup_query: |- SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX(`{COLUMN_NAME}`) AS col FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`) WHERE ABS(<%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%>) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results @@ -142,6 +144,7 @@ test_types: test_type: Recency sql_flavor: mssql lookup_type: null + lookup_redactable_columns: latest_date_available lookup_query: |- SELECT DISTINCT TOP {LIMIT} col AS latest_date_available, CAST('{TEST_DATE}' AS DATE) AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE DATEDIFF(day, col, CAST('{TEST_DATE}' AS DATE)) > {THRESHOLD_VALUE}; error_type: Test Results @@ -150,6 +153,7 @@ test_types: test_type: Recency sql_flavor: postgresql lookup_type: null + lookup_redactable_columns: latest_date_available lookup_query: |- SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") a WHERE <%DATEDIFF_DAY;col;'{TEST_DATE}'::DATE%> > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results @@ -158,6 +162,7 @@ test_types: test_type: Recency sql_flavor: redshift lookup_type: null + lookup_redactable_columns: latest_date_available lookup_query: |- SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results @@ -166,6 +171,7 @@ test_types: test_type: Recency sql_flavor: redshift_spectrum lookup_type: null + lookup_redactable_columns: latest_date_available lookup_query: |- SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results @@ -174,6 +180,7 @@ test_types: test_type: Recency sql_flavor: snowflake lookup_type: null + lookup_redactable_columns: latest_date_available lookup_query: |- SELECT DISTINCT col AS latest_date_available, '{TEST_DATE}' :: DATE as test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE DATEDIFF('D', col, '{TEST_DATE}'::DATE) > {THRESHOLD_VALUE} LIMIT {LIMIT}; error_type: Test Results @@ -182,6 +189,7 @@ test_types: test_type: Recency sql_flavor: oracle lookup_type: null + lookup_redactable_columns: latest_date_available lookup_query: |- SELECT DISTINCT col AS latest_date_available, TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE <%DATEDIFF_DAY;col;TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > {THRESHOLD_VALUE} FETCH FIRST {LIMIT} ROWS ONLY error_type: Test Results @@ -190,6 +198,7 @@ test_types: test_type: Recency sql_flavor: sap_hana lookup_type: null + lookup_redactable_columns: latest_date_available lookup_query: |- SELECT DISTINCT col AS latest_date_available, TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS') AS test_run_date FROM (SELECT MAX("{COLUMN_NAME}") AS col FROM "{TARGET_SCHEMA}"."{TABLE_NAME}") WHERE <%DATEDIFF_DAY;col;TO_DATE('{TEST_DATE}', 'YYYY-MM-DD HH24:MI:SS')%> > {THRESHOLD_VALUE} LIMIT {LIMIT} error_type: Test Results diff --git a/testgen/template/dbupgrade/0178_incremental_upgrade.sql b/testgen/template/dbupgrade/0178_incremental_upgrade.sql index ba31a28f..9a583577 100644 --- a/testgen/template/dbupgrade/0178_incremental_upgrade.sql +++ b/testgen/template/dbupgrade/0178_incremental_upgrade.sql @@ -1,7 +1,13 @@ SET SEARCH_PATH TO {SCHEMA_NAME}; -ALTER TABLE table_groups ADD COLUMN IF NOT EXISTS profile_flag_pii BOOLEAN DEFAULT TRUE; +ALTER TABLE table_groups + ADD COLUMN IF NOT EXISTS profile_flag_pii BOOLEAN DEFAULT TRUE, + ADD COLUMN IF NOT EXISTS profile_exclude_xde BOOLEAN DEFAULT TRUE; ALTER TABLE data_column_chars ADD COLUMN IF NOT EXISTS excluded_data_element BOOLEAN, ADD COLUMN IF NOT EXISTS pii_flag VARCHAR(50); + +ALTER TABLE target_data_lookups ADD COLUMN IF NOT EXISTS lookup_redactable_columns VARCHAR(100); + +ALTER TABLE profile_anomaly_types ADD COLUMN IF NOT EXISTS detail_redactable BOOLEAN DEFAULT FALSE; diff --git a/testgen/template/generation/gen_selection_tests.sql b/testgen/template/generation/gen_selection_tests.sql index 36a8444f..ca85796f 100644 --- a/testgen/template/generation/gen_selection_tests.sql +++ b/testgen/template/generation/gen_selection_tests.sql @@ -10,14 +10,15 @@ selected_columns AS ( SELECT p.* FROM profile_results p INNER JOIN latest_run lr ON p.run_date = lr.last_run_date - LEFT JOIN data_column_chars dcc ON ( - p.table_groups_id = dcc.table_groups_id - AND p.schema_name = dcc.schema_name - AND p.table_name = dcc.table_name - AND p.column_name = dcc.column_name - ) WHERE p.table_groups_id = :TABLE_GROUPS_ID ::UUID - AND dcc.excluded_data_element IS NOT TRUE + AND NOT EXISTS ( + SELECT 1 FROM data_column_chars dcc + WHERE dcc.table_groups_id = p.table_groups_id + AND dcc.schema_name = p.schema_name + AND dcc.table_name = p.table_name + AND dcc.column_name = p.column_name + AND dcc.excluded_data_element IS TRUE + ) AND {SELECTION_CRITERIA} ) INSERT INTO test_definitions ( diff --git a/testgen/template/score_cards/get_score_card_issues_by_column.sql b/testgen/template/score_cards/get_score_card_issues_by_column.sql index c0e9724e..c2955a5f 100644 --- a/testgen/template/score_cards/get_score_card_issues_by_column.sql +++ b/testgen/template/score_cards/get_score_card_issues_by_column.sql @@ -14,6 +14,8 @@ anomalies AS ( types.anomaly_name AS type, types.issue_likelihood AS status, results.detail, + types.detail_redactable, + dcc.pii_flag, EXTRACT( EPOCH FROM runs.profiling_starttime @@ -24,6 +26,12 @@ anomalies AS ( FROM profile_anomaly_results AS results INNER JOIN profile_anomaly_types AS types ON (types.id = results.anomaly_id) INNER JOIN profiling_runs AS runs ON (runs.id = results.profile_run_id) + LEFT JOIN data_column_chars AS dcc ON ( + results.table_groups_id = dcc.table_groups_id + AND results.schema_name = dcc.schema_name + AND results.table_name = dcc.table_name + AND results.column_name = dcc.column_name + ) INNER JOIN score_profiling_runs ON ( score_profiling_runs.profile_run_id = runs.id AND score_profiling_runs.table_name = results.table_name @@ -47,6 +55,8 @@ tests AS ( test_types.test_name_short AS type, result_status AS status, result_message AS detail, + NULL::BOOLEAN AS detail_redactable, + NULL AS pii_flag, EXTRACT( EPOCH FROM test_time diff --git a/testgen/template/score_cards/get_score_card_issues_by_dimension.sql b/testgen/template/score_cards/get_score_card_issues_by_dimension.sql index 8afb5d85..74830695 100644 --- a/testgen/template/score_cards/get_score_card_issues_by_dimension.sql +++ b/testgen/template/score_cards/get_score_card_issues_by_dimension.sql @@ -14,6 +14,8 @@ anomalies AS ( types.anomaly_name AS type, types.issue_likelihood AS status, results.detail, + types.detail_redactable, + dcc.pii_flag, EXTRACT( EPOCH FROM runs.profiling_starttime @@ -24,6 +26,12 @@ anomalies AS ( FROM profile_anomaly_results AS results INNER JOIN profile_anomaly_types AS types ON (types.id = results.anomaly_id) INNER JOIN profiling_runs AS runs ON (runs.id = results.profile_run_id) + LEFT JOIN data_column_chars AS dcc ON ( + results.table_groups_id = dcc.table_groups_id + AND results.schema_name = dcc.schema_name + AND results.table_name = dcc.table_name + AND results.column_name = dcc.column_name + ) INNER JOIN score_profiling_runs ON ( score_profiling_runs.profile_run_id = runs.id AND score_profiling_runs.table_name = results.table_name @@ -48,6 +56,8 @@ tests AS ( test_types.test_name_short AS type, result_status AS status, result_message AS detail, + NULL::BOOLEAN AS detail_redactable, + NULL AS pii_flag, EXTRACT( EPOCH FROM test_time diff --git a/testgen/ui/components/frontend/js/components/score_issues.js b/testgen/ui/components/frontend/js/components/score_issues.js index 0d93b745..bcab1146 100644 --- a/testgen/ui/components/frontend/js/components/score_issues.js +++ b/testgen/ui/components/frontend/js/components/score_issues.js @@ -192,7 +192,7 @@ const ColumnProfilingButton = ( style: 'color: var(--secondary-text-color);', tooltip: 'View profiling for column', tooltipPosition: 'top-right', - onclick: () => emitEvent('ColumnProflingClicked', { payload: { column_name, table_name, table_group_id } }), + onclick: () => emitEvent('ColumnProfilingClicked', { payload: { column_name, table_name, table_group_id } }), }); }; diff --git a/testgen/ui/components/frontend/js/components/table_group_form.js b/testgen/ui/components/frontend/js/components/table_group_form.js index 609f6aa0..8ba8b414 100644 --- a/testgen/ui/components/frontend/js/components/table_group_form.js +++ b/testgen/ui/components/frontend/js/components/table_group_form.js @@ -42,6 +42,7 @@ * @property {boolean?} showConnectionSelector * @property {boolean?} disableConnectionSelector * @property {boolean?} disableSchemaField + * @property {boolean?} disablePiiFlag * @property {(tg: TableGroup, state: FormState) => void} onChange */ import van from '../van.min.js'; @@ -83,6 +84,7 @@ const TableGroupForm = (props) => { const profilingDelayDays = van.state(tableGroup.profiling_delay_days ?? 0); const profileFlagCdes = van.state(tableGroup.profile_flag_cdes ?? true); const profileFlagPii = van.state(tableGroup.profile_flag_pii ?? true); + const profileExcludeXde = van.state(tableGroup.profile_exclude_xde ?? true); const includeInDashboard = van.state(tableGroup.include_in_dashboard ?? true); const addScorecardDefinition = van.state(tableGroup.add_scorecard_definition ?? true); const profileUseSampling = van.state(tableGroup.profile_use_sampling ?? false); @@ -123,6 +125,7 @@ const TableGroupForm = (props) => { profiling_delay_days: profilingDelayDays.val, profile_flag_cdes: profileFlagCdes.val, profile_flag_pii: profileFlagPii.val, + profile_exclude_xde: profileExcludeXde.val, include_in_dashboard: includeInDashboard.val, add_scorecard_definition: addScorecardDefinition.val, profile_use_sampling: profileUseSampling.val, @@ -186,10 +189,11 @@ const TableGroupForm = (props) => { profileSkColumnMask, ), SettingsForm( - { editMode: !!tableGroup.id, setValidity: setFieldValidity }, + { editMode: !!tableGroup.id, disablePiiFlag: getValue(props.disablePiiFlag) ?? false, setValidity: setFieldValidity }, profilingDelayDays, profileFlagCdes, profileFlagPii, + profileExcludeXde, includeInDashboard, addScorecardDefinition, ), @@ -330,6 +334,7 @@ const SettingsForm = ( profilingDelayDays, profileFlagCdes, profileFlagPii, + profileExcludeXde, includeInDashboard, addScorecardDefinition, ) => { @@ -349,6 +354,13 @@ const SettingsForm = ( label: 'Detect PII during profiling', checked: profileFlagPii, onChange: (value) => profileFlagPii.val = value, + disabled: options.disablePiiFlag, + }), + Checkbox({ + name: 'profile_exclude_xde', + label: 'Exclude XDE columns from profiling', + checked: profileExcludeXde, + onChange: (value) => profileExcludeXde.val = value, }), Checkbox({ name: 'include_in_dashboard', diff --git a/testgen/ui/components/frontend/js/pages/table_group_wizard.js b/testgen/ui/components/frontend/js/pages/table_group_wizard.js index 48ef56b3..61bac1b8 100644 --- a/testgen/ui/components/frontend/js/pages/table_group_wizard.js +++ b/testgen/ui/components/frontend/js/pages/table_group_wizard.js @@ -4,6 +4,10 @@ * @import { TableGroup } from '../components/table_group_form.js' * @import { CronSample } from '../types.js' * + * @typedef Permissions + * @type {object} + * @property {boolean} can_view_pii + * * @typedef WizardResult * @type {object} * @property {boolean} success @@ -20,6 +24,7 @@ * @property {Connection[]} connections * @property {string[]?} steps * @property {boolean?} is_in_use + * @property {Permissions} permissions * @property {TableGroupPreview?} table_group_preview * @property {CronSample?} standard_cron_sample * @property {CronSample?} monitor_cron_sample @@ -189,6 +194,7 @@ const TableGroupWizard = (props) => { showConnectionSelector: connections.length > 1, disableConnectionSelector: false, disableSchemaField: props.is_in_use ?? false, + disablePiiFlag: !getValue(props.permissions)?.can_view_pii, onChange: (updatedTableGroup, state) => { stepsState.tableGroup.val = updatedTableGroup; stepsValidity.tableGroup.val = state.valid; diff --git a/testgen/ui/pdf/hygiene_issue_report.py b/testgen/ui/pdf/hygiene_issue_report.py index c2c92e4d..58579577 100644 --- a/testgen/ui/pdf/hygiene_issue_report.py +++ b/testgen/ui/pdf/hygiene_issue_report.py @@ -5,7 +5,6 @@ from reportlab.platypus import CondPageBreak, KeepTogether, Paragraph, Table, TableStyle from testgen.common.models.settings import PersistedSetting -from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii from testgen.settings import ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT from testgen.ui.pdf.dataframe_table import DataFrameTableBuilder from testgen.ui.pdf.style import ( @@ -191,12 +190,7 @@ def get_report_content(document, hi_data, mask_pii: bool = False): yield Paragraph("Suggested Action", style=PARA_STYLE_H1) yield Paragraph(hi_data["suggested_action"], style=PARA_STYLE_TEXT) - sample_data_tuple = get_hygiene_issue_source_data(hi_data, limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT) - - # Mask PII in sample data - if sample_data_tuple[3] is not None and mask_pii: - pii_columns = get_pii_columns(str(hi_data["table_groups_id"]), table_name=hi_data["table_name"]) - mask_dataframe_pii(sample_data_tuple[3], pii_columns) + sample_data_tuple = get_hygiene_issue_source_data(hi_data, limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT, mask_pii=mask_pii) yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT) yield Paragraph("Sample Data", PARA_STYLE_H1) diff --git a/testgen/ui/pdf/test_result_report.py b/testgen/ui/pdf/test_result_report.py index 28b3c900..e6ce17f0 100644 --- a/testgen/ui/pdf/test_result_report.py +++ b/testgen/ui/pdf/test_result_report.py @@ -11,7 +11,6 @@ ) from testgen.common.models.settings import PersistedSetting -from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii from testgen.settings import ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT from testgen.ui.pdf.dataframe_table import TABLE_STYLE_DATA, DataFrameTableBuilder from testgen.ui.pdf.style import ( @@ -29,7 +28,10 @@ get_formatted_datetime, ) from testgen.ui.pdf.templates import DatakitchenTemplate -from testgen.ui.queries.source_data_queries import get_test_issue_source_data, get_test_issue_source_data_custom +from testgen.ui.queries.source_data_queries import ( + get_test_issue_source_data, + get_test_issue_source_data_custom, +) from testgen.ui.queries.test_result_queries import ( get_test_result_history, ) @@ -247,14 +249,9 @@ def get_report_content(document, tr_data, mask_pii: bool = False): yield build_history_table(document, tr_data) if tr_data["test_type"] == "CUSTOM": - sample_data_tuple = get_test_issue_source_data_custom(tr_data, limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT) + sample_data_tuple = get_test_issue_source_data_custom(tr_data, limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT, mask_pii=mask_pii) else: - sample_data_tuple = get_test_issue_source_data(tr_data, limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT) - - # Mask PII in sample data - if sample_data_tuple[3] is not None and mask_pii: - pii_columns = get_pii_columns(str(tr_data["table_groups_id"]), table_name=tr_data["table_name"]) - mask_dataframe_pii(sample_data_tuple[3], pii_columns) + sample_data_tuple = get_test_issue_source_data(tr_data, limit=ISSUE_REPORT_SOURCE_DATA_LOOKUP_LIMIT, mask_pii=mask_pii) yield CondPageBreak(SECTION_MIN_AVAILABLE_HEIGHT) yield Paragraph("Sample Data", PARA_STYLE_H1) diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py index 4db67d55..a0cb7873 100644 --- a/testgen/ui/queries/profiling_queries.py +++ b/testgen/ui/queries/profiling_queries.py @@ -451,6 +451,7 @@ def get_hygiene_issues(profile_run_id: str, table_name: str, column_name: str | anomaly_name, issue_likelihood, detail, + detail_redactable, pii_risk FROM profile_anomaly_results anomaly_results LEFT JOIN profile_anomaly_types anomaly_types ON ( @@ -519,7 +520,7 @@ def get_profiling_anomalies( WHEN t.issue_likelihood = 'Likely' THEN 2 WHEN t.issue_likelihood = 'Definite' THEN 1 END AS likelihood_order, - t.anomaly_description, r.detail, t.suggested_action, + t.anomaly_description, r.detail, t.detail_redactable, t.suggested_action, r.anomaly_id, r.table_groups_id::VARCHAR, r.id::VARCHAR, p.profiling_starttime, r.profile_run_id::VARCHAR, tg.table_groups_name, tg.project_code, diff --git a/testgen/ui/queries/scoring_queries.py b/testgen/ui/queries/scoring_queries.py index 2bcb5d97..9a892369 100644 --- a/testgen/ui/queries/scoring_queries.py +++ b/testgen/ui/queries/scoring_queries.py @@ -33,6 +33,7 @@ def get_score_card_issue_reports(selected_issues: list["SelectedIssue"]) -> list types.anomaly_name, types.anomaly_description, results.detail, + types.detail_redactable, results.schema_name, results.table_name, results.column_name, @@ -92,6 +93,8 @@ def get_score_card_issue_reports(selected_issues: list["SelectedIssue"]) -> list results.schema_name, results.table_name, results.column_names, + column_chars.column_type, + results.result_message, groups.table_groups_name, suites.test_suite, types.dq_dimension, diff --git a/testgen/ui/queries/source_data_queries.py b/testgen/ui/queries/source_data_queries.py index 9abfc101..48b307ff 100644 --- a/testgen/ui/queries/source_data_queries.py +++ b/testgen/ui/queries/source_data_queries.py @@ -9,6 +9,7 @@ from testgen.common.database.database_service import get_flavor_service, replace_params from testgen.common.models.connection import Connection, SQLFlavor from testgen.common.models.test_definition import TestDefinition +from testgen.common.pii_masking import PII_REDACTED, get_pii_columns, mask_source_data_pii from testgen.common.read_file import replace_templated_functions from testgen.ui.services.database_service import fetch_from_target_db, fetch_one_from_db from testgen.ui.utils import parse_fuzzy_date @@ -78,6 +79,7 @@ def generate_lookup_query(test_id: str, detail_exp: str, column_names: list[str] def get_hygiene_issue_source_data( issue_data: dict, limit: int = DEFAULT_LIMIT, + mask_pii: bool = False, ) -> tuple[Literal["OK"], None, str, pd.DataFrame] | tuple[Literal["NA", "ND", "ERR"], str, str | None, None]: lookup_query = None try: @@ -92,6 +94,15 @@ def get_hygiene_issue_source_data( df = to_dataframe(results) if limit: df = df.sample(n=min(len(df), limit)).sort_index() + if mask_pii: + _mask_lookup_pii( + df, + issue_data["table_groups_id"], + issue_data["table_name"], + column_name=issue_data.get("column_name"), + test_type_id=issue_data.get("anomaly_id"), + error_type="Profile Anomaly", + ) return "OK", None, lookup_query, df else: return ( @@ -159,6 +170,7 @@ def get_test_issue_source_query(issue_data: dict, limit: int = DEFAULT_LIMIT) -> def get_test_issue_source_data( issue_data: dict, limit: int = DEFAULT_LIMIT, + mask_pii: bool = False, ) -> tuple[Literal["OK"], None, str, pd.DataFrame] | tuple[Literal["NA", "ND", "ERR"], str, str | None, None]: lookup_query = None try: @@ -177,6 +189,15 @@ def get_test_issue_source_data( df = to_dataframe(results) if limit: df = df.sample(n=min(len(df), limit)).sort_index() + if mask_pii: + _mask_lookup_pii( + df, + issue_data["table_groups_id"], + issue_data["table_name"], + column_name=issue_data.get("column_names"), + test_type_id=issue_data.get("test_type_id"), + error_type="Test Results", + ) return "OK", None, lookup_query, df else: return "ND", "Data that violates test criteria is not present in the current dataset.", lookup_query, None @@ -203,6 +224,7 @@ def get_test_issue_source_query_custom( def get_test_issue_source_data_custom( issue_data: dict, limit: int | None = None, + mask_pii: bool = False, ) -> tuple[Literal["OK"], None, str, pd.DataFrame] | tuple[Literal["NA", "ND", "ERR"], str, str | None, None]: try: test_definition = TestDefinition.get(issue_data["test_definition_id"]) @@ -220,6 +242,17 @@ def get_test_issue_source_data_custom( df = to_dataframe(results) if limit: df = df.sample(n=min(len(df), limit)).sort_index() + if mask_pii: + _mask_lookup_pii( + df, + issue_data["table_groups_id"], + issue_data["table_name"], + ) + # Mask user-defined redactable columns from the test definition + lookup_data = _get_lookup_data_custom(issue_data["test_definition_id"]) + if lookup_data and lookup_data.lookup_redactable_columns: + redactable = {col.strip() for col in lookup_data.lookup_redactable_columns.split(",")} + mask_source_data_pii(df, redactable) return "OK", None, lookup_query, df else: return "ND", "Data that violates test criteria is not present in the current dataset.", lookup_query, None @@ -232,6 +265,50 @@ def get_test_issue_source_data_custom( class LookupData: lookup_query: str sql_flavor: SQLFlavor | None = None + lookup_redactable_columns: str | None = None + + +def _mask_lookup_pii( + df: pd.DataFrame, + table_group_id: str, + table_name: str, + column_name: str | None = None, + test_type_id: str | None = None, + error_type: Literal["Profile Anomaly", "Test Results"] | None = None, +) -> None: + """Apply PII masking to a source data lookup DataFrame.""" + pii_columns = get_pii_columns(table_group_id, table_name=table_name) + mask_source_data_pii(df, pii_columns) + + # Row-level masking: if result has a column_name column listing which source column + # each row is about (e.g., table-level recency queries), mask value columns in rows + # where that source column is PII + if pii_columns and "column_name" in df.columns: + pii_lower = {c.lower() for c in pii_columns} + value_cols = [c for c in df.columns if c != "column_name"] + pii_rows = df["column_name"].str.lower().isin(pii_lower) + for col in value_cols: + if df[col].dtype != object: + df[col] = df[col].astype(object) + df.loc[pii_rows, col] = PII_REDACTED + + # Also mask redactable columns if the test's target column is PII + if column_name and test_type_id and error_type and column_name.lower() in {c.lower() for c in pii_columns}: + result = fetch_one_from_db( + """ + SELECT t.lookup_redactable_columns + FROM target_data_lookups t + INNER JOIN table_groups tg ON (:table_group_id = tg.id) + INNER JOIN connections c ON (tg.connection_id = c.connection_id AND t.sql_flavor = c.sql_flavor) + WHERE t.error_type = :error_type + AND t.test_id = :test_type_id + AND t.lookup_redactable_columns IS NOT NULL; + """, + {"table_group_id": table_group_id, "error_type": error_type, "test_type_id": test_type_id}, + ) + if result and result["lookup_redactable_columns"]: + redactable = {col.strip() for col in result["lookup_redactable_columns"].split(",")} + mask_source_data_pii(df, redactable) def _get_lookup_data( @@ -243,7 +320,8 @@ def _get_lookup_data( """ SELECT t.lookup_query, - c.sql_flavor + c.sql_flavor, + t.lookup_redactable_columns FROM target_data_lookups t INNER JOIN table_groups tg ON (:table_group_id = tg.id) @@ -269,7 +347,8 @@ def _get_lookup_data_custom( result = fetch_one_from_db( """ SELECT - d.custom_query as lookup_query + d.custom_query as lookup_query, + d.match_column_names as lookup_redactable_columns FROM test_definitions d WHERE d.id = :test_definition_id; """, diff --git a/testgen/ui/static/js/components/score_issues.js b/testgen/ui/static/js/components/score_issues.js index 0d93b745..bcab1146 100644 --- a/testgen/ui/static/js/components/score_issues.js +++ b/testgen/ui/static/js/components/score_issues.js @@ -192,7 +192,7 @@ const ColumnProfilingButton = ( style: 'color: var(--secondary-text-color);', tooltip: 'View profiling for column', tooltipPosition: 'top-right', - onclick: () => emitEvent('ColumnProflingClicked', { payload: { column_name, table_name, table_group_id } }), + onclick: () => emitEvent('ColumnProfilingClicked', { payload: { column_name, table_name, table_group_id } }), }); }; diff --git a/testgen/ui/static/js/components/table_group_form.js b/testgen/ui/static/js/components/table_group_form.js index 609f6aa0..8ba8b414 100644 --- a/testgen/ui/static/js/components/table_group_form.js +++ b/testgen/ui/static/js/components/table_group_form.js @@ -42,6 +42,7 @@ * @property {boolean?} showConnectionSelector * @property {boolean?} disableConnectionSelector * @property {boolean?} disableSchemaField + * @property {boolean?} disablePiiFlag * @property {(tg: TableGroup, state: FormState) => void} onChange */ import van from '../van.min.js'; @@ -83,6 +84,7 @@ const TableGroupForm = (props) => { const profilingDelayDays = van.state(tableGroup.profiling_delay_days ?? 0); const profileFlagCdes = van.state(tableGroup.profile_flag_cdes ?? true); const profileFlagPii = van.state(tableGroup.profile_flag_pii ?? true); + const profileExcludeXde = van.state(tableGroup.profile_exclude_xde ?? true); const includeInDashboard = van.state(tableGroup.include_in_dashboard ?? true); const addScorecardDefinition = van.state(tableGroup.add_scorecard_definition ?? true); const profileUseSampling = van.state(tableGroup.profile_use_sampling ?? false); @@ -123,6 +125,7 @@ const TableGroupForm = (props) => { profiling_delay_days: profilingDelayDays.val, profile_flag_cdes: profileFlagCdes.val, profile_flag_pii: profileFlagPii.val, + profile_exclude_xde: profileExcludeXde.val, include_in_dashboard: includeInDashboard.val, add_scorecard_definition: addScorecardDefinition.val, profile_use_sampling: profileUseSampling.val, @@ -186,10 +189,11 @@ const TableGroupForm = (props) => { profileSkColumnMask, ), SettingsForm( - { editMode: !!tableGroup.id, setValidity: setFieldValidity }, + { editMode: !!tableGroup.id, disablePiiFlag: getValue(props.disablePiiFlag) ?? false, setValidity: setFieldValidity }, profilingDelayDays, profileFlagCdes, profileFlagPii, + profileExcludeXde, includeInDashboard, addScorecardDefinition, ), @@ -330,6 +334,7 @@ const SettingsForm = ( profilingDelayDays, profileFlagCdes, profileFlagPii, + profileExcludeXde, includeInDashboard, addScorecardDefinition, ) => { @@ -349,6 +354,13 @@ const SettingsForm = ( label: 'Detect PII during profiling', checked: profileFlagPii, onChange: (value) => profileFlagPii.val = value, + disabled: options.disablePiiFlag, + }), + Checkbox({ + name: 'profile_exclude_xde', + label: 'Exclude XDE columns from profiling', + checked: profileExcludeXde, + onChange: (value) => profileExcludeXde.val = value, }), Checkbox({ name: 'include_in_dashboard', diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index b7f10094..d3c522f5 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -497,6 +497,9 @@ def on_close_clicked(_params: dict) -> None: data={ "project_code": project_code, "table_group": table_group.to_dict(json_safe=True), + "permissions": { + "can_view_pii": session.auth.user_has_permission("view_pii"), + }, "table_group_preview": table_group_preview, "steps": [ "tableGroup", diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index 92906c8f..a3e5385c 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -12,7 +12,7 @@ from testgen.common.models import with_database_session from testgen.common.models.project import Project from testgen.common.models.table_group import TableGroup, TableGroupMinimal -from testgen.common.pii_masking import PII_REDACTED, get_pii_columns, mask_profiling_pii +from testgen.common.pii_masking import PII_REDACTED, get_pii_columns, mask_hygiene_detail, mask_profiling_pii from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets import testgen_component from testgen.ui.components.widgets.download_dialog import ( @@ -579,6 +579,7 @@ def get_selected_item(selected: str, table_group_id: str) -> dict | None: if not session.auth.user_has_permission("view_pii"): pii_columns = get_pii_columns(item["table_group_id"], table_name=item["table_name"]) mask_profiling_pii(item, pii_columns) + mask_hygiene_detail(item.get("hygiene_issues", []), pii_columns) return item diff --git a/testgen/ui/views/dialogs/data_preview_dialog.py b/testgen/ui/views/dialogs/data_preview_dialog.py index d57d78f3..ee029644 100644 --- a/testgen/ui/views/dialogs/data_preview_dialog.py +++ b/testgen/ui/views/dialogs/data_preview_dialog.py @@ -3,7 +3,7 @@ from testgen.common.database.database_service import get_flavor_service from testgen.common.models.connection import Connection -from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii +from testgen.common.pii_masking import get_pii_columns, mask_source_data_pii from testgen.ui.components import widgets as testgen from testgen.ui.services.database_service import fetch_from_target_db from testgen.ui.session import session @@ -30,7 +30,7 @@ def data_preview_dialog( if not data.empty and not session.auth.user_has_permission("view_pii"): pii_columns = get_pii_columns(table_group_id, schema_name, table_name) - mask_dataframe_pii(data, pii_columns) + mask_source_data_pii(data, pii_columns) if data.empty: st.warning("The preview data could not be loaded.") diff --git a/testgen/ui/views/dialogs/import_metadata_dialog.py b/testgen/ui/views/dialogs/import_metadata_dialog.py index 36eeb345..511be2d2 100644 --- a/testgen/ui/views/dialogs/import_metadata_dialog.py +++ b/testgen/ui/views/dialogs/import_metadata_dialog.py @@ -285,7 +285,7 @@ def _set_row_status(preview_row: dict, bad_cde: bool, bad_xde: bool, bad_pii: bo if bad_cde: issues.append("Unrecognized CDE value (expected Yes/No) — skipped") if bad_xde: - issues.append("Unrecognized XDE value (expetced Yes/No) - skipped") + issues.append("Unrecognized XDE value (expected Yes/No) - skipped") if bad_pii: issues.append("Unrecognized PII value (expected Yes/No) - skipped") if truncated: diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py index 1bc9913b..75f17292 100644 --- a/testgen/ui/views/hygiene_issues.py +++ b/testgen/ui/views/hygiene_issues.py @@ -12,7 +12,7 @@ from testgen.common.models import with_database_session from testgen.common.models.hygiene_issue import HygieneIssue from testgen.common.models.profiling_run import ProfilingRun -from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii +from testgen.common.pii_masking import mask_hygiene_detail from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( FILE_DATA_TYPE, @@ -185,6 +185,10 @@ def render( # Get hygiene issue list df_pa = get_profiling_anomalies(run_id, likelihood, issue_type_id, table_name, column_name, action, sorting_columns) + # Mask detail for PII columns with redactable details + if not session.auth.user_has_permission("view_pii"): + mask_hygiene_detail(df_pa) + # Retrieve disposition action (cache refreshed) df_action = get_anomaly_disposition(run_id) @@ -438,6 +442,10 @@ def get_excel_report_data( if data is None: data = get_profiling_anomalies(run_id) + if not session.auth.user_has_permission("view_pii"): + data = data.copy() + mask_hygiene_detail(data) + columns = { "table_name": {"header": "Table"}, "column_name": {"header": "Column"}, @@ -468,8 +476,9 @@ def source_data_dialog(selected_row): st.markdown("#### Hygiene Issue Detail") st.caption(selected_row["detail"]) + mask_pii = not session.auth.user_has_permission("view_pii") with st.spinner("Retrieving source data..."): - bad_data_status, bad_data_msg, _, df_bad = get_hygiene_issue_source_data(selected_row, limit=500) + bad_data_status, bad_data_msg, _, df_bad = get_hygiene_issue_source_data(selected_row, limit=500, mask_pii=mask_pii) if bad_data_status in {"ND", "NA"}: st.info(bad_data_msg) elif bad_data_status == "ERR": @@ -479,12 +488,6 @@ def source_data_dialog(selected_row): else: if bad_data_msg: st.info(bad_data_msg) - if not session.auth.user_has_permission("view_pii"): - pii_columns = get_pii_columns( - selected_row["table_groups_id"], - table_name=selected_row["table_name"], - ) - mask_dataframe_pii(df_bad, pii_columns) # Pretify the dataframe df_bad.columns = [col.replace("_", " ").title() for col in df_bad.columns] df_bad.fillna("", inplace=True) diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index f0af0b9e..a1529f95 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -10,7 +10,7 @@ from testgen.common import date_service from testgen.common.models import with_database_session from testgen.common.models.profiling_run import ProfilingRun -from testgen.common.pii_masking import PII_REDACTED, get_pii_columns, mask_profiling_pii +from testgen.common.pii_masking import PII_REDACTED, get_pii_columns, mask_hygiene_detail, mask_profiling_pii from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( FILE_DATA_TYPE, @@ -173,6 +173,9 @@ def open_download_dialog(data: pd.DataFrame | None = None) -> None: st.markdown(":orange[Select a row to see profiling details.]") else: selected_row["hygiene_issues"] = profiling_queries.get_hygiene_issues(run_id, selected_row["table_name"], selected_row.get("column_name")) + if not session.auth.user_has_permission("view_pii"): + pii_cols = get_pii_columns(selected_row["table_group_id"], table_name=selected_row["table_name"]) + mask_hygiene_detail(selected_row["hygiene_issues"], pii_cols) testgen_component( "column_profiling_results", props={ "column": json.dumps(selected_row), "data_preview": True }, diff --git a/testgen/ui/views/score_details.py b/testgen/ui/views/score_details.py index ed9a3860..47178231 100644 --- a/testgen/ui/views/score_details.py +++ b/testgen/ui/views/score_details.py @@ -23,6 +23,7 @@ ScoreTypes, SelectedIssue, ) +from testgen.common.pii_masking import mask_hygiene_detail from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import FILE_DATA_TYPE, download_dialog, zip_multi_file_data from testgen.ui.navigation.page import Page @@ -108,10 +109,10 @@ def render( ) score_breakdown = format_score_card_breakdown([item.to_dict() for item in score_breakdown], category) else: - issues = format_score_card_issues( - score_definition.get_score_card_issues(score_type, category, drilldown), - category, - ) + raw_issues = score_definition.get_score_card_issues(score_type, category, drilldown) + if not session.auth.user_has_permission("view_pii"): + mask_hygiene_detail(raw_issues) + issues = format_score_card_issues(raw_issues, category) testgen.testgen_component( "score_details", @@ -134,7 +135,7 @@ def render( "CategoryChanged": select_category, "ScoreTypeChanged": select_score_type, "IssueReportsExported": export_issue_reports, - "ColumnProflingClicked": lambda payload: profiling_results_dialog( + "ColumnProfilingClicked": lambda payload: profiling_results_dialog( payload["column_name"], payload["table_name"], payload["table_group_id"], @@ -177,16 +178,19 @@ def export_issue_reports(selected_issues: list[SelectedIssue]) -> None: def get_report_file_data(update_progress, issue) -> FILE_DATA_TYPE: + mask_pii = not session.auth.user_has_permission("view_pii") + if mask_pii: + issue = {**issue} + mask_hygiene_detail([issue]) + with BytesIO() as buffer: if issue["issue_type"] == "hygiene": issue_id = issue["id"][:8] timestamp = pd.Timestamp(issue["profiling_starttime"]).strftime("%Y%m%d_%H%M%S") - mask_pii = not session.auth.user_has_permission("view_pii") hygiene_issue_report.create_report(buffer, issue, mask_pii=mask_pii) else: issue_id = issue["test_result_id"][:8] timestamp = pd.Timestamp(issue["test_date"]).strftime("%Y%m%d_%H%M%S") - mask_pii = not session.auth.user_has_permission("view_pii") test_result_report.create_report(buffer, issue, mask_pii=mask_pii) update_progress(1.0) diff --git a/testgen/ui/views/score_explorer.py b/testgen/ui/views/score_explorer.py index 3d8206a7..1e9352ce 100644 --- a/testgen/ui/views/score_explorer.py +++ b/testgen/ui/views/score_explorer.py @@ -23,6 +23,7 @@ SelectedIssue, ) from testgen.common.models.test_run import TestRun +from testgen.common.pii_masking import mask_hygiene_detail from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import FILE_DATA_TYPE, download_dialog, zip_multi_file_data from testgen.ui.navigation.page import Page @@ -36,6 +37,7 @@ ) from testgen.ui.services.rerun_service import safe_rerun from testgen.ui.session import session, temp_value +from testgen.ui.views.dialogs.profiling_results_dialog import profiling_results_dialog from testgen.utils import format_score_card, format_score_card_breakdown, format_score_card_issues, try_json PAGE_PATH = "quality-dashboard:explorer" @@ -154,10 +156,10 @@ def render( breakdown_category, ) if drilldown: - issues = format_score_card_issues( - score_definition.get_score_card_issues(breakdown_score_type, breakdown_category, drilldown), - breakdown_category, - ) + raw_issues = score_definition.get_score_card_issues(breakdown_score_type, breakdown_category, drilldown) + if not session.auth.user_has_permission("view_pii"): + mask_hygiene_detail(raw_issues) + issues = format_score_card_issues(raw_issues, breakdown_category) score_definition_dict = score_definition.to_dict() testgen.testgen_component( @@ -182,6 +184,11 @@ def render( "ScoreTypeChanged": set_breakdown_score_type, "DrilldownChanged": set_breakdown_drilldown, "IssueReportsExported": export_issue_reports, + "ColumnProfilingClicked": lambda payload: profiling_results_dialog( + payload["column_name"], + payload["table_name"], + payload["table_group_id"], + ), "ScoreDefinitionSaved": save_score_definition, "ColumnSelectorOpened": partial(column_selector_dialog, project_code, score_definition_dict), "FilterModeChanged": change_score_definition_filter_mode, @@ -240,16 +247,19 @@ def export_issue_reports(selected_issues: list[SelectedIssue]) -> None: def get_report_file_data(update_progress, issue) -> FILE_DATA_TYPE: + mask_pii = not session.auth.user_has_permission("view_pii") + if mask_pii: + issue = {**issue} + mask_hygiene_detail([issue]) + with BytesIO() as buffer: if issue["issue_type"] == "hygiene": issue_id = issue["id"][:8] timestamp = pd.Timestamp(issue["profiling_starttime"]).strftime("%Y%m%d_%H%M%S") - mask_pii = not session.auth.user_has_permission("view_pii") hygiene_issue_report.create_report(buffer, issue, mask_pii=mask_pii) else: issue_id = issue["test_result_id"][:8] timestamp = pd.Timestamp(issue["test_date"]).strftime("%Y%m%d_%H%M%S") - mask_pii = not session.auth.user_has_permission("view_pii") test_result_report.create_report(buffer, issue, mask_pii=mask_pii) update_progress(1.0) diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index cccaf6e7..ab15c178 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -336,7 +336,7 @@ def on_close_clicked(_params: dict) -> None: LOG.exception(message) if table_group_id and success: - st.rerun() + safe_rerun() except IntegrityError: success = False @@ -352,6 +352,9 @@ def on_close_clicked(_params: dict) -> None: "connections": connections, "table_group": table_group.to_dict(json_safe=True), "is_in_use": is_table_group_used, + "permissions": { + "can_view_pii": session.auth.user_has_permission("view_pii"), + }, "table_group_preview": table_group_preview, "steps": steps, "results": { diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 1d843fae..df2858eb 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -662,7 +662,7 @@ def show_test_form( st.divider() - has_match_attributes = any(attribute.startswith("match_") for attribute in dynamic_attributes) + has_match_attributes = "match_schema_name" in dynamic_attributes or "match_table_name" in dynamic_attributes left_column, right_column = st.columns([0.5, 0.5]) if has_match_attributes else (st.container(), None) test_definition["schema_name"] = left_column.text_input( diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index aa86f081..9d4c802d 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -20,7 +20,6 @@ from testgen.common.models.test_definition import TestDefinition from testgen.common.models.test_run import TestRun from testgen.common.models.test_suite import TestSuite, TestSuiteMinimal -from testgen.common.pii_masking import get_pii_columns, mask_dataframe_pii from testgen.ui.components import widgets as testgen from testgen.ui.components.widgets.download_dialog import ( FILE_DATA_TYPE, @@ -880,11 +879,12 @@ def source_data_dialog(selected_row): st.markdown("#### Result Detail") st.caption(selected_row["result_message"].replace("*", "\\*")) + mask_pii = not session.auth.user_has_permission("view_pii") with st.spinner("Retrieving source data..."): if selected_row["test_type"] == "CUSTOM": - bad_data_status, bad_data_msg, _, df_bad = get_test_issue_source_data_custom(selected_row, limit=500) + bad_data_status, bad_data_msg, _, df_bad = get_test_issue_source_data_custom(selected_row, limit=500, mask_pii=mask_pii) else: - bad_data_status, bad_data_msg, _, df_bad = get_test_issue_source_data(selected_row, limit=500) + bad_data_status, bad_data_msg, _, df_bad = get_test_issue_source_data(selected_row, limit=500, mask_pii=mask_pii) if bad_data_status in {"ND", "NA"}: st.info(bad_data_msg) elif bad_data_status == "ERR": @@ -894,12 +894,6 @@ def source_data_dialog(selected_row): else: if bad_data_msg: st.info(bad_data_msg) - if not session.auth.user_has_permission("view_pii"): - pii_columns = get_pii_columns( - selected_row["table_groups_id"], - table_name=selected_row["table_name"], - ) - mask_dataframe_pii(df_bad, pii_columns) # Pretify the dataframe df_bad.columns = [col.replace("_", " ").title() for col in df_bad.columns] df_bad.fillna("", inplace=True) diff --git a/tests/unit/common/test_pii_masking.py b/tests/unit/common/test_pii_masking.py index eb0e4b77..b336ad43 100644 --- a/tests/unit/common/test_pii_masking.py +++ b/tests/unit/common/test_pii_masking.py @@ -1,43 +1,43 @@ import pandas as pd -from testgen.common.pii_masking import PII_REDACTED, mask_dataframe_pii, mask_profiling_pii +from testgen.common.pii_masking import PII_REDACTED, mask_hygiene_detail, mask_profiling_pii, mask_source_data_pii -class Test_mask_dataframe_pii: +class Test_mask_source_data_pii: def test_masks_pii_columns(self): df = pd.DataFrame({ "name": ["Alice", "Bob"], "ssn": ["123-45-6789", "987-65-4321"], "age": [30, 25], }) - mask_dataframe_pii(df, {"ssn"}) + mask_source_data_pii(df, {"ssn"}) assert df["ssn"].tolist() == [PII_REDACTED, PII_REDACTED] assert df["age"].tolist() == [30, 25] assert df["name"].tolist() == ["Alice", "Bob"] def test_preserves_non_pii_columns(self): df = pd.DataFrame({"col_a": [1, 2], "col_b": ["x", "y"]}) - mask_dataframe_pii(df, {"col_a"}) + mask_source_data_pii(df, {"col_a"}) assert df["col_b"].tolist() == ["x", "y"] def test_handles_empty_dataframe(self): df = pd.DataFrame(columns=["name", "ssn"]) - mask_dataframe_pii(df, {"ssn"}) + mask_source_data_pii(df, {"ssn"}) assert df.empty def test_handles_missing_pii_column(self): df = pd.DataFrame({"col_a": [1, 2]}) - mask_dataframe_pii(df, {"nonexistent_col"}) + mask_source_data_pii(df, {"nonexistent_col"}) assert df["col_a"].tolist() == [1, 2] def test_handles_empty_pii_set(self): df = pd.DataFrame({"col_a": [1, 2]}) - mask_dataframe_pii(df, set()) + mask_source_data_pii(df, set()) assert df["col_a"].tolist() == [1, 2] def test_case_insensitive_matching(self): df = pd.DataFrame({"SSN": ["123-45-6789"], "Name": ["Alice"]}) - mask_dataframe_pii(df, {"ssn"}) + mask_source_data_pii(df, {"ssn"}) assert df["SSN"].tolist() == [PII_REDACTED] assert df["Name"].tolist() == ["Alice"] @@ -48,7 +48,7 @@ def test_multiple_pii_columns(self): "email": ["a@b.com"], "age": [30], }) - mask_dataframe_pii(df, {"ssn", "email"}) + mask_source_data_pii(df, {"ssn", "email"}) assert df["ssn"].tolist() == [PII_REDACTED] assert df["email"].tolist() == [PII_REDACTED] assert df["name"].tolist() == ["Alice"] @@ -189,3 +189,112 @@ def test_preserves_non_profiling_fields(self): assert data["top_freq_values"] == PII_REDACTED assert data["record_ct"] == 100 assert data["distinct_value_ct"] == 50 + + +class Test_mask_hygiene_detail_dataframe: + def test_masks_detail_for_pii_redactable_rows(self): + df = pd.DataFrame({ + "column_name": ["ssn", "age", "email"], + "detail": ["SSN range: 100-999", "Count: 50", "Email range: a@b - z@y"], + "detail_redactable": [True, False, True], + "pii_flag": ["A/ID/SSN", None, "B/CONTACT/Email"], + }) + mask_hygiene_detail(df) + assert df.loc[0, "detail"] == PII_REDACTED + assert df.loc[1, "detail"] == "Count: 50" + assert df.loc[2, "detail"] == PII_REDACTED + + def test_preserves_non_redactable_pii_rows(self): + df = pd.DataFrame({ + "column_name": ["ssn"], + "detail": ["Non-printing chars: 5"], + "detail_redactable": [False], + "pii_flag": ["A/ID/SSN"], + }) + mask_hygiene_detail(df) + assert df.loc[0, "detail"] == "Non-printing chars: 5" + + def test_preserves_redactable_non_pii_rows(self): + df = pd.DataFrame({ + "column_name": ["age"], + "detail": ["Date range: 2020-2024"], + "detail_redactable": [True], + "pii_flag": [None], + }) + mask_hygiene_detail(df) + assert df.loc[0, "detail"] == "Date range: 2020-2024" + + def test_handles_empty_dataframe(self): + df = pd.DataFrame(columns=["column_name", "detail", "detail_redactable", "pii_flag"]) + mask_hygiene_detail(df) + assert df.empty + + def test_handles_missing_detail_redactable_column(self): + df = pd.DataFrame({ + "column_name": ["ssn"], + "detail": ["some detail"], + "pii_flag": ["A/ID/SSN"], + }) + mask_hygiene_detail(df) + assert df.loc[0, "detail"] == "some detail" + + def test_handles_null_detail_redactable(self): + df = pd.DataFrame({ + "column_name": ["ssn"], + "detail": ["SSN range: 100-999"], + "detail_redactable": [None], + "pii_flag": ["A/ID/SSN"], + }) + mask_hygiene_detail(df) + assert df.loc[0, "detail"] == "SSN range: 100-999" + + +class Test_mask_hygiene_detail_list_with_pii_flag: + def test_masks_detail_when_redactable_and_pii(self): + issues = [ + {"detail": "Date range: 2020-2024", "detail_redactable": True, "pii_flag": "A/ID/SSN"}, + {"detail": "Count: 50", "detail_redactable": False, "pii_flag": "A/ID/SSN"}, + {"detail": "Min text: Alice", "detail_redactable": True, "pii_flag": None}, + ] + mask_hygiene_detail(issues) + assert issues[0]["detail"] == PII_REDACTED + assert issues[1]["detail"] == "Count: 50" + assert issues[2]["detail"] == "Min text: Alice" + + def test_handles_empty_list(self): + issues = [] + mask_hygiene_detail(issues) + assert issues == [] + + def test_handles_missing_fields(self): + issues = [{"detail": "some detail"}] + mask_hygiene_detail(issues) + assert issues[0]["detail"] == "some detail" + + +class Test_mask_hygiene_detail_list_with_pii_columns: + def test_masks_detail_when_column_is_pii(self): + issues = [ + {"column_name": "ssn", "detail": "Date range: 2020-2024", "detail_redactable": True}, + {"column_name": "age", "detail": "Count: 50", "detail_redactable": True}, + {"column_name": "email", "detail": "Min text: a@b", "detail_redactable": True}, + ] + mask_hygiene_detail(issues, pii_columns={"ssn", "email"}) + assert issues[0]["detail"] == PII_REDACTED + assert issues[1]["detail"] == "Count: 50" + assert issues[2]["detail"] == PII_REDACTED + + def test_case_insensitive_column_matching(self): + issues = [{"column_name": "SSN", "detail": "range: 100-999", "detail_redactable": True}] + mask_hygiene_detail(issues, pii_columns={"ssn"}) + assert issues[0]["detail"] == PII_REDACTED + + def test_empty_pii_columns_skips_masking(self): + issues = [{"column_name": "ssn", "detail": "range: 100-999", "detail_redactable": True}] + mask_hygiene_detail(issues, pii_columns=set()) + assert issues[0]["detail"] == "range: 100-999" + + def test_non_redactable_issues_preserved(self): + issues = [{"column_name": "ssn", "detail": "Non-printing: 5", "detail_redactable": False}] + mask_hygiene_detail(issues, pii_columns={"ssn"}) + assert issues[0]["detail"] == "Non-printing: 5" From 4cdb3f20c6dac7716842f7c1a4e39d48eb699fe1 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Wed, 25 Mar 2026 12:34:25 -0300 Subject: [PATCH 72/95] =?UTF-8?q?fix:=20scheduler=20shutdown=20race=20?= =?UTF-8?q?=E2=80=94=20check=20=5Fstopping=20before=20blocking=20on=20=5Fr?= =?UTF-8?q?eload=5Fevent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _get_next_jobs() clears _reload_event in its finally block. If shutdown() sets _reload_event between get_jobs() returning and the finally block, the event is cleared after being set. The _run loop then blocks forever on _reload_event.wait() in the StopIteration handler. Guard the wait with a _stopping check so the thread exits immediately when shutting down. Also harden scheduler tests: replace unbounded spin loop with timeout-guarded wait, and use SKIP policy to avoid processing ~345 past daily triggers. Co-Authored-By: Claude Opus 4.6 (1M context) --- testgen/scheduler/base.py | 3 ++- tests/unit/scheduler/test_scheduler_base.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/testgen/scheduler/base.py b/testgen/scheduler/base.py index 0c0e5bc6..ef88be83 100644 --- a/testgen/scheduler/base.py +++ b/testgen/scheduler/base.py @@ -122,7 +122,8 @@ def _run(self): try: triggering_time, jobs = next(next_jobs) except StopIteration: - self._reload_event.wait() + if not self._stopping.is_set(): + self._reload_event.wait() break if self._wait_until(triggering_time): diff --git a/tests/unit/scheduler/test_scheduler_base.py b/tests/unit/scheduler/test_scheduler_base.py index ccac8374..ab0445e2 100644 --- a/tests/unit/scheduler/test_scheduler_base.py +++ b/tests/unit/scheduler/test_scheduler_base.py @@ -138,7 +138,7 @@ def wait_for_call_count(mock, expected_count, timeout=0.5): @pytest.mark.parametrize("with_job", (True, False)) def test_reloads_and_shutdowns_immediately(with_job, scheduler_instance, base_time): - jobs = [Job(cron_expr="0 0 * * *", cron_tz="UTC", delayed_policy=DelayedPolicy.ALL)] if with_job else [] + jobs = [Job(cron_expr="0 0 * * *", cron_tz="UTC", delayed_policy=DelayedPolicy.SKIP)] if with_job else [] scheduler_instance.get_jobs.return_value = jobs scheduler_instance.start(base_time) @@ -169,8 +169,8 @@ def test_job_start_is_called(start_side_effect, scheduler_instance, base_time, n scheduler_instance.start(base_time) for multiplier in (1, 2): - while scheduler_instance.start_job.call_count != 6 * multiplier: - time.sleep(0.01) + assert wait_for_call_count(scheduler_instance.start_job, 6 * multiplier, timeout=5.0), \ + f"start_job call_count={scheduler_instance.start_job.call_count}, expected {6 * multiplier}" assert scheduler_instance.get_jobs.call_count == multiplier assert get_next_mock.call_count == multiplier From 592ce63c2ae082f2063c6f69db05e86aebe7977e Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 25 Mar 2026 16:20:30 -0400 Subject: [PATCH 73/95] fix: database urls detected as emails --- .../flavors/bigquery/profiling/project_profiling_query.sql | 1 + .../flavors/databricks/profiling/project_profiling_query.sql | 3 ++- .../flavors/mssql/profiling/project_profiling_query.sql | 1 + .../flavors/oracle/profiling/project_profiling_query.sql | 1 + .../flavors/redshift/profiling/project_profiling_query.sql | 1 + .../redshift_spectrum/profiling/project_profiling_query.sql | 1 + .../flavors/sap_hana/profiling/project_profiling_query.sql | 1 + .../flavors/snowflake/profiling/project_profiling_query.sql | 1 + 8 files changed, 9 insertions(+), 1 deletion(-) diff --git a/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql b/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql index 34eb1c24..444437c3 100644 --- a/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/bigquery/profiling/project_profiling_query.sql @@ -99,6 +99,7 @@ SELECT WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$') THEN 1 END), COUNT(`{COL_NAME}`)) > 0.8 THEN 'PHONE_USA' WHEN SAFE_DIVIDE(SUM(CASE WHEN REGEXP_CONTAINS(`{COL_NAME}`, r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') + AND `{COL_NAME}` NOT LIKE '%://%' THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'EMAIL' WHEN SAFE_DIVIDE(SUM(CASE WHEN TRANSLATE(`{COL_NAME}`, '012345678', '999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END), COUNT(`{COL_NAME}`)) > 0.9 THEN 'ZIP_USA' diff --git a/testgen/template/flavors/databricks/profiling/project_profiling_query.sql b/testgen/template/flavors/databricks/profiling/project_profiling_query.sql index b72090a7..0ffe73cc 100644 --- a/testgen/template/flavors/databricks/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/databricks/profiling/project_profiling_query.sql @@ -100,7 +100,8 @@ SELECT THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.9 THEN 'STATE_USA' WHEN CAST(SUM( CASE WHEN `{COL_NAME}` RLIKE '\\+1\\s*\\(?\\d{3}\\)?[-. ]*\\d{3}[-. ]*\\d{4}' THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.9 THEN 'PHONE_USA' - WHEN CAST(SUM( CASE WHEN `{COL_NAME}` RLIKE '[_a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+.[a-zA-Z][a-zA-Z]+' + WHEN CAST(SUM( CASE WHEN `{COL_NAME}` RLIKE '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$' + AND `{COL_NAME}` NOT LIKE '%://%' THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.9 THEN 'EMAIL' WHEN CAST(SUM( CASE WHEN TRANSLATE(`{COL_NAME}`,'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END) AS FLOAT)/CAST(COUNT(`{COL_NAME}`) AS FLOAT) > 0.9 THEN 'ZIP_USA' diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query.sql b/testgen/template/flavors/mssql/profiling/project_profiling_query.sql index fbcfb57e..b1313712 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query.sql @@ -110,6 +110,7 @@ SELECT OR ("{COL_NAME}" LIKE '[+]1%[0-9][0-9][0-9][-. ][0-9][0-9][0-9][-. ][0-9][0-9][0-9][0-9]' AND "{COL_NAME}" NOT LIKE '%[^0-9+-]%') THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'PHONE_USA' WHEN CAST(SUM(CASE WHEN "{COL_NAME}" LIKE '%[_a-zA-Z0-9.-]%@%[a-zA-Z0-9.-]%.[a-zA-Z][a-zA-Z]%' + AND "{COL_NAME}" NOT LIKE '%://%' THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'EMAIL' WHEN CAST(SUM(CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END) AS FLOAT)/CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'ZIP_USA' diff --git a/testgen/template/flavors/oracle/profiling/project_profiling_query.sql b/testgen/template/flavors/oracle/profiling/project_profiling_query.sql index cc93718c..b27953b9 100644 --- a/testgen/template/flavors/oracle/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/oracle/profiling/project_profiling_query.sql @@ -159,6 +159,7 @@ FROM ( WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^(\+1|1)?[ .-]?(\([2-9][0-9]{2}\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$') THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'PHONE_USA' WHEN SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}", '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$') + AND "{COL_NAME}" NOT LIKE '%://%' THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'EMAIL' WHEN SUM(CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'ZIP_USA' diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query.sql b/testgen/template/flavors/redshift/profiling/project_profiling_query.sql index d054e40e..53774bb0 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query.sql @@ -70,6 +70,7 @@ SELECT WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'PHONE_USA' WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$' + AND "{COL_NAME}" NOT LIKE '%://%' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL' WHEN SUM(CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' diff --git a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql index d054e40e..53774bb0 100644 --- a/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/redshift_spectrum/profiling/project_profiling_query.sql @@ -70,6 +70,7 @@ SELECT WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.8 THEN 'PHONE_USA' WHEN SUM(CASE WHEN "{COL_NAME}" ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$' + AND "{COL_NAME}" NOT LIKE '%://%' THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'EMAIL' WHEN SUM(CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END)::FLOAT/COUNT("{COL_NAME}")::FLOAT > 0.9 THEN 'ZIP_USA' diff --git a/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql b/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql index 82e5ce85..e80b0374 100644 --- a/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/sap_hana/profiling/project_profiling_query.sql @@ -87,6 +87,7 @@ SELECT WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^(\+1|1)?[ .-]?(\([2-9][0-9]{2}\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$' THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.8 THEN 'PHONE_USA' WHEN SUM(CASE WHEN "{COL_NAME}" LIKE_REGEXPR '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' + AND "{COL_NAME}" NOT LIKE '%://%' THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'EMAIL' WHEN SUM(CASE WHEN REPLACE_REGEXPR('[0-9]' IN "{COL_NAME}" WITH '9') IN ('99999', '999999999', '99999-9999') THEN 1 ELSE 0 END) / NULLIF(COUNT("{COL_NAME}"), 0) > 0.9 THEN 'ZIP_USA' diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql b/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql index 6874d062..35adb40f 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query.sql @@ -79,6 +79,7 @@ SELECT WHEN CAST(SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^(\\+1|1)?[ .-]?(\\([2-9][0-9]{2}\\)|[2-9][0-9]{2})[ .-]?[2-9][0-9]{2}[ .-]?[0-9]{4}$') THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.8 THEN 'PHONE_USA' WHEN CAST(SUM(CASE WHEN REGEXP_LIKE("{COL_NAME}"::VARCHAR, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$') + AND "{COL_NAME}"::VARCHAR NOT LIKE '%://%' THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'EMAIL' WHEN CAST(SUM(CASE WHEN TRANSLATE("{COL_NAME}",'012345678','999999999') IN ('99999', '999999999', '99999-9999') THEN 1 END) AS FLOAT) / CAST(COUNT("{COL_NAME}") AS FLOAT) > 0.9 THEN 'ZIP_USA' From 39c82cfa0eccaf5b6483d00a8e7122154b5b633b Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 25 Mar 2026 16:20:56 -0400 Subject: [PATCH 74/95] fix(data catalog): improve flag styling --- .../js/data_profiling/metadata_tags.js | 92 +++++++++++++------ .../frontend/js/pages/data_catalog.js | 2 +- testgen/ui/views/data_catalog.py | 4 +- 3 files changed, 67 insertions(+), 31 deletions(-) diff --git a/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js b/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js index cfabba45..88554474 100644 --- a/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js +++ b/testgen/ui/components/frontend/js/data_profiling/metadata_tags.js @@ -15,7 +15,7 @@ import { Attribute } from '../components/attribute.js'; import { Input } from '../components/input.js'; import { Icon } from '../components/icon.js'; import { withTooltip } from '../components/tooltip.js'; -import { emitEvent } from '../utils.js'; +import { emitEvent, loadStylesheet } from '../utils.js'; import { RadioGroup } from '../components/radio_group.js'; import { Checkbox } from '../components/checkbox.js'; import { capitalize } from '../display_utils.js'; @@ -26,7 +26,7 @@ import { Alert } from '../components/alert.js'; const { div, span } = van.tags; -const attributeWidth = 300; +const attributeWidth = 250; const descriptionWidth = 932; const multiEditWidth = 400; @@ -79,10 +79,12 @@ const TAG_HELP = { * @returns */ const MetadataTagsCard = (props, item) => { + loadStylesheet('metadata-tags', stylesheet); + const title = `${item.type} Tags `; const attributes = [ 'critical_data_element', - ...(item.type === 'column' ? ['excluded_data_element', 'pii_flag'] : []), + ...(item.type === 'column' ? ['pii_flag', 'excluded_data_element'] : []), 'description', ...TAG_KEYS, ].map(key => { @@ -235,33 +237,40 @@ const InheritedIcon = (/** @type string */ inheritedFrom) => withTooltip( * @returns */ const CdeDisplay = (value, isColumn, isInherited) => { + if (value) { + return div( + { style: `width: ${attributeWidth}px` }, + span( + { class: 'flex-row fx-gap-1 metadata-badge cde' }, + Icon({ size: 24, classes: 'text-purple' }, 'star'), + span(isColumn ? 'Critical data element' : 'All critical data elements'), + (isColumn && isInherited) ? InheritedIcon('table') : null, + ), + ); + } return span( { class: 'flex-row fx-gap-1', style: `width: ${attributeWidth}px` }, - Icon( - { size: value ? 24 : 20, classes: value ? 'text-purple' : 'text-disabled' }, - value ? 'star' : 'cancel', - ), - span( - { class: value ? '' : 'text-secondary' }, - isColumn - ? (value ? 'Critical data element' : 'Not a critical data element') - : (value ? 'All critical data elements' : 'Not all critical data elements'), - ), + Icon({ size: 20, classes: 'text-disabled' }, 'cancel'), + span({ class: 'text-secondary' }, isColumn ? 'Not a critical data element' : 'Not all critical data elements'), (isColumn && isInherited) ? InheritedIcon('table') : null, ); } const XdeDisplay = (/** @type boolean */ value) => { + if (value) { + return div( + { style: `width: ${attributeWidth}px` }, + span( + { class: 'flex-row fx-gap-1 metadata-badge xde' }, + Icon({ size: 20, classes: 'text-brown' }, 'visibility_off'), + span('Excluded data element'), + ), + ); + } return span( { class: 'flex-row fx-gap-1', style: `width: ${attributeWidth}px` }, - Icon( - { size: 20, classes: value ? 'text-brown' : 'text-disabled' }, - value ? 'visibility_off' : 'visibility', - ), - span( - { class: value ? '' : 'text-secondary' }, - value ? 'Excluded data element' : 'Not an excluded data element', - ), + Icon({ size: 20, classes: 'text-disabled' }, 'visibility'), + span({ class: 'text-secondary' }, 'Not an excluded data element'), ); } @@ -273,13 +282,13 @@ const PiiDisplay = (/** @type string|null */ value) => { const typeLabel = pii_type_map[type]; caption = `${pii_risk_map[risk] ?? 'Moderate'} Risk${typeLabel ? ' - ' + typeLabel : ''}${detail && detail !== typeLabel ? ' / ' + detail : ''}`; } - return span( - { class: 'flex-row fx-gap-1', style: `width: ${attributeWidth}px` }, - Icon({ size: 24, classes: 'text-orange' }, 'shield_person'), - div( - { class: 'flex-column fx-gap-1' }, + return div( + { style: `width: ${attributeWidth}px` }, + span( + { class: 'flex-row fx-gap-1 metadata-badge pii' }, + Icon({ size: 21, classes: 'text-orange' }, 'shield_person'), span('PII data'), - caption ? span({ class: 'text-caption' }, caption) : null, + caption ? withTooltip(Icon({ size: 16 }, 'help'), { text: caption }) : null, ), ); } @@ -300,8 +309,8 @@ const MetadataTagsMultiEdit = (props, selectedItems) => { const attributes = [ 'critical_data_element', - 'excluded_data_element', 'pii_flag', + 'excluded_data_element', ...TAG_KEYS, ].map(key => ({ key, @@ -451,4 +460,31 @@ const WarningDialog = (open, pendingAction, warnCde, warnPii) => { ); }; +const stylesheet = new CSSStyleSheet(); +stylesheet.replace(` +.metadata-badge { + display: inline-flex; + padding: 4px 12px 4px 6px; + border-radius: 15px; + height: 30px; + box-sizing: border-box; +} + +.metadata-badge.cde { + background-color: rgba(171, 71, 188, 0.15); +} + +.metadata-badge.cde i { + margin-top: -3px; +} + +.metadata-badge.pii { + background-color: rgba(255, 152, 0, 0.15); +} + +.metadata-badge.xde { + background-color: rgba(141, 110, 99, 0.15); +} +`); + export { MetadataTagsCard, MetadataTagsMultiEdit, TAG_KEYS }; diff --git a/testgen/ui/components/frontend/js/pages/data_catalog.js b/testgen/ui/components/frontend/js/pages/data_catalog.js index 32e01a7e..14979a9b 100644 --- a/testgen/ui/components/frontend/js/pages/data_catalog.js +++ b/testgen/ui/components/frontend/js/pages/data_catalog.js @@ -379,7 +379,7 @@ const ExportOptions = (/** @type TreeNode[] */ treeNodes, /** @type SelectedNode icon: 'download', type: 'stroked', label: 'Export', - tooltip: 'Download columns to Excel', + tooltip: 'Download columns to Excel or CSV', tooltipPosition: 'left', width: 'fit-content', style: 'background: var(--button-generic-background-color);', diff --git a/testgen/ui/views/data_catalog.py b/testgen/ui/views/data_catalog.py index a3e5385c..d89a4680 100644 --- a/testgen/ui/views/data_catalog.py +++ b/testgen/ui/views/data_catalog.py @@ -266,8 +266,8 @@ def get_excel_report_data( "table_name": {"header": "Table"}, "column_name": {"header": "Column"}, "critical_data_element": {"header": "Critical data element (CDE)"}, - "excluded_data_element": {"header": "Excluded data element (XDE)"}, "pii_flag": {"header": "PII"}, + "excluded_data_element": {"header": "Excluded data element (XDE)"}, "active_test_count": {"header": "Active tests"}, "ordinal_position": {"header": "Position"}, "general_type": {}, @@ -488,8 +488,8 @@ def _get_csv_data(update_progress: PROGRESS_UPDATE_TYPE) -> FILE_DATA_TYPE: "Column": row["column_name"], "Description": row["description"] or "", "Critical Data Element": "Yes" if row["critical_data_element"] is True else "No" if row["critical_data_element"] is False else "", - "Excluded Data Element": "Yes" if row.get("excluded_data_element") else "No", "PII": "Yes" if row.get("pii_flag") else "No", + "Excluded Data Element": "Yes" if row.get("excluded_data_element") else "No", } for tag in TAG_FIELDS: header = tag.replace("_", " ").title() From 8d3667b561a5913b718c693ee6fd81f19ee2536c Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 25 Mar 2026 17:30:36 -0400 Subject: [PATCH 75/95] fix: monitor generation fails to find test suite --- testgen/ui/views/connections.py | 4 +++- testgen/ui/views/monitors_dashboard.py | 4 +++- testgen/ui/views/table_groups.py | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index d3c522f5..4fb658f7 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -19,7 +19,7 @@ from testgen.commands.run_profiling import run_profiling_in_background from testgen.common.database.database_service import empty_cache, get_flavor_service from testgen.common.database.flavor.flavor_service import resolve_connection_params -from testgen.common.models import with_database_session +from testgen.common.models import get_current_session, with_database_session from testgen.common.models.connection import Connection, ConnectionMinimal from testgen.common.models.scheduler import RUN_MONITORS_JOB_KEY, RUN_TESTS_JOB_KEY, JobSchedule from testgen.common.models.table_group import TableGroup @@ -441,6 +441,8 @@ def on_close_clicked(_params: dict) -> None: predict_holiday_codes=monitor_test_suite_data.get("predict_holiday_codes") or None, ) monitor_test_suite.save() + # Commit needed to make test suite visible to run_monitor_generation's separate DB connection + get_current_session().commit() run_monitor_generation(monitor_test_suite.id, ["Volume_Trend", "Schema_Drift"]) JobSchedule( diff --git a/testgen/ui/views/monitors_dashboard.py b/testgen/ui/views/monitors_dashboard.py index 4086f294..40661e4c 100644 --- a/testgen/ui/views/monitors_dashboard.py +++ b/testgen/ui/views/monitors_dashboard.py @@ -8,7 +8,7 @@ from testgen.commands.test_generation import run_monitor_generation from testgen.common.freshness_service import add_business_minutes, get_schedule_params, resolve_holiday_dates -from testgen.common.models import with_database_session +from testgen.common.models import get_current_session, with_database_session from testgen.common.models.notification_settings import ( MonitorNotificationSettings, MonitorNotificationTrigger, @@ -561,6 +561,8 @@ def on_save_settings_clicked(payload: dict) -> None: monitors: list[str] = ["Volume_Trend", "Schema_Drift"] if updated_table_group.last_complete_profile_run_id: monitors.append("Freshness_Trend") + # Commit needed to make test suite visible to run_monitor_generation's separate DB connection + get_current_session().commit() run_monitor_generation(monitor_suite.id, monitors) safe_rerun() diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index ab15c178..79468104 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -9,7 +9,7 @@ from testgen.commands.run_profiling import run_profiling_in_background from testgen.commands.test_generation import run_monitor_generation -from testgen.common.models import with_database_session +from testgen.common.models import get_current_session, with_database_session from testgen.common.models.connection import Connection from testgen.common.models.project import Project from testgen.common.models.scheduler import RUN_MONITORS_JOB_KEY, RUN_TESTS_JOB_KEY, JobSchedule @@ -309,6 +309,8 @@ def on_close_clicked(_params: dict) -> None: predict_holiday_codes=monitor_test_suite_data.get("predict_holiday_codes") or None, ) monitor_test_suite.save() + # Commit needed to make test suite visible to run_monitor_generation's separate DB connection + get_current_session().commit() run_monitor_generation(monitor_test_suite.id, ["Volume_Trend", "Schema_Drift"]) JobSchedule( From 6c82a062cfbe68d5861ad2d7829dcd179ac0ca69 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Wed, 25 Mar 2026 18:30:29 -0400 Subject: [PATCH 76/95] fix: edge case in column history dialog --- testgen/ui/views/dialogs/column_history_dialog.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/testgen/ui/views/dialogs/column_history_dialog.py b/testgen/ui/views/dialogs/column_history_dialog.py index 6d2dc2ec..a82282a1 100644 --- a/testgen/ui/views/dialogs/column_history_dialog.py +++ b/testgen/ui/views/dialogs/column_history_dialog.py @@ -39,6 +39,13 @@ def _column_history_dialog( ProfilingRun.profiling_starttime >= func.to_timestamp(add_date), ) profiling_runs = [run.to_dict(json_safe=True) for run in profiling_runs] + + if not profiling_runs: + st.info("No profiling runs are available for this column. Run profiling first to see column history.") + return + + with loading_column: + with st.spinner("Loading data ..."): run_id = st.session_state.get("column_history_dialog:run_id") or profiling_runs[0]["id"] selected_item = get_run_column(run_id, schema_name, table_name, column_name) From 7f105d5537484b6dd130b48cd816787f09cf12bb Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 26 Mar 2026 01:21:20 -0400 Subject: [PATCH 77/95] fix(schedules dialog): bug in pausing/deleting --- testgen/ui/views/dialogs/manage_schedules.py | 1 + 1 file changed, 1 insertion(+) diff --git a/testgen/ui/views/dialogs/manage_schedules.py b/testgen/ui/views/dialogs/manage_schedules.py index cb743dd7..c2e459c6 100644 --- a/testgen/ui/views/dialogs/manage_schedules.py +++ b/testgen/ui/views/dialogs/manage_schedules.py @@ -41,6 +41,7 @@ def open(self, project_code: str) -> None: self.init() return st.dialog(title=self.title)(self.render)() + @with_database_session def render(self) -> None: @with_database_session def on_delete_sched(item): From 2ead7ac91f5e09ba1f6ecbccdfa49e659fb9c6c0 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 26 Mar 2026 01:52:42 -0400 Subject: [PATCH 78/95] fix(run tests): hide button in dialog after clicking link --- testgen/ui/views/dialogs/run_tests_dialog.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/testgen/ui/views/dialogs/run_tests_dialog.py b/testgen/ui/views/dialogs/run_tests_dialog.py index 7908f90c..35798819 100644 --- a/testgen/ui/views/dialogs/run_tests_dialog.py +++ b/testgen/ui/views/dialogs/run_tests_dialog.py @@ -53,11 +53,13 @@ def run_tests_dialog(project_code: str, test_suite: TestSuiteMinimal | None = No button_container = st.empty() status_container = st.empty() + link_clicked = st.session_state.get(LINK_KEY) run_test_button = None - with button_container: - _, button_column = st.columns([.8, .2]) - with button_column: - run_test_button = st.button("Run Tests", use_container_width=True, disabled=not test_suite_id) + if not link_clicked: + with button_container: + _, button_column = st.columns([.8, .2]) + with button_column: + run_test_button = st.button("Run Tests", use_container_width=True, disabled=not test_suite_id) if run_test_button: button_container.empty() @@ -69,7 +71,7 @@ def run_tests_dialog(project_code: str, test_suite: TestSuiteMinimal | None = No status_container.error(f"Test run encountered errors: {e!s}.") # The second condition is needed for the link to work - if run_test_button or st.session_state.get(LINK_KEY): + if run_test_button or link_clicked: with status_container.container(): st.success( f"Test run started for test suite **{test_suite_name}**." From 62e1b4c0dba4dcbc743e8edeae1c29733fcb7c39 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 26 Mar 2026 02:03:02 -0400 Subject: [PATCH 79/95] fix(table group): remove stepper from edit dialog --- .../frontend/js/pages/table_group_wizard.js | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/testgen/ui/components/frontend/js/pages/table_group_wizard.js b/testgen/ui/components/frontend/js/pages/table_group_wizard.js index 61bac1b8..e3807b9f 100644 --- a/testgen/ui/components/frontend/js/pages/table_group_wizard.js +++ b/testgen/ui/components/frontend/js/pages/table_group_wizard.js @@ -146,33 +146,36 @@ const TableGroupWizard = (props) => { return ''; } - return WizardProgressIndicator( - [ + const allIndicators = [ { - index: 1, title: 'Table Group', skipped: false, includedSteps: ['tableGroup', 'testTableGroup'], }, { - index: 2, title: 'Profiling', skipped: !stepsState.runProfiling.rawVal, includedSteps: ['runProfiling'], }, { - index: 3, title: 'Testing', skipped: !stepsState.testSuite.rawVal.generate, includedSteps: ['testSuite'], }, { - index: 4, title: 'Monitors', skipped: !stepsState.monitorSuite.rawVal.generate, includedSteps: ['monitorSuite'], }, - ], + ].filter(indicator => indicator.includedSteps.some(s => steps.includes(s))) + .map((indicator, i) => ({ ...indicator, index: i + 1 })); + + if (allIndicators.length <= 1) { + return ''; + } + + return WizardProgressIndicator( + allIndicators, { index: stepIndex, name: steps[stepIndex], From ffb41a95313c364dbc70b2e1406f0e663b362517 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 26 Mar 2026 02:09:59 -0400 Subject: [PATCH 80/95] feat(test suites): add search filter --- testgen/common/models/test_suite.py | 5 +- .../frontend/js/pages/test_suites.js | 112 +++++++++++------- testgen/ui/views/test_suites.py | 9 +- 3 files changed, 78 insertions(+), 48 deletions(-) diff --git a/testgen/common/models/test_suite.py b/testgen/common/models/test_suite.py index 18e29cd3..229094a6 100644 --- a/testgen/common/models/test_suite.py +++ b/testgen/common/models/test_suite.py @@ -101,7 +101,7 @@ def select_minimal_where( @classmethod @st.cache_data(show_spinner=False) - def select_summary(cls, project_code: str, table_group_id: str | UUID | None = None) -> Iterable[TestSuiteSummary]: + def select_summary(cls, project_code: str, table_group_id: str | UUID | None = None, test_suite_name: str | None = None) -> Iterable[TestSuiteSummary]: if table_group_id and not is_uuid4(table_group_id): return [] @@ -199,9 +199,10 @@ def select_summary(cls, project_code: str, table_group_id: str | UUID | None = N WHERE suites.is_monitor IS NOT TRUE AND suites.project_code = :project_code {"AND suites.table_groups_id = :table_group_id" if table_group_id else ""} + {"AND suites.test_suite ILIKE :test_suite_name" if test_suite_name else ""} ORDER BY LOWER(suites.test_suite); """ - params = {"project_code": project_code, "table_group_id": table_group_id} + params = {"project_code": project_code, "table_group_id": table_group_id, "test_suite_name": f"%{test_suite_name}%" if test_suite_name else None} db_session = get_current_session() results = db_session.execute(text(query), params).mappings().all() return [TestSuiteSummary(**row) for row in results] diff --git a/testgen/ui/components/frontend/js/pages/test_suites.js b/testgen/ui/components/frontend/js/pages/test_suites.js index 9fdd40f2..abd95965 100644 --- a/testgen/ui/components/frontend/js/pages/test_suites.js +++ b/testgen/ui/components/frontend/js/pages/test_suites.js @@ -11,12 +11,14 @@ * @property {ProjectSummary} project_summary * @property {TestSuiteSummary} test_suites * @property {FilterOption[]} table_group_filter_options + * @property {string?} test_suite_name * @property {Permissions} permissions */ import van from '../van.min.js'; import { Streamlit } from '../streamlit.js'; import { emitEvent, getValue, loadStylesheet, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange } from '../utils.js'; import { formatTimestamp, DISABLED_ACTION_TEXT } from '../display_utils.js'; +import { Input } from '../components/input.js'; import { Select } from '../components/select.js'; import { Button } from '../components/button.js'; import { Card } from '../components/card.js'; @@ -46,51 +48,77 @@ const TestSuites = (/** @type Properties */ props) => { return projectSummary.test_suite_count > 0 ? div( { class: 'tg-test-suites'}, - () => div( - { class: 'flex-row fx-align-flex-end fx-justify-space-between fx-gap-4 mb-4' }, - Select({ - label: 'Table Group', - value: getValue(props.table_group_filter_options)?.find((op) => op.selected)?.value ?? null, - options: getValue(props.table_group_filter_options) ?? [], - allowNull: true, - style: 'font-size: 14px;', - testId: 'table-group-filter', - onChange: (value) => emitEvent('FilterApplied', {payload: value}), - }), - div( - { class: 'flex-row fx-gap-3' }, - Button({ - icon: 'notifications', - type: 'stroked', - label: 'Notifications', - tooltip: 'Configure email notifications for test runs', - tooltipPosition: 'bottom', - width: 'fit-content', - style: 'background: var(--button-generic-background-color);', - onclick: () => emitEvent('RunNotificationsClicked', {}), - }), - Button({ - icon: 'today', - type: 'stroked', - label: 'Schedules', - tooltip: 'Manage when test suites should run', - tooltipPosition: 'bottom', - width: 'fit-content', - style: 'background: var(--button-generic-background-color);', - onclick: () => emitEvent('RunSchedulesClicked', {}), - }), - userCanEdit - ? Button({ - icon: 'add', + () => { + const initialTableGroup = getValue(props.table_group_filter_options)?.find((op) => op.selected)?.value ?? null; + const initialTestSuiteName = getValue(props.test_suite_name) || null; + const selectedTableGroup = van.state(initialTableGroup); + const testSuiteNameFilter = van.state(initialTestSuiteName); + + van.derive(() => { + if (selectedTableGroup.val !== initialTableGroup || testSuiteNameFilter.val !== initialTestSuiteName) { + emitEvent('FilterApplied', { payload: { table_group_id: selectedTableGroup.val, test_suite_name: testSuiteNameFilter.val } }); + } + }); + + return div( + { class: 'flex-row fx-align-flex-end fx-justify-space-between fx-gap-4 fx-flex-wrap mb-4' }, + div( + { class: 'flex-row fx-align-flex-end fx-gap-3' }, + Select({ + label: 'Table Group', + value: selectedTableGroup, + options: getValue(props.table_group_filter_options) ?? [], + allowNull: true, + style: 'font-size: 14px;', + testId: 'table-group-filter', + onChange: (value) => selectedTableGroup.val = value, + }), + Input({ + testId: 'test-suite-name-filter', + icon: 'search', + label: '', + placeholder: 'Search test suite names', + width: 300, + clearable: true, + value: testSuiteNameFilter, + onChange: (value) => testSuiteNameFilter.val = value || null, + }), + ), + div( + { class: 'flex-row fx-gap-3' }, + Button({ + icon: 'notifications', type: 'stroked', - label: 'Add Test Suite', + label: 'Notifications', + tooltip: 'Configure email notifications for test runs', + tooltipPosition: 'bottom', width: 'fit-content', style: 'background: var(--button-generic-background-color);', - onclick: () => emitEvent('AddTestSuiteClicked', {}), - }) - : '', - ), - ), + onclick: () => emitEvent('RunNotificationsClicked', {}), + }), + Button({ + icon: 'today', + type: 'stroked', + label: 'Schedules', + tooltip: 'Manage when test suites should run', + tooltipPosition: 'bottom', + width: 'fit-content', + style: 'background: var(--button-generic-background-color);', + onclick: () => emitEvent('RunSchedulesClicked', {}), + }), + userCanEdit + ? Button({ + icon: 'add', + type: 'stroked', + label: 'Add Test Suite', + width: 'fit-content', + style: 'background: var(--button-generic-background-color);', + onclick: () => emitEvent('AddTestSuiteClicked', {}), + }) + : '', + ), + ); + }, () => getValue(testSuites)?.length ? div( { class: 'flex-column' }, diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index f0dc5abd..a6d31834 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -38,7 +38,7 @@ class TestSuitesPage(Page): order=2, ) - def render(self, project_code: str, table_group_id: str | None = None, **_kwargs) -> None: + def render(self, project_code: str, table_group_id: str | None = None, test_suite_name: str | None = None, **_kwargs) -> None: testgen.page_header( PAGE_TITLE, "connect-your-database/manage-test-suites/", @@ -46,7 +46,7 @@ def render(self, project_code: str, table_group_id: str | None = None, **_kwargs table_groups = TableGroup.select_minimal_where(TableGroup.project_code == project_code) user_can_edit = session.auth.user_has_permission("edit") - test_suites = TestSuite.select_summary(project_code, table_group_id) + test_suites = TestSuite.select_summary(project_code, table_group_id, test_suite_name) project_summary = Project.get_summary(project_code) testgen.testgen_component( @@ -61,6 +61,7 @@ def render(self, project_code: str, table_group_id: str | None = None, **_kwargs "selected": str(table_group_id) == str(table_group.id), } for table_group in table_groups ], + "test_suite_name": test_suite_name, "permissions": { "can_edit": user_can_edit, } @@ -79,8 +80,8 @@ def render(self, project_code: str, table_group_id: str | None = None, **_kwargs ) -def on_test_suites_filtered(table_group_id: str | None = None) -> None: - Router().set_query_params({ "table_group_id": table_group_id }) +def on_test_suites_filtered(params: dict) -> None: + Router().set_query_params(params) @st.dialog(title="Add Test Suite") From 10195263f04e40d91e384a667284b2ea24efe93d Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Wed, 25 Mar 2026 09:56:21 -0300 Subject: [PATCH 81/95] fix: truncate timestamps to date in Daily_Record_Ct measure formula COUNT(DISTINCT col) on timestamp columns counts unique timestamps, not unique dates, producing negative measures (e.g., -1575 instead of 240). Add CAST(col AS DATE) for 7 flavors that were missing date truncation: postgresql, snowflake, databricks, redshift, redshift_spectrum, mssql, trino. BigQuery, Oracle, and SAP HANA already truncated correctly. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../test_types_Daily_Record_Ct.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml index fea86ca0..c57a32d1 100644 --- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml @@ -52,7 +52,7 @@ test_types: test_type: Daily_Record_Ct sql_flavor: databricks measure: |- - <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME}) + <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT CAST({COLUMN_NAME} AS DATE)) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -60,7 +60,7 @@ test_types: test_type: Daily_Record_Ct sql_flavor: mssql measure: |- - DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) + DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT CAST({COLUMN_NAME} AS DATE)) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -68,7 +68,7 @@ test_types: test_type: Daily_Record_Ct sql_flavor: postgresql measure: |- - <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT {COLUMN_NAME}) + <%DATEDIFF_DAY;MIN({COLUMN_NAME});MAX({COLUMN_NAME})%>+1-COUNT(DISTINCT CAST({COLUMN_NAME} AS DATE)) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -76,7 +76,7 @@ test_types: test_type: Daily_Record_Ct sql_flavor: redshift measure: |- - DATEDIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) + DATEDIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT CAST({COLUMN_NAME} AS DATE)) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -84,7 +84,7 @@ test_types: test_type: Daily_Record_Ct sql_flavor: redshift_spectrum measure: |- - DATEDIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) + DATEDIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT CAST({COLUMN_NAME} AS DATE)) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -92,7 +92,7 @@ test_types: test_type: Daily_Record_Ct sql_flavor: snowflake measure: |- - DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) + DATEDIFF(day, MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT CAST({COLUMN_NAME} AS DATE)) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} @@ -100,7 +100,7 @@ test_types: test_type: Daily_Record_Ct sql_flavor: trino measure: |- - DATE_DIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT {COLUMN_NAME}) + DATE_DIFF('DAY', MIN({COLUMN_NAME}), MAX({COLUMN_NAME}))+1-COUNT(DISTINCT CAST({COLUMN_NAME} AS DATE)) test_operator: '>' test_condition: |- {THRESHOLD_VALUE} From 25636cafa1ff98ce82ed7eb2caa03e02f9e9b5d2 Mon Sep 17 00:00:00 2001 From: Ricardo Boni Date: Wed, 25 Mar 2026 13:06:59 -0300 Subject: [PATCH 82/95] fix: cast timestamps to date in Daily_Record_Ct source data lookup (Databricks) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit date_bounds CTE used raw MIN/MAX without CAST, so SEQUENCE generated timestamps that never matched the date-typed existing_periods — all dates appeared missing for timestamp columns. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml index c57a32d1..eeb64f32 100644 --- a/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml +++ b/testgen/template/dbsetup_test_types/test_types_Daily_Record_Ct.yaml @@ -166,7 +166,7 @@ test_types: sql_flavor: databricks lookup_type: null lookup_query: |- - WITH date_bounds AS( SELECT MIN(`{COLUMN_NAME}`) AS min_date, MAX(`{COLUMN_NAME}`) AS max_date FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT {LIMIT}; + WITH date_bounds AS( SELECT CAST(MIN(`{COLUMN_NAME}`) AS DATE) AS min_date, CAST(MAX(`{COLUMN_NAME}`) AS DATE) AS max_date FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}`), all_dates AS ( SELECT EXPLODE(SEQUENCE(min_date, max_date, INTERVAL 1 DAY)) AS all_dates FROM date_bounds ), existing_periods AS ( SELECT DISTINCT CAST(`{COLUMN_NAME}` AS DATE) AS period, COUNT(1) AS period_count FROM `{TARGET_SCHEMA}`.`{TABLE_NAME}` GROUP BY CAST(`{COLUMN_NAME}` AS DATE) ), missing_dates AS ( SELECT d.all_dates AS missing_period FROM all_dates d LEFT JOIN existing_periods e ON d.all_dates = e.period WHERE e.period IS NULL ) SELECT m.missing_period, MAX(e1.period) AS prior_available_date, MAX(e1.period_count) AS prior_available_date_count, MIN(e2.period) AS next_available_date, MAX(e2.period_count) AS next_available_date_count FROM missing_dates m LEFT JOIN existing_periods e1 ON e1.period < m.missing_period LEFT JOIN existing_periods e2 ON e2.period > m.missing_period GROUP BY m.missing_period ORDER BY m.missing_period LIMIT {LIMIT}; error_type: Test Results - id: '1144' test_id: '1009' From f5747aa9b0958f366eb05eaf16c113a508701f1b Mon Sep 17 00:00:00 2001 From: Luis Date: Thu, 26 Mar 2026 18:17:58 -0400 Subject: [PATCH 83/95] fix(ui): portals were closing when a nested portal opened --- .../frontend/js/components/portal.js | 2 +- testgen/ui/static/js/components/portal.js | 95 ++++++++++++++----- 2 files changed, 71 insertions(+), 26 deletions(-) diff --git a/testgen/ui/components/frontend/js/components/portal.js b/testgen/ui/components/frontend/js/components/portal.js index 12fa2e70..fce86227 100644 --- a/testgen/ui/components/frontend/js/components/portal.js +++ b/testgen/ui/components/frontend/js/components/portal.js @@ -23,7 +23,7 @@ const Portal = (/** @type Options */ options, ...args) => { const { target, targetRelative, align = 'left', position = 'bottom' } = getValue(options); const id = `${target}-portal`; - window.testgen.portals[id] = { domId: id, targetId: target, opened: options.opened }; + window.testgen.portals[id] = { domId: id, targetId: target, opened: options.opened, close: () => { options.opened.val = false; } }; return () => { if (!getValue(options.opened)) { diff --git a/testgen/ui/static/js/components/portal.js b/testgen/ui/static/js/components/portal.js index de0278af..272a619a 100644 --- a/testgen/ui/static/js/components/portal.js +++ b/testgen/ui/static/js/components/portal.js @@ -18,25 +18,26 @@ import van from '../van.min.js'; import { getValue } from '../utils.js'; +const { div } = van.tags; + const STREAMLIT_DIALOG_ZINDEX = 1000060; const STREAMLIT_DIALOG_CLASS = 'stDialog'; const Portal = (/** @type Options */ options, ...args) => { const { target, align = 'left', position = 'bottom' } = getValue(options); const id = `${target}-portal`; - let portalEl = null; let outsideClickHandler = null; const close = () => { options.opened.val = false; }; window.testgen.portals[id] = { domId: id, targetId: target, opened: options.opened, close }; + // Side-effect derive: manages close loop and outside-click handler. + // Kept free of van.add / DOM creation to avoid corrupting VanJS dependency tracking. van.derive(() => { const isOpen = getValue(options.opened); if (!isOpen) { - portalEl?.remove(); - portalEl = null; if (outsideClickHandler) { document.removeEventListener('click', outsideClickHandler, true); outsideClickHandler = null; @@ -44,45 +45,89 @@ const Portal = (/** @type Options */ options, ...args) => { return; } - // Close other open portals before opening this one + const anchor = document.getElementById(target); + if (!anchor) return; + + // Close other open portals — skip parent portals that contain our anchor. + const toClose = []; for (const p of Object.values(window.testgen.portals)) { - if (p.domId !== id && getValue(p.opened)) { - p.close(); + if (p.domId !== id && p.opened?.rawVal) { + const otherEl = document.getElementById(p.domId); + if (otherEl?.contains(anchor)) continue; + toClose.push(p); } } + if (toClose.length) { + queueMicrotask(() => toClose.forEach(p => { p.opened.val = false; })); + } + + if (!outsideClickHandler) { + outsideClickHandler = (event) => { + const anchor = document.getElementById(target); + const portalEl = document.getElementById(id); + if (portalEl?.contains(event.target)) return; + if (anchor?.contains(event.target)) return; + if (isClickInsideChildPortal(event.target, id, portalEl)) return; + close(); + }; + document.addEventListener('click', outsideClickHandler, true); + } + }); + + // DOM rendering: a VanJS binding on document.body. + // VanJS manages the element lifecycle natively — no manual createElement/remove. + van.add(document.body, () => { + if (!getValue(options.opened)) { + return ''; + } const anchor = document.getElementById(target); - if (!anchor) return; + if (!anchor) return ''; const fixed = hasFixedAncestor(anchor); const fromDialog = hasStreamlitDialogAncestor(anchor); - const zIndex = fromDialog ? (STREAMLIT_DIALOG_ZINDEX + 1) : 1001; + const parentPortalEl = getParentPortalElement(anchor, id); + const zIndex = parentPortalEl + ? (parseInt(parentPortalEl.style.zIndex) || 1001) + 1 + : fromDialog ? (STREAMLIT_DIALOG_ZINDEX + 1) : 1001; const coords = position === 'bottom' ? calculateBottomPosition(anchor, align, fixed) : calculateTopPosition(anchor, align, fixed); - if (!portalEl) { - portalEl = document.createElement('div'); - document.body.appendChild(portalEl); - van.add(portalEl, ...args); - - outsideClickHandler = (event) => { - const anchor = document.getElementById(target); - if (!portalEl?.contains(event.target) && !anchor?.contains(event.target)) { - close(); - } - }; - document.addEventListener('click', outsideClickHandler, true); - } - - portalEl.id = id; - portalEl.className = getValue(options.class) ?? ''; - portalEl.style.cssText = `position: ${fixed ? 'fixed' : 'absolute'}; z-index: ${zIndex}; ${coords} ${getValue(options.style) ?? ''}`; + return div( + { + id, + class: getValue(options.class) ?? '', + style: `position: ${fixed ? 'fixed' : 'absolute'}; z-index: ${zIndex}; ${coords} ${getValue(options.style) ?? ''}`, + }, + ...args, + ); }); return ''; }; +function getParentPortalElement(anchor, selfId) { + for (const p of Object.values(window.testgen.portals)) { + if (p.domId === selfId) continue; + const el = document.getElementById(p.domId); + if (el?.contains(anchor)) return el; + } + return null; +} + +function isClickInsideChildPortal(target, selfId, selfPortalEl) { + for (const p of Object.values(window.testgen.portals)) { + if (p.domId === selfId) continue; + const childEl = document.getElementById(p.domId); + if (childEl?.contains(target)) { + const childAnchor = document.getElementById(p.targetId); + if (selfPortalEl?.contains(childAnchor)) return true; + } + } + return false; +} + function hasFixedAncestor(el) { let node = el.parentElement; while (node && node !== document.body) { From 9151f517b459e3aca889f74baaa0b0267d83a823 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 26 Mar 2026 17:24:51 -0400 Subject: [PATCH 84/95] ci: disable pip cache in dockerfiles --- deploy/testgen-base.dockerfile | 4 ++-- deploy/testgen.dockerfile | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deploy/testgen-base.dockerfile b/deploy/testgen-base.dockerfile index f5b9a2bf..0a297555 100644 --- a/deploy/testgen-base.dockerfile +++ b/deploy/testgen-base.dockerfile @@ -41,7 +41,7 @@ COPY ./pyproject.toml /tmp/dk/pyproject.toml RUN mkdir /dk # Upgrading pip for security -RUN python3 -m pip install --upgrade pip==26.0 +RUN python3 -m pip install --no-cache-dir --upgrade pip==26.0 # hdbcli only ships manylinux wheels (no musl). pip 26+ correctly rejects these on Alpine. # We download the wheel for the correct arch, then extract it directly into site-packages @@ -60,7 +60,7 @@ RUN ARCH=$(uname -m) && \ # pip 26+ would fail trying to resolve it from PyPI on musl RUN sed -i '/hdbcli/d' /tmp/dk/pyproject.toml -RUN python3 -m pip install --prefix=/dk /tmp/dk +RUN python3 -m pip install --no-cache-dir --prefix=/dk /tmp/dk RUN apk del \ gcc \ diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index f40da127..d105c2ea 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -15,10 +15,10 @@ RUN apk upgrade # Now install everything (hdbcli is pre-installed in the base image via manual wheel extraction) COPY . /tmp/dk/ RUN sed -i '/hdbcli/d' /tmp/dk/pyproject.toml /tmp/dk/testgen/pyproject.toml 2>/dev/null; \ - python3 -m pip install --prefix=/dk /tmp/dk + python3 -m pip install --no-cache-dir --prefix=/dk /tmp/dk # Generate third-party license notices from installed packages -RUN pip install pip-licenses \ +RUN pip install --no-cache-dir pip-licenses \ && SCRIPT=$(find /tmp/dk -name generate_third_party_notices.py | head -1) \ && PYTHONPATH=/dk/lib/python3.12/site-packages python3 "$SCRIPT" --output /dk/THIRD-PARTY-NOTICES \ && pip uninstall -y pip-licenses From 2512c9edda6696e6f680749f916eb4628fde6594 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 26 Mar 2026 17:46:49 -0400 Subject: [PATCH 85/95] fix(emails): move app links to left --- testgen/common/notifications/monitor_run.py | 8 ++-- testgen/common/notifications/profiling_run.py | 37 ++++++++++--------- testgen/common/notifications/score_drop.py | 8 ++-- testgen/common/notifications/test_run.py | 28 +++++++------- 4 files changed, 45 insertions(+), 36 deletions(-) diff --git a/testgen/common/notifications/monitor_run.py b/testgen/common/notifications/monitor_run.py index c3893aad..4a7153d5 100644 --- a/testgen/common/notifications/monitor_run.py +++ b/testgen/common/notifications/monitor_run.py @@ -64,9 +64,6 @@ def get_main_content_template(self): border="0"> Anomalies Summary - - View on TestGen > -
@@ -112,6 +109,11 @@ def get_main_content_template(self): {{/if}} + + + View on TestGen > + + """ diff --git a/testgen/common/notifications/profiling_run.py b/testgen/common/notifications/profiling_run.py index c1731bac..4f4fbd39 100644 --- a/testgen/common/notifications/profiling_run.py +++ b/testgen/common/notifications/profiling_run.py @@ -50,10 +50,6 @@ def get_main_content_template(self): {{project_name}} Schema {{table_group_schema}} - - View results on TestGen > - - Table Group @@ -71,6 +67,11 @@ def get_main_content_template(self): Duration {{format_duration profiling_run.start_time profiling_run.end_time}} + + + View results on TestGen > + +
@@ -81,11 +82,6 @@ def get_main_content_template(self): border="0"> Issues Summary - {{#if (eq profiling_run.status 'Complete')}} - - View {{format_number issue_count}} issues > - - {{/if}} @@ -140,6 +136,13 @@ def get_main_content_template(self):
{{profiling_run.log_message}}
{{/if}} + {{#if (eq profiling_run.status 'Complete')}} + + + View {{format_number issue_count}} issues > + + + {{/if}}
{{#each hygiene_issues_summary}} @@ -165,11 +168,6 @@ def get_result_table_template(self): {{#if (eq priority 'High')}} text-red {{/if}} {{#if (eq priority 'Moderate')}} text-orange {{/if}} ">{{label}} - - - View {{format_number count.total}} {{label}} > - - {{#if (len issues)}} @@ -189,13 +187,18 @@ def get_result_table_template(self): {{/each}} - - + + + + View {{format_number count.total}} {{label}} > + + + {{#if truncated}} + {{truncated}} more {{/if}} - + indicates new issues diff --git a/testgen/common/notifications/score_drop.py b/testgen/common/notifications/score_drop.py index e16f4bf4..dbcaa498 100644 --- a/testgen/common/notifications/score_drop.py +++ b/testgen/common/notifications/score_drop.py @@ -45,9 +45,6 @@ def get_main_content_template(self): Project {{project_name}} - - View on TestGen > - Scorecard @@ -62,6 +59,11 @@ def get_main_content_template(self): {{/each}} + + + View on TestGen > + + - {{#if (eq test_run.status 'Complete')}} - - {{/if}} {{/if}} + {{#if (eq test_run.status 'Complete')}} + + + + {{/if}}
Results Summary - View on TestGen > -
@@ -144,6 +139,13 @@ def get_main_content_template(self):
{{test_run.log_message}}
+ View on TestGen > +
{{#each test_result_summary}} @@ -167,11 +169,6 @@ def get_result_table_template(self): {{#if (eq status 'Warning')}} text-orange {{/if}} {{#if (eq status 'Error')}} text-brown {{/if}} ">{{label}} - - - View {{format_number total}} {{label}} > - - @@ -190,13 +187,18 @@ def get_result_table_template(self): {{/each}} - - + + + + View {{format_number total}} {{label}} > + + + {{#if truncated}} + {{truncated}} more {{/if}} - + indicates new {{label}} From 14f5e8c5790dbd70510715f79cc4047e78950296 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Thu, 26 Mar 2026 19:00:19 -0400 Subject: [PATCH 86/95] feat(wizards): make steppers clickable --- .../components/wizard_progress_indicator.js | 30 +++++++++++++------ .../frontend/js/pages/table_group_wizard.js | 1 + .../components/wizard_progress_indicator.js | 30 +++++++++++++------ 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/testgen/ui/components/frontend/js/components/wizard_progress_indicator.js b/testgen/ui/components/frontend/js/components/wizard_progress_indicator.js index 88bbb789..80e35703 100644 --- a/testgen/ui/components/frontend/js/components/wizard_progress_indicator.js +++ b/testgen/ui/components/frontend/js/components/wizard_progress_indicator.js @@ -14,14 +14,15 @@ * * @param {WizardStepMeta[]} steps * @param {CurrentStep} currentStep - * @returns + * @param {function(string)?} onStepClick + * @returns */ import van from '../van.min.js'; import { colorMap } from '../display_utils.js'; const { div, i, span } = van.tags; -const WizardProgressIndicator = (steps, currentStep) => { +const WizardProgressIndicator = (steps, currentStep, onStepClick) => { const currentPhysicalIndex = steps.findIndex(s => s.includedSteps.includes(currentStep.name)); const progressWidth = van.state('0px'); @@ -50,8 +51,12 @@ const WizardProgressIndicator = (steps, currentStep) => { z-index: -4; `; - const currentStepIndicator = (title, stepIndex) => div( - { class: `flex-column fx-align-flex-center fx-gap-1 step-icon-current`, style: 'position: relative;' }, + const currentStepIndicator = (title, stepIndex, step) => div( + { + class: `flex-column fx-align-flex-center fx-gap-1 step-icon-current`, + style: `position: relative; ${onStepClick ? 'cursor: pointer;' : ''}`, + onclick: () => onStepClick?.(step.includedSteps[0]), + }, stepIndex === 0 ? div({ style: 'position: absolute; width: 50%; height: 50%; left: 0px; background: var(--dk-dialog-background); z-index: -1;' }, '') : '', @@ -66,7 +71,10 @@ const WizardProgressIndicator = (steps, currentStep) => { ); const pendingStepIndicator = (title, stepIndex) => div( - { class: `flex-column fx-align-flex-center fx-gap-1 ${currentPhysicalIndex === stepIndex ? 'step-icon-current' : 'text-secondary'}`, style: 'position: relative;' }, + { + class: `flex-column fx-align-flex-center fx-gap-1 ${currentPhysicalIndex === stepIndex ? 'step-icon-current' : 'text-secondary'}`, + style: 'position: relative; cursor: default;', + }, stepIndex === 0 ? div({ style: 'position: absolute; width: 50%; height: 50%; left: 0px; background: var(--dk-dialog-background); z-index: -1;' }, '') : '', @@ -80,8 +88,12 @@ const WizardProgressIndicator = (steps, currentStep) => { span({}, title), ); - const completedStepIndicator = (title, stepIndex) => div( - { class: `flex-column fx-align-flex-center fx-gap-1 ${currentPhysicalIndex === stepIndex ? 'step-icon-current' : 'text-secondary'}`, style: 'position: relative;' }, + const completedStepIndicator = (title, stepIndex, step) => div( + { + class: `flex-column fx-align-flex-center fx-gap-1 ${currentPhysicalIndex === stepIndex ? 'step-icon-current' : 'text-secondary'}`, + style: `position: relative; ${onStepClick ? 'cursor: pointer;' : ''}`, + onclick: () => onStepClick?.(step.includedSteps[0]), + }, stepIndex === 0 ? div({ style: 'position: absolute; width: 50%; height: 50%; left: 0px; background: var(--dk-dialog-background); z-index: -1;' }, '') : '', @@ -134,9 +146,9 @@ const WizardProgressIndicator = (steps, currentStep) => { ...steps.map((step, physicalIdx) => { if (step.index < currentStep.index) { if (step.skipped) return skippedStepIndicator(step.title, physicalIdx); - return completedStepIndicator(step.title, physicalIdx); + return completedStepIndicator(step.title, physicalIdx, step); } else if (step.includedSteps.includes(currentStep.name)) { - return currentStepIndicator(step.title, physicalIdx); + return currentStepIndicator(step.title, physicalIdx, step); } else { return pendingStepIndicator(step.title, physicalIdx); } diff --git a/testgen/ui/components/frontend/js/pages/table_group_wizard.js b/testgen/ui/components/frontend/js/pages/table_group_wizard.js index e3807b9f..1c7b0ad2 100644 --- a/testgen/ui/components/frontend/js/pages/table_group_wizard.js +++ b/testgen/ui/components/frontend/js/pages/table_group_wizard.js @@ -180,6 +180,7 @@ const TableGroupWizard = (props) => { index: stepIndex, name: steps[stepIndex], }, + (stepName) => setStep(steps.indexOf(stepName)), ); }, WizardStep(0, currentStepIndex, () => { diff --git a/testgen/ui/static/js/components/wizard_progress_indicator.js b/testgen/ui/static/js/components/wizard_progress_indicator.js index 88bbb789..80e35703 100644 --- a/testgen/ui/static/js/components/wizard_progress_indicator.js +++ b/testgen/ui/static/js/components/wizard_progress_indicator.js @@ -14,14 +14,15 @@ * * @param {WizardStepMeta[]} steps * @param {CurrentStep} currentStep - * @returns + * @param {function(string)?} onStepClick + * @returns */ import van from '../van.min.js'; import { colorMap } from '../display_utils.js'; const { div, i, span } = van.tags; -const WizardProgressIndicator = (steps, currentStep) => { +const WizardProgressIndicator = (steps, currentStep, onStepClick) => { const currentPhysicalIndex = steps.findIndex(s => s.includedSteps.includes(currentStep.name)); const progressWidth = van.state('0px'); @@ -50,8 +51,12 @@ const WizardProgressIndicator = (steps, currentStep) => { z-index: -4; `; - const currentStepIndicator = (title, stepIndex) => div( - { class: `flex-column fx-align-flex-center fx-gap-1 step-icon-current`, style: 'position: relative;' }, + const currentStepIndicator = (title, stepIndex, step) => div( + { + class: `flex-column fx-align-flex-center fx-gap-1 step-icon-current`, + style: `position: relative; ${onStepClick ? 'cursor: pointer;' : ''}`, + onclick: () => onStepClick?.(step.includedSteps[0]), + }, stepIndex === 0 ? div({ style: 'position: absolute; width: 50%; height: 50%; left: 0px; background: var(--dk-dialog-background); z-index: -1;' }, '') : '', @@ -66,7 +71,10 @@ const WizardProgressIndicator = (steps, currentStep) => { ); const pendingStepIndicator = (title, stepIndex) => div( - { class: `flex-column fx-align-flex-center fx-gap-1 ${currentPhysicalIndex === stepIndex ? 'step-icon-current' : 'text-secondary'}`, style: 'position: relative;' }, + { + class: `flex-column fx-align-flex-center fx-gap-1 ${currentPhysicalIndex === stepIndex ? 'step-icon-current' : 'text-secondary'}`, + style: 'position: relative; cursor: default;', + }, stepIndex === 0 ? div({ style: 'position: absolute; width: 50%; height: 50%; left: 0px; background: var(--dk-dialog-background); z-index: -1;' }, '') : '', @@ -80,8 +88,12 @@ const WizardProgressIndicator = (steps, currentStep) => { span({}, title), ); - const completedStepIndicator = (title, stepIndex) => div( - { class: `flex-column fx-align-flex-center fx-gap-1 ${currentPhysicalIndex === stepIndex ? 'step-icon-current' : 'text-secondary'}`, style: 'position: relative;' }, + const completedStepIndicator = (title, stepIndex, step) => div( + { + class: `flex-column fx-align-flex-center fx-gap-1 ${currentPhysicalIndex === stepIndex ? 'step-icon-current' : 'text-secondary'}`, + style: `position: relative; ${onStepClick ? 'cursor: pointer;' : ''}`, + onclick: () => onStepClick?.(step.includedSteps[0]), + }, stepIndex === 0 ? div({ style: 'position: absolute; width: 50%; height: 50%; left: 0px; background: var(--dk-dialog-background); z-index: -1;' }, '') : '', @@ -134,9 +146,9 @@ const WizardProgressIndicator = (steps, currentStep) => { ...steps.map((step, physicalIdx) => { if (step.index < currentStep.index) { if (step.skipped) return skippedStepIndicator(step.title, physicalIdx); - return completedStepIndicator(step.title, physicalIdx); + return completedStepIndicator(step.title, physicalIdx, step); } else if (step.includedSteps.includes(currentStep.name)) { - return currentStepIndicator(step.title, physicalIdx); + return currentStepIndicator(step.title, physicalIdx, step); } else { return pendingStepIndicator(step.title, physicalIdx); } From 8d461c35115e0899c8b7e64bbb9d5705841c9762 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 27 Mar 2026 00:11:34 -0400 Subject: [PATCH 87/95] fix: update error text to be consistent --- testgen/ui/views/connections.py | 6 +++--- testgen/ui/views/dialogs/import_metadata_dialog.py | 2 +- testgen/ui/views/hygiene_issues.py | 2 +- testgen/ui/views/monitors_dashboard.py | 2 +- testgen/ui/views/profiling_runs.py | 2 +- testgen/ui/views/score_details.py | 2 +- testgen/ui/views/table_groups.py | 1 + testgen/ui/views/test_results.py | 2 +- testgen/ui/views/test_runs.py | 2 +- 9 files changed, 11 insertions(+), 10 deletions(-) diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index 4fb658f7..c0089fca 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -181,7 +181,7 @@ def on_setup_table_group_clicked(*_args) -> None: connection.save() message = "Changes have been saved successfully." except Exception as error: - message = "Error creating connection" + message = "Something went wrong while creating the connection." success = False LOG.exception(message) @@ -269,7 +269,7 @@ def test_connection(self, connection: Connection) -> "ConnectionStatus": details = error.args[0] return ConnectionStatus(message="Error attempting the connection.", details=details, successful=False) except Exception as error: - details = "Try again" + details = "Something went wrong while testing the connection." if connection.connect_by_key and not connection.private_key: details = "The private key is missing." LOG.exception("Error testing database connection") @@ -472,7 +472,7 @@ def on_close_clicked(_params: dict) -> None: LOG.info("Table group %s created", table_group.id) safe_rerun() except Exception as error: - message = "Error creating table group" + message = "Something went wrong while creating the table group." success = False LOG.exception(message) diff --git a/testgen/ui/views/dialogs/import_metadata_dialog.py b/testgen/ui/views/dialogs/import_metadata_dialog.py index 511be2d2..209cd308 100644 --- a/testgen/ui/views/dialogs/import_metadata_dialog.py +++ b/testgen/ui/views/dialogs/import_metadata_dialog.py @@ -428,7 +428,7 @@ def on_file_cleared(_payload: dict) -> None: LOG.exception("Metadata import failed") result = { "success": False, - "message": "Import failed due to an unexpected error. Please try again.", + "message": "Something went wrong while importing the metadata.", } st.session_state.pop(PREVIEW_SESSION_KEY, None) diff --git a/testgen/ui/views/hygiene_issues.py b/testgen/ui/views/hygiene_issues.py index 75f17292..e4ef88c3 100644 --- a/testgen/ui/views/hygiene_issues.py +++ b/testgen/ui/views/hygiene_issues.py @@ -484,7 +484,7 @@ def source_data_dialog(selected_row): elif bad_data_status == "ERR": st.error(bad_data_msg) elif df_bad is None: - st.error("An unknown error was encountered.") + st.error("Something went wrong while loading the data.") else: if bad_data_msg: st.info(bad_data_msg) diff --git a/testgen/ui/views/monitors_dashboard.py b/testgen/ui/views/monitors_dashboard.py index 40661e4c..e19145a1 100644 --- a/testgen/ui/views/monitors_dashboard.py +++ b/testgen/ui/views/monitors_dashboard.py @@ -625,7 +625,7 @@ def on_delete_confirmed(*_args) -> None: LOG.exception("Failed to delete monitor suite") set_result({ "success": False, - "message": "Unable to delete monitors for the table group, try again.", + "message": "Something went wrong while deleting the monitors.", }) safe_rerun(scope="fragment") diff --git a/testgen/ui/views/profiling_runs.py b/testgen/ui/views/profiling_runs.py index 475cae67..b59363d1 100644 --- a/testgen/ui/views/profiling_runs.py +++ b/testgen/ui/views/profiling_runs.py @@ -229,6 +229,6 @@ def on_delete_confirmed(*_args) -> None: LOG.exception("Failed to delete profiling runs") set_result({ "success": False, - "message": "Unable to delete the selected profiling runs, try again.", + "message": "Something went wrong while deleting the profiling runs.", }) safe_rerun(scope="fragment") diff --git a/testgen/ui/views/score_details.py b/testgen/ui/views/score_details.py index 47178231..edc8c33c 100644 --- a/testgen/ui/views/score_details.py +++ b/testgen/ui/views/score_details.py @@ -243,7 +243,7 @@ def recalculate_score_history(definition_id: str) -> None: st.toast("Scorecard trend recalculated", icon=":material/task_alt:") except: LOG.exception(f"Failure recalculating history for scorecard id={definition_id}") - st.toast("Recalculating the trend failed. Try again", icon=":material/error:") + st.toast("Something went wrong while recalculating the trend.", icon=":material/error:") class ScoreDropNotificationSettingsDialog(NotificationSettingsDialogBase): diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index 79468104..058cc975 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -341,6 +341,7 @@ def on_close_clicked(_params: dict) -> None: safe_rerun() except IntegrityError: + get_current_session().rollback() success = False message = "A Table Group with the same name already exists." else: diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 9d4c802d..ff8a1188 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -890,7 +890,7 @@ def source_data_dialog(selected_row): elif bad_data_status == "ERR": st.error(bad_data_msg) elif df_bad is None: - st.error("An unknown error was encountered.") + st.error("Something went wrong while loading the data.") else: if bad_data_msg: st.info(bad_data_msg) diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index 9918b96f..919651c7 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -246,5 +246,5 @@ def on_delete_confirmed(*_args) -> None: safe_rerun() except Exception: LOG.exception("Failed to delete test run") - result = {"success": False, "message": "Unable to delete the test run, try again."} + result = {"success": False, "message": "Something went wrong while deleting the test run."} safe_rerun(scope="fragment") From 04dbd1b9b8cf627b6f885a13b71de463c164f59f Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 27 Mar 2026 01:25:50 -0400 Subject: [PATCH 88/95] fix(test definitions): handle empty suite --- testgen/ui/views/test_definitions.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index df2858eb..00850bcc 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -176,6 +176,10 @@ def render( with st.spinner("Loading data ..."): df = get_test_definitions(test_suite, table_name, column_name, test_type, sorting_columns, flagged) + if df.empty: + st.info("No test definitions found.") + return + selected, selected_test_def = render_grid(df, multi_select, filters_changed) popover_container = table_actions_column.empty() @@ -1306,6 +1310,9 @@ def get_test_definitions( ) df = to_dataframe(test_definitions) + if df.empty: + return df + date_service.accommodate_dataframe_to_timezone(df, st.session_state) for key in ["id", "table_groups_id", "profile_run_id", "test_suite_id"]: df[key] = df[key].apply(lambda value: str(value)) From b94c5065ffbcb241471030a1a5cc09756ba23703 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 27 Mar 2026 01:26:29 -0400 Subject: [PATCH 89/95] fix(scorecard): error on adding notification --- testgen/ui/views/dialogs/manage_notifications.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testgen/ui/views/dialogs/manage_notifications.py b/testgen/ui/views/dialogs/manage_notifications.py index 4a14637c..af35f828 100644 --- a/testgen/ui/views/dialogs/manage_notifications.py +++ b/testgen/ui/views/dialogs/manage_notifications.py @@ -143,7 +143,7 @@ def render(self) -> None: scope_options_labels = dict(component_props.get("scope_options", [])) ns_json_list = sorted( self._mark_duplicates(ns_json_list), - key=lambda item: "0" if not item["scope"] else scope_options_labels.get(item["scope"], "ZZZ"), + key=lambda item: "0" if not item.get("scope") else scope_options_labels.get(item["scope"], "ZZZ"), ) widgets.css_class("m-dialog") widgets.testgen_component( From 1dfbfd9665eb20ba5b3b295cb939ac8ee7aebcb1 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 27 Mar 2026 01:58:03 -0400 Subject: [PATCH 90/95] fix(copy/move tests): unique key constraints --- testgen/ui/views/test_definitions.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 00850bcc..af1b4b43 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -997,12 +997,15 @@ def copy_move_test_dialog( movable_test_definitions = [] if target_table_group_id and target_test_suite_id: - collision_test_definitions = get_test_definitions_collision(selected_test_definitions, target_table_group_id, target_test_suite_id) + collision_test_definitions = get_test_definitions_collision(selected_test_definitions, target_table_group_id, target_test_suite_id, target_table_name, target_column_name) + overwrite_ids = [] if not collision_test_definitions.empty: unlocked = collision_test_definitions[collision_test_definitions["lock_refresh"] == False] locked = collision_test_definitions[collision_test_definitions["lock_refresh"] == True] locked_tuples = [ (test["table_name"], test["column_name"], test["test_type"]) for test in locked.iterrows() ] movable_test_definitions = [ test for test in selected_test_definitions if (test["table_name"], test["column_name"], test["test_type"]) not in locked_tuples ] + selected_ids = {str(item["id"]) for item in selected_test_definitions} + overwrite_ids = [id_ for id_ in unlocked["id"].tolist() if str(id_) not in selected_ids] warning_message = f"""Auto-generated tests are present in the target test suite for the same column-test type combinations as the selected tests. \nUnlocked tests that will be overwritten: {len(unlocked)} @@ -1028,12 +1031,16 @@ def copy_move_test_dialog( test_definition_ids = [item["id"] for item in movable_test_definitions] if move: + if overwrite_ids: + TestDefinition.delete_where(TestDefinition.id.in_(overwrite_ids)) TestDefinition.move(test_definition_ids, target_table_group_id, target_test_suite_id, target_table_name, target_column_name) success_message = "Test Definitions have been moved." st.success(success_message) time.sleep(1) safe_rerun() elif copy: + if overwrite_ids: + TestDefinition.delete_where(TestDefinition.id.in_(overwrite_ids)) TestDefinition.copy(test_definition_ids, target_table_group_id, target_test_suite_id, target_table_name, target_column_name) success_message = "Test Definitions have been copied." st.success(success_message) @@ -1347,9 +1354,11 @@ def get_test_definitions_collision( test_definitions: list[dict], target_table_group_id: str, target_test_suite_id: str, + target_table_name: str | None = None, + target_column_name: str | None = None, ) -> pd.DataFrame: - table_tests = [(item["table_name"], item["test_type"]) for item in test_definitions if item["column_name"] is None and item["table_name"] is not None] - column_tests = [(item["table_name"], item["column_name"], item["test_type"]) for item in test_definitions if item["column_name"] is not None] + table_tests = [(target_table_name or item["table_name"], item["test_type"]) for item in test_definitions if item["column_name"] is None and item["table_name"] is not None] + column_tests = [(target_table_name or item["table_name"], target_column_name or item["column_name"], item["test_type"]) for item in test_definitions if item["column_name"] is not None] results = TestDefinition.select_minimal_where( TestDefinition.table_groups_id == target_table_group_id, TestDefinition.test_suite_id == target_test_suite_id, From 7a1333eea946d76dd00eb983637f3eb9c71a4c41 Mon Sep 17 00:00:00 2001 From: testgen-ci-bot Date: Fri, 27 Mar 2026 15:49:22 +0000 Subject: [PATCH 91/95] ci: bump base image to v14 --- deploy/testgen.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/testgen.dockerfile b/deploy/testgen.dockerfile index d105c2ea..6708fd67 100644 --- a/deploy/testgen.dockerfile +++ b/deploy/testgen.dockerfile @@ -1,4 +1,4 @@ -ARG TESTGEN_BASE_LABEL=v13 +ARG TESTGEN_BASE_LABEL=v14 FROM datakitchen/dataops-testgen-base:${TESTGEN_BASE_LABEL} AS release-image From 104e0b8c29a9f1bd47663c518933f2e7f912645f Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 27 Mar 2026 15:26:49 -0400 Subject: [PATCH 92/95] fix(data catalog): prefix icons disappear after saving --- .../components/frontend/js/components/tree.js | 2 +- .../frontend/js/pages/data_catalog.js | 24 ++++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/testgen/ui/components/frontend/js/components/tree.js b/testgen/ui/components/frontend/js/components/tree.js index fbf77c9c..59001db5 100644 --- a/testgen/ui/components/frontend/js/components/tree.js +++ b/testgen/ui/components/frontend/js/components/tree.js @@ -8,7 +8,7 @@ * @property {number?} iconSize * @property {string?} iconClass * @property {string?} iconTooltip - * @property {Element?} prefix + * @property {Element|function?} prefix * @property {TreeNode[]?} children * @property {number?} level * @property {boolean?} expanded diff --git a/testgen/ui/components/frontend/js/pages/data_catalog.js b/testgen/ui/components/frontend/js/pages/data_catalog.js index 14979a9b..33418b93 100644 --- a/testgen/ui/components/frontend/js/pages/data_catalog.js +++ b/testgen/ui/components/frontend/js/pages/data_catalog.js @@ -118,16 +118,6 @@ const DataCatalog = (/** @type Properties */ props) => { }; TAG_KEYS.forEach(key => tables[table_id][key] = item[`table_${key}`]); } - const prefixIcons = []; - if (item.critical_data_element ?? item.table_critical_data_element) { - prefixIcons.push(withTooltip(Icon({ size: 15, classes: 'text-purple' }, 'star'), { text: 'Critical data element', position: 'right' })); - } - if (item.excluded_data_element) { - prefixIcons.push(withTooltip(Icon({ size: 15, classes: 'text-brown' }, 'visibility_off'), { text: 'Excluded data element', position: 'right' })); - } - if (item.pii_flag) { - prefixIcons.push(withTooltip(Icon({ size: 15, classes: 'text-orange' }, 'shield_person'), { text: 'PII data', position: 'right' })); - } const columnNode = { id: column_id, label: column_name, @@ -135,7 +125,19 @@ const DataCatalog = (/** @type Properties */ props) => { ...getColumnIcon(item), iconClass: value_ct === 0 ? 'text-error' : null, iconTooltip: value_ct === 0 ? 'No non-null values detected' : null, - prefix: span({ class: 'tg-dh--column-prefix' }, ...prefixIcons), + prefix: () => { + const icons = []; + if (item.critical_data_element ?? item.table_critical_data_element) { + icons.push(withTooltip(Icon({ size: 15, classes: 'text-purple' }, 'star'), { text: 'Critical data element', position: 'right' })); + } + if (item.excluded_data_element) { + icons.push(withTooltip(Icon({ size: 15, classes: 'text-brown' }, 'visibility_off'), { text: 'Excluded data element', position: 'right' })); + } + if (item.pii_flag) { + icons.push(withTooltip(Icon({ size: 15, classes: 'text-orange' }, 'shield_person'), { text: 'PII data', position: 'right' })); + } + return span({ class: 'tg-dh--column-prefix' }, ...icons); + }, criticalDataElement: !!(item.critical_data_element ?? item.table_critical_data_element), excludedDataElement: !!item.excluded_data_element, piiFlag: !!item.pii_flag, From 66f244e6f948dbd8a7ab9e1d75042762f0e96fea Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 27 Mar 2026 15:27:00 -0400 Subject: [PATCH 93/95] fix: missing imports --- testgen/ui/components/frontend/js/pages/profiling_runs.js | 2 +- testgen/ui/components/frontend/js/pages/test_runs.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/testgen/ui/components/frontend/js/pages/profiling_runs.js b/testgen/ui/components/frontend/js/pages/profiling_runs.js index 1515fbf4..e041b5d1 100644 --- a/testgen/ui/components/frontend/js/pages/profiling_runs.js +++ b/testgen/ui/components/frontend/js/pages/profiling_runs.js @@ -48,7 +48,7 @@ import { Link } from '../components/link.js'; import { Button } from '../components/button.js'; import { Streamlit } from '../streamlit.js'; import { emitEvent, getValue, loadStylesheet, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange } from '../utils.js'; -import { formatTimestamp, formatDuration, formatNumber } from '../display_utils.js'; +import { formatTimestamp, formatDuration, formatNumber, DISABLED_ACTION_TEXT } from '../display_utils.js'; import { Checkbox } from '../components/checkbox.js'; import { Select } from '../components/select.js'; import { Paginator } from '../components/paginator.js'; diff --git a/testgen/ui/components/frontend/js/pages/test_runs.js b/testgen/ui/components/frontend/js/pages/test_runs.js index 73922155..20e60bea 100644 --- a/testgen/ui/components/frontend/js/pages/test_runs.js +++ b/testgen/ui/components/frontend/js/pages/test_runs.js @@ -47,7 +47,7 @@ import { Link } from '../components/link.js'; import { Button } from '../components/button.js'; import { Streamlit } from '../streamlit.js'; import { emitEvent, getValue, loadStylesheet, resizeFrameHeightToElement, resizeFrameHeightOnDOMChange } from '../utils.js'; -import { formatTimestamp, formatDuration } from '../display_utils.js'; +import { formatTimestamp, formatDuration, DISABLED_ACTION_TEXT } from '../display_utils.js'; import { Checkbox } from '../components/checkbox.js'; import { Select } from '../components/select.js'; import { Paginator } from '../components/paginator.js'; From 71c974d42e56a88b65872be828d862a4f74e6824 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 27 Mar 2026 15:27:20 -0400 Subject: [PATCH 94/95] security: upgrade PyJWT library --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bd7a2982..c3f775d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ dependencies = [ # MCP server "mcp[cli]==1.26.0", "uvicorn==0.41.0", - "PyJWT==2.11.0", + "PyJWT==2.12.0", "bcrypt==5.0.0", # API & OAuth server From 1a08184e0b4959fe420acab9ced12701673e7d00 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Fri, 27 Mar 2026 23:25:30 -0400 Subject: [PATCH 95/95] release: 5.0.2 -> 5.9.4 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c3f775d7..9cc59ed5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "dataops-testgen" -version = "5.0.2" +version = "5.9.4" description = "DataKitchen's Data Quality DataOps TestGen" authors = [ { "name" = "DataKitchen, Inc.", "email" = "info@datakitchen.io" },