From da608818651de7e5e82041f9b55271ca59a3c517 Mon Sep 17 00:00:00 2001 From: JaskaranIntugle Date: Thu, 11 Sep 2025 15:25:30 +0530 Subject: [PATCH 1/5] added export to csv functionality --- docsite/.gitignore | 20 +++++ src/intugle/streamlit.py | 187 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 docsite/.gitignore create mode 100644 src/intugle/streamlit.py diff --git a/docsite/.gitignore b/docsite/.gitignore new file mode 100644 index 0000000..b2d6de3 --- /dev/null +++ b/docsite/.gitignore @@ -0,0 +1,20 @@ +# Dependencies +/node_modules + +# Production +/build + +# Generated files +.docusaurus +.cache-loader + +# Misc +.DS_Store +.env.local +.env.development.local +.env.test.local +.env.production.local + +npm-debug.log* +yarn-debug.log* +yarn-error.log* diff --git a/src/intugle/streamlit.py b/src/intugle/streamlit.py new file mode 100644 index 0000000..4fc3fd4 --- /dev/null +++ b/src/intugle/streamlit.py @@ -0,0 +1,187 @@ +import os +from typing import TYPE_CHECKING + +import pandas as pd + +from intugle.analysis.models import DataSet +from intugle.core import settings +from intugle.libs.smart_query_generator import SmartQueryGenerator +from intugle.libs.smart_query_generator.models.models import ETLModel, FieldDetailsModel, LinkModel +from intugle.libs.smart_query_generator.utils.join import Join +from intugle.parser.manifest import ManifestLoader + +if TYPE_CHECKING: + from intugle.models.resources.model import Column + + +class StreamlitApp: + + def __init__(self, project_base: str = settings.PROJECT_BASE): + self.manifest_loader = ManifestLoader(project_base) + self.manifest_loader.load() + self.manifest = self.manifest_loader.manifest + + self.project_base = project_base + + self.field_details = self.get_all_field_details() + + # get the links from the manifest + self.links = self.get_links() + + selected_fields = set(self.field_details.keys()) + self.join = Join(self.links, selected_fields) + + self.datasets: dict[str, DataSet] = {} + + self.load_all() + + def load_all(self): + sources = self.manifest.sources + for source in sources.values(): + table_name = source.table.name + details = source.table.details + dataset = DataSet(data=details, name=table_name) + self.datasets[table_name] = dataset + + def get_all_field_details(self) -> dict[str, FieldDetailsModel]: + """Fetches all field details from the manifest.""" + + # get sources from the manifest + sources = self.manifest.sources + + field_details: dict[str, FieldDetailsModel] = {} + + # iterate through each source and get the field details (all fields / columns) + for source in sources.values(): + for column in source.table.columns: + field_detail: FieldDetailsModel = FieldDetailsModel( + id=f"{source.table.name}.{column.name}", + name=column.name, + datatype_l1=column.type, + datatype_l2=column.category, + sql_code=f"\"{source.table.name}\".\"{column.name}\"", + is_pii=False, + asset_id=source.table.name, + asset_name=source.table.name, + asset_details={}, + connection_id=source.schema, + connection_source_name="postgresql", + connection_credentials={}, + ) + field_details[field_detail.id] = field_detail + + return field_details + + def get_links(self) -> list[LinkModel]: + """Fetches the links from the manifest.""" + + # get relationships from the manifest + relationships = self.manifest.relationships + links: list[LinkModel] = [] + + # iterate through each relationship and create a LinkModel + for relationship in relationships.values(): + links.append(relationship.link) + return links + + def export_analysis_to_csv(self): + """Exports the analysis results to CSV files.""" + # 1. Column Profiles CSV + all_profiles = [] + for source in self.manifest.sources.values(): + for column in source.table.columns: + profile_data = { + "table_name": source.table.name, + "column_name": column.name, + "data_type_l1": column.type, + "data_type_l2": column.category, + "count": column.profiling_metrics.count, + "uniqueness": column.profiling_metrics.distinct_count / column.profiling_metrics.count + if column.profiling_metrics.count + else 0, + "completeness": (column.profiling_metrics.count - column.profiling_metrics.null_count) + / column.profiling_metrics.count + if column.profiling_metrics.count + else 0, + "sample_values": column.profiling_metrics.sample_data, + } + all_profiles.append(profile_data) + column_profiles_df = pd.DataFrame(all_profiles) + column_profiles_df.to_csv(os.path.join(self.project_base, "column_profiles.csv"), index=False) + + # 2. Link Predictions CSV + link_data = [] + for relationship in self.manifest.relationships.values(): + left_table_name = relationship.source.table + left_column_name = relationship.source.column + right_table_name = relationship.target.table + right_column_name = relationship.target.column + + left_source = self.manifest.sources.get(left_table_name) + right_source = self.manifest.sources.get(right_table_name) + + if left_source and right_source: + left_column_data = next( + (col for col in left_source.table.columns if col.name == left_column_name), None + ) + right_column_data = next( + (col for col in right_source.table.columns if col.name == right_column_name), None + ) + + if left_column_data and right_column_data: + link_data.append( + { + "left_table": left_table_name, + "left_column": left_column_name, + "left_data_type_l1": left_column_data.type, + "left_data_type_l2": left_column_data.category, + "left_count": left_column_data.profiling_metrics.count, + "left_uniqueness": left_column_data.profiling_metrics.distinct_count + / left_column_data.profiling_metrics.count + if left_column_data.profiling_metrics.count + else 0, + "left_completeness": ( + left_column_data.profiling_metrics.count - left_column_data.profiling_metrics.null_count + ) + / left_column_data.profiling_metrics.count + if left_column_data.profiling_metrics.count + else 0, + "left_sample_values": left_column_data.profiling_metrics.sample_data, + "right_table": right_table_name, + "right_column": right_column_name, + "right_data_type_l1": right_column_data.type, + "right_data_type_l2": right_column_data.category, + "right_count": right_column_data.profiling_metrics.count, + "right_uniqueness": right_column_data.profiling_metrics.distinct_count + / right_column_data.profiling_metrics.count + if right_column_data.profiling_metrics.count + else 0, + "right_completeness": ( + right_column_data.profiling_metrics.count + - right_column_data.profiling_metrics.null_count + ) + / right_column_data.profiling_metrics.count + if right_column_data.profiling_metrics.count + else 0, + "right_sample_values": right_column_data.profiling_metrics.sample_data, + } + ) + link_predictions_df = pd.DataFrame(link_data) + link_predictions_df.to_csv(os.path.join(self.project_base, "link_predictions.csv"), index=False) + + # 3. Business Glossary CSV + glossary_data = [] + for source in self.manifest.sources.values(): + for column in source.table.columns: + glossary_data.append( + { + "table_name": source.table.name, + "column_name": column.name, + "business_glossary": column.description, + "business_tags": column.tags, + } + ) + glossary_df = pd.DataFrame(glossary_data) + glossary_df.to_csv(os.path.join(self.project_base, "business_glossary.csv"), index=False) + + From b71b383120781f6d3458cb2a95e5e18b98658d43 Mon Sep 17 00:00:00 2001 From: JaskaranIntugle Date: Thu, 11 Sep 2025 16:05:00 +0530 Subject: [PATCH 2/5] modularized export to csv --- .gitignore | 2 +- src/intugle/exporters/__init__.py | 36 ++++++++++ src/intugle/models/manifest.py | 93 +++++++++++++++++++++++++ src/intugle/streamlit.py | 110 ++---------------------------- 4 files changed, 134 insertions(+), 107 deletions(-) create mode 100644 src/intugle/exporters/__init__.py diff --git a/.gitignore b/.gitignore index a8e0c01..9ac7fef 100644 --- a/.gitignore +++ b/.gitignore @@ -208,7 +208,7 @@ __marimo__/ notes.txt testing_base -models +/models models_bak settings.json diff --git a/src/intugle/exporters/__init__.py b/src/intugle/exporters/__init__.py new file mode 100644 index 0000000..744aa56 --- /dev/null +++ b/src/intugle/exporters/__init__.py @@ -0,0 +1,36 @@ +import os + +import pandas as pd + +from intugle.parser.manifest import Manifest + + +class CSVExporter: + def __init__(self, manifest: Manifest, project_base: str): + self.manifest = manifest + self.project_base = project_base + + def _export_column_profiles(self, file_path: str): + df = self.manifest.profiles_df + profile_columns_to_keep = [ + col for col in df.columns if col not in ["business_glossary", "business_tags"] + ] + df[profile_columns_to_keep].to_csv(file_path, index=False) + + def _export_link_predictions(self, file_path: str): + df = self.manifest.links_df + df.to_csv(file_path, index=False) + + def _export_business_glossary(self, file_path: str): + df = self.manifest.business_glossary_df + df.to_csv(file_path, index=False) + + def export_all( + self, + column_profiles_file="column_profiles.csv", + link_predictions_file="link_predictions.csv", + business_glossary_file="business_glossary.csv", + ): + self._export_column_profiles(os.path.join(self.project_base, column_profiles_file)) + self._export_link_predictions(os.path.join(self.project_base, link_predictions_file)) + self._export_business_glossary(os.path.join(self.project_base, business_glossary_file)) diff --git a/src/intugle/models/manifest.py b/src/intugle/models/manifest.py index 17aa331..7b84b81 100644 --- a/src/intugle/models/manifest.py +++ b/src/intugle/models/manifest.py @@ -1,3 +1,5 @@ +import pandas as pd + from pydantic import Field from intugle.common.schema import SchemaBase @@ -10,3 +12,94 @@ class Manifest(SchemaBase): sources: dict[str, Source] = Field(default_factory=dict) models: dict[str, Model] = Field(default_factory=dict) relationships: dict[str, Relationship] = Field(default_factory=dict) + + @property + def profiles_df(self) -> pd.DataFrame: + """Generates a DataFrame with column profiling information.""" + all_profiles = [] + for source in self.sources.values(): + for column in source.table.columns: + metrics = column.profiling_metrics + profile_data = { + "table_name": source.table.name, + "column_name": column.name, + "data_type_l1": column.type, + "data_type_l2": column.category, + "count": metrics.count, + "null_count": metrics.null_count, + "distinct_count": metrics.distinct_count, + "uniqueness": metrics.distinct_count / metrics.count if metrics.count else 0, + "completeness": (metrics.count - metrics.null_count) / metrics.count if metrics.count else 0, + "sample_values": metrics.sample_data, + "business_glossary": column.description, + "business_tags": column.tags, + } + all_profiles.append(profile_data) + return pd.DataFrame(all_profiles) + + @property + def links_df(self) -> pd.DataFrame: + """Generates a DataFrame with link prediction information.""" + link_data = [] + for relationship in self.relationships.values(): + left_table_name = relationship.source.table + left_column_name = relationship.source.column + right_table_name = relationship.target.table + right_column_name = relationship.target.column + + left_source = self.sources.get(left_table_name) + right_source = self.sources.get(right_table_name) + + if left_source and right_source: + left_column = next((c for c in left_source.table.columns if c.name == left_column_name), None) + right_column = next((c for c in right_source.table.columns if c.name == right_column_name), None) + + if left_column and right_column: + left_metrics = left_column.profiling_metrics + right_metrics = right_column.profiling_metrics + link_data.append( + { + "left_table": left_table_name, + "left_column": left_column_name, + "left_data_type_l1": left_column.type, + "left_data_type_l2": left_column.category, + "left_count": left_metrics.count, + "left_uniqueness": left_metrics.distinct_count / left_metrics.count + if left_metrics.count + else 0, + "left_completeness": (left_metrics.count - left_metrics.null_count) / left_metrics.count + if left_metrics.count + else 0, + "left_sample_values": left_metrics.sample_data, + "right_table": right_table_name, + "right_column": right_column_name, + "right_data_type_l1": right_column.type, + "right_data_type_l2": right_column.category, + "right_count": right_metrics.count, + "right_uniqueness": right_metrics.distinct_count / right_metrics.count + if right_metrics.count + else 0, + "right_completeness": (right_metrics.count - right_metrics.null_count) + / right_metrics.count + if right_metrics.count + else 0, + "right_sample_values": right_metrics.sample_data, + } + ) + return pd.DataFrame(link_data) + + @property + def business_glossary_df(self) -> pd.DataFrame: + """Generates a DataFrame with business glossary information.""" + glossary_data = [] + for source in self.sources.values(): + for column in source.table.columns: + glossary_data.append( + { + "table_name": source.table.name, + "column_name": column.name, + "business_glossary": column.description, + "business_tags": column.tags, + } + ) + return pd.DataFrame(glossary_data) diff --git a/src/intugle/streamlit.py b/src/intugle/streamlit.py index 4fc3fd4..69eb0b7 100644 --- a/src/intugle/streamlit.py +++ b/src/intugle/streamlit.py @@ -1,18 +1,11 @@ -import os -from typing import TYPE_CHECKING - -import pandas as pd from intugle.analysis.models import DataSet from intugle.core import settings -from intugle.libs.smart_query_generator import SmartQueryGenerator -from intugle.libs.smart_query_generator.models.models import ETLModel, FieldDetailsModel, LinkModel +from intugle.exporters import CSVExporter +from intugle.libs.smart_query_generator.models.models import FieldDetailsModel, LinkModel from intugle.libs.smart_query_generator.utils.join import Join from intugle.parser.manifest import ManifestLoader -if TYPE_CHECKING: - from intugle.models.resources.model import Column - class StreamlitApp: @@ -86,102 +79,7 @@ def get_links(self) -> list[LinkModel]: def export_analysis_to_csv(self): """Exports the analysis results to CSV files.""" - # 1. Column Profiles CSV - all_profiles = [] - for source in self.manifest.sources.values(): - for column in source.table.columns: - profile_data = { - "table_name": source.table.name, - "column_name": column.name, - "data_type_l1": column.type, - "data_type_l2": column.category, - "count": column.profiling_metrics.count, - "uniqueness": column.profiling_metrics.distinct_count / column.profiling_metrics.count - if column.profiling_metrics.count - else 0, - "completeness": (column.profiling_metrics.count - column.profiling_metrics.null_count) - / column.profiling_metrics.count - if column.profiling_metrics.count - else 0, - "sample_values": column.profiling_metrics.sample_data, - } - all_profiles.append(profile_data) - column_profiles_df = pd.DataFrame(all_profiles) - column_profiles_df.to_csv(os.path.join(self.project_base, "column_profiles.csv"), index=False) - - # 2. Link Predictions CSV - link_data = [] - for relationship in self.manifest.relationships.values(): - left_table_name = relationship.source.table - left_column_name = relationship.source.column - right_table_name = relationship.target.table - right_column_name = relationship.target.column - - left_source = self.manifest.sources.get(left_table_name) - right_source = self.manifest.sources.get(right_table_name) - - if left_source and right_source: - left_column_data = next( - (col for col in left_source.table.columns if col.name == left_column_name), None - ) - right_column_data = next( - (col for col in right_source.table.columns if col.name == right_column_name), None - ) - - if left_column_data and right_column_data: - link_data.append( - { - "left_table": left_table_name, - "left_column": left_column_name, - "left_data_type_l1": left_column_data.type, - "left_data_type_l2": left_column_data.category, - "left_count": left_column_data.profiling_metrics.count, - "left_uniqueness": left_column_data.profiling_metrics.distinct_count - / left_column_data.profiling_metrics.count - if left_column_data.profiling_metrics.count - else 0, - "left_completeness": ( - left_column_data.profiling_metrics.count - left_column_data.profiling_metrics.null_count - ) - / left_column_data.profiling_metrics.count - if left_column_data.profiling_metrics.count - else 0, - "left_sample_values": left_column_data.profiling_metrics.sample_data, - "right_table": right_table_name, - "right_column": right_column_name, - "right_data_type_l1": right_column_data.type, - "right_data_type_l2": right_column_data.category, - "right_count": right_column_data.profiling_metrics.count, - "right_uniqueness": right_column_data.profiling_metrics.distinct_count - / right_column_data.profiling_metrics.count - if right_column_data.profiling_metrics.count - else 0, - "right_completeness": ( - right_column_data.profiling_metrics.count - - right_column_data.profiling_metrics.null_count - ) - / right_column_data.profiling_metrics.count - if right_column_data.profiling_metrics.count - else 0, - "right_sample_values": right_column_data.profiling_metrics.sample_data, - } - ) - link_predictions_df = pd.DataFrame(link_data) - link_predictions_df.to_csv(os.path.join(self.project_base, "link_predictions.csv"), index=False) - - # 3. Business Glossary CSV - glossary_data = [] - for source in self.manifest.sources.values(): - for column in source.table.columns: - glossary_data.append( - { - "table_name": source.table.name, - "column_name": column.name, - "business_glossary": column.description, - "business_tags": column.tags, - } - ) - glossary_df = pd.DataFrame(glossary_data) - glossary_df.to_csv(os.path.join(self.project_base, "business_glossary.csv"), index=False) + exporter = CSVExporter(self.manifest, self.project_base) + exporter.export_all() From e11925851e09fe49c147bd965194936e728b7839 Mon Sep 17 00:00:00 2001 From: JaskaranIntugle Date: Thu, 11 Sep 2025 17:06:19 +0530 Subject: [PATCH 3/5] added cli for streamlit export --- pyproject.toml | 1 + src/intugle/cli.py | 11 ++++++++ src/intugle/streamlit.py | 57 ++-------------------------------------- 3 files changed, 14 insertions(+), 55 deletions(-) create mode 100644 src/intugle/cli.py diff --git a/pyproject.toml b/pyproject.toml index 773fae7..b90c813 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ dependencies = [ [project.scripts] intugle-mcp = "intugle.mcp.server:main" +intugle-streamlit = "intugle.cli:export_data" [dependency-groups] test = [ diff --git a/src/intugle/cli.py b/src/intugle/cli.py new file mode 100644 index 0000000..bbd98e1 --- /dev/null +++ b/src/intugle/cli.py @@ -0,0 +1,11 @@ +from intugle.streamlit import StreamlitApp + + +def export_data(): + """Exports the analysis results to CSV files.""" + app = StreamlitApp() + app.export_analysis_to_csv() + + +if __name__ == "__main__": + export_data() diff --git a/src/intugle/streamlit.py b/src/intugle/streamlit.py index 69eb0b7..7fb6da2 100644 --- a/src/intugle/streamlit.py +++ b/src/intugle/streamlit.py @@ -2,8 +2,6 @@ from intugle.analysis.models import DataSet from intugle.core import settings from intugle.exporters import CSVExporter -from intugle.libs.smart_query_generator.models.models import FieldDetailsModel, LinkModel -from intugle.libs.smart_query_generator.utils.join import Join from intugle.parser.manifest import ManifestLoader @@ -16,16 +14,6 @@ def __init__(self, project_base: str = settings.PROJECT_BASE): self.project_base = project_base - self.field_details = self.get_all_field_details() - - # get the links from the manifest - self.links = self.get_links() - - selected_fields = set(self.field_details.keys()) - self.join = Join(self.links, selected_fields) - - self.datasets: dict[str, DataSet] = {} - self.load_all() def load_all(self): @@ -33,53 +21,12 @@ def load_all(self): for source in sources.values(): table_name = source.table.name details = source.table.details - dataset = DataSet(data=details, name=table_name) - self.datasets[table_name] = dataset - - def get_all_field_details(self) -> dict[str, FieldDetailsModel]: - """Fetches all field details from the manifest.""" - - # get sources from the manifest - sources = self.manifest.sources - - field_details: dict[str, FieldDetailsModel] = {} - - # iterate through each source and get the field details (all fields / columns) - for source in sources.values(): - for column in source.table.columns: - field_detail: FieldDetailsModel = FieldDetailsModel( - id=f"{source.table.name}.{column.name}", - name=column.name, - datatype_l1=column.type, - datatype_l2=column.category, - sql_code=f"\"{source.table.name}\".\"{column.name}\"", - is_pii=False, - asset_id=source.table.name, - asset_name=source.table.name, - asset_details={}, - connection_id=source.schema, - connection_source_name="postgresql", - connection_credentials={}, - ) - field_details[field_detail.id] = field_detail - - return field_details - - def get_links(self) -> list[LinkModel]: - """Fetches the links from the manifest.""" - - # get relationships from the manifest - relationships = self.manifest.relationships - links: list[LinkModel] = [] - - # iterate through each relationship and create a LinkModel - for relationship in relationships.values(): - links.append(relationship.link) - return links + DataSet(data=details, name=table_name) def export_analysis_to_csv(self): """Exports the analysis results to CSV files.""" exporter = CSVExporter(self.manifest, self.project_base) exporter.export_all() + print("Succesfulluy exported analysis results to CSV files.") From 8844f9f454a7924d2125bf546644754bab693f6b Mon Sep 17 00:00:00 2001 From: JaskaranIntugle Date: Thu, 11 Sep 2025 17:58:08 +0530 Subject: [PATCH 4/5] Renamed relationships.yaml file --- src/intugle/link_predictor/predictor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intugle/link_predictor/predictor.py b/src/intugle/link_predictor/predictor.py index c10410e..057a2a7 100644 --- a/src/intugle/link_predictor/predictor.py +++ b/src/intugle/link_predictor/predictor.py @@ -143,7 +143,7 @@ def _predict_for_pair( ] return pair_links - def predict(self, filename='relationships.yml', save: bool = False) -> Self: + def predict(self, filename='__relationships__.yml', save: bool = False) -> Self: """ Iterates through all unique pairs of datasets, predicts the links for each pair, and returns the aggregated results. From 7466d098d04dbe7ce14d6f0d28301378e4192197 Mon Sep 17 00:00:00 2001 From: JaskaranIntugle Date: Thu, 11 Sep 2025 18:34:44 +0530 Subject: [PATCH 5/5] added colab links to README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3168785..7779210 100644 --- a/README.md +++ b/README.md @@ -81,8 +81,8 @@ export OPENAI_API_KEY="your-openai-api-key" For a detailed, hands-on introduction to the project, please see our quickstart notebooks: -* [`quickstart_healthcare.ipynb`](notebooks/quickstart_healthcare.ipynb): This notebook will walk you through the entire process of building a semantic layer using a healthcare dataset. -* [`quickstart_tech_company.ipynb`](notebooks/quickstart_tech_company.ipynb): This notebook demonstrates how to use the library with a technology manufacturing company dataset +* [`quickstart_healthcare.ipynb`](notebooks/quickstart_healthcare.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_healthcare.ipynb): This notebook will walk you through the entire process of building a semantic layer using a healthcare dataset. +* [`quickstart_tech_company.ipynb`](notebooks/quickstart_tech_company.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_tech_company.ipynb): This notebook demonstrates how to use the library with a technology manufacturing company dataset These datasets will take you through the following steps: