Merge pull request #26 from Intugle/features/csv-export

prinkanintugle · web-flow · commit 0b020bf54315 · 2025-09-11T12:24:38.000-07:00
Features/csv export
diff --git a/.gitignore b/.gitignore
@@ -208,7 +208,7 @@ __marimo__/
 notes.txt
 
 testing_base
-models
+/models
 models_bak
 
 settings.json
diff --git a/README.md b/README.md
@@ -81,8 +81,8 @@ export OPENAI_API_KEY="your-openai-api-key"
 
 For a detailed, hands-on introduction to the project, please see our quickstart notebooks:
 
-*   [`quickstart_healthcare.ipynb`](notebooks/quickstart_healthcare.ipynb): This notebook will walk you through the entire process of building a semantic layer using a healthcare dataset.
-*   [`quickstart_tech_company.ipynb`](notebooks/quickstart_tech_company.ipynb): This notebook demonstrates how to use the library with a technology manufacturing company dataset
+*   [`quickstart_healthcare.ipynb`](notebooks/quickstart_healthcare.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_healthcare.ipynb): This notebook will walk you through the entire process of building a semantic layer using a healthcare dataset. 
+*   [`quickstart_tech_company.ipynb`](notebooks/quickstart_tech_company.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_tech_company.ipynb): This notebook demonstrates how to use the library with a technology manufacturing company dataset
 
 These datasets will take you through the following steps:
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,6 +62,7 @@ dependencies = [
 
 [project.scripts]
 intugle-mcp = "intugle.mcp.server:main"
+intugle-streamlit = "intugle.cli:export_data"
 
 [dependency-groups]
 test = [
diff --git a/src/intugle/cli.py b/src/intugle/cli.py
@@ -0,0 +1,11 @@
+from intugle.streamlit import StreamlitApp
+
+
+def export_data():
+    """Exports the analysis results to CSV files."""
+    app = StreamlitApp()
+    app.export_analysis_to_csv()
+
+
+if __name__ == "__main__":
+    export_data()
diff --git a/src/intugle/exporters/__init__.py b/src/intugle/exporters/__init__.py
@@ -0,0 +1,36 @@
+import os
+
+import pandas as pd
+
+from intugle.parser.manifest import Manifest
+
+
+class CSVExporter:
+    def __init__(self, manifest: Manifest, project_base: str):
+        self.manifest = manifest
+        self.project_base = project_base
+
+    def _export_column_profiles(self, file_path: str):
+        df = self.manifest.profiles_df
+        profile_columns_to_keep = [
+            col for col in df.columns if col not in ["business_glossary", "business_tags"]
+        ]
+        df[profile_columns_to_keep].to_csv(file_path, index=False)
+
+    def _export_link_predictions(self, file_path: str):
+        df = self.manifest.links_df
+        df.to_csv(file_path, index=False)
+
+    def _export_business_glossary(self, file_path: str):
+        df = self.manifest.business_glossary_df
+        df.to_csv(file_path, index=False)
+
+    def export_all(
+        self,
+        column_profiles_file="column_profiles.csv",
+        link_predictions_file="link_predictions.csv",
+        business_glossary_file="business_glossary.csv",
+    ):
+        self._export_column_profiles(os.path.join(self.project_base, column_profiles_file))
+        self._export_link_predictions(os.path.join(self.project_base, link_predictions_file))
+        self._export_business_glossary(os.path.join(self.project_base, business_glossary_file))
diff --git a/src/intugle/link_predictor/predictor.py b/src/intugle/link_predictor/predictor.py
@@ -143,7 +143,7 @@ def _predict_for_pair(
         ]
         return pair_links
 
-    def predict(self, filename='relationships.yml', save: bool = False) -> Self:
+    def predict(self, filename='__relationships__.yml', save: bool = False) -> Self:
         """
         Iterates through all unique pairs of datasets, predicts the links for
         each pair, and returns the aggregated results.
diff --git a/src/intugle/models/manifest.py b/src/intugle/models/manifest.py
@@ -1,3 +1,5 @@
+import pandas as pd
+
 from pydantic import Field
 
 from intugle.common.schema import SchemaBase
@@ -10,3 +12,94 @@ class Manifest(SchemaBase):
     sources: dict[str, Source] = Field(default_factory=dict)
     models: dict[str, Model] = Field(default_factory=dict)
     relationships: dict[str, Relationship] = Field(default_factory=dict)
+
+    @property
+    def profiles_df(self) -> pd.DataFrame:
+        """Generates a DataFrame with column profiling information."""
+        all_profiles = []
+        for source in self.sources.values():
+            for column in source.table.columns:
+                metrics = column.profiling_metrics
+                profile_data = {
+                    "table_name": source.table.name,
+                    "column_name": column.name,
+                    "data_type_l1": column.type,
+                    "data_type_l2": column.category,
+                    "count": metrics.count,
+                    "null_count": metrics.null_count,
+                    "distinct_count": metrics.distinct_count,
+                    "uniqueness": metrics.distinct_count / metrics.count if metrics.count else 0,
+                    "completeness": (metrics.count - metrics.null_count) / metrics.count if metrics.count else 0,
+                    "sample_values": metrics.sample_data,
+                    "business_glossary": column.description,
+                    "business_tags": column.tags,
+                }
+                all_profiles.append(profile_data)
+        return pd.DataFrame(all_profiles)
+
+    @property
+    def links_df(self) -> pd.DataFrame:
+        """Generates a DataFrame with link prediction information."""
+        link_data = []
+        for relationship in self.relationships.values():
+            left_table_name = relationship.source.table
+            left_column_name = relationship.source.column
+            right_table_name = relationship.target.table
+            right_column_name = relationship.target.column
+
+            left_source = self.sources.get(left_table_name)
+            right_source = self.sources.get(right_table_name)
+
+            if left_source and right_source:
+                left_column = next((c for c in left_source.table.columns if c.name == left_column_name), None)
+                right_column = next((c for c in right_source.table.columns if c.name == right_column_name), None)
+
+                if left_column and right_column:
+                    left_metrics = left_column.profiling_metrics
+                    right_metrics = right_column.profiling_metrics
+                    link_data.append(
+                        {
+                            "left_table": left_table_name,
+                            "left_column": left_column_name,
+                            "left_data_type_l1": left_column.type,
+                            "left_data_type_l2": left_column.category,
+                            "left_count": left_metrics.count,
+                            "left_uniqueness": left_metrics.distinct_count / left_metrics.count
+                            if left_metrics.count
+                            else 0,
+                            "left_completeness": (left_metrics.count - left_metrics.null_count) / left_metrics.count
+                            if left_metrics.count
+                            else 0,
+                            "left_sample_values": left_metrics.sample_data,
+                            "right_table": right_table_name,
+                            "right_column": right_column_name,
+                            "right_data_type_l1": right_column.type,
+                            "right_data_type_l2": right_column.category,
+                            "right_count": right_metrics.count,
+                            "right_uniqueness": right_metrics.distinct_count / right_metrics.count
+                            if right_metrics.count
+                            else 0,
+                            "right_completeness": (right_metrics.count - right_metrics.null_count)
+                            / right_metrics.count
+                            if right_metrics.count
+                            else 0,
+                            "right_sample_values": right_metrics.sample_data,
+                        }
+                    )
+        return pd.DataFrame(link_data)
+
+    @property
+    def business_glossary_df(self) -> pd.DataFrame:
+        """Generates a DataFrame with business glossary information."""
+        glossary_data = []
+        for source in self.sources.values():
+            for column in source.table.columns:
+                glossary_data.append(
+                    {
+                        "table_name": source.table.name,
+                        "column_name": column.name,
+                        "business_glossary": column.description,
+                        "business_tags": column.tags,
+                    }
+                )
+        return pd.DataFrame(glossary_data)
diff --git a/src/intugle/streamlit.py b/src/intugle/streamlit.py
@@ -0,0 +1,32 @@
+
+from intugle.analysis.models import DataSet
+from intugle.core import settings
+from intugle.exporters import CSVExporter
+from intugle.parser.manifest import ManifestLoader
+
+
+class StreamlitApp:
+
+    def __init__(self, project_base: str = settings.PROJECT_BASE):
+        self.manifest_loader = ManifestLoader(project_base)
+        self.manifest_loader.load()
+        self.manifest = self.manifest_loader.manifest
+
+        self.project_base = project_base
+
+        self.load_all()
+
+    def load_all(self):
+        sources = self.manifest.sources
+        for source in sources.values():
+            table_name = source.table.name
+            details = source.table.details
+            DataSet(data=details, name=table_name)
+
+    def export_analysis_to_csv(self):
+        """Exports the analysis results to CSV files."""
+        exporter = CSVExporter(self.manifest, self.project_base)
+        exporter.export_all()
+        print("Succesfulluy exported analysis results to CSV files.")
+
+    

Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,7 @@ def _predict_for_pair(`
`143`	`143`	`]`
`144`	`144`	`return pair_links`
`145`	`145`
`146`		`- def predict(self, filename='relationships.yml', save: bool = False) -> Self:`
	`146`	`+ def predict(self, filename='__relationships__.yml', save: bool = False) -> Self:`
`147`	`147`	`"""`
`148`	`148`	`Iterates through all unique pairs of datasets, predicts the links for`
`149`	`149`	`each pair, and returns the aggregated results.`