Skip to content

Commit 0b020bf

Browse files
Merge pull request #26 from Intugle/features/csv-export
Features/csv export
2 parents cd2ff9f + 7466d09 commit 0b020bf

8 files changed

Lines changed: 177 additions & 4 deletions

File tree

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ __marimo__/
208208
notes.txt
209209

210210
testing_base
211-
models
211+
/models
212212
models_bak
213213

214214
settings.json

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ export OPENAI_API_KEY="your-openai-api-key"
8181

8282
For a detailed, hands-on introduction to the project, please see our quickstart notebooks:
8383

84-
* [`quickstart_healthcare.ipynb`](notebooks/quickstart_healthcare.ipynb): This notebook will walk you through the entire process of building a semantic layer using a healthcare dataset.
85-
* [`quickstart_tech_company.ipynb`](notebooks/quickstart_tech_company.ipynb): This notebook demonstrates how to use the library with a technology manufacturing company dataset
84+
* [`quickstart_healthcare.ipynb`](notebooks/quickstart_healthcare.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_healthcare.ipynb): This notebook will walk you through the entire process of building a semantic layer using a healthcare dataset.
85+
* [`quickstart_tech_company.ipynb`](notebooks/quickstart_tech_company.ipynb) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Intugle/data-tools/blob/main/notebooks/quickstart_tech_company.ipynb): This notebook demonstrates how to use the library with a technology manufacturing company dataset
8686

8787
These datasets will take you through the following steps:
8888

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ dependencies = [
6262

6363
[project.scripts]
6464
intugle-mcp = "intugle.mcp.server:main"
65+
intugle-streamlit = "intugle.cli:export_data"
6566

6667
[dependency-groups]
6768
test = [

src/intugle/cli.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from intugle.streamlit import StreamlitApp
2+
3+
4+
def export_data():
5+
"""Exports the analysis results to CSV files."""
6+
app = StreamlitApp()
7+
app.export_analysis_to_csv()
8+
9+
10+
if __name__ == "__main__":
11+
export_data()

src/intugle/exporters/__init__.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import os
2+
3+
import pandas as pd
4+
5+
from intugle.parser.manifest import Manifest
6+
7+
8+
class CSVExporter:
9+
def __init__(self, manifest: Manifest, project_base: str):
10+
self.manifest = manifest
11+
self.project_base = project_base
12+
13+
def _export_column_profiles(self, file_path: str):
14+
df = self.manifest.profiles_df
15+
profile_columns_to_keep = [
16+
col for col in df.columns if col not in ["business_glossary", "business_tags"]
17+
]
18+
df[profile_columns_to_keep].to_csv(file_path, index=False)
19+
20+
def _export_link_predictions(self, file_path: str):
21+
df = self.manifest.links_df
22+
df.to_csv(file_path, index=False)
23+
24+
def _export_business_glossary(self, file_path: str):
25+
df = self.manifest.business_glossary_df
26+
df.to_csv(file_path, index=False)
27+
28+
def export_all(
29+
self,
30+
column_profiles_file="column_profiles.csv",
31+
link_predictions_file="link_predictions.csv",
32+
business_glossary_file="business_glossary.csv",
33+
):
34+
self._export_column_profiles(os.path.join(self.project_base, column_profiles_file))
35+
self._export_link_predictions(os.path.join(self.project_base, link_predictions_file))
36+
self._export_business_glossary(os.path.join(self.project_base, business_glossary_file))

src/intugle/link_predictor/predictor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def _predict_for_pair(
143143
]
144144
return pair_links
145145

146-
def predict(self, filename='relationships.yml', save: bool = False) -> Self:
146+
def predict(self, filename='__relationships__.yml', save: bool = False) -> Self:
147147
"""
148148
Iterates through all unique pairs of datasets, predicts the links for
149149
each pair, and returns the aggregated results.

src/intugle/models/manifest.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import pandas as pd
2+
13
from pydantic import Field
24

35
from intugle.common.schema import SchemaBase
@@ -10,3 +12,94 @@ class Manifest(SchemaBase):
1012
sources: dict[str, Source] = Field(default_factory=dict)
1113
models: dict[str, Model] = Field(default_factory=dict)
1214
relationships: dict[str, Relationship] = Field(default_factory=dict)
15+
16+
@property
17+
def profiles_df(self) -> pd.DataFrame:
18+
"""Generates a DataFrame with column profiling information."""
19+
all_profiles = []
20+
for source in self.sources.values():
21+
for column in source.table.columns:
22+
metrics = column.profiling_metrics
23+
profile_data = {
24+
"table_name": source.table.name,
25+
"column_name": column.name,
26+
"data_type_l1": column.type,
27+
"data_type_l2": column.category,
28+
"count": metrics.count,
29+
"null_count": metrics.null_count,
30+
"distinct_count": metrics.distinct_count,
31+
"uniqueness": metrics.distinct_count / metrics.count if metrics.count else 0,
32+
"completeness": (metrics.count - metrics.null_count) / metrics.count if metrics.count else 0,
33+
"sample_values": metrics.sample_data,
34+
"business_glossary": column.description,
35+
"business_tags": column.tags,
36+
}
37+
all_profiles.append(profile_data)
38+
return pd.DataFrame(all_profiles)
39+
40+
@property
41+
def links_df(self) -> pd.DataFrame:
42+
"""Generates a DataFrame with link prediction information."""
43+
link_data = []
44+
for relationship in self.relationships.values():
45+
left_table_name = relationship.source.table
46+
left_column_name = relationship.source.column
47+
right_table_name = relationship.target.table
48+
right_column_name = relationship.target.column
49+
50+
left_source = self.sources.get(left_table_name)
51+
right_source = self.sources.get(right_table_name)
52+
53+
if left_source and right_source:
54+
left_column = next((c for c in left_source.table.columns if c.name == left_column_name), None)
55+
right_column = next((c for c in right_source.table.columns if c.name == right_column_name), None)
56+
57+
if left_column and right_column:
58+
left_metrics = left_column.profiling_metrics
59+
right_metrics = right_column.profiling_metrics
60+
link_data.append(
61+
{
62+
"left_table": left_table_name,
63+
"left_column": left_column_name,
64+
"left_data_type_l1": left_column.type,
65+
"left_data_type_l2": left_column.category,
66+
"left_count": left_metrics.count,
67+
"left_uniqueness": left_metrics.distinct_count / left_metrics.count
68+
if left_metrics.count
69+
else 0,
70+
"left_completeness": (left_metrics.count - left_metrics.null_count) / left_metrics.count
71+
if left_metrics.count
72+
else 0,
73+
"left_sample_values": left_metrics.sample_data,
74+
"right_table": right_table_name,
75+
"right_column": right_column_name,
76+
"right_data_type_l1": right_column.type,
77+
"right_data_type_l2": right_column.category,
78+
"right_count": right_metrics.count,
79+
"right_uniqueness": right_metrics.distinct_count / right_metrics.count
80+
if right_metrics.count
81+
else 0,
82+
"right_completeness": (right_metrics.count - right_metrics.null_count)
83+
/ right_metrics.count
84+
if right_metrics.count
85+
else 0,
86+
"right_sample_values": right_metrics.sample_data,
87+
}
88+
)
89+
return pd.DataFrame(link_data)
90+
91+
@property
92+
def business_glossary_df(self) -> pd.DataFrame:
93+
"""Generates a DataFrame with business glossary information."""
94+
glossary_data = []
95+
for source in self.sources.values():
96+
for column in source.table.columns:
97+
glossary_data.append(
98+
{
99+
"table_name": source.table.name,
100+
"column_name": column.name,
101+
"business_glossary": column.description,
102+
"business_tags": column.tags,
103+
}
104+
)
105+
return pd.DataFrame(glossary_data)

src/intugle/streamlit.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
2+
from intugle.analysis.models import DataSet
3+
from intugle.core import settings
4+
from intugle.exporters import CSVExporter
5+
from intugle.parser.manifest import ManifestLoader
6+
7+
8+
class StreamlitApp:
9+
10+
def __init__(self, project_base: str = settings.PROJECT_BASE):
11+
self.manifest_loader = ManifestLoader(project_base)
12+
self.manifest_loader.load()
13+
self.manifest = self.manifest_loader.manifest
14+
15+
self.project_base = project_base
16+
17+
self.load_all()
18+
19+
def load_all(self):
20+
sources = self.manifest.sources
21+
for source in sources.values():
22+
table_name = source.table.name
23+
details = source.table.details
24+
DataSet(data=details, name=table_name)
25+
26+
def export_analysis_to_csv(self):
27+
"""Exports the analysis results to CSV files."""
28+
exporter = CSVExporter(self.manifest, self.project_base)
29+
exporter.export_all()
30+
print("Succesfulluy exported analysis results to CSV files.")
31+
32+

0 commit comments

Comments
 (0)