-
Notifications
You must be signed in to change notification settings - Fork 1
refactor: decouple GO enrichment logic and improve data flow safety #10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d8e402d
9bf39df
272924f
c1b5bc9
ebeb023
b754582
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| from pathlib import Path | ||
| import streamlit as st | ||
| import pandas as pd | ||
| import numpy as np | ||
| import plotly.express as px | ||
|
|
||
| from src.common.common import page_setup | ||
| from src.common.results_helpers import get_abundance_data | ||
|
|
||
| # ================================ | ||
| # Page setup | ||
| # ================================ | ||
| params = page_setup() | ||
| st.title("ProteomicsLFQ Results") | ||
|
|
||
| # ================================ | ||
| # Workspace check | ||
| # ================================ | ||
| if "workspace" not in st.session_state: | ||
| st.warning("Please initialize your workspace first.") | ||
| st.stop() | ||
|
|
||
| # ================================ | ||
| # Load abundance data | ||
| # ================================ | ||
| res = get_abundance_data(st.session_state["workspace"]) | ||
| if res is None: | ||
| st.info( | ||
| "Abundance data not available or incomplete. " | ||
| "Please run the workflow and configure sample groups first." | ||
| ) | ||
| st.stop() | ||
|
|
||
| pivot_df, expr_df, group_map = res | ||
|
|
||
| # ================================ | ||
| # Tabs | ||
| # ================================ | ||
| protein_tab, = st.tabs(["🧬 Protein Table"]) | ||
|
|
||
| # ================================ | ||
| # Protein-level results | ||
| # ================================ | ||
| with protein_tab: | ||
| st.markdown("### 🧬 Protein-Level Abundance Table") | ||
| st.info( | ||
| "This protein-level table is generated by grouping all PSMs that map to the " | ||
| "same protein and aggregating their intensities across samples.\n\n" | ||
| "Additionally, log2 fold change and p-values are calculated between sample groups." | ||
| ) | ||
|
|
||
| if pivot_df.empty: | ||
| st.info("No protein-level data available.") | ||
| else: | ||
| st.session_state["pivot_df"] = pivot_df | ||
| st.dataframe(pivot_df.sort_values("p-value"), use_container_width=True) | ||
|
|
||
| # ====================================================== | ||
| # GO Enrichment Results | ||
| # ====================================================== | ||
| st.markdown("---") | ||
| st.subheader("🧬 GO Enrichment Analysis") | ||
|
|
||
| results_dir = Path(st.session_state["workspace"]) / "topp-workflow" / "results" / "go-terms" | ||
| go_json_file = results_dir / "go_results.json" | ||
|
|
||
| if not go_json_file.exists(): | ||
| st.info("GO Enrichment results are not available yet. Please run the analysis first.") | ||
| else: | ||
| import json | ||
| import plotly.io as pio | ||
|
|
||
| with open(go_json_file, "r") as f: | ||
| go_data = json.load(f) | ||
|
|
||
| bp_tab, cc_tab, mf_tab = st.tabs([ | ||
| "🧬 Biological Process", | ||
| "🏠 Cellular Component", | ||
| "⚙️ Molecular Function", | ||
| ]) | ||
|
|
||
| for tab, go_type in zip([bp_tab, cc_tab, mf_tab], ["BP", "CC", "MF"]): | ||
| with tab: | ||
| if go_type not in go_data: | ||
| st.info(f"No enriched {go_type} terms found.") | ||
| continue | ||
|
|
||
| fig_json = go_data[go_type]["fig_json"] | ||
| df_dict = go_data[go_type]["df_dict"] | ||
|
|
||
| fig = pio.from_json(fig_json) | ||
|
|
||
| df_go = pd.DataFrame(df_dict) | ||
|
|
||
| if df_go.empty: | ||
| st.info(f"No enriched {go_type} terms found.") | ||
| else: | ||
| st.plotly_chart(fig, use_container_width=True) | ||
|
|
||
| st.markdown(f"#### {go_type} Enrichment Results") | ||
| st.dataframe(df_go, use_container_width=True) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,5 +5,4 @@ | |
| params = page_setup() | ||
|
|
||
| wf = WorkflowTest() | ||
|
|
||
| wf.show_execution_section() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,12 +6,17 @@ | |
| from pyopenms import IdXMLFile | ||
| from scipy.stats import ttest_ind | ||
| import numpy as np | ||
| import mygene | ||
|
|
||
| from collections import defaultdict | ||
| from scipy.stats import fisher_exact | ||
| from src.workflow.WorkflowManager import WorkflowManager | ||
| from src.common.common import page_setup | ||
| from src.common.results_helpers import get_abundance_data | ||
| from src.common.results_helpers import parse_idxml, build_spectra_cache | ||
| from openms_insight import Table, Heatmap, LinePlot, SequenceView | ||
|
|
||
|
|
||
| # params = page_setup() | ||
| class WorkflowTest(WorkflowManager): | ||
|
|
||
| def __init__(self) -> None: | ||
|
|
@@ -354,16 +359,14 @@ def execution(self) -> bool: | |
| filter_dir = results_dir / "filter_results" | ||
| quant_dir = results_dir / "quant_results" | ||
|
|
||
| results_dir = Path(self.workflow_dir, "input-files") | ||
|
|
||
| for d in [comet_dir, perc_dir, filter_dir, quant_dir]: | ||
| d.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| self.logger.log("📁 Output directories created") | ||
|
|
||
| # ================================ | ||
| # 2️⃣ File path definitions (per sample) | ||
| # ================================ | ||
| # # ================================ | ||
| # # 2️⃣ File path definitions (per sample) | ||
| # # ================================ | ||
| comet_results = [] | ||
| percolator_results = [] | ||
| filter_results = [] | ||
|
|
@@ -499,10 +502,6 @@ def execution(self) -> bool: | |
|
|
||
| self.logger.log("✅ Peptide search complete") | ||
|
|
||
| # if not Path(comet_results).exists(): | ||
| # st.error(f"CometAdapter failed for {stem}") | ||
| # st.stop() | ||
|
|
||
| # --- PercolatorAdapter --- | ||
| self.logger.log("📊 Running rescoring...") | ||
| with st.spinner(f"PercolatorAdapter ({stem})"): | ||
|
|
@@ -764,9 +763,9 @@ def execution(self) -> bool: | |
|
|
||
| st.success(f"✓ {stem} identification completed") | ||
|
|
||
| # # ================================ | ||
| # # 4️⃣ ProteomicsLFQ (cross-sample) | ||
| # # ================================ | ||
| # ================================ | ||
| # 4️⃣ ProteomicsLFQ (cross-sample) | ||
| # ================================ | ||
| self.logger.log("📈 Running cross-sample quantification...") | ||
| st.info("Running ProteomicsLFQ (cross-sample quantification)") | ||
|
|
||
|
|
@@ -813,10 +812,17 @@ def execution(self) -> bool: | |
| return False | ||
| self.logger.log("✅ Quantification complete") | ||
|
|
||
| # if not Path(quant_mztab).exists(): | ||
| # st.error("ProteomicsLFQ failed: mzTab not created") | ||
| # st.stop() | ||
|
|
||
| # ====================================================== | ||
| # ⚠️ 5️⃣ GO Enrichment Analysis (INLINE IN EXECUTION) | ||
| # ====================================================== | ||
| workspace_path = Path(self.workflow_dir).parent | ||
| res = get_abundance_data(workspace_path) | ||
| if res is not None: | ||
| pivot_df, _, _ = res | ||
| self.logger.log("✅ pivot_df loaded, starting GO enrichment...") | ||
| self._run_go_enrichment(pivot_df, results_dir) | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| else: | ||
| st.warning("GO enrichment skipped: abundance data not available.") | ||
|
|
||
| # ================================ | ||
| # 5️⃣ Final report | ||
|
|
@@ -825,13 +831,158 @@ def execution(self) -> bool: | |
| st.write("📁 Results directory:") | ||
| st.code(str(results_dir)) | ||
|
|
||
| return True | ||
|
|
||
| def _run_go_enrichment(self, pivot_df: pd.DataFrame, results_dir: Path): | ||
| p_cutoff = 0.05 | ||
| fc_cutoff = 1.0 | ||
|
|
||
| st.write("📄 Generated files:") | ||
| st.write(f"- mzTab: {quant_mztab}") | ||
| st.write(f"- consensusXML: {quant_cxml}") | ||
| st.write(f"- MSstats CSV: {quant_msstats}") | ||
| analysis_df = pivot_df.dropna(subset=["p-value", "log2FC"]).copy() | ||
|
|
||
| return True | ||
| if analysis_df.empty: | ||
| st.error("No valid statistical data found for GO enrichment.") | ||
| self.logger.log("❗ analysis_df is empty") | ||
| else: | ||
| with st.spinner("Fetching GO terms from MyGene.info API..."): | ||
| mg = mygene.MyGeneInfo() | ||
|
|
||
| def get_clean_uniprot(name): | ||
| parts = str(name).split("|") | ||
| return parts[1] if len(parts) >= 2 else parts[0] | ||
|
|
||
| analysis_df["UniProt"] = analysis_df["ProteinName"].apply(get_clean_uniprot) | ||
|
|
||
| bg_ids = analysis_df["UniProt"].dropna().astype(str).unique().tolist() | ||
| fg_ids = analysis_df[ | ||
| (analysis_df["p-value"] < p_cutoff) & | ||
| (analysis_df["log2FC"].abs() >= fc_cutoff) | ||
| ]["UniProt"].dropna().astype(str).unique().tolist() | ||
| self.logger.log("✅ get_clean_uniprot applied") | ||
|
|
||
| if len(fg_ids) < 3: | ||
| st.warning( | ||
| f"Not enough significant proteins " | ||
| f"(p < {p_cutoff}, |log2FC| ≥ {fc_cutoff}). " | ||
| f"Found: {len(fg_ids)}" | ||
| ) | ||
| self.logger.log("❗ Not enough significant proteins") | ||
| else: | ||
| res_list = mg.querymany( | ||
| bg_ids, scopes="uniprot", fields="go", as_dataframe=False | ||
| ) | ||
|
Comment on lines
+870
to
+872
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
🛡️ Proposed fix — add error handling- res_list = mg.querymany(
- bg_ids, scopes="uniprot", fields="go", as_dataframe=False
- )
+ try:
+ res_list = mg.querymany(
+ bg_ids, scopes="uniprot", fields="go", as_dataframe=False
+ )
+ except Exception as e:
+ self.logger.log(f"❗ MyGene API call failed: {e}")
+ st.warning("GO enrichment skipped: failed to fetch GO terms from MyGene.info.")
+ return🤖 Prompt for AI Agents |
||
| res_go = pd.DataFrame(res_list) | ||
| if "notfound" in res_go.columns: | ||
| res_go = res_go[res_go["notfound"] != True] | ||
|
|
||
| def extract_go_terms(go_data, go_type): | ||
| if not isinstance(go_data, dict) or go_type not in go_data: | ||
| return [] | ||
| terms = go_data[go_type] | ||
| if isinstance(terms, dict): | ||
| terms = [terms] | ||
| return list({t.get("term") for t in terms if "term" in t}) | ||
|
|
||
| for go_type in ["BP", "CC", "MF"]: | ||
| res_go[f"{go_type}_terms"] = res_go["go"].apply( | ||
| lambda x: extract_go_terms(x, go_type) | ||
| ) | ||
|
Comment on lines
+875
to
+888
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix lint issues flagged by static analysis. Three issues in this block:
🔧 Proposed fix if "notfound" in res_go.columns:
- res_go = res_go[res_go["notfound"] != True]
+ res_go = res_go[~res_go["notfound"].fillna(False)]
def extract_go_terms(go_data, go_type):
if not isinstance(go_data, dict) or go_type not in go_data:
return []
terms = go_data[go_type]
if isinstance(terms, dict):
terms = [terms]
return list({t.get("term") for t in terms if "term" in t})
for go_type in ["BP", "CC", "MF"]:
res_go[f"{go_type}_terms"] = res_go["go"].apply(
- lambda x: extract_go_terms(x, go_type)
+ lambda x, _gt=go_type: extract_go_terms(x, _gt)
)
annotated_ids = set(res_go["query"].astype(str))
fg_set = annotated_ids.intersection(fg_ids)
bg_set = annotated_ids
- self.logger.log(f"✅ fg_set bg_set are set")
+ self.logger.log("✅ fg_set bg_set are set")🧰 Tools🪛 Ruff (0.14.14)[error] 875-875: Avoid inequality comparisons to Replace with (E712) [warning] 887-887: Function definition does not bind loop variable (B023) 🤖 Prompt for AI Agents |
||
|
|
||
| annotated_ids = set(res_go["query"].astype(str)) | ||
| fg_set = annotated_ids.intersection(fg_ids) | ||
| bg_set = annotated_ids | ||
| self.logger.log(f"✅ fg_set bg_set are set") | ||
|
|
||
| def run_go(go_type): | ||
| go2fg = defaultdict(set) | ||
| go2bg = defaultdict(set) | ||
|
|
||
| for _, row in res_go.iterrows(): | ||
| uid = str(row["query"]) | ||
| for term in row[f"{go_type}_terms"]: | ||
| go2bg[term].add(uid) | ||
| if uid in fg_set: | ||
| go2fg[term].add(uid) | ||
|
|
||
| records = [] | ||
| N_fg = len(fg_set) | ||
| N_bg = len(bg_set) | ||
|
|
||
| for term, fg_genes in go2fg.items(): | ||
| a = len(fg_genes) | ||
| if a == 0: | ||
| continue | ||
| b = N_fg - a | ||
| c = len(go2bg[term]) - a | ||
| d = N_bg - (a + b + c) | ||
|
|
||
| _, p = fisher_exact([[a, b], [c, d]], alternative="greater") | ||
| records.append({ | ||
| "GO_Term": term, | ||
| "Count": a, | ||
| "GeneRatio": f"{a}/{N_fg}", | ||
| "p_value": p, | ||
| }) | ||
|
|
||
| df = pd.DataFrame(records) | ||
| if df.empty: | ||
| return None, None | ||
|
|
||
| df["-log10(p)"] = -np.log10(df["p_value"].replace(0, 1e-10)) | ||
| df = df.sort_values("p_value").head(20) | ||
|
|
||
| # ✅ Plotly Figure | ||
| fig = px.bar( | ||
| df, | ||
| x="-log10(p)", | ||
| y="GO_Term", | ||
| orientation="h", | ||
| title=f"GO Enrichment ({go_type})", | ||
| ) | ||
|
|
||
| self.logger.log(f"✅ Plotly Figure generated") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove extraneous These f-strings have no interpolation placeholders. 🔧 Proposed fix- self.logger.log(f"✅ Plotly Figure generated")
+ self.logger.log("✅ Plotly Figure generated")- self.logger.log(f"✅ go_type generated")
+ self.logger.log("✅ go_type generated")Also applies to: 961-961 🧰 Tools🪛 Ruff (0.14.14)[error] 942-942: f-string without any placeholders Remove extraneous (F541) 🤖 Prompt for AI Agents |
||
|
|
||
| fig.update_layout( | ||
| yaxis=dict(autorange="reversed"), | ||
| height=500, | ||
| margin=dict(l=10, r=10, t=40, b=10), | ||
| ) | ||
|
|
||
| return fig, df | ||
|
|
||
| go_results = {} | ||
|
|
||
| for go_type in ["BP", "CC", "MF"]: | ||
| fig, df_go = run_go(go_type) | ||
| if fig is not None: | ||
| go_results[go_type] = { | ||
| "fig": fig, | ||
| "df": df_go | ||
| } | ||
| self.logger.log(f"✅ go_type generated") | ||
|
|
||
| go_dir = results_dir / "go-terms" | ||
| go_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| import json | ||
| go_data = {} | ||
|
|
||
| for go_type in ["BP", "CC", "MF"]: | ||
| if go_type in go_results: | ||
| fig = go_results[go_type]["fig"] | ||
| df = go_results[go_type]["df"] | ||
|
|
||
| go_data[go_type] = { | ||
| "fig_json": fig.to_json(), # Figure → JSON string | ||
| "df_dict": df.to_dict(orient="records") # DataFrame → list of dicts | ||
| } | ||
|
|
||
| go_json_file = go_dir / "go_results.json" | ||
| with open(go_json_file, "w") as f: | ||
| json.dump(go_data, f) | ||
| st.session_state["go_results"] = go_results | ||
| st.session_state["go_ready"] = True if go_data else False | ||
|
Comment on lines
+982
to
+983
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Storing Plotly figures and flags in
♻️ Proposed fix — remove dead session state writes with open(go_json_file, "w") as f:
json.dump(go_data, f)
- st.session_state["go_results"] = go_results
- st.session_state["go_ready"] = True if go_data else False
self.logger.log("✅ GO enrichment analysis complete")🤖 Prompt for AI Agents |
||
| self.logger.log("✅ GO enrichment analysis complete") | ||
|
|
||
|
|
||
| @st.fragment | ||
| def results(self) -> None: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: OpenMS/quantms-web
Length of output: 334
🏁 Script executed:
Repository: OpenMS/quantms-web
Length of output: 96
🏁 Script executed:
Repository: OpenMS/quantms-web
Length of output: 232
🏁 Script executed:
# Examine the results_helpers.py file to see the helper function cat -n src/common/results_helpers.pyRepository: OpenMS/quantms-web
Length of output: 12765
🏁 Script executed:
Repository: OpenMS/quantms-web
Length of output: 1713
🏁 Script executed:
Repository: OpenMS/quantms-web
Length of output: 1045
Use the centralized
get_workflow_dir()helper instead of hardcoding the path.Line 64 hardcodes
"topp-workflow", butsrc/common/results_helpers.pyprovidesget_workflow_dir(workspace)for this purpose. Other result files (e.g.,results_abundance.py,results_library.py,results_filtered.py) already import and use this helper. For consistency and maintainability, importget_workflow_dirand replace the hardcoded path:Then on line 64:
🤖 Prompt for AI Agents