From 827367e8999bd0282cbfc526232c3f684922851c Mon Sep 17 00:00:00 2001 From: Yoo HoJun Date: Wed, 4 Feb 2026 13:47:20 +0900 Subject: [PATCH] feat: add GO enrichment analysis page for ProteomicsLFQ results --- app.py | 1 + content/results_proteomicslfq.py | 141 +++++++++++++++++++++++++++++++ requirements.txt | 1 + 3 files changed, 143 insertions(+) create mode 100644 content/results_proteomicslfq.py diff --git a/app.py b/app.py index 6c276f0..d0182da 100644 --- a/app.py +++ b/app.py @@ -27,6 +27,7 @@ st.Page(Path("content", "results_pca.py"), title="PCA", icon="๐Ÿ“Š"), st.Page(Path("content", "results_heatmap.py"), title="Heatmap", icon="๐Ÿ”ฅ"), st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="๐Ÿ“š"), + st.Page(Path("content", "results_proteomicslfq.py"), title="GO Terms", icon="๐Ÿงช"), ], } diff --git a/content/results_proteomicslfq.py b/content/results_proteomicslfq.py new file mode 100644 index 0000000..3acd031 --- /dev/null +++ b/content/results_proteomicslfq.py @@ -0,0 +1,141 @@ +from pathlib import Path +import streamlit as st +import pandas as pd +import numpy as np +import plotly.express as px +import mygene +from collections import defaultdict +from scipy.stats import ttest_ind, fisher_exact + +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data, get_workflow_dir + +params = page_setup() +st.title("ProteomicsLFQ Results") + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +res = get_abundance_data(st.session_state["workspace"]) +if res is None: + st.info("Abundance data not available or incomplete. Please run the workflow and configure sample groups first.") + st.stop() + +pivot_df, expr_df, group_map = res + +protein_tab, = st.tabs(["๐Ÿงฌ Protein Table"]) + +# Protein-level tab +with protein_tab: + st.markdown("### ๐Ÿงฌ Protein-Level Abundance Table") + st.info( + "This protein-level table is generated by grouping all PSMs that map to the " + "same protein and aggregating their intensities across samples.\n\n" + "Additionally, log2 fold change and p-values are calculated between sample groups." + ) + + if pivot_df.empty: + st.info("No protein-level data available.") + else: + st.session_state["pivot_df"] = pivot_df + st.dataframe(pivot_df.sort_values("p-value"), use_container_width=True) + + st.markdown("---") + st.subheader("๐Ÿงฌ GO Enrichment Analysis") + + p_cutoff = st.slider("Select p-value threshold for Foreground Proteins", 0.01, 0.50, 0.05) + fc_cutoff = st.slider("Select |log2FC| threshold for Foreground Proteins", 0.0, 5.0, 1.0, step=0.1) + + if st.button("Run GO Enrichment"): + analysis_df = pivot_df.dropna(subset=["p-value", "log2FC"]).copy() + if analysis_df.empty: + st.error("No valid statistical data found.") + else: + with st.spinner("Fetching GO terms from MyGene.info API..."): + try: + mg = mygene.MyGeneInfo() + + def get_clean_uniprot(name): + try: + parts = str(name).split("|") + return parts[1] if len(parts) >= 2 else parts[0] + except Exception: + return None + + analysis_df["UniProt"] = analysis_df["ProteinName"].apply(get_clean_uniprot) + + bg_ids = analysis_df["UniProt"].dropna().unique().tolist() + fg_ids = analysis_df[ + (analysis_df["p-value"] < p_cutoff) & + (analysis_df["log2FC"].abs() >= fc_cutoff) + ]["UniProt"].dropna().unique().tolist() + + if len(fg_ids) < 3: + st.warning(f"Not enough significant proteins (p < {p_cutoff}, |log2FC| โ‰ฅ {fc_cutoff}). Found: {len(fg_ids)}") + else: + res_list = mg.querymany(bg_ids, scopes="uniprot", fields="go", as_dataframe=False) + res = pd.DataFrame(res_list) + if "notfound" in res.columns: + res = res[res["notfound"] != True] + + def extract_go_terms(go_data, go_type): + if not isinstance(go_data, dict) or go_type not in go_data: + return [] + terms = go_data[go_type] + if isinstance(terms, dict): + terms = [terms] + return list({t.get("term") for t in terms if "term" in t}) + + for go_type in ["BP", "CC", "MF"]: + res[f"{go_type}_terms"] = res["go"].apply(lambda x: extract_go_terms(x, go_type)) + + fg_set = set(fg_ids) + bg_set = set(bg_ids) + + def run_go_enrichment(go_type): + go2fg = defaultdict(set) + go2bg = defaultdict(set) + for _, row in res.iterrows(): + uid = str(row["query"]) + for term in row[f"{go_type}_terms"]: + go2bg[term].add(uid) + if uid in fg_set: + go2fg[term].add(uid) + + records = [] + N_fg = len(fg_set) + N_bg = len(bg_set) + for term, fg_genes in go2fg.items(): + a = len(fg_genes) + if a == 0: + continue + b = N_fg - a + c = len(go2bg[term]) - a + d = N_bg - (a + b + c) + _, p = fisher_exact([[a, b], [c, d]], alternative="greater") + records.append({"GO_Term": term, "Count": a, "GeneRatio": f"{a}/{N_fg}", "p_value": p}) + + df_go = pd.DataFrame(records) + if not df_go.empty: + df_go["-log10(p)"] = -np.log10(df_go["p_value"].replace(0, 1e-10)) + df_go = df_go.sort_values("p_value") + return df_go + + enrich_results = {go: run_go_enrichment(go) for go in ["BP", "CC", "MF"]} + + bp_tab, cc_tab, mf_tab = st.tabs(["๐Ÿงฌ Biological Process", "๐Ÿ  Cellular Component", "โš™๏ธ Molecular Function"]) + for tab, go_type in zip([bp_tab, cc_tab, mf_tab], ["BP", "CC", "MF"]): + with tab: + df_go = enrich_results[go_type] + if df_go.empty: + st.warning(f"No enriched {go_type} terms found.") + continue + fig = px.bar(df_go.head(15), x="-log10(p)", y="GO_Term", orientation="h", text="GeneRatio", color="-log10(p)", color_continuous_scale="Viridis") + fig.update_layout(yaxis={"categoryorder": "total ascending"}, margin=dict(l=300)) + st.plotly_chart(fig, use_container_width=True) + st.dataframe(df_go, use_container_width=True) + + st.success("GO Enrichment analysis completed successfully.") + except Exception as e: + st.error(f"GO enrichment failed: {e}") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 510c539..d8a278d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -149,3 +149,4 @@ pyprophet>=2.2.0 # Redis Queue dependencies (for online mode) redis>=5.0.0 rq>=1.16.0 +mygene \ No newline at end of file