From 827367e8999bd0282cbfc526232c3f684922851c Mon Sep 17 00:00:00 2001
From: Yoo HoJun <hjn0415a@gmail.com>
Date: Wed, 4 Feb 2026 13:47:20 +0900
Subject: [PATCH] feat: add GO enrichment analysis page for ProteomicsLFQ
 results

---
 app.py                           |   1 +
 content/results_proteomicslfq.py | 141 +++++++++++++++++++++++++++++++
 requirements.txt                 |   1 +
 3 files changed, 143 insertions(+)
 create mode 100644 content/results_proteomicslfq.py

diff --git a/app.py b/app.py
index 6c276f0..d0182da 100644
--- a/app.py
+++ b/app.py
@@ -27,6 +27,7 @@
             st.Page(Path("content", "results_pca.py"), title="PCA", icon="📊"),
             st.Page(Path("content", "results_heatmap.py"), title="Heatmap", icon="🔥"),
             st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="📚"),
+            st.Page(Path("content", "results_proteomicslfq.py"), title="GO Terms", icon="🧪"),
         ],
     }
 
diff --git a/content/results_proteomicslfq.py b/content/results_proteomicslfq.py
new file mode 100644
index 0000000..3acd031
--- /dev/null
+++ b/content/results_proteomicslfq.py
@@ -0,0 +1,141 @@
+from pathlib import Path
+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import mygene
+from collections import defaultdict
+from scipy.stats import ttest_ind, fisher_exact
+
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data, get_workflow_dir
+
+params = page_setup()
+st.title("ProteomicsLFQ Results")
+
+if "workspace" not in st.session_state:
+    st.warning("Please initialize your workspace first.")
+    st.stop()
+
+res = get_abundance_data(st.session_state["workspace"])
+if res is None:
+    st.info("Abundance data not available or incomplete. Please run the workflow and configure sample groups first.")
+    st.stop()
+
+pivot_df, expr_df, group_map = res
+
+protein_tab, = st.tabs(["🧬 Protein Table"])
+
+# Protein-level tab
+with protein_tab:
+    st.markdown("### 🧬 Protein-Level Abundance Table")
+    st.info(
+            "This protein-level table is generated by grouping all PSMs that map to the "
+            "same protein and aggregating their intensities across samples.\n\n"
+            "Additionally, log2 fold change and p-values are calculated between sample groups."
+        )
+
+    if pivot_df.empty:
+        st.info("No protein-level data available.")
+    else:
+        st.session_state["pivot_df"] = pivot_df
+        st.dataframe(pivot_df.sort_values("p-value"), use_container_width=True)
+
+    st.markdown("---")
+    st.subheader("🧬 GO Enrichment Analysis")
+
+    p_cutoff = st.slider("Select p-value threshold for Foreground Proteins", 0.01, 0.50, 0.05)
+    fc_cutoff = st.slider("Select |log2FC| threshold for Foreground Proteins", 0.0, 5.0, 1.0, step=0.1)
+
+    if st.button("Run GO Enrichment"):
+        analysis_df = pivot_df.dropna(subset=["p-value", "log2FC"]).copy()
+        if analysis_df.empty:
+            st.error("No valid statistical data found.")
+        else:
+            with st.spinner("Fetching GO terms from MyGene.info API..."):
+                try:
+                    mg = mygene.MyGeneInfo()
+
+                    def get_clean_uniprot(name):
+                        try:
+                            parts = str(name).split("|")
+                            return parts[1] if len(parts) >= 2 else parts[0]
+                        except Exception:
+                            return None
+
+                    analysis_df["UniProt"] = analysis_df["ProteinName"].apply(get_clean_uniprot)
+
+                    bg_ids = analysis_df["UniProt"].dropna().unique().tolist()
+                    fg_ids = analysis_df[
+                        (analysis_df["p-value"] < p_cutoff) &
+                        (analysis_df["log2FC"].abs() >= fc_cutoff)
+                    ]["UniProt"].dropna().unique().tolist()
+
+                    if len(fg_ids) < 3:
+                        st.warning(f"Not enough significant proteins (p < {p_cutoff}, |log2FC| ≥ {fc_cutoff}). Found: {len(fg_ids)}")
+                    else:
+                        res_list = mg.querymany(bg_ids, scopes="uniprot", fields="go", as_dataframe=False)
+                        res = pd.DataFrame(res_list)
+                        if "notfound" in res.columns:
+                            res = res[res["notfound"] != True]
+
+                        def extract_go_terms(go_data, go_type):
+                            if not isinstance(go_data, dict) or go_type not in go_data:
+                                return []
+                            terms = go_data[go_type]
+                            if isinstance(terms, dict):
+                                terms = [terms]
+                            return list({t.get("term") for t in terms if "term" in t})
+
+                        for go_type in ["BP", "CC", "MF"]:
+                            res[f"{go_type}_terms"] = res["go"].apply(lambda x: extract_go_terms(x, go_type))
+
+                        fg_set = set(fg_ids)
+                        bg_set = set(bg_ids)
+
+                        def run_go_enrichment(go_type):
+                            go2fg = defaultdict(set)
+                            go2bg = defaultdict(set)
+                            for _, row in res.iterrows():
+                                uid = str(row["query"])
+                                for term in row[f"{go_type}_terms"]:
+                                    go2bg[term].add(uid)
+                                    if uid in fg_set:
+                                        go2fg[term].add(uid)
+
+                            records = []
+                            N_fg = len(fg_set)
+                            N_bg = len(bg_set)
+                            for term, fg_genes in go2fg.items():
+                                a = len(fg_genes)
+                                if a == 0:
+                                    continue
+                                b = N_fg - a
+                                c = len(go2bg[term]) - a
+                                d = N_bg - (a + b + c)
+                                _, p = fisher_exact([[a, b], [c, d]], alternative="greater")
+                                records.append({"GO_Term": term, "Count": a, "GeneRatio": f"{a}/{N_fg}", "p_value": p})
+
+                            df_go = pd.DataFrame(records)
+                            if not df_go.empty:
+                                df_go["-log10(p)"] = -np.log10(df_go["p_value"].replace(0, 1e-10))
+                                df_go = df_go.sort_values("p_value")
+                            return df_go
+
+                        enrich_results = {go: run_go_enrichment(go) for go in ["BP", "CC", "MF"]}
+
+                        bp_tab, cc_tab, mf_tab = st.tabs(["🧬 Biological Process", "🏠 Cellular Component", "⚙️ Molecular Function"])
+                        for tab, go_type in zip([bp_tab, cc_tab, mf_tab], ["BP", "CC", "MF"]):
+                            with tab:
+                                df_go = enrich_results[go_type]
+                                if df_go.empty:
+                                    st.warning(f"No enriched {go_type} terms found.")
+                                    continue
+                                fig = px.bar(df_go.head(15), x="-log10(p)", y="GO_Term", orientation="h", text="GeneRatio", color="-log10(p)", color_continuous_scale="Viridis")
+                                fig.update_layout(yaxis={"categoryorder": "total ascending"}, margin=dict(l=300))
+                                st.plotly_chart(fig, use_container_width=True)
+                                st.dataframe(df_go, use_container_width=True)
+
+                        st.success("GO Enrichment analysis completed successfully.")
+                except Exception as e:
+                    st.error(f"GO enrichment failed: {e}")
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 510c539..d8a278d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -149,3 +149,4 @@ pyprophet>=2.2.0
 # Redis Queue dependencies (for online mode)
 redis>=5.0.0
 rq>=1.16.0
+mygene
\ No newline at end of file