From d8e402daf1882820d4299cb741f67a2789604b0d Mon Sep 17 00:00:00 2001 From: Yoo HoJun Date: Fri, 6 Feb 2026 17:13:16 +0900 Subject: [PATCH 1/6] test commit --- app.py | 1 + content/results_proteomicslfq.py | 95 +++++++++++++++++++++ src/WorkflowTest.py | 140 ++++++++++++++++++++++++++++++- src/workflow/WorkflowManager.py | 16 +++- 4 files changed, 245 insertions(+), 7 deletions(-) create mode 100644 content/results_proteomicslfq.py diff --git a/app.py b/app.py index 6c276f0..194d857 100644 --- a/app.py +++ b/app.py @@ -27,6 +27,7 @@ st.Page(Path("content", "results_pca.py"), title="PCA", icon="πŸ“Š"), st.Page(Path("content", "results_heatmap.py"), title="Heatmap", icon="πŸ”₯"), st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="πŸ“š"), + st.Page(Path("content", "results_proteomicslfq.py"), title="Proteomics LFQ", icon="πŸ§ͺ"), ], } diff --git a/content/results_proteomicslfq.py b/content/results_proteomicslfq.py new file mode 100644 index 0000000..dbe14ce --- /dev/null +++ b/content/results_proteomicslfq.py @@ -0,0 +1,95 @@ +from pathlib import Path +import streamlit as st +import pandas as pd +import numpy as np +import plotly.express as px + +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data + +# ================================ +# Page setup +# ================================ +params = page_setup() +st.title("ProteomicsLFQ Results") + +# ================================ +# Workspace check +# ================================ +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +# ================================ +# Load abundance data +# ================================ +res = get_abundance_data(st.session_state["workspace"]) +if res is None: + st.info( + "Abundance data not available or incomplete. " + "Please run the workflow and configure sample groups first." + ) + st.stop() + +pivot_df, expr_df, group_map = res + +# ================================ +# Tabs +# ================================ +protein_tab, = st.tabs(["🧬 Protein Table"]) + +# ================================ +# Protein-level results +# ================================ +with protein_tab: + st.markdown("### 🧬 Protein-Level Abundance Table") + st.info( + "This protein-level table is generated by grouping all PSMs that map to the " + "same protein and aggregating their intensities across samples.\n\n" + "Additionally, log2 fold change and p-values are calculated between sample groups." + ) + + if pivot_df.empty: + st.info("No protein-level data available.") + else: + st.session_state["pivot_df"] = pivot_df + st.dataframe(pivot_df.sort_values("p-value"), use_container_width=True) + +# ====================================================== +# GO Enrichment Results (from session_state) +# ====================================================== +st.markdown("---") +st.subheader("🧬 GO Enrichment Analysis") + +# GO analysis must be executed in the execution step +if not st.session_state.get("go_ready", False): + st.info("GO Enrichment results are not available yet. Please run the analysis first.") +else: + go_results = st.session_state.get("go_results", {}) + + if not go_results: + st.warning("GO Enrichment results are empty.") + else: + bp_tab, cc_tab, mf_tab = st.tabs([ + "🧬 Biological Process", + "🏠 Cellular Component", + "βš™οΈ Molecular Function", + ]) + + # κ΄„ν˜Έμ™€ λ“€μ—¬μ“°κΈ° μˆ˜μ • λΆ€λΆ„ + for tab, go_type in zip([bp_tab, cc_tab, mf_tab], ["BP", "CC", "MF"]): + with tab: + if go_type not in go_results: + st.warning(f"No enriched {go_type} terms found.") + continue + + fig = go_results[go_type].get("fig") + df_go = go_results[go_type].get("df") + + if fig is None or df_go is None or df_go.empty: + st.warning(f"No enriched {go_type} terms found.") + else: + # 차트 좜λ ₯ (plotly_chart μ‚¬μš©) + st.plotly_chart(fig, use_container_width=True) + # λ°μ΄ν„°ν”„λ ˆμž„ 좜λ ₯ + st.dataframe(df_go, use_container_width=True) \ No newline at end of file diff --git a/src/WorkflowTest.py b/src/WorkflowTest.py index d94a37d..683a243 100644 --- a/src/WorkflowTest.py +++ b/src/WorkflowTest.py @@ -6,8 +6,12 @@ from pyopenms import IdXMLFile from scipy.stats import ttest_ind import numpy as np +import mygene +from collections import defaultdict +from scipy.stats import fisher_exact from src.workflow.WorkflowManager import WorkflowManager +from src.common.results_helpers import get_abundance_data from src.common.results_helpers import parse_idxml, build_spectra_cache from openms_insight import Table, Heatmap, LinePlot, SequenceView @@ -813,9 +817,139 @@ def execution(self) -> bool: return False self.logger.log("βœ… Quantification complete") - # if not Path(quant_mztab).exists(): - # st.error("ProteomicsLFQ failed: mzTab not created") - # st.stop() + # ====================================================== + # ⚠️ 5️⃣ GO Enrichment Analysis (INLINE IN EXECUTION) + # ====================================================== + st.markdown("---") + st.subheader("🧬 GO Enrichment Analysis") + + res = get_abundance_data(st.session_state["workspace"]) + if res is None: + st.warning("GO enrichment skipped: abundance data not available.") + else: + pivot_df, expr_df, group_map = res + + p_cutoff = 0.05 + fc_cutoff = 1.0 + + analysis_df = pivot_df.dropna(subset=["p-value", "log2FC"]).copy() + + if analysis_df.empty: + st.error("No valid statistical data found for GO enrichment.") + else: + with st.spinner("Fetching GO terms from MyGene.info API..."): + mg = mygene.MyGeneInfo() + + def get_clean_uniprot(name): + parts = str(name).split("|") + return parts[1] if len(parts) >= 2 else parts[0] + + analysis_df["UniProt"] = analysis_df["ProteinName"].apply(get_clean_uniprot) + + bg_ids = analysis_df["UniProt"].dropna().astype(str).unique().tolist() + fg_ids = analysis_df[ + (analysis_df["p-value"] < p_cutoff) & + (analysis_df["log2FC"].abs() >= fc_cutoff) + ]["UniProt"].dropna().astype(str).unique().tolist() + + if len(fg_ids) < 3: + st.warning( + f"Not enough significant proteins " + f"(p < {p_cutoff}, |log2FC| β‰₯ {fc_cutoff}). " + f"Found: {len(fg_ids)}" + ) + else: + res_list = mg.querymany( + bg_ids, scopes="uniprot", fields="go", as_dataframe=False + ) + res_go = pd.DataFrame(res_list) + if "notfound" in res_go.columns: + res_go = res_go[res_go["notfound"] != True] + + def extract_go_terms(go_data, go_type): + if not isinstance(go_data, dict) or go_type not in go_data: + return [] + terms = go_data[go_type] + if isinstance(terms, dict): + terms = [terms] + return list({t.get("term") for t in terms if "term" in t}) + + for go_type in ["BP", "CC", "MF"]: + res_go[f"{go_type}_terms"] = res_go["go"].apply( + lambda x: extract_go_terms(x, go_type) + ) + + annotated_ids = set(res_go["query"].astype(str)) + fg_set = annotated_ids.intersection(fg_ids) + bg_set = annotated_ids + + def run_go(go_type): + go2fg = defaultdict(set) + go2bg = defaultdict(set) + + for _, row in res_go.iterrows(): + uid = str(row["query"]) + for term in row[f"{go_type}_terms"]: + go2bg[term].add(uid) + if uid in fg_set: + go2fg[term].add(uid) + + records = [] + N_fg = len(fg_set) + N_bg = len(bg_set) + + for term, fg_genes in go2fg.items(): + a = len(fg_genes) + if a == 0: + continue + b = N_fg - a + c = len(go2bg[term]) - a + d = N_bg - (a + b + c) + + _, p = fisher_exact([[a, b], [c, d]], alternative="greater") + records.append({ + "GO_Term": term, + "Count": a, + "GeneRatio": f"{a}/{N_fg}", + "p_value": p, + }) + + df = pd.DataFrame(records) + if df.empty: + return None, None + + df["-log10(p)"] = -np.log10(df["p_value"].replace(0, 1e-10)) + df = df.sort_values("p_value").head(20) + + # βœ… Plotly Figure 생성 + fig = px.bar( + df, + x="-log10(p)", + y="GO_Term", + orientation="h", + title=f"GO Enrichment ({go_type})", + ) + + fig.update_layout( + yaxis=dict(autorange="reversed"), + height=500, + margin=dict(l=10, r=10, t=40, b=10), + ) + + return fig, df + + go_results = {} + + for go_type in ["BP", "CC", "MF"]: + fig, df_go = run_go(go_type) + if fig is not None: + go_results[go_type] = { + "fig": fig, + "df": df_go + } + + st.session_state["go_results"] = go_results + st.session_state["go_ready"] = True # ================================ diff --git a/src/workflow/WorkflowManager.py b/src/workflow/WorkflowManager.py index a87fb33..c3343dc 100644 --- a/src/workflow/WorkflowManager.py +++ b/src/workflow/WorkflowManager.py @@ -191,9 +191,10 @@ def stop_workflow(self) -> bool: return self._stop_local_workflow() def _stop_local_workflow(self) -> bool: - """Stop locally running workflow process""" + """Stop locally running workflow process - Windows Compatible""" import os import signal + import platform pid_dir = self.executor.pid_dir if not pid_dir.exists(): @@ -203,11 +204,18 @@ def _stop_local_workflow(self) -> bool: for pid_file in pid_dir.iterdir(): try: pid = int(pid_file.name) - os.kill(pid, signal.SIGTERM) + # Windows + if platform.system() == "Windows": + os.system(f"taskkill /F /T /PID {pid}") + else: + # Linux/macOS + os.kill(pid, signal.SIGTERM) + pid_file.unlink() stopped = True - except (ValueError, ProcessLookupError, PermissionError): - pid_file.unlink() # Clean up stale PID file + except (ValueError, ProcessLookupError, PermissionError, OSError): + if pid_file.exists(): + pid_file.unlink() # Clean up the pid directory shutil.rmtree(pid_dir, ignore_errors=True) From 9bf39df34303a3fc80a413351c8b92466b3b2af5 Mon Sep 17 00:00:00 2001 From: hojun Date: Mon, 9 Feb 2026 17:31:02 +0900 Subject: [PATCH 2/6] feat: temporal commit --- content/workflow_run.py | 3 +- requirements.txt | 2 +- src/WorkflowTest.py | 801 ++++++++++++++++---------------- src/workflow/WorkflowManager.py | 6 +- 4 files changed, 409 insertions(+), 403 deletions(-) diff --git a/content/workflow_run.py b/content/workflow_run.py index eadb5a5..65eb293 100644 --- a/content/workflow_run.py +++ b/content/workflow_run.py @@ -5,5 +5,6 @@ params = page_setup() wf = WorkflowTest() - +st.write(st.session_state["workspace"]) +st.write(type(st.session_state["workspace"])) wf.show_execution_section() diff --git a/requirements.txt b/requirements.txt index 510c539..2f5fd30 100644 --- a/requirements.txt +++ b/requirements.txt @@ -145,7 +145,7 @@ polars>=1.0.0 cython easypqp>=0.1.34 pyprophet>=2.2.0 - +mygene # Redis Queue dependencies (for online mode) redis>=5.0.0 rq>=1.16.0 diff --git a/src/WorkflowTest.py b/src/WorkflowTest.py index 683a243..9f74e41 100644 --- a/src/WorkflowTest.py +++ b/src/WorkflowTest.py @@ -11,11 +11,12 @@ from collections import defaultdict from scipy.stats import fisher_exact from src.workflow.WorkflowManager import WorkflowManager +from src.common.common import page_setup from src.common.results_helpers import get_abundance_data from src.common.results_helpers import parse_idxml, build_spectra_cache from openms_insight import Table, Heatmap, LinePlot, SequenceView - +# params = page_setup() class WorkflowTest(WorkflowManager): def __init__(self) -> None: @@ -360,14 +361,14 @@ def execution(self) -> bool: results_dir = Path(self.workflow_dir, "input-files") - for d in [comet_dir, perc_dir, filter_dir, quant_dir]: - d.mkdir(parents=True, exist_ok=True) + # for d in [comet_dir, perc_dir, filter_dir, quant_dir]: + # d.mkdir(parents=True, exist_ok=True) - self.logger.log("πŸ“ Output directories created") + # self.logger.log("πŸ“ Output directories created") - # ================================ - # 2️⃣ File path definitions (per sample) - # ================================ + # # ================================ + # # 2️⃣ File path definitions (per sample) + # # ================================ comet_results = [] percolator_results = [] filter_results = [] @@ -378,395 +379,395 @@ def execution(self) -> bool: percolator_results.append(str(perc_dir / f"{stem}_per.idXML")) filter_results.append(str(filter_dir / f"{stem}_filter.idXML")) - # ================================ - # 3️⃣ Per-file processing - # ================================ - for i, mz in enumerate(in_mzML): - stem = Path(mz).stem - st.info(f"Processing sample: {stem}") - - self.logger.log("πŸ”¬ Starting per-sample processing...") - - # --- CometAdapter --- - self.logger.log("πŸ”Ž Running peptide search...") - with st.spinner(f"CometAdapter ({stem})"): - comet_extra_params = {"database": str(database_fasta)} - if self.params.get("generate-decoys", True): - # Propagate decoy_string from DecoyDatabase - comet_extra_params["PeptideIndexing:decoy_string"] = decoy_string - - if not self.executor.run_topp( - "CometAdapter", - { - "in": in_mzML, - "out": comet_results, - }, - comet_extra_params, - ): - self.logger.log("Workflow stopped due to error") - return False - - # Get fragment tolerance from CometAdapter parameters for visualization - comet_params = self.parameter_manager.get_topp_parameters("CometAdapter") - frag_tol = comet_params.get("fragment_mass_tolerance", 0.02) - frag_tol_is_ppm = comet_params.get("fragment_error_units", "Da") != "Da" - - # Build visualization cache for Comet results - results_dir_path = Path(self.workflow_dir, "results") - cache_dir = results_dir_path / "insight_cache" - cache_dir.mkdir(parents=True, exist_ok=True) - - # Get mzML directory - mzml_dir = Path(in_mzML[0]).parent - - # Build spectra cache (once, shared by all stages) - spectra_df = None - filename_to_index = {} - - for idxml_file in comet_results: - idxml_path = Path(idxml_file) - cache_id_prefix = idxml_path.stem - - # Parse idXML to DataFrame - id_df, spectra_data = parse_idxml(idxml_path) - - # Build spectra cache (only once) - if spectra_df is None: - filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)} - spectra_df, filename_to_index = build_spectra_cache(mzml_dir, filename_to_index) - - # Initialize Table component (caches itself) - Table( - cache_id=f"table_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - column_definitions=[ - {"field": "sequence", "title": "Sequence"}, - {"field": "charge", "title": "Z", "sorter": "number"}, - {"field": "mz", "title": "m/z", "sorter": "number"}, - {"field": "rt", "title": "RT", "sorter": "number"}, - {"field": "score", "title": "Score", "sorter": "number"}, - {"field": "protein_accession", "title": "Proteins"}, - ], - initial_sort=[{"column": "score", "dir": "asc"}], - index_field="id_idx", - ) - - # Initialize Heatmap component - Heatmap( - cache_id=f"heatmap_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - x_column="rt", - y_column="mz", - intensity_column="score", - interactivity={"identification": "id_idx"}, - ) - - # Initialize SequenceView component - seq_view = SequenceView( - cache_id=f"seqview_{cache_id_prefix}", - sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - "id_idx": "sequence_id", - "charge": "precursor_charge", - }), - peaks_data=spectra_df.lazy(), - filters={ - "identification": "sequence_id", - "file": "file_index", - "spectrum": "scan_id", - }, - interactivity={"peak": "peak_id"}, - cache_path=str(cache_dir), - deconvolved=False, - annotation_config={ - "ion_types": ["b", "y"], - "neutral_losses": True, - "tolerance": frag_tol, - "tolerance_ppm": frag_tol_is_ppm, - }, - ) - - # Initialize LinePlot from SequenceView - LinePlot.from_sequence_view( - seq_view, - cache_id=f"lineplot_{cache_id_prefix}", - cache_path=str(cache_dir), - title="Annotated Spectrum", - styling={ - "unhighlightedColor": "#CCCCCC", - "highlightColor": "#E74C3C", - "selectedColor": "#F3A712", - }, - ) - - self.logger.log("βœ… Peptide search complete") - - # if not Path(comet_results).exists(): - # st.error(f"CometAdapter failed for {stem}") - # st.stop() - - # --- PercolatorAdapter --- - self.logger.log("πŸ“Š Running rescoring...") - with st.spinner(f"PercolatorAdapter ({stem})"): - if not self.executor.run_topp( - "PercolatorAdapter", - { - "in": comet_results, - "out": percolator_results, - }, - {"decoy_pattern": decoy_string}, # Always propagated from upstream - ): - self.logger.log("Workflow stopped due to error") - return False - - # Build visualization cache for Percolator results - for idxml_file in percolator_results: - idxml_path = Path(idxml_file) - cache_id_prefix = idxml_path.stem - - # Parse idXML to DataFrame - id_df, spectra_data = parse_idxml(idxml_path) - - # Initialize Table component (caches itself) - Table( - cache_id=f"table_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - column_definitions=[ - {"field": "sequence", "title": "Sequence"}, - {"field": "charge", "title": "Z", "sorter": "number"}, - {"field": "mz", "title": "m/z", "sorter": "number"}, - {"field": "rt", "title": "RT", "sorter": "number"}, - {"field": "score", "title": "Score", "sorter": "number"}, - {"field": "protein_accession", "title": "Proteins"}, - ], - initial_sort=[{"column": "score", "dir": "asc"}], - index_field="id_idx", - ) - - # Initialize Heatmap component - Heatmap( - cache_id=f"heatmap_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - x_column="rt", - y_column="mz", - intensity_column="score", - interactivity={"identification": "id_idx"}, - ) - - # Initialize SequenceView component - seq_view = SequenceView( - cache_id=f"seqview_{cache_id_prefix}", - sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - "id_idx": "sequence_id", - "charge": "precursor_charge", - }), - peaks_data=spectra_df.lazy(), - filters={ - "identification": "sequence_id", - "file": "file_index", - "spectrum": "scan_id", - }, - interactivity={"peak": "peak_id"}, - cache_path=str(cache_dir), - deconvolved=False, - annotation_config={ - "ion_types": ["b", "y"], - "neutral_losses": True, - "tolerance": frag_tol, - "tolerance_ppm": frag_tol_is_ppm, - }, - ) - - # Initialize LinePlot from SequenceView - LinePlot.from_sequence_view( - seq_view, - cache_id=f"lineplot_{cache_id_prefix}", - cache_path=str(cache_dir), - title="Annotated Spectrum", - styling={ - "unhighlightedColor": "#CCCCCC", - "highlightColor": "#E74C3C", - "selectedColor": "#F3A712", - }, - ) - - self.logger.log("βœ… Rescoring complete") - - # if not Path(percolator_results[i]).exists(): - # st.error(f"PercolatorAdapter failed for {stem}") - # st.stop() - - # --- IDFilter --- - self.logger.log("πŸ”§ Filtering identifications...") - with st.spinner(f"IDFilter ({stem})"): - if not self.executor.run_topp( - "IDFilter", - { - "in": percolator_results, - "out": filter_results, - }, - ): - self.logger.log("Workflow stopped due to error") - return False - - # Build visualization cache for Filter results - for idxml_file in filter_results: - idxml_path = Path(idxml_file) - cache_id_prefix = idxml_path.stem - - # Parse idXML to DataFrame - id_df, spectra_data = parse_idxml(idxml_path) - - # Initialize Table component (caches itself) - Table( - cache_id=f"table_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - column_definitions=[ - {"field": "sequence", "title": "Sequence"}, - {"field": "charge", "title": "Z", "sorter": "number"}, - {"field": "mz", "title": "m/z", "sorter": "number"}, - {"field": "rt", "title": "RT", "sorter": "number"}, - {"field": "score", "title": "Score", "sorter": "number"}, - {"field": "protein_accession", "title": "Proteins"}, - ], - initial_sort=[{"column": "score", "dir": "asc"}], - index_field="id_idx", - ) - - # Initialize Heatmap component - Heatmap( - cache_id=f"heatmap_{cache_id_prefix}", - data=id_df.lazy(), - cache_path=str(cache_dir), - x_column="rt", - y_column="mz", - intensity_column="score", - interactivity={"identification": "id_idx"}, - ) - - # Initialize SequenceView component - seq_view = SequenceView( - cache_id=f"seqview_{cache_id_prefix}", - sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - "id_idx": "sequence_id", - "charge": "precursor_charge", - }), - peaks_data=spectra_df.lazy(), - filters={ - "identification": "sequence_id", - "file": "file_index", - "spectrum": "scan_id", - }, - interactivity={"peak": "peak_id"}, - cache_path=str(cache_dir), - deconvolved=False, - annotation_config={ - "ion_types": ["b", "y"], - "neutral_losses": True, - "tolerance": frag_tol, - "tolerance_ppm": frag_tol_is_ppm, - }, - ) - - # Initialize LinePlot from SequenceView - LinePlot.from_sequence_view( - seq_view, - cache_id=f"lineplot_{cache_id_prefix}", - cache_path=str(cache_dir), - title="Annotated Spectrum", - styling={ - "unhighlightedColor": "#CCCCCC", - "highlightColor": "#E74C3C", - "selectedColor": "#F3A712", - }, - ) - - self.logger.log("βœ… Filtering complete") - - # if not Path(filter_results[i]).exists(): - # st.error(f"IDFilter failed for {stem}") - # st.stop() - - # ================================ - # EasyPQP Spectral Library Generation (optional) - # ================================ - if self.params.get("generate-library", False): - self.logger.log("πŸ“š Building spectral library with EasyPQP...") - st.info("Building spectral library with EasyPQP...") - library_dir = Path(self.workflow_dir, "results", "library") - library_dir.mkdir(parents=True, exist_ok=True) - - psms_files, peaks_files = [], [] - - for filter_idxml in filter_results: - original_stem = Path(filter_idxml).stem.replace("_filter", "") - matching_mzml = next((m for m in in_mzML if Path(m).stem == original_stem), None) - if not matching_mzml: - self.logger.log(f"Warning: No matching mzML found for {filter_idxml}") - continue - - # easypqp library requires specific extensions for file recognition: - # - PSM files must contain 'psmpkl' β†’ use .psmpkl extension - # - Peak files must contain 'peakpkl' β†’ use .peakpkl extension - # After splitext(), stem will be just "{mzML_stem}" matching PSM base_name - psms_out = str(library_dir / f"{original_stem}.psmpkl") - peaks_out = str(library_dir / f"{original_stem}.peakpkl") - - convert_cmd = [ - "easypqp", "convert", - "--pepxml", filter_idxml, - "--spectra", matching_mzml, - "--psms", psms_out, - "--peaks", peaks_out - ] - if self.executor.run_command(convert_cmd): - psms_files.append(psms_out) - peaks_files.append(peaks_out) - - if psms_files: - # easypqp library outputs TSV format (despite common .pqp extension) - library_tsv = str(library_dir / "spectral_library.tsv") - library_cmd = ["easypqp", "library", "--out", library_tsv] - - if not self.params.get("library-use-fdr", False): - # --nofdr only skips FDR recalculation, NOT threshold filtering - # Set all thresholds to 1.0 to bypass filtering for pre-filtered input - library_cmd.extend([ - "--nofdr", - "--psm_fdr_threshold", "1.0", - "--peptide_fdr_threshold", "1.0", - "--protein_fdr_threshold", "1.0" - ]) - else: - # Apply user-specified FDR filtering - library_cmd.extend([ - "--psm_fdr_threshold", - str(self.params.get("library-psm-fdr", 0.01)), - "--peptide_fdr_threshold", - str(self.params.get("library-peptide-fdr", 0.01)), - "--protein_fdr_threshold", - str(self.params.get("library-protein-fdr", 0.01)) - ]) - - for psms, peaks in zip(psms_files, peaks_files): - library_cmd.extend([psms, peaks]) - - if self.executor.run_command(library_cmd): - self.logger.log("βœ… Spectral library created") - st.success("Spectral library created") - else: - self.logger.log("Warning: Failed to build spectral library") - else: - self.logger.log("Warning: No PSMs converted for library generation") + # # ================================ + # # 3️⃣ Per-file processing + # # ================================ + # for i, mz in enumerate(in_mzML): + # stem = Path(mz).stem + # st.info(f"Processing sample: {stem}") + + # self.logger.log("πŸ”¬ Starting per-sample processing...") + + # # --- CometAdapter --- + # self.logger.log("πŸ”Ž Running peptide search...") + # with st.spinner(f"CometAdapter ({stem})"): + # comet_extra_params = {"database": str(database_fasta)} + # if self.params.get("generate-decoys", True): + # # Propagate decoy_string from DecoyDatabase + # comet_extra_params["PeptideIndexing:decoy_string"] = decoy_string + + # if not self.executor.run_topp( + # "CometAdapter", + # { + # "in": in_mzML, + # "out": comet_results, + # }, + # comet_extra_params, + # ): + # self.logger.log("Workflow stopped due to error") + # return False + + # # Get fragment tolerance from CometAdapter parameters for visualization + # comet_params = self.parameter_manager.get_topp_parameters("CometAdapter") + # frag_tol = comet_params.get("fragment_mass_tolerance", 0.02) + # frag_tol_is_ppm = comet_params.get("fragment_error_units", "Da") != "Da" + + # # Build visualization cache for Comet results + # results_dir_path = Path(self.workflow_dir, "results") + # cache_dir = results_dir_path / "insight_cache" + # cache_dir.mkdir(parents=True, exist_ok=True) + + # # Get mzML directory + # mzml_dir = Path(in_mzML[0]).parent + + # # Build spectra cache (once, shared by all stages) + # spectra_df = None + # filename_to_index = {} + + # for idxml_file in comet_results: + # idxml_path = Path(idxml_file) + # cache_id_prefix = idxml_path.stem + + # # Parse idXML to DataFrame + # id_df, spectra_data = parse_idxml(idxml_path) + + # # Build spectra cache (only once) + # if spectra_df is None: + # filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)} + # spectra_df, filename_to_index = build_spectra_cache(mzml_dir, filename_to_index) + + # # Initialize Table component (caches itself) + # Table( + # cache_id=f"table_{cache_id_prefix}", + # data=id_df.lazy(), + # cache_path=str(cache_dir), + # interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + # column_definitions=[ + # {"field": "sequence", "title": "Sequence"}, + # {"field": "charge", "title": "Z", "sorter": "number"}, + # {"field": "mz", "title": "m/z", "sorter": "number"}, + # {"field": "rt", "title": "RT", "sorter": "number"}, + # {"field": "score", "title": "Score", "sorter": "number"}, + # {"field": "protein_accession", "title": "Proteins"}, + # ], + # initial_sort=[{"column": "score", "dir": "asc"}], + # index_field="id_idx", + # ) + + # # Initialize Heatmap component + # Heatmap( + # cache_id=f"heatmap_{cache_id_prefix}", + # data=id_df.lazy(), + # cache_path=str(cache_dir), + # x_column="rt", + # y_column="mz", + # intensity_column="score", + # interactivity={"identification": "id_idx"}, + # ) + + # # Initialize SequenceView component + # seq_view = SequenceView( + # cache_id=f"seqview_{cache_id_prefix}", + # sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + # "id_idx": "sequence_id", + # "charge": "precursor_charge", + # }), + # peaks_data=spectra_df.lazy(), + # filters={ + # "identification": "sequence_id", + # "file": "file_index", + # "spectrum": "scan_id", + # }, + # interactivity={"peak": "peak_id"}, + # cache_path=str(cache_dir), + # deconvolved=False, + # annotation_config={ + # "ion_types": ["b", "y"], + # "neutral_losses": True, + # "tolerance": frag_tol, + # "tolerance_ppm": frag_tol_is_ppm, + # }, + # ) + + # # Initialize LinePlot from SequenceView + # LinePlot.from_sequence_view( + # seq_view, + # cache_id=f"lineplot_{cache_id_prefix}", + # cache_path=str(cache_dir), + # title="Annotated Spectrum", + # styling={ + # "unhighlightedColor": "#CCCCCC", + # "highlightColor": "#E74C3C", + # "selectedColor": "#F3A712", + # }, + # ) + + # self.logger.log("βœ… Peptide search complete") + + # # if not Path(comet_results).exists(): + # # st.error(f"CometAdapter failed for {stem}") + # # st.stop() + + # # --- PercolatorAdapter --- + # self.logger.log("πŸ“Š Running rescoring...") + # with st.spinner(f"PercolatorAdapter ({stem})"): + # if not self.executor.run_topp( + # "PercolatorAdapter", + # { + # "in": comet_results, + # "out": percolator_results, + # }, + # {"decoy_pattern": decoy_string}, # Always propagated from upstream + # ): + # self.logger.log("Workflow stopped due to error") + # return False + + # # Build visualization cache for Percolator results + # for idxml_file in percolator_results: + # idxml_path = Path(idxml_file) + # cache_id_prefix = idxml_path.stem + + # # Parse idXML to DataFrame + # id_df, spectra_data = parse_idxml(idxml_path) + + # # Initialize Table component (caches itself) + # Table( + # cache_id=f"table_{cache_id_prefix}", + # data=id_df.lazy(), + # cache_path=str(cache_dir), + # interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + # column_definitions=[ + # {"field": "sequence", "title": "Sequence"}, + # {"field": "charge", "title": "Z", "sorter": "number"}, + # {"field": "mz", "title": "m/z", "sorter": "number"}, + # {"field": "rt", "title": "RT", "sorter": "number"}, + # {"field": "score", "title": "Score", "sorter": "number"}, + # {"field": "protein_accession", "title": "Proteins"}, + # ], + # initial_sort=[{"column": "score", "dir": "asc"}], + # index_field="id_idx", + # ) + + # # Initialize Heatmap component + # Heatmap( + # cache_id=f"heatmap_{cache_id_prefix}", + # data=id_df.lazy(), + # cache_path=str(cache_dir), + # x_column="rt", + # y_column="mz", + # intensity_column="score", + # interactivity={"identification": "id_idx"}, + # ) + + # # Initialize SequenceView component + # seq_view = SequenceView( + # cache_id=f"seqview_{cache_id_prefix}", + # sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + # "id_idx": "sequence_id", + # "charge": "precursor_charge", + # }), + # peaks_data=spectra_df.lazy(), + # filters={ + # "identification": "sequence_id", + # "file": "file_index", + # "spectrum": "scan_id", + # }, + # interactivity={"peak": "peak_id"}, + # cache_path=str(cache_dir), + # deconvolved=False, + # annotation_config={ + # "ion_types": ["b", "y"], + # "neutral_losses": True, + # "tolerance": frag_tol, + # "tolerance_ppm": frag_tol_is_ppm, + # }, + # ) + + # # Initialize LinePlot from SequenceView + # LinePlot.from_sequence_view( + # seq_view, + # cache_id=f"lineplot_{cache_id_prefix}", + # cache_path=str(cache_dir), + # title="Annotated Spectrum", + # styling={ + # "unhighlightedColor": "#CCCCCC", + # "highlightColor": "#E74C3C", + # "selectedColor": "#F3A712", + # }, + # ) + + # self.logger.log("βœ… Rescoring complete") + + # # if not Path(percolator_results[i]).exists(): + # # st.error(f"PercolatorAdapter failed for {stem}") + # # st.stop() + + # # --- IDFilter --- + # self.logger.log("πŸ”§ Filtering identifications...") + # with st.spinner(f"IDFilter ({stem})"): + # if not self.executor.run_topp( + # "IDFilter", + # { + # "in": percolator_results, + # "out": filter_results, + # }, + # ): + # self.logger.log("Workflow stopped due to error") + # return False + + # # Build visualization cache for Filter results + # for idxml_file in filter_results: + # idxml_path = Path(idxml_file) + # cache_id_prefix = idxml_path.stem + + # # Parse idXML to DataFrame + # id_df, spectra_data = parse_idxml(idxml_path) + + # # Initialize Table component (caches itself) + # Table( + # cache_id=f"table_{cache_id_prefix}", + # data=id_df.lazy(), + # cache_path=str(cache_dir), + # interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + # column_definitions=[ + # {"field": "sequence", "title": "Sequence"}, + # {"field": "charge", "title": "Z", "sorter": "number"}, + # {"field": "mz", "title": "m/z", "sorter": "number"}, + # {"field": "rt", "title": "RT", "sorter": "number"}, + # {"field": "score", "title": "Score", "sorter": "number"}, + # {"field": "protein_accession", "title": "Proteins"}, + # ], + # initial_sort=[{"column": "score", "dir": "asc"}], + # index_field="id_idx", + # ) + + # # Initialize Heatmap component + # Heatmap( + # cache_id=f"heatmap_{cache_id_prefix}", + # data=id_df.lazy(), + # cache_path=str(cache_dir), + # x_column="rt", + # y_column="mz", + # intensity_column="score", + # interactivity={"identification": "id_idx"}, + # ) + + # # Initialize SequenceView component + # seq_view = SequenceView( + # cache_id=f"seqview_{cache_id_prefix}", + # sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + # "id_idx": "sequence_id", + # "charge": "precursor_charge", + # }), + # peaks_data=spectra_df.lazy(), + # filters={ + # "identification": "sequence_id", + # "file": "file_index", + # "spectrum": "scan_id", + # }, + # interactivity={"peak": "peak_id"}, + # cache_path=str(cache_dir), + # deconvolved=False, + # annotation_config={ + # "ion_types": ["b", "y"], + # "neutral_losses": True, + # "tolerance": frag_tol, + # "tolerance_ppm": frag_tol_is_ppm, + # }, + # ) + + # # Initialize LinePlot from SequenceView + # LinePlot.from_sequence_view( + # seq_view, + # cache_id=f"lineplot_{cache_id_prefix}", + # cache_path=str(cache_dir), + # title="Annotated Spectrum", + # styling={ + # "unhighlightedColor": "#CCCCCC", + # "highlightColor": "#E74C3C", + # "selectedColor": "#F3A712", + # }, + # ) + + # self.logger.log("βœ… Filtering complete") + + # # if not Path(filter_results[i]).exists(): + # # st.error(f"IDFilter failed for {stem}") + # # st.stop() - st.success(f"βœ“ {stem} identification completed") + # # ================================ + # # EasyPQP Spectral Library Generation (optional) + # # ================================ + # if self.params.get("generate-library", False): + # self.logger.log("πŸ“š Building spectral library with EasyPQP...") + # st.info("Building spectral library with EasyPQP...") + # library_dir = Path(self.workflow_dir, "results", "library") + # library_dir.mkdir(parents=True, exist_ok=True) + + # psms_files, peaks_files = [], [] + + # for filter_idxml in filter_results: + # original_stem = Path(filter_idxml).stem.replace("_filter", "") + # matching_mzml = next((m for m in in_mzML if Path(m).stem == original_stem), None) + # if not matching_mzml: + # self.logger.log(f"Warning: No matching mzML found for {filter_idxml}") + # continue + + # # easypqp library requires specific extensions for file recognition: + # # - PSM files must contain 'psmpkl' β†’ use .psmpkl extension + # # - Peak files must contain 'peakpkl' β†’ use .peakpkl extension + # # After splitext(), stem will be just "{mzML_stem}" matching PSM base_name + # psms_out = str(library_dir / f"{original_stem}.psmpkl") + # peaks_out = str(library_dir / f"{original_stem}.peakpkl") + + # convert_cmd = [ + # "easypqp", "convert", + # "--pepxml", filter_idxml, + # "--spectra", matching_mzml, + # "--psms", psms_out, + # "--peaks", peaks_out + # ] + # if self.executor.run_command(convert_cmd): + # psms_files.append(psms_out) + # peaks_files.append(peaks_out) + + # if psms_files: + # # easypqp library outputs TSV format (despite common .pqp extension) + # library_tsv = str(library_dir / "spectral_library.tsv") + # library_cmd = ["easypqp", "library", "--out", library_tsv] + + # if not self.params.get("library-use-fdr", False): + # # --nofdr only skips FDR recalculation, NOT threshold filtering + # # Set all thresholds to 1.0 to bypass filtering for pre-filtered input + # library_cmd.extend([ + # "--nofdr", + # "--psm_fdr_threshold", "1.0", + # "--peptide_fdr_threshold", "1.0", + # "--protein_fdr_threshold", "1.0" + # ]) + # else: + # # Apply user-specified FDR filtering + # library_cmd.extend([ + # "--psm_fdr_threshold", + # str(self.params.get("library-psm-fdr", 0.01)), + # "--peptide_fdr_threshold", + # str(self.params.get("library-peptide-fdr", 0.01)), + # "--protein_fdr_threshold", + # str(self.params.get("library-protein-fdr", 0.01)) + # ]) + + # for psms, peaks in zip(psms_files, peaks_files): + # library_cmd.extend([psms, peaks]) + + # if self.executor.run_command(library_cmd): + # self.logger.log("βœ… Spectral library created") + # st.success("Spectral library created") + # else: + # self.logger.log("Warning: Failed to build spectral library") + # else: + # self.logger.log("Warning: No PSMs converted for library generation") + + # st.success(f"βœ“ {stem} identification completed") # # ================================ # # 4️⃣ ProteomicsLFQ (cross-sample) @@ -820,14 +821,14 @@ def execution(self) -> bool: # ====================================================== # ⚠️ 5️⃣ GO Enrichment Analysis (INLINE IN EXECUTION) # ====================================================== - st.markdown("---") - st.subheader("🧬 GO Enrichment Analysis") - + st.session_state["workspace"] = Path(self.workflow_dir).parent res = get_abundance_data(st.session_state["workspace"]) if res is None: st.warning("GO enrichment skipped: abundance data not available.") else: pivot_df, expr_df, group_map = res + self.logger.log("βœ… pivot_df loaded") + self.logger.log(f"pivot_df columns: {pivot_df.columns.tolist()}") p_cutoff = 0.05 fc_cutoff = 1.0 @@ -836,6 +837,7 @@ def execution(self) -> bool: if analysis_df.empty: st.error("No valid statistical data found for GO enrichment.") + self.logger.log("❗ analysis_df is empty") else: with st.spinner("Fetching GO terms from MyGene.info API..."): mg = mygene.MyGeneInfo() @@ -851,6 +853,7 @@ def get_clean_uniprot(name): (analysis_df["p-value"] < p_cutoff) & (analysis_df["log2FC"].abs() >= fc_cutoff) ]["UniProt"].dropna().astype(str).unique().tolist() + self.logger.log("βœ… get_clean_uniprot applied") if len(fg_ids) < 3: st.warning( @@ -858,6 +861,7 @@ def get_clean_uniprot(name): f"(p < {p_cutoff}, |log2FC| β‰₯ {fc_cutoff}). " f"Found: {len(fg_ids)}" ) + self.logger.log("❗ Not enough significant proteins") else: res_list = mg.querymany( bg_ids, scopes="uniprot", fields="go", as_dataframe=False @@ -865,6 +869,7 @@ def get_clean_uniprot(name): res_go = pd.DataFrame(res_list) if "notfound" in res_go.columns: res_go = res_go[res_go["notfound"] != True] + self.logger.log("❗ res_go filtered for notfound entries") def extract_go_terms(go_data, go_type): if not isinstance(go_data, dict) or go_type not in go_data: diff --git a/src/workflow/WorkflowManager.py b/src/workflow/WorkflowManager.py index c3343dc..f15d21d 100644 --- a/src/workflow/WorkflowManager.py +++ b/src/workflow/WorkflowManager.py @@ -98,9 +98,9 @@ def workflow_process(self) -> None: try: self.logger.log("STARTING WORKFLOW") results_dir = Path(self.workflow_dir, "results") - if results_dir.exists(): - shutil.rmtree(results_dir) - results_dir.mkdir(parents=True) + # if results_dir.exists(): + # shutil.rmtree(results_dir) + # results_dir.mkdir(parents=True) success = self.execution() if success: self.logger.log("WORKFLOW FINISHED") From 272924fceb5946af6d2dc9a14fdcb142d5e6627e Mon Sep 17 00:00:00 2001 From: hojun Date: Mon, 9 Feb 2026 22:33:50 +0900 Subject: [PATCH 3/6] feat: integrate GO term analysis into execution method --- content/results_proteomicslfq.py | 68 +-- content/workflow_run.py | 2 - src/WorkflowTest.py | 821 ++++++++++++++++--------------- src/workflow/WorkflowManager.py | 6 +- 4 files changed, 461 insertions(+), 436 deletions(-) diff --git a/content/results_proteomicslfq.py b/content/results_proteomicslfq.py index dbe14ce..77eb332 100644 --- a/content/results_proteomicslfq.py +++ b/content/results_proteomicslfq.py @@ -56,40 +56,46 @@ st.dataframe(pivot_df.sort_values("p-value"), use_container_width=True) # ====================================================== -# GO Enrichment Results (from session_state) +# GO Enrichment Results # ====================================================== st.markdown("---") st.subheader("🧬 GO Enrichment Analysis") -# GO analysis must be executed in the execution step -if not st.session_state.get("go_ready", False): +results_dir = Path(st.session_state["workspace"]) / "topp-workflow" / "results" / "go-terms" +go_json_file = results_dir / "go_results.json" + +if not go_json_file.exists(): st.info("GO Enrichment results are not available yet. Please run the analysis first.") else: - go_results = st.session_state.get("go_results", {}) - - if not go_results: - st.warning("GO Enrichment results are empty.") - else: - bp_tab, cc_tab, mf_tab = st.tabs([ - "🧬 Biological Process", - "🏠 Cellular Component", - "βš™οΈ Molecular Function", - ]) - - # κ΄„ν˜Έμ™€ λ“€μ—¬μ“°κΈ° μˆ˜μ • λΆ€λΆ„ - for tab, go_type in zip([bp_tab, cc_tab, mf_tab], ["BP", "CC", "MF"]): - with tab: - if go_type not in go_results: - st.warning(f"No enriched {go_type} terms found.") - continue - - fig = go_results[go_type].get("fig") - df_go = go_results[go_type].get("df") - - if fig is None or df_go is None or df_go.empty: - st.warning(f"No enriched {go_type} terms found.") - else: - # 차트 좜λ ₯ (plotly_chart μ‚¬μš©) - st.plotly_chart(fig, use_container_width=True) - # λ°μ΄ν„°ν”„λ ˆμž„ 좜λ ₯ - st.dataframe(df_go, use_container_width=True) \ No newline at end of file + import json + import plotly.io as pio + + with open(go_json_file, "r") as f: + go_data = json.load(f) + + bp_tab, cc_tab, mf_tab = st.tabs([ + "🧬 Biological Process", + "🏠 Cellular Component", + "βš™οΈ Molecular Function", + ]) + + for tab, go_type in zip([bp_tab, cc_tab, mf_tab], ["BP", "CC", "MF"]): + with tab: + if go_type not in go_data: + st.info(f"No enriched {go_type} terms found.") + continue + + fig_json = go_data[go_type]["fig_json"] + df_dict = go_data[go_type]["df_dict"] + + fig = pio.from_json(fig_json) + + df_go = pd.DataFrame(df_dict) + + if df_go.empty: + st.info(f"No enriched {go_type} terms found.") + else: + st.plotly_chart(fig, use_container_width=True) + + st.markdown(f"#### {go_type} Enrichment Results") + st.dataframe(df_go, use_container_width=True) \ No newline at end of file diff --git a/content/workflow_run.py b/content/workflow_run.py index 65eb293..6853470 100644 --- a/content/workflow_run.py +++ b/content/workflow_run.py @@ -5,6 +5,4 @@ params = page_setup() wf = WorkflowTest() -st.write(st.session_state["workspace"]) -st.write(type(st.session_state["workspace"])) wf.show_execution_section() diff --git a/src/WorkflowTest.py b/src/WorkflowTest.py index 9f74e41..3663cd9 100644 --- a/src/WorkflowTest.py +++ b/src/WorkflowTest.py @@ -361,10 +361,10 @@ def execution(self) -> bool: results_dir = Path(self.workflow_dir, "input-files") - # for d in [comet_dir, perc_dir, filter_dir, quant_dir]: - # d.mkdir(parents=True, exist_ok=True) + for d in [comet_dir, perc_dir, filter_dir, quant_dir]: + d.mkdir(parents=True, exist_ok=True) - # self.logger.log("πŸ“ Output directories created") + self.logger.log("πŸ“ Output directories created") # # ================================ # # 2️⃣ File path definitions (per sample) @@ -379,399 +379,395 @@ def execution(self) -> bool: percolator_results.append(str(perc_dir / f"{stem}_per.idXML")) filter_results.append(str(filter_dir / f"{stem}_filter.idXML")) - # # ================================ - # # 3️⃣ Per-file processing - # # ================================ - # for i, mz in enumerate(in_mzML): - # stem = Path(mz).stem - # st.info(f"Processing sample: {stem}") - - # self.logger.log("πŸ”¬ Starting per-sample processing...") - - # # --- CometAdapter --- - # self.logger.log("πŸ”Ž Running peptide search...") - # with st.spinner(f"CometAdapter ({stem})"): - # comet_extra_params = {"database": str(database_fasta)} - # if self.params.get("generate-decoys", True): - # # Propagate decoy_string from DecoyDatabase - # comet_extra_params["PeptideIndexing:decoy_string"] = decoy_string - - # if not self.executor.run_topp( - # "CometAdapter", - # { - # "in": in_mzML, - # "out": comet_results, - # }, - # comet_extra_params, - # ): - # self.logger.log("Workflow stopped due to error") - # return False - - # # Get fragment tolerance from CometAdapter parameters for visualization - # comet_params = self.parameter_manager.get_topp_parameters("CometAdapter") - # frag_tol = comet_params.get("fragment_mass_tolerance", 0.02) - # frag_tol_is_ppm = comet_params.get("fragment_error_units", "Da") != "Da" - - # # Build visualization cache for Comet results - # results_dir_path = Path(self.workflow_dir, "results") - # cache_dir = results_dir_path / "insight_cache" - # cache_dir.mkdir(parents=True, exist_ok=True) - - # # Get mzML directory - # mzml_dir = Path(in_mzML[0]).parent - - # # Build spectra cache (once, shared by all stages) - # spectra_df = None - # filename_to_index = {} - - # for idxml_file in comet_results: - # idxml_path = Path(idxml_file) - # cache_id_prefix = idxml_path.stem - - # # Parse idXML to DataFrame - # id_df, spectra_data = parse_idxml(idxml_path) - - # # Build spectra cache (only once) - # if spectra_df is None: - # filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)} - # spectra_df, filename_to_index = build_spectra_cache(mzml_dir, filename_to_index) - - # # Initialize Table component (caches itself) - # Table( - # cache_id=f"table_{cache_id_prefix}", - # data=id_df.lazy(), - # cache_path=str(cache_dir), - # interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - # column_definitions=[ - # {"field": "sequence", "title": "Sequence"}, - # {"field": "charge", "title": "Z", "sorter": "number"}, - # {"field": "mz", "title": "m/z", "sorter": "number"}, - # {"field": "rt", "title": "RT", "sorter": "number"}, - # {"field": "score", "title": "Score", "sorter": "number"}, - # {"field": "protein_accession", "title": "Proteins"}, - # ], - # initial_sort=[{"column": "score", "dir": "asc"}], - # index_field="id_idx", - # ) - - # # Initialize Heatmap component - # Heatmap( - # cache_id=f"heatmap_{cache_id_prefix}", - # data=id_df.lazy(), - # cache_path=str(cache_dir), - # x_column="rt", - # y_column="mz", - # intensity_column="score", - # interactivity={"identification": "id_idx"}, - # ) - - # # Initialize SequenceView component - # seq_view = SequenceView( - # cache_id=f"seqview_{cache_id_prefix}", - # sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - # "id_idx": "sequence_id", - # "charge": "precursor_charge", - # }), - # peaks_data=spectra_df.lazy(), - # filters={ - # "identification": "sequence_id", - # "file": "file_index", - # "spectrum": "scan_id", - # }, - # interactivity={"peak": "peak_id"}, - # cache_path=str(cache_dir), - # deconvolved=False, - # annotation_config={ - # "ion_types": ["b", "y"], - # "neutral_losses": True, - # "tolerance": frag_tol, - # "tolerance_ppm": frag_tol_is_ppm, - # }, - # ) - - # # Initialize LinePlot from SequenceView - # LinePlot.from_sequence_view( - # seq_view, - # cache_id=f"lineplot_{cache_id_prefix}", - # cache_path=str(cache_dir), - # title="Annotated Spectrum", - # styling={ - # "unhighlightedColor": "#CCCCCC", - # "highlightColor": "#E74C3C", - # "selectedColor": "#F3A712", - # }, - # ) - - # self.logger.log("βœ… Peptide search complete") - - # # if not Path(comet_results).exists(): - # # st.error(f"CometAdapter failed for {stem}") - # # st.stop() - - # # --- PercolatorAdapter --- - # self.logger.log("πŸ“Š Running rescoring...") - # with st.spinner(f"PercolatorAdapter ({stem})"): - # if not self.executor.run_topp( - # "PercolatorAdapter", - # { - # "in": comet_results, - # "out": percolator_results, - # }, - # {"decoy_pattern": decoy_string}, # Always propagated from upstream - # ): - # self.logger.log("Workflow stopped due to error") - # return False - - # # Build visualization cache for Percolator results - # for idxml_file in percolator_results: - # idxml_path = Path(idxml_file) - # cache_id_prefix = idxml_path.stem - - # # Parse idXML to DataFrame - # id_df, spectra_data = parse_idxml(idxml_path) - - # # Initialize Table component (caches itself) - # Table( - # cache_id=f"table_{cache_id_prefix}", - # data=id_df.lazy(), - # cache_path=str(cache_dir), - # interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - # column_definitions=[ - # {"field": "sequence", "title": "Sequence"}, - # {"field": "charge", "title": "Z", "sorter": "number"}, - # {"field": "mz", "title": "m/z", "sorter": "number"}, - # {"field": "rt", "title": "RT", "sorter": "number"}, - # {"field": "score", "title": "Score", "sorter": "number"}, - # {"field": "protein_accession", "title": "Proteins"}, - # ], - # initial_sort=[{"column": "score", "dir": "asc"}], - # index_field="id_idx", - # ) - - # # Initialize Heatmap component - # Heatmap( - # cache_id=f"heatmap_{cache_id_prefix}", - # data=id_df.lazy(), - # cache_path=str(cache_dir), - # x_column="rt", - # y_column="mz", - # intensity_column="score", - # interactivity={"identification": "id_idx"}, - # ) - - # # Initialize SequenceView component - # seq_view = SequenceView( - # cache_id=f"seqview_{cache_id_prefix}", - # sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - # "id_idx": "sequence_id", - # "charge": "precursor_charge", - # }), - # peaks_data=spectra_df.lazy(), - # filters={ - # "identification": "sequence_id", - # "file": "file_index", - # "spectrum": "scan_id", - # }, - # interactivity={"peak": "peak_id"}, - # cache_path=str(cache_dir), - # deconvolved=False, - # annotation_config={ - # "ion_types": ["b", "y"], - # "neutral_losses": True, - # "tolerance": frag_tol, - # "tolerance_ppm": frag_tol_is_ppm, - # }, - # ) - - # # Initialize LinePlot from SequenceView - # LinePlot.from_sequence_view( - # seq_view, - # cache_id=f"lineplot_{cache_id_prefix}", - # cache_path=str(cache_dir), - # title="Annotated Spectrum", - # styling={ - # "unhighlightedColor": "#CCCCCC", - # "highlightColor": "#E74C3C", - # "selectedColor": "#F3A712", - # }, - # ) - - # self.logger.log("βœ… Rescoring complete") - - # # if not Path(percolator_results[i]).exists(): - # # st.error(f"PercolatorAdapter failed for {stem}") - # # st.stop() - - # # --- IDFilter --- - # self.logger.log("πŸ”§ Filtering identifications...") - # with st.spinner(f"IDFilter ({stem})"): - # if not self.executor.run_topp( - # "IDFilter", - # { - # "in": percolator_results, - # "out": filter_results, - # }, - # ): - # self.logger.log("Workflow stopped due to error") - # return False - - # # Build visualization cache for Filter results - # for idxml_file in filter_results: - # idxml_path = Path(idxml_file) - # cache_id_prefix = idxml_path.stem - - # # Parse idXML to DataFrame - # id_df, spectra_data = parse_idxml(idxml_path) - - # # Initialize Table component (caches itself) - # Table( - # cache_id=f"table_{cache_id_prefix}", - # data=id_df.lazy(), - # cache_path=str(cache_dir), - # interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, - # column_definitions=[ - # {"field": "sequence", "title": "Sequence"}, - # {"field": "charge", "title": "Z", "sorter": "number"}, - # {"field": "mz", "title": "m/z", "sorter": "number"}, - # {"field": "rt", "title": "RT", "sorter": "number"}, - # {"field": "score", "title": "Score", "sorter": "number"}, - # {"field": "protein_accession", "title": "Proteins"}, - # ], - # initial_sort=[{"column": "score", "dir": "asc"}], - # index_field="id_idx", - # ) - - # # Initialize Heatmap component - # Heatmap( - # cache_id=f"heatmap_{cache_id_prefix}", - # data=id_df.lazy(), - # cache_path=str(cache_dir), - # x_column="rt", - # y_column="mz", - # intensity_column="score", - # interactivity={"identification": "id_idx"}, - # ) - - # # Initialize SequenceView component - # seq_view = SequenceView( - # cache_id=f"seqview_{cache_id_prefix}", - # sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ - # "id_idx": "sequence_id", - # "charge": "precursor_charge", - # }), - # peaks_data=spectra_df.lazy(), - # filters={ - # "identification": "sequence_id", - # "file": "file_index", - # "spectrum": "scan_id", - # }, - # interactivity={"peak": "peak_id"}, - # cache_path=str(cache_dir), - # deconvolved=False, - # annotation_config={ - # "ion_types": ["b", "y"], - # "neutral_losses": True, - # "tolerance": frag_tol, - # "tolerance_ppm": frag_tol_is_ppm, - # }, - # ) - - # # Initialize LinePlot from SequenceView - # LinePlot.from_sequence_view( - # seq_view, - # cache_id=f"lineplot_{cache_id_prefix}", - # cache_path=str(cache_dir), - # title="Annotated Spectrum", - # styling={ - # "unhighlightedColor": "#CCCCCC", - # "highlightColor": "#E74C3C", - # "selectedColor": "#F3A712", - # }, - # ) - - # self.logger.log("βœ… Filtering complete") - - # # if not Path(filter_results[i]).exists(): - # # st.error(f"IDFilter failed for {stem}") - # # st.stop() + # ================================ + # 3️⃣ Per-file processing + # ================================ + for i, mz in enumerate(in_mzML): + stem = Path(mz).stem + st.info(f"Processing sample: {stem}") - # # ================================ - # # EasyPQP Spectral Library Generation (optional) - # # ================================ - # if self.params.get("generate-library", False): - # self.logger.log("πŸ“š Building spectral library with EasyPQP...") - # st.info("Building spectral library with EasyPQP...") - # library_dir = Path(self.workflow_dir, "results", "library") - # library_dir.mkdir(parents=True, exist_ok=True) - - # psms_files, peaks_files = [], [] - - # for filter_idxml in filter_results: - # original_stem = Path(filter_idxml).stem.replace("_filter", "") - # matching_mzml = next((m for m in in_mzML if Path(m).stem == original_stem), None) - # if not matching_mzml: - # self.logger.log(f"Warning: No matching mzML found for {filter_idxml}") - # continue - - # # easypqp library requires specific extensions for file recognition: - # # - PSM files must contain 'psmpkl' β†’ use .psmpkl extension - # # - Peak files must contain 'peakpkl' β†’ use .peakpkl extension - # # After splitext(), stem will be just "{mzML_stem}" matching PSM base_name - # psms_out = str(library_dir / f"{original_stem}.psmpkl") - # peaks_out = str(library_dir / f"{original_stem}.peakpkl") - - # convert_cmd = [ - # "easypqp", "convert", - # "--pepxml", filter_idxml, - # "--spectra", matching_mzml, - # "--psms", psms_out, - # "--peaks", peaks_out - # ] - # if self.executor.run_command(convert_cmd): - # psms_files.append(psms_out) - # peaks_files.append(peaks_out) - - # if psms_files: - # # easypqp library outputs TSV format (despite common .pqp extension) - # library_tsv = str(library_dir / "spectral_library.tsv") - # library_cmd = ["easypqp", "library", "--out", library_tsv] - - # if not self.params.get("library-use-fdr", False): - # # --nofdr only skips FDR recalculation, NOT threshold filtering - # # Set all thresholds to 1.0 to bypass filtering for pre-filtered input - # library_cmd.extend([ - # "--nofdr", - # "--psm_fdr_threshold", "1.0", - # "--peptide_fdr_threshold", "1.0", - # "--protein_fdr_threshold", "1.0" - # ]) - # else: - # # Apply user-specified FDR filtering - # library_cmd.extend([ - # "--psm_fdr_threshold", - # str(self.params.get("library-psm-fdr", 0.01)), - # "--peptide_fdr_threshold", - # str(self.params.get("library-peptide-fdr", 0.01)), - # "--protein_fdr_threshold", - # str(self.params.get("library-protein-fdr", 0.01)) - # ]) - - # for psms, peaks in zip(psms_files, peaks_files): - # library_cmd.extend([psms, peaks]) - - # if self.executor.run_command(library_cmd): - # self.logger.log("βœ… Spectral library created") - # st.success("Spectral library created") - # else: - # self.logger.log("Warning: Failed to build spectral library") - # else: - # self.logger.log("Warning: No PSMs converted for library generation") - - # st.success(f"βœ“ {stem} identification completed") + self.logger.log("πŸ”¬ Starting per-sample processing...") - # # ================================ - # # 4️⃣ ProteomicsLFQ (cross-sample) - # # ================================ + # --- CometAdapter --- + self.logger.log("πŸ”Ž Running peptide search...") + with st.spinner(f"CometAdapter ({stem})"): + comet_extra_params = {"database": str(database_fasta)} + if self.params.get("generate-decoys", True): + # Propagate decoy_string from DecoyDatabase + comet_extra_params["PeptideIndexing:decoy_string"] = decoy_string + + if not self.executor.run_topp( + "CometAdapter", + { + "in": in_mzML, + "out": comet_results, + }, + comet_extra_params, + ): + self.logger.log("Workflow stopped due to error") + return False + + # Get fragment tolerance from CometAdapter parameters for visualization + comet_params = self.parameter_manager.get_topp_parameters("CometAdapter") + frag_tol = comet_params.get("fragment_mass_tolerance", 0.02) + frag_tol_is_ppm = comet_params.get("fragment_error_units", "Da") != "Da" + + # Build visualization cache for Comet results + results_dir_path = Path(self.workflow_dir, "results") + cache_dir = results_dir_path / "insight_cache" + cache_dir.mkdir(parents=True, exist_ok=True) + + # Get mzML directory + mzml_dir = Path(in_mzML[0]).parent + + # Build spectra cache (once, shared by all stages) + spectra_df = None + filename_to_index = {} + + for idxml_file in comet_results: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Build spectra cache (only once) + if spectra_df is None: + filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)} + spectra_df, filename_to_index = build_spectra_cache(mzml_dir, filename_to_index) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) + + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) + + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) + + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) + + self.logger.log("βœ… Peptide search complete") + + # --- PercolatorAdapter --- + self.logger.log("πŸ“Š Running rescoring...") + with st.spinner(f"PercolatorAdapter ({stem})"): + if not self.executor.run_topp( + "PercolatorAdapter", + { + "in": comet_results, + "out": percolator_results, + }, + {"decoy_pattern": decoy_string}, # Always propagated from upstream + ): + self.logger.log("Workflow stopped due to error") + return False + + # Build visualization cache for Percolator results + for idxml_file in percolator_results: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) + + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) + + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) + + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) + + self.logger.log("βœ… Rescoring complete") + + # if not Path(percolator_results[i]).exists(): + # st.error(f"PercolatorAdapter failed for {stem}") + # st.stop() + + # --- IDFilter --- + self.logger.log("πŸ”§ Filtering identifications...") + with st.spinner(f"IDFilter ({stem})"): + if not self.executor.run_topp( + "IDFilter", + { + "in": percolator_results, + "out": filter_results, + }, + ): + self.logger.log("Workflow stopped due to error") + return False + + # Build visualization cache for Filter results + for idxml_file in filter_results: + idxml_path = Path(idxml_file) + cache_id_prefix = idxml_path.stem + + # Parse idXML to DataFrame + id_df, spectra_data = parse_idxml(idxml_path) + + # Initialize Table component (caches itself) + Table( + cache_id=f"table_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"}, + column_definitions=[ + {"field": "sequence", "title": "Sequence"}, + {"field": "charge", "title": "Z", "sorter": "number"}, + {"field": "mz", "title": "m/z", "sorter": "number"}, + {"field": "rt", "title": "RT", "sorter": "number"}, + {"field": "score", "title": "Score", "sorter": "number"}, + {"field": "protein_accession", "title": "Proteins"}, + ], + initial_sort=[{"column": "score", "dir": "asc"}], + index_field="id_idx", + ) + + # Initialize Heatmap component + Heatmap( + cache_id=f"heatmap_{cache_id_prefix}", + data=id_df.lazy(), + cache_path=str(cache_dir), + x_column="rt", + y_column="mz", + intensity_column="score", + interactivity={"identification": "id_idx"}, + ) + + # Initialize SequenceView component + seq_view = SequenceView( + cache_id=f"seqview_{cache_id_prefix}", + sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({ + "id_idx": "sequence_id", + "charge": "precursor_charge", + }), + peaks_data=spectra_df.lazy(), + filters={ + "identification": "sequence_id", + "file": "file_index", + "spectrum": "scan_id", + }, + interactivity={"peak": "peak_id"}, + cache_path=str(cache_dir), + deconvolved=False, + annotation_config={ + "ion_types": ["b", "y"], + "neutral_losses": True, + "tolerance": frag_tol, + "tolerance_ppm": frag_tol_is_ppm, + }, + ) + + # Initialize LinePlot from SequenceView + LinePlot.from_sequence_view( + seq_view, + cache_id=f"lineplot_{cache_id_prefix}", + cache_path=str(cache_dir), + title="Annotated Spectrum", + styling={ + "unhighlightedColor": "#CCCCCC", + "highlightColor": "#E74C3C", + "selectedColor": "#F3A712", + }, + ) + + self.logger.log("βœ… Filtering complete") + + # if not Path(filter_results[i]).exists(): + # st.error(f"IDFilter failed for {stem}") + # st.stop() + + # ================================ + # EasyPQP Spectral Library Generation (optional) + # ================================ + if self.params.get("generate-library", False): + self.logger.log("πŸ“š Building spectral library with EasyPQP...") + st.info("Building spectral library with EasyPQP...") + library_dir = Path(self.workflow_dir, "results", "library") + library_dir.mkdir(parents=True, exist_ok=True) + + psms_files, peaks_files = [], [] + + for filter_idxml in filter_results: + original_stem = Path(filter_idxml).stem.replace("_filter", "") + matching_mzml = next((m for m in in_mzML if Path(m).stem == original_stem), None) + if not matching_mzml: + self.logger.log(f"Warning: No matching mzML found for {filter_idxml}") + continue + + # easypqp library requires specific extensions for file recognition: + # - PSM files must contain 'psmpkl' β†’ use .psmpkl extension + # - Peak files must contain 'peakpkl' β†’ use .peakpkl extension + # After splitext(), stem will be just "{mzML_stem}" matching PSM base_name + psms_out = str(library_dir / f"{original_stem}.psmpkl") + peaks_out = str(library_dir / f"{original_stem}.peakpkl") + + convert_cmd = [ + "easypqp", "convert", + "--pepxml", filter_idxml, + "--spectra", matching_mzml, + "--psms", psms_out, + "--peaks", peaks_out + ] + if self.executor.run_command(convert_cmd): + psms_files.append(psms_out) + peaks_files.append(peaks_out) + + if psms_files: + # easypqp library outputs TSV format (despite common .pqp extension) + library_tsv = str(library_dir / "spectral_library.tsv") + library_cmd = ["easypqp", "library", "--out", library_tsv] + + if not self.params.get("library-use-fdr", False): + # --nofdr only skips FDR recalculation, NOT threshold filtering + # Set all thresholds to 1.0 to bypass filtering for pre-filtered input + library_cmd.extend([ + "--nofdr", + "--psm_fdr_threshold", "1.0", + "--peptide_fdr_threshold", "1.0", + "--protein_fdr_threshold", "1.0" + ]) + else: + # Apply user-specified FDR filtering + library_cmd.extend([ + "--psm_fdr_threshold", + str(self.params.get("library-psm-fdr", 0.01)), + "--peptide_fdr_threshold", + str(self.params.get("library-peptide-fdr", 0.01)), + "--protein_fdr_threshold", + str(self.params.get("library-protein-fdr", 0.01)) + ]) + + for psms, peaks in zip(psms_files, peaks_files): + library_cmd.extend([psms, peaks]) + + if self.executor.run_command(library_cmd): + self.logger.log("βœ… Spectral library created") + st.success("Spectral library created") + else: + self.logger.log("Warning: Failed to build spectral library") + else: + self.logger.log("Warning: No PSMs converted for library generation") + + st.success(f"βœ“ {stem} identification completed") + + # ================================ + # 4️⃣ ProteomicsLFQ (cross-sample) + # ================================ self.logger.log("πŸ“ˆ Running cross-sample quantification...") st.info("Running ProteomicsLFQ (cross-sample quantification)") @@ -887,6 +883,7 @@ def extract_go_terms(go_data, go_type): annotated_ids = set(res_go["query"].astype(str)) fg_set = annotated_ids.intersection(fg_ids) bg_set = annotated_ids + self.logger.log(f"βœ… fg_set bg_set are set") def run_go(go_type): go2fg = defaultdict(set) @@ -926,7 +923,7 @@ def run_go(go_type): df["-log10(p)"] = -np.log10(df["p_value"].replace(0, 1e-10)) df = df.sort_values("p_value").head(20) - # βœ… Plotly Figure 생성 + # βœ… Plotly Figure fig = px.bar( df, x="-log10(p)", @@ -935,6 +932,8 @@ def run_go(go_type): title=f"GO Enrichment ({go_type})", ) + self.logger.log(f"βœ… Plotly Figure generated") + fig.update_layout( yaxis=dict(autorange="reversed"), height=500, @@ -952,9 +951,31 @@ def run_go(go_type): "fig": fig, "df": df_go } + self.logger.log(f"βœ… go_type generated") + + results_dir = Path(self.workflow_dir) / "results" / "go-terms" + results_dir.mkdir(parents=True, exist_ok=True) + import json + go_data = {} + + for go_type in ["BP", "CC", "MF"]: + if go_type in go_results: + fig = go_results[go_type]["fig"] + df = go_results[go_type]["df"] + + go_data[go_type] = { + "fig_json": fig.to_json(), # Figure β†’ JSON string + "df_dict": df.to_dict(orient="records") # DataFrame β†’ list of dicts + } + + go_json_file = results_dir / "go_results.json" + with open(go_json_file, "w") as f: + json.dump(go_data, f) + st.session_state["go_results"] = go_results - st.session_state["go_ready"] = True + st.session_state["go_ready"] = True if go_data else False + self.logger.log("βœ… GO enrichment analysis complete") # ================================ @@ -965,10 +986,10 @@ def run_go(go_type): st.code(str(results_dir)) - st.write("πŸ“„ Generated files:") - st.write(f"- mzTab: {quant_mztab}") - st.write(f"- consensusXML: {quant_cxml}") - st.write(f"- MSstats CSV: {quant_msstats}") + # st.write("πŸ“„ Generated files:") + # st.write(f"- mzTab: {quant_mztab}") + # st.write(f"- consensusXML: {quant_cxml}") + # st.write(f"- MSstats CSV: {quant_msstats}") return True diff --git a/src/workflow/WorkflowManager.py b/src/workflow/WorkflowManager.py index f15d21d..c3343dc 100644 --- a/src/workflow/WorkflowManager.py +++ b/src/workflow/WorkflowManager.py @@ -98,9 +98,9 @@ def workflow_process(self) -> None: try: self.logger.log("STARTING WORKFLOW") results_dir = Path(self.workflow_dir, "results") - # if results_dir.exists(): - # shutil.rmtree(results_dir) - # results_dir.mkdir(parents=True) + if results_dir.exists(): + shutil.rmtree(results_dir) + results_dir.mkdir(parents=True) success = self.execution() if success: self.logger.log("WORKFLOW FINISHED") From c1b5bc97092ecef470f0fa3cbc1d1a83b8718d67 Mon Sep 17 00:00:00 2001 From: Yoo HoJun Date: Tue, 10 Feb 2026 12:43:55 +0900 Subject: [PATCH 4/6] refactor: extract GO enrichment analysis into a separate method --- src/WorkflowTest.py | 322 ++++++++++++++++++++++---------------------- 1 file changed, 158 insertions(+), 164 deletions(-) diff --git a/src/WorkflowTest.py b/src/WorkflowTest.py index 3663cd9..d005791 100644 --- a/src/WorkflowTest.py +++ b/src/WorkflowTest.py @@ -817,166 +817,14 @@ def execution(self) -> bool: # ====================================================== # ⚠️ 5️⃣ GO Enrichment Analysis (INLINE IN EXECUTION) # ====================================================== - st.session_state["workspace"] = Path(self.workflow_dir).parent - res = get_abundance_data(st.session_state["workspace"]) - if res is None: - st.warning("GO enrichment skipped: abundance data not available.") + workspace_path = Path(self.workflow_dir).parent + res = get_abundance_data(workspace_path) + if res is not None: + pivot_df, _, _ = res + self.logger.log("βœ… pivot_df loaded, starting GO enrichment...") + self._run_go_enrichment(pivot_df, results_dir) else: - pivot_df, expr_df, group_map = res - self.logger.log("βœ… pivot_df loaded") - self.logger.log(f"pivot_df columns: {pivot_df.columns.tolist()}") - - p_cutoff = 0.05 - fc_cutoff = 1.0 - - analysis_df = pivot_df.dropna(subset=["p-value", "log2FC"]).copy() - - if analysis_df.empty: - st.error("No valid statistical data found for GO enrichment.") - self.logger.log("❗ analysis_df is empty") - else: - with st.spinner("Fetching GO terms from MyGene.info API..."): - mg = mygene.MyGeneInfo() - - def get_clean_uniprot(name): - parts = str(name).split("|") - return parts[1] if len(parts) >= 2 else parts[0] - - analysis_df["UniProt"] = analysis_df["ProteinName"].apply(get_clean_uniprot) - - bg_ids = analysis_df["UniProt"].dropna().astype(str).unique().tolist() - fg_ids = analysis_df[ - (analysis_df["p-value"] < p_cutoff) & - (analysis_df["log2FC"].abs() >= fc_cutoff) - ]["UniProt"].dropna().astype(str).unique().tolist() - self.logger.log("βœ… get_clean_uniprot applied") - - if len(fg_ids) < 3: - st.warning( - f"Not enough significant proteins " - f"(p < {p_cutoff}, |log2FC| β‰₯ {fc_cutoff}). " - f"Found: {len(fg_ids)}" - ) - self.logger.log("❗ Not enough significant proteins") - else: - res_list = mg.querymany( - bg_ids, scopes="uniprot", fields="go", as_dataframe=False - ) - res_go = pd.DataFrame(res_list) - if "notfound" in res_go.columns: - res_go = res_go[res_go["notfound"] != True] - self.logger.log("❗ res_go filtered for notfound entries") - - def extract_go_terms(go_data, go_type): - if not isinstance(go_data, dict) or go_type not in go_data: - return [] - terms = go_data[go_type] - if isinstance(terms, dict): - terms = [terms] - return list({t.get("term") for t in terms if "term" in t}) - - for go_type in ["BP", "CC", "MF"]: - res_go[f"{go_type}_terms"] = res_go["go"].apply( - lambda x: extract_go_terms(x, go_type) - ) - - annotated_ids = set(res_go["query"].astype(str)) - fg_set = annotated_ids.intersection(fg_ids) - bg_set = annotated_ids - self.logger.log(f"βœ… fg_set bg_set are set") - - def run_go(go_type): - go2fg = defaultdict(set) - go2bg = defaultdict(set) - - for _, row in res_go.iterrows(): - uid = str(row["query"]) - for term in row[f"{go_type}_terms"]: - go2bg[term].add(uid) - if uid in fg_set: - go2fg[term].add(uid) - - records = [] - N_fg = len(fg_set) - N_bg = len(bg_set) - - for term, fg_genes in go2fg.items(): - a = len(fg_genes) - if a == 0: - continue - b = N_fg - a - c = len(go2bg[term]) - a - d = N_bg - (a + b + c) - - _, p = fisher_exact([[a, b], [c, d]], alternative="greater") - records.append({ - "GO_Term": term, - "Count": a, - "GeneRatio": f"{a}/{N_fg}", - "p_value": p, - }) - - df = pd.DataFrame(records) - if df.empty: - return None, None - - df["-log10(p)"] = -np.log10(df["p_value"].replace(0, 1e-10)) - df = df.sort_values("p_value").head(20) - - # βœ… Plotly Figure - fig = px.bar( - df, - x="-log10(p)", - y="GO_Term", - orientation="h", - title=f"GO Enrichment ({go_type})", - ) - - self.logger.log(f"βœ… Plotly Figure generated") - - fig.update_layout( - yaxis=dict(autorange="reversed"), - height=500, - margin=dict(l=10, r=10, t=40, b=10), - ) - - return fig, df - - go_results = {} - - for go_type in ["BP", "CC", "MF"]: - fig, df_go = run_go(go_type) - if fig is not None: - go_results[go_type] = { - "fig": fig, - "df": df_go - } - self.logger.log(f"βœ… go_type generated") - - results_dir = Path(self.workflow_dir) / "results" / "go-terms" - results_dir.mkdir(parents=True, exist_ok=True) - - import json - go_data = {} - - for go_type in ["BP", "CC", "MF"]: - if go_type in go_results: - fig = go_results[go_type]["fig"] - df = go_results[go_type]["df"] - - go_data[go_type] = { - "fig_json": fig.to_json(), # Figure β†’ JSON string - "df_dict": df.to_dict(orient="records") # DataFrame β†’ list of dicts - } - - go_json_file = results_dir / "go_results.json" - with open(go_json_file, "w") as f: - json.dump(go_data, f) - - st.session_state["go_results"] = go_results - st.session_state["go_ready"] = True if go_data else False - self.logger.log("βœ… GO enrichment analysis complete") - + st.warning("GO enrichment skipped: abundance data not available.") # ================================ # 5️⃣ Final report @@ -985,13 +833,159 @@ def run_go(go_type): st.write("πŸ“ Results directory:") st.code(str(results_dir)) + return True + + def _run_go_enrichmnet(self, pivot_df: pd.DataFrame, results_dir: Path): + p_cutoff = 0.05 + fc_cutoff = 1.0 - # st.write("πŸ“„ Generated files:") - # st.write(f"- mzTab: {quant_mztab}") - # st.write(f"- consensusXML: {quant_cxml}") - # st.write(f"- MSstats CSV: {quant_msstats}") + analysis_df = pivot_df.dropna(subset=["p-value", "log2FC"]).copy() - return True + if analysis_df.empty: + st.error("No valid statistical data found for GO enrichment.") + self.logger.log("❗ analysis_df is empty") + else: + with st.spinner("Fetching GO terms from MyGene.info API..."): + mg = mygene.MyGeneInfo() + + def get_clean_uniprot(name): + parts = str(name).split("|") + return parts[1] if len(parts) >= 2 else parts[0] + + analysis_df["UniProt"] = analysis_df["ProteinName"].apply(get_clean_uniprot) + + bg_ids = analysis_df["UniProt"].dropna().astype(str).unique().tolist() + fg_ids = analysis_df[ + (analysis_df["p-value"] < p_cutoff) & + (analysis_df["log2FC"].abs() >= fc_cutoff) + ]["UniProt"].dropna().astype(str).unique().tolist() + self.logger.log("βœ… get_clean_uniprot applied") + + if len(fg_ids) < 3: + st.warning( + f"Not enough significant proteins " + f"(p < {p_cutoff}, |log2FC| β‰₯ {fc_cutoff}). " + f"Found: {len(fg_ids)}" + ) + self.logger.log("❗ Not enough significant proteins") + else: + res_list = mg.querymany( + bg_ids, scopes="uniprot", fields="go", as_dataframe=False + ) + res_go = pd.DataFrame(res_list) + if "notfound" in res_go.columns: + res_go = res_go[res_go["notfound"] != True] + + def extract_go_terms(go_data, go_type): + if not isinstance(go_data, dict) or go_type not in go_data: + return [] + terms = go_data[go_type] + if isinstance(terms, dict): + terms = [terms] + return list({t.get("term") for t in terms if "term" in t}) + + for go_type in ["BP", "CC", "MF"]: + res_go[f"{go_type}_terms"] = res_go["go"].apply( + lambda x: extract_go_terms(x, go_type) + ) + + annotated_ids = set(res_go["query"].astype(str)) + fg_set = annotated_ids.intersection(fg_ids) + bg_set = annotated_ids + self.logger.log(f"βœ… fg_set bg_set are set") + + def run_go(go_type): + go2fg = defaultdict(set) + go2bg = defaultdict(set) + + for _, row in res_go.iterrows(): + uid = str(row["query"]) + for term in row[f"{go_type}_terms"]: + go2bg[term].add(uid) + if uid in fg_set: + go2fg[term].add(uid) + + records = [] + N_fg = len(fg_set) + N_bg = len(bg_set) + + for term, fg_genes in go2fg.items(): + a = len(fg_genes) + if a == 0: + continue + b = N_fg - a + c = len(go2bg[term]) - a + d = N_bg - (a + b + c) + + _, p = fisher_exact([[a, b], [c, d]], alternative="greater") + records.append({ + "GO_Term": term, + "Count": a, + "GeneRatio": f"{a}/{N_fg}", + "p_value": p, + }) + + df = pd.DataFrame(records) + if df.empty: + return None, None + + df["-log10(p)"] = -np.log10(df["p_value"].replace(0, 1e-10)) + df = df.sort_values("p_value").head(20) + + # βœ… Plotly Figure + fig = px.bar( + df, + x="-log10(p)", + y="GO_Term", + orientation="h", + title=f"GO Enrichment ({go_type})", + ) + + self.logger.log(f"βœ… Plotly Figure generated") + + fig.update_layout( + yaxis=dict(autorange="reversed"), + height=500, + margin=dict(l=10, r=10, t=40, b=10), + ) + + return fig, df + + go_results = {} + + for go_type in ["BP", "CC", "MF"]: + fig, df_go = run_go(go_type) + if fig is not None: + go_results[go_type] = { + "fig": fig, + "df": df_go + } + self.logger.log(f"βœ… go_type generated") + + go_dir = results_dir / "go-terms" + go_dir.mkdir(parents=True, exist_ok=True) + + import json + go_data = {} + + for go_type in ["BP", "CC", "MF"]: + if go_type in go_results: + fig = go_results[go_type]["fig"] + df = go_results[go_type]["df"] + + go_data[go_type] = { + "fig_json": fig.to_json(), # Figure β†’ JSON string + "df_dict": df.to_dict(orient="records") # DataFrame β†’ list of dicts + } + + go_json_file = go_dir / "go_results.json" + with open(go_json_file, "w") as f: + json.dump(go_data, f) + + st.session_state["go_results"] = go_results + st.session_state["go_ready"] = True if go_data else False + self.logger.log("βœ… GO enrichment analysis complete") + @st.fragment def results(self) -> None: From ebeb023b8815dbbc9d47642f68de03db28f4e8db Mon Sep 17 00:00:00 2001 From: Yoo HoJun Date: Tue, 10 Feb 2026 15:03:14 +0900 Subject: [PATCH 5/6] refactor: pass results_dir to _run_go_enrichment to handle output paths --- src/WorkflowTest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/WorkflowTest.py b/src/WorkflowTest.py index d005791..781bcbe 100644 --- a/src/WorkflowTest.py +++ b/src/WorkflowTest.py @@ -835,7 +835,7 @@ def execution(self) -> bool: return True - def _run_go_enrichmnet(self, pivot_df: pd.DataFrame, results_dir: Path): + def _run_go_enrichment(self, pivot_df: pd.DataFrame, results_dir: Path): p_cutoff = 0.05 fc_cutoff = 1.0 @@ -981,7 +981,6 @@ def run_go(go_type): go_json_file = go_dir / "go_results.json" with open(go_json_file, "w") as f: json.dump(go_data, f) - st.session_state["go_results"] = go_results st.session_state["go_ready"] = True if go_data else False self.logger.log("βœ… GO enrichment analysis complete") From b754582a3eb81c7d13ccb2b4f04e91c32f17e753 Mon Sep 17 00:00:00 2001 From: Yoo HoJun Date: Tue, 10 Feb 2026 16:43:36 +0900 Subject: [PATCH 6/6] fix: remove incorrect reassignment of results_dir to input-files --- src/WorkflowTest.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/WorkflowTest.py b/src/WorkflowTest.py index 781bcbe..4abbf92 100644 --- a/src/WorkflowTest.py +++ b/src/WorkflowTest.py @@ -359,8 +359,6 @@ def execution(self) -> bool: filter_dir = results_dir / "filter_results" quant_dir = results_dir / "quant_results" - results_dir = Path(self.workflow_dir, "input-files") - for d in [comet_dir, perc_dir, filter_dir, quant_dir]: d.mkdir(parents=True, exist_ok=True)