diff --git a/app.py b/app.py
index d9b3d73d..cec8d4e9 100644
--- a/app.py
+++ b/app.py
@@ -8,33 +8,35 @@
with open("settings.json", "r") as f:
st.session_state.settings = json.load(f)
+# Initialize session state for workspace
+if "chosen-workspace" not in st.session_state:
+ if "workspace" in st.session_state:
+ st.session_state["chosen-workspace"] = str(st.session_state.workspace.stem)
+ else:
+ st.session_state["chosen-workspace"] = "default"
+
if __name__ == '__main__':
pages = {
str(st.session_state.settings["app-name"]) : [
st.Page(Path("content", "quickstart.py"), title="Quickstart", icon="π"),
st.Page(Path("content", "documentation.py"), title="Documentation", icon="π"),
],
- "pyOpenMS Toolbox": [
- st.Page(Path("content", "digest.py"), title="In Silico Digest", icon="βοΈ"),
- st.Page(Path("content", "peptide_mz_calculator.py"), title="m/z Calculator", icon="βοΈ"),
- st.Page(Path("content", "isotope_pattern_generator.py"), title="Isotopic Pattern Calculator", icon="πΆ"),
- st.Page(Path("content", "fragmentation.py"), title="Fragment Ion Generation", icon="π₯"),
- ],
"TOPP Workflow Framework": [
st.Page(Path("content", "topp_workflow_file_upload.py"), title="File Upload", icon="π"),
st.Page(Path("content", "topp_workflow_parameter.py"), title="Configure", icon="βοΈ"),
st.Page(Path("content", "topp_workflow_execution.py"), title="Run", icon="π"),
- st.Page(Path("content", "topp_workflow_results.py"), title="Results", icon="π"),
- ],
- "pyOpenMS Workflow" : [
- st.Page(Path("content", "file_upload.py"), title="File Upload", icon="π"),
- st.Page(Path("content", "raw_data_viewer.py"), title="View MS data", icon="π"),
- st.Page(Path("content", "run_example_workflow.py"), title="Run Workflow", icon="βοΈ"),
- st.Page(Path("content", "download_section.py"), title="Download Results", icon="β¬οΈ"),
+ # st.Page(Path("content", "topp_workflow_results.py"), title="Results", icon="π"),
],
- "Others Topics": [
- st.Page(Path("content", "simple_workflow.py"), title="Simple Workflow", icon="βοΈ"),
- st.Page(Path("content", "run_subprocess.py"), title="Run Subprocess", icon="π₯οΈ"),
+ "Results": [
+ st.Page(Path("content", "results_database_search.py"), title="Database Search", icon="π¬"),
+ st.Page(Path("content", "results_rescoring.py"), title="Rescoring", icon="π"),
+ st.Page(Path("content", "results_filtered.py"), title="Filtered PSMs", icon="π―"),
+ st.Page(Path("content", "results_abundance.py"), title="Abundance", icon="π"),
+ st.Page(Path("content", "results_volcano.py"), title="Volcano", icon="π"),
+ st.Page(Path("content", "results_pca.py"), title="PCA", icon="π"),
+ st.Page(Path("content", "results_heatmap.py"), title="Heatmap", icon="π₯"),
+ # st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="π"),
+ st.Page(Path("content", "results_pathway.py"), title="Pathway Analysis", icon="π§ͺ"),
]
}
diff --git a/content/results_abundance.py b/content/results_abundance.py
new file mode 100644
index 00000000..baf5cb9f
--- /dev/null
+++ b/content/results_abundance.py
@@ -0,0 +1,132 @@
+"""Abundance (ProteomicsLFQ) Results Page."""
+import streamlit as st
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from scipy.stats import ttest_ind
+from src.common.common import page_setup
+from statsmodels.stats.multitest import multipletests
+from src.common.results_helpers import get_workflow_dir, get_abundance_data
+
+params = page_setup()
+st.title("Abundance Quantification")
+
+st.markdown(
+ """
+View protein and PSM-level quantification from **ProteomicsLFQ**.
+This page calculates differential expression statistics between sample groups.
+"""
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+results_dir = Path(st.session_state["workspace"]) / "topp-workflow" / "results" / "quant_results"
+consensus_out = results_dir / "openms_design_protein_openms.csv"
+
+@st.cache_data
+def load_data(file_path):
+ return pd.read_csv(file_path, sep="\t", comment="#", engine="python")
+
+if consensus_out.exists():
+
+ # df = load_data(consensus_out)
+ # # ratio column removal
+ # df = df.loc[:, ~df.columns.str.contains('ratio', case=False)]
+
+ pre_processing_tab, protein_tab = st.tabs(["Pre-processing", "Protein Table"])
+
+ with pre_processing_tab:
+ # result = get_abundance_data(st.session_state["workspace"])
+ # DEBUG: μμΈ μμΈ μΆλ ₯ (μμ)
+ try:
+ result = get_abundance_data(st.session_state["workspace"])
+ except Exception as e:
+ st.exception(e)
+ result = None
+
+ if result is None:
+ ws = st.session_state.get("workspace")
+ st.error("Debug: get_abundance_data returned None")
+ st.write("workspace:", ws)
+ wf = Path(ws) / "topp-workflow"
+ st.write("workflow dir exists:", wf.exists(), "->", wf)
+ qdir = wf / "results" / "quant_results"
+ st.write("quant_dir exists:", qdir.exists(), "->", qdir)
+ if qdir.exists():
+ st.write("csv files:", sorted([p.name for p in qdir.glob('*.csv')]))
+ # show cached param snapshot if available
+ try:
+ from src.workflow.ParameterManager import ParameterManager
+ pm = ParameterManager(wf)
+ st.write("parameters keys (sample):", list(pm.get_parameters_from_json().items())[:20])
+ except Exception as e:
+ st.write("Param manager error:", e)
+ st.stop()
+
+ if result is None:
+ st.info("π‘ Please complete the configuration in the 'Configure' page to see results.")
+ st.stop()
+
+ pivot_df, expr_df, group_map = result
+
+ st.write("### Final Results (Group row removed, Stats added)")
+ st.dataframe(pivot_df.head(10))
+
+
+ with protein_tab:
+ st.markdown("### Protein-Level Abundance Table")
+
+ st.info(
+ "This protein-level table is generated by grouping all PSMs that map to the "
+ "same protein and aggregating their intensities across samples.\n\n"
+ "Additionally, log2 fold change and p-values are calculated between sample groups."
+ )
+
+ # Display group comparison info
+ groups = sorted(set(group_map.values()))
+ if len(groups) >= 2:
+ group1, group2 = sorted(groups)[:2]
+ st.info(f"Statistical comparison: **{group2} vs {group1}**")
+
+ exclude_cols = ["protein", "log2FC", "p-value", "p-adj",
+ "n_proteins", "n_peptides", "protein_score"]
+
+ # Get sample columns (between stats and PeptideSequence)
+ sample_cols = [c for c in pivot_df.columns if c
+ not in exclude_cols and "ratio" not in c.lower()]
+
+ # Create bar chart column with log2-transformed values
+ pivot_df["Intensity"] = pivot_df[sample_cols].apply(
+ lambda row: [np.log2(v + 1) for v in row], axis=1
+ )
+
+ # Reorder columns: place Intensity after p-value
+ display_cols = ["protein", "log2FC", "p-value", "Intensity"] + sample_cols
+ available_cols = [c for c in display_cols if c in pivot_df.columns]
+
+ st.dataframe(
+ pivot_df[available_cols].sort_values("p-value"),
+ column_config={
+ "Intensity": st.column_config.BarChartColumn(
+ "Intensity",
+ help="Sample intensities (log2 scale)",
+ width="small",
+ y_min=0,
+ ),
+ },
+ width="stretch"
+ )
+else:
+ st.warning(f"File not found: {consensus_out}")
+
+st.markdown("---")
+st.markdown("**Next steps:** Explore statistical visualizations")
+col1, col2, col3 = st.columns(3)
+with col1:
+ st.page_link("content/results_volcano.py", label="Volcano Plot", icon="π")
+with col2:
+ st.page_link("content/results_pca.py", label="PCA", icon="π")
+with col3:
+ st.page_link("content/results_heatmap.py", label="Heatmap", icon="π₯")
diff --git a/content/results_database_search.py b/content/results_database_search.py
new file mode 100644
index 00000000..cd04e69e
--- /dev/null
+++ b/content/results_database_search.py
@@ -0,0 +1,79 @@
+"""Database Search (Comet) Results Page."""
+import streamlit as st
+from pathlib import Path
+from src.common.common import page_setup
+from src.common.results_helpers import get_workflow_dir
+from openms_insight import Table, Heatmap, LinePlot, SequenceView, StateManager
+
+params = page_setup()
+st.title("Database Search Results")
+
+st.markdown(
+ """
+View peptide-spectrum matches (PSMs) identified by **Comet** database search.
+Click on a PSM to view the annotated spectrum and peptide sequence.
+"""
+)
+
+st.info(
+ "**Score:** The e-value (expectation value) represents the expected number of random PSMs "
+ "with an equal or better score. Lower values indicate higher confidence identifications."
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+workflow_dir = get_workflow_dir(st.session_state["workspace"])
+comet_dir = workflow_dir / "results" / "comet_results"
+cache_dir = workflow_dir / "results" / "insight_cache"
+
+if not comet_dir.exists():
+ st.info("No database search results available yet. Please run the workflow first.")
+ st.page_link("content/workflow_run.py", label="Go to Run", icon="π")
+ st.stop()
+
+comet_files = sorted(comet_dir.glob("*.idXML"))
+
+if not comet_files:
+ st.warning("No identification output files found.")
+ st.stop()
+
+selected_file = st.selectbox(
+ "Select identification result file",
+ comet_files,
+ format_func=lambda x: x.name
+)
+
+cache_id_prefix = selected_file.stem
+
+# Check if cache exists
+if not (cache_dir / f"table_{cache_id_prefix}").is_dir():
+ st.warning("Visualization cache not found. Please re-run the workflow.")
+ st.stop()
+
+# Initialize state manager for cross-component linking
+state_manager = StateManager()
+
+# Load components from cache (no data parameter needed)
+table = Table(cache_id=f"table_{cache_id_prefix}", cache_path=str(cache_dir))
+heatmap = Heatmap(cache_id=f"heatmap_{cache_id_prefix}", cache_path=str(cache_dir))
+seq_view = SequenceView(cache_id=f"seqview_{cache_id_prefix}", cache_path=str(cache_dir))
+line_plot = LinePlot(cache_id=f"lineplot_{cache_id_prefix}", cache_path=str(cache_dir))
+
+# Render components
+st.subheader("PSM Overview")
+heatmap(state_manager=state_manager, height=350)
+
+st.subheader("PSM Table")
+table(state_manager=state_manager, height=533)
+
+st.subheader("Peptide Sequence")
+seq_view(key=f"seqview_{cache_id_prefix}", state_manager=state_manager, height=533)
+
+st.subheader("MS2 Spectrum")
+line_plot(key=f"lineplot_{cache_id_prefix}", state_manager=state_manager, height=450, sequence_view_key=f"seqview_{cache_id_prefix}")
+
+st.markdown("---")
+st.markdown("**Next step:** View rescoring results")
+st.page_link("content/results_rescoring.py", label="Go to Rescoring", icon="π")
diff --git a/content/results_filtered.py b/content/results_filtered.py
new file mode 100644
index 00000000..a108ebde
--- /dev/null
+++ b/content/results_filtered.py
@@ -0,0 +1,79 @@
+"""Filtered PSMs Results Page."""
+import streamlit as st
+from pathlib import Path
+from src.common.common import page_setup
+from src.common.results_helpers import get_workflow_dir
+from openms_insight import Table, Heatmap, LinePlot, SequenceView, StateManager
+
+params = page_setup()
+st.title("Filtered PSMs")
+
+st.markdown(
+ """
+View FDR-controlled peptide identifications after **IDFilter** processing.
+These are high-confidence PSMs that passed the specified FDR threshold.
+Click on a PSM to view the annotated spectrum and peptide sequence.
+"""
+)
+
+st.info(
+ "**Score:** The q-value represents the minimum false discovery rate (FDR) at which this PSM "
+ "would be accepted. Lower values indicate higher confidence identifications."
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+workflow_dir = get_workflow_dir(st.session_state["workspace"])
+filter_dir = workflow_dir / "results" / "psm_filter"
+cache_dir = workflow_dir / "results" / "insight_cache"
+
+if not filter_dir.exists():
+ st.info("No filtered results available yet. Please run the workflow first.")
+ st.stop()
+
+filter_files = sorted(filter_dir.glob("*.idXML"))
+
+if not filter_files:
+ st.warning("No filtering output files found.")
+ st.stop()
+
+selected_file = st.selectbox(
+ "Select filtering result file",
+ filter_files,
+ format_func=lambda x: x.name
+)
+
+cache_id_prefix = selected_file.stem
+
+# Check if cache exists
+if not (cache_dir / f"table_{cache_id_prefix}").is_dir():
+ st.warning("Visualization cache not found. Please re-run the workflow.")
+ st.stop()
+
+# Initialize state manager for cross-component linking
+state_manager = StateManager()
+
+# Load components from cache (no data parameter needed)
+table = Table(cache_id=f"table_{cache_id_prefix}", cache_path=str(cache_dir))
+heatmap = Heatmap(cache_id=f"heatmap_{cache_id_prefix}", cache_path=str(cache_dir))
+seq_view = SequenceView(cache_id=f"seqview_{cache_id_prefix}", cache_path=str(cache_dir))
+line_plot = LinePlot(cache_id=f"lineplot_{cache_id_prefix}", cache_path=str(cache_dir))
+
+# Render components
+st.subheader("PSM Overview")
+heatmap(state_manager=state_manager, height=350)
+
+st.subheader("PSM Table")
+table(state_manager=state_manager, height=533)
+
+st.subheader("Peptide Sequence")
+seq_view(key=f"seqview_{cache_id_prefix}", state_manager=state_manager, height=533)
+
+st.subheader("MS2 Spectrum")
+line_plot(key=f"lineplot_{cache_id_prefix}", state_manager=state_manager, height=450, sequence_view_key=f"seqview_{cache_id_prefix}")
+
+st.markdown("---")
+st.markdown("**Next step:** View abundance quantification")
+st.page_link("content/results_abundance.py", label="Go to Abundance", icon="π")
diff --git a/content/results_heatmap.py b/content/results_heatmap.py
new file mode 100644
index 00000000..5a7ba470
--- /dev/null
+++ b/content/results_heatmap.py
@@ -0,0 +1,76 @@
+"""Heatmap Results Page."""
+import streamlit as st
+import numpy as np
+import plotly.express as px
+from scipy.cluster.hierarchy import linkage, leaves_list
+from scipy.spatial.distance import pdist
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+
+params = page_setup()
+st.title("Heatmap")
+
+st.markdown(
+ """
+Hierarchically clustered heatmap of protein-level abundance (Z-score normalized).
+Proteins and samples are ordered by similarity.
+"""
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+result = get_abundance_data(st.session_state["workspace"])
+if result is None:
+ st.info("Abundance data not available. Please run the workflow and configure sample groups first.")
+ st.page_link("content/results_abundance.py", label="Go to Abundance", icon="π")
+ st.stop()
+
+pivot_df, expr_df, group_map = result
+
+top_n = st.slider("Number of proteins", 20, 200, 50, key="heatmap_top_n")
+
+var_series = expr_df.var(axis=1)
+top_proteins = var_series.sort_values(ascending=False).head(top_n).index
+heatmap_df = expr_df.loc[top_proteins]
+heatmap_z = heatmap_df.sub(heatmap_df.mean(axis=1), axis=0).div(heatmap_df.std(axis=1), axis=0)
+heatmap_z = heatmap_z.replace([np.inf, -np.inf], np.nan).dropna()
+
+if not heatmap_z.empty:
+ row_linkage = linkage(pdist(heatmap_z.values), method="average")
+ row_order = leaves_list(row_linkage)
+
+ col_linkage = linkage(pdist(heatmap_z.T.values), method="average")
+ col_order = leaves_list(col_linkage)
+
+ heatmap_clustered = heatmap_z.iloc[row_order, col_order]
+
+ fig_heatmap = px.imshow(
+ heatmap_clustered,
+ labels=dict(x="Sample", y="Protein", color="Z-score"),
+ aspect="auto",
+ color_continuous_scale=[[0.0, "#3b6fb6"], [0.5, "white"], [1.0, "#b40426"]],
+ zmin=-3, zmax=3
+ )
+
+ fig_heatmap.update_layout(
+ height=700,
+ xaxis={'side': 'bottom'},
+ yaxis={'side': 'left'}
+ )
+
+ fig_heatmap.update_xaxes(tickfont=dict(size=10))
+ fig_heatmap.update_yaxes(tickfont=dict(size=8))
+
+ st.plotly_chart(fig_heatmap, width="stretch")
+else:
+ st.warning("Insufficient data to generate the heatmap.")
+
+st.markdown("---")
+st.markdown("**Other visualizations:**")
+col1, col2 = st.columns(2)
+with col1:
+ st.page_link("content/results_volcano.py", label="Volcano Plot", icon="π")
+with col2:
+ st.page_link("content/results_pca.py", label="PCA", icon="π")
diff --git a/content/results_pathway.py b/content/results_pathway.py
new file mode 100644
index 00000000..e6b13760
--- /dev/null
+++ b/content/results_pathway.py
@@ -0,0 +1,253 @@
+import mygene
+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+from pathlib import Path
+from collections import defaultdict
+from scipy.stats import fisher_exact
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+
+
+# ================================
+# Page setup
+# ================================
+params = page_setup()
+st.title("ProteomicsLFQ Results")
+
+# ================================
+# Workspace check
+# ================================
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+# ================================
+# _run_go_enrichment function
+# ================================
+def _run_go_enrichment(pivot_df: pd.DataFrame, results_dir: Path):
+ p_cutoff = 0.05
+ fc_cutoff = 1.0
+
+ analysis_df = pivot_df.dropna(subset=["p-value", "log2FC"]).copy()
+
+ if analysis_df.empty:
+ st.error("No valid statistical data found for GO enrichment.")
+ st.write("β analysis_df is empty")
+ else:
+ with st.spinner("Fetching GO terms from MyGene.info API..."):
+ mg = mygene.MyGeneInfo()
+
+ def get_clean_uniprot(name):
+ parts = str(name).split("|")
+ return parts[1] if len(parts) >= 2 else parts[0]
+
+ analysis_df["UniProt"] = analysis_df["protein"].apply(get_clean_uniprot)
+
+ bg_ids = analysis_df["UniProt"].dropna().astype(str).unique().tolist()
+ fg_ids = analysis_df[
+ (analysis_df["p-value"] < p_cutoff) &
+ (analysis_df["log2FC"].abs() >= fc_cutoff)
+ ]["UniProt"].dropna().astype(str).unique().tolist()
+ # st.write("β
get_clean_uniprot applied")
+
+ if len(fg_ids) < 3:
+ st.warning(
+ f"Not enough significant proteins "
+ f"(p < {p_cutoff}, |log2FC| β₯ {fc_cutoff}). "
+ f"Found: {len(fg_ids)}"
+ )
+ st.write("β Not enough significant proteins")
+ else:
+ res_list = mg.querymany(
+ bg_ids, scopes="uniprot", fields="go", as_dataframe=False
+ )
+ res_go = pd.DataFrame(res_list)
+ if "notfound" in res_go.columns:
+ res_go = res_go[res_go["notfound"] != True]
+
+ def extract_go_terms(go_data, go_type):
+ if not isinstance(go_data, dict) or go_type not in go_data:
+ return []
+ terms = go_data[go_type]
+ if isinstance(terms, dict):
+ terms = [terms]
+ return list({t.get("term") for t in terms if "term" in t})
+
+ for go_type in ["BP", "CC", "MF"]:
+ res_go[f"{go_type}_terms"] = res_go["go"].apply(
+ lambda x: extract_go_terms(x, go_type)
+ )
+
+ annotated_ids = set(res_go["query"].astype(str))
+ fg_set = annotated_ids.intersection(fg_ids)
+ bg_set = annotated_ids
+ # st.write(f"β
fg_set bg_set are set")
+
+ def run_go(go_type):
+ go2fg = defaultdict(set)
+ go2bg = defaultdict(set)
+
+ for _, row in res_go.iterrows():
+ uid = str(row["query"])
+ for term in row[f"{go_type}_terms"]:
+ go2bg[term].add(uid)
+ if uid in fg_set:
+ go2fg[term].add(uid)
+
+ records = []
+ N_fg = len(fg_set)
+ N_bg = len(bg_set)
+
+ for term, fg_genes in go2fg.items():
+ a = len(fg_genes)
+ if a == 0:
+ continue
+ b = N_fg - a
+ c = len(go2bg[term]) - a
+ d = N_bg - (a + b + c)
+
+ _, p = fisher_exact([[a, b], [c, d]], alternative="greater")
+ records.append({
+ "GO_Term": term,
+ "Count": a,
+ "GeneRatio": f"{a}/{N_fg}",
+ "p_value": p,
+ })
+
+ df = pd.DataFrame(records)
+ if df.empty:
+ return None, None
+
+ df["-log10(p)"] = -np.log10(df["p_value"].replace(0, 1e-10))
+ df = df.sort_values("p_value").head(20)
+
+ # β
Plotly Figure
+ fig = px.bar(
+ df,
+ x="-log10(p)",
+ y="GO_Term",
+ orientation="h",
+ title=f"GO Enrichment ({go_type})",
+ )
+
+ # st.write(f"β
Plotly Figure generated")
+
+ fig.update_layout(
+ yaxis=dict(autorange="reversed"),
+ height=500,
+ margin=dict(l=10, r=10, t=40, b=10),
+ )
+
+ return fig, df
+
+ go_results = {}
+
+ for go_type in ["BP", "CC", "MF"]:
+ fig, df_go = run_go(go_type)
+ if fig is not None:
+ go_results[go_type] = {
+ "fig": fig,
+ "df": df_go
+ }
+ # st.write(f"β
go_type generated")
+
+ go_dir = results_dir / "go-terms"
+ go_dir.mkdir(parents=True, exist_ok=True)
+
+ import json
+ go_data = {}
+
+ for go_type in ["BP", "CC", "MF"]:
+ if go_type in go_results:
+ fig = go_results[go_type]["fig"]
+ df = go_results[go_type]["df"]
+
+ go_data[go_type] = {
+ "fig_json": fig.to_json(), # Figure β JSON string
+ "df_dict": df.to_dict(orient="records") # DataFrame β list of dicts
+ }
+
+ go_json_file = go_dir / "go_results.json"
+ with open(go_json_file, "w") as f:
+ json.dump(go_data, f)
+ st.session_state["go_results"] = go_results
+ st.session_state["go_ready"] = True if go_data else False
+ # st.write("β
GO enrichment analysis complete")
+
+results_dir = Path(st.session_state["workspace"]) / "topp-workflow" / "results" / "quant"
+result = get_abundance_data(st.session_state["workspace"])
+if result is None:
+ st.info("Abundance data not available. Please run the workflow and configure sample groups first.")
+ st.page_link("content/results_abundance.py", label="Go to Abundance", icon="π")
+ st.stop()
+
+pivot_df, expr_df, group_map = result
+_run_go_enrichment(pivot_df, results_dir)
+
+# ================================
+# Tabs
+# ================================
+protein_tab, = st.tabs(["𧬠Protein Table"])
+
+# ================================
+# Protein-level results
+# ================================
+with protein_tab:
+ st.markdown("### 𧬠Protein-Level Abundance Table")
+ st.info(
+ "This protein-level table is generated by grouping all PSMs that map to the "
+ "same protein and aggregating their intensities across samples.\n\n"
+ "Additionally, log2 fold change and p-values are calculated between sample groups."
+ )
+
+ if pivot_df.empty:
+ st.info("No protein-level data available.")
+ else:
+ st.session_state["pivot_df"] = pivot_df
+ st.dataframe(pivot_df.sort_values("p-value"), width="stretch")
+
+# ======================================================
+# GO Enrichment Results
+# ======================================================
+st.markdown("---")
+st.subheader("𧬠GO Enrichment Analysis")
+
+go_json_file = results_dir / "go-terms" / "go_results.json"
+
+if not go_json_file.exists():
+ st.info("GO Enrichment results are not available yet. Please run the analysis first.")
+else:
+ import json
+ import plotly.io as pio
+
+ with open(go_json_file, "r") as f:
+ go_data = json.load(f)
+
+ bp_tab, cc_tab, mf_tab = st.tabs([
+ "𧬠Biological Process",
+ "π Cellular Component",
+ "βοΈ Molecular Function",
+ ])
+
+ for tab, go_type in zip([bp_tab, cc_tab, mf_tab], ["BP", "CC", "MF"]):
+ with tab:
+ if go_type not in go_data:
+ st.info(f"No enriched {go_type} terms found.")
+ continue
+
+ fig_json = go_data[go_type]["fig_json"]
+ df_dict = go_data[go_type]["df_dict"]
+
+ fig = pio.from_json(fig_json)
+
+ df_go = pd.DataFrame(df_dict)
+
+ if df_go.empty:
+ st.info(f"No enriched {go_type} terms found.")
+ else:
+ st.plotly_chart(fig, width="stretch")
+
+ st.markdown(f"#### {go_type} Enrichment Results")
+ st.dataframe(df_go, width="stretch")
\ No newline at end of file
diff --git a/content/results_pca.py b/content/results_pca.py
new file mode 100644
index 00000000..d15475da
--- /dev/null
+++ b/content/results_pca.py
@@ -0,0 +1,103 @@
+"""PCA Results Page."""
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+
+params = page_setup()
+st.title("PCA Analysis")
+
+st.markdown(
+ """
+Principal Component Analysis (PCA) of protein-level abundance.
+Samples are colored by group assignment to visualize clustering.
+"""
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+result = get_abundance_data(st.session_state["workspace"])
+if result is None:
+ st.info("Abundance data not available. Please run the workflow and configure sample groups first.")
+ st.page_link("content/results_abundance.py", label="Go to Abundance", icon="π")
+ st.stop()
+
+pivot_df, expr_df, group_map = result
+
+top_n = 500
+
+top_proteins = (
+ pivot_df
+ .dropna(subset=["p-adj"])
+ .sort_values("p-adj", ascending=True)
+ .head(top_n)["protein"]
+)
+
+expr_df_pca = expr_df.loc[
+ expr_df.index.intersection(top_proteins)
+]
+
+if expr_df_pca.shape[0] < 2:
+ st.info("Not enough proteins after p-value filtering for PCA.")
+ st.stop()
+
+X = expr_df_pca.T
+X_scaled = StandardScaler().fit_transform(X)
+
+pca = PCA(n_components=2)
+pcs = pca.fit_transform(X_scaled)
+
+pca_df = pd.DataFrame(
+ pcs,
+ columns=["PC1", "PC2"],
+ index=X.index
+)
+
+actual_sample_names = pca_df.index.tolist()
+
+norm_map = {}
+
+for k, v in group_map.items():
+ try:
+ sample_idx = int(k) + 1
+ target_substring = f"sample{sample_idx}["
+ real_full_name = next((name for name in actual_sample_names if target_substring in name), None)
+
+ if real_full_name:
+ norm_map[real_full_name] = v if v and v.strip() else "Unassigned"
+ except ValueError:
+ continue
+
+pca_df["Group"] = pca_df.index.map(norm_map)
+
+fig_pca = px.scatter(
+ pca_df,
+ x="PC1",
+ y="PC2",
+ color="Group",
+ text=pca_df.index,
+)
+
+fig_pca.update_traces(textposition="top center")
+fig_pca.update_layout(
+ xaxis_title=f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)",
+ yaxis_title=f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)",
+ height=600,
+)
+
+st.plotly_chart(fig_pca, width="stretch")
+
+st.markdown(f"**Proteins used:** {expr_df_pca.shape[0]} (top {top_n} by p-adj)")
+
+st.markdown("---")
+st.markdown("**Other visualizations:**")
+col1, col2 = st.columns(2)
+with col1:
+ st.page_link("content/results_volcano.py", label="Volcano Plot", icon="π")
+with col2:
+ st.page_link("content/results_heatmap.py", label="Heatmap", icon="π₯")
diff --git a/content/results_rescoring.py b/content/results_rescoring.py
new file mode 100644
index 00000000..ed8ec5f0
--- /dev/null
+++ b/content/results_rescoring.py
@@ -0,0 +1,80 @@
+"""Rescoring (Percolator) Results Page."""
+import streamlit as st
+from pathlib import Path
+from src.common.common import page_setup
+from src.common.results_helpers import get_workflow_dir
+from openms_insight import Table, Heatmap, LinePlot, SequenceView, StateManager
+
+params = page_setup()
+st.title("Rescoring Results")
+
+st.markdown(
+ """
+View PSMs after **Percolator** statistical validation. Percolator uses machine learning
+to re-score PSMs and estimate false discovery rates (FDR) for more accurate results.
+Click on a PSM to view the annotated spectrum and peptide sequence.
+"""
+)
+
+st.info(
+ "**Score:** The Posterior Error Probability (PEP) represents the probability that this PSM "
+ "is incorrect. Lower values indicate higher confidence identifications."
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+workflow_dir = get_workflow_dir(st.session_state["workspace"])
+perc_dir = workflow_dir / "results" / "percolator"
+cache_dir = workflow_dir / "results" / "insight_cache"
+
+if not perc_dir.exists():
+ st.info("No rescoring results available yet. Please run the workflow first.")
+ # st.page_link("content/workflow_run.py", label="Go to Run", icon="π")
+ st.stop()
+
+perc_files = sorted(perc_dir.glob("*.idXML"))
+
+if not perc_files:
+ st.warning("No rescoring output files found.")
+ st.stop()
+
+selected_file = st.selectbox(
+ "Select rescoring result file",
+ perc_files,
+ format_func=lambda x: x.name
+)
+
+cache_id_prefix = selected_file.stem
+
+# Check if cache exists
+if not (cache_dir / f"table_{cache_id_prefix}").is_dir():
+ st.warning("Visualization cache not found. Please re-run the workflow.")
+ st.stop()
+
+# Initialize state manager for cross-component linking
+state_manager = StateManager()
+
+# Load components from cache (no data parameter needed)
+table = Table(cache_id=f"table_{cache_id_prefix}", cache_path=str(cache_dir))
+heatmap = Heatmap(cache_id=f"heatmap_{cache_id_prefix}", cache_path=str(cache_dir))
+seq_view = SequenceView(cache_id=f"seqview_{cache_id_prefix}", cache_path=str(cache_dir))
+line_plot = LinePlot(cache_id=f"lineplot_{cache_id_prefix}", cache_path=str(cache_dir))
+
+# Render components
+st.subheader("PSM Overview")
+heatmap(state_manager=state_manager, height=350)
+
+st.subheader("PSM Table")
+table(state_manager=state_manager, height=533)
+
+st.subheader("Peptide Sequence")
+seq_view(key=f"seqview_{cache_id_prefix}", state_manager=state_manager, height=533)
+
+st.subheader("MS2 Spectrum")
+line_plot(key=f"lineplot_{cache_id_prefix}", state_manager=state_manager, height=450, sequence_view_key=f"seqview_{cache_id_prefix}")
+
+st.markdown("---")
+st.markdown("**Next step:** View filtered PSMs")
+st.page_link("content/results_filtered.py", label="Go to Filtered PSMs", icon="π―")
diff --git a/content/results_volcano.py b/content/results_volcano.py
new file mode 100644
index 00000000..17518075
--- /dev/null
+++ b/content/results_volcano.py
@@ -0,0 +1,109 @@
+"""Volcano Plot Results Page."""
+import streamlit as st
+import plotly.express as px
+import numpy as np
+import pandas as pd
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+from statsmodels.stats.multitest import multipletests
+from scipy.stats import ttest_ind
+from pathlib import Path
+
+params = page_setup()
+st.title("Volcano Plot")
+
+st.markdown(
+ """
+Visualize differential expression analysis with a volcano plot.
+Points represent proteins colored by significance status.
+"""
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+result = get_abundance_data(st.session_state["workspace"])
+if result is None:
+ st.info("Abundance data not available. Please run the workflow and configure sample groups first.")
+ st.page_link("content/results_abundance.py", label="Go to Abundance", icon="π")
+ st.stop()
+
+# Load the processed dataframe from session state
+pivot_df, expr_df, group_map = result
+
+# Threshold Selection UI
+st.divider()
+c1, c2 = st.columns(2)
+with c1:
+ fc_thresh = st.slider(
+ "log2 Fold Change threshold",
+ min_value=0.1,
+ max_value=3.0,
+ value=1.0,
+ step=0.1,
+ )
+with c2:
+ p_thresh = st.slider(
+ "p-adj (FDR) threshold",
+ min_value=0.001,
+ max_value=0.1,
+ value=0.05,
+ step=0.001,
+ )
+
+volcano_df = pivot_df.dropna(subset=["log2FC", "p-adj"]).copy()
+volcano_df["neg_log10_padj"] = -np.log10(volcano_df["p-adj"])
+
+volcano_df["Significance"] = "Not significant"
+volcano_df.loc[
+ (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] >= fc_thresh),
+ "Significance",
+] = "Up-regulated"
+
+volcano_df.loc[
+ (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] <= -fc_thresh),
+ "Significance",
+] = "Down-regulated"
+
+fig_volcano = px.scatter(
+ volcano_df,
+ x="log2FC",
+ y="neg_log10_padj",
+ color="Significance",
+ hover_data=["protein", "log2FC", "p-value", "p-adj"],
+ color_discrete_map={
+ "Up-regulated": "red",
+ "Down-regulated": "blue",
+ "Not significant": "lightgrey",
+ }
+)
+
+fig_volcano.add_vline(x=fc_thresh, line_dash="dash")
+fig_volcano.add_vline(x=-fc_thresh, line_dash="dash")
+fig_volcano.add_hline(y=-np.log10(p_thresh), line_dash="dash")
+
+# Make x-axis symmetric around zero
+max_abs_fc = volcano_df["log2FC"].abs().max()
+x_range = [-max_abs_fc * 1.1, max_abs_fc * 1.1] # 10% padding
+
+fig_volcano.update_layout(
+ xaxis_title="log2 Fold Change",
+ yaxis_title="-log10(p-adj)",
+ xaxis_range=x_range,
+ height=600,
+)
+
+st.plotly_chart(fig_volcano, width="stretch")
+
+up_count = (volcano_df["Significance"] == "Up-regulated").sum()
+down_count = (volcano_df["Significance"] == "Down-regulated").sum()
+st.markdown(f"**Up-regulated:** {up_count} | **Down-regulated:** {down_count}")
+
+st.markdown("---")
+st.markdown("**Other visualizations:**")
+col1, col2 = st.columns(2)
+with col1:
+ st.page_link("content/results_pca.py", label="PCA", icon="π")
+with col2:
+ st.page_link("content/results_heatmap.py", label="Heatmap", icon="π₯")
diff --git a/src/Workflow.py b/src/Workflow.py
index 7523a0cb..2764c8c1 100644
--- a/src/Workflow.py
+++ b/src/Workflow.py
@@ -1,11 +1,17 @@
+from altair import value
import streamlit as st
-from src.workflow.WorkflowManager import WorkflowManager
+import pandas as pd
+import numpy as np
+import plotly.express as px
+from src.workflow.WorkflowManager import WorkflowManager
+from scipy.stats import ttest_ind
+from pyopenms import IdXMLFile
# for result section:
from pathlib import Path
-import pandas as pd
-import plotly.express as px
from src.common.common import show_fig
+from openms_insight import Table, Heatmap, LinePlot, SequenceView
+from src.common.results_helpers import parse_idxml, build_spectra_cache
class Workflow(WorkflowManager):
@@ -16,7 +22,7 @@ def __init__(self) -> None:
super().__init__("TOPP Workflow", st.session_state["workspace"])
def upload(self) -> None:
- t = st.tabs(["MS data"])
+ t = st.tabs(["MS data", "FASTA database"])
with t[0]:
# Use the upload method from StreamlitUI to handle mzML file uploads.
self.ui.upload_widget(
@@ -26,98 +32,1008 @@ def upload(self) -> None:
fallback=[str(f) for f in Path("example-data", "mzML").glob("*.mzML")],
)
+ with t[1]:
+ self.ui.upload_widget(
+ key="fasta-file",
+ name="Protein FASTA database",
+ file_types=("fasta", "fa"),
+ fallback=[str(f) for f in Path("example-data", "db").glob("*.fasta")],
+ )
+
@st.fragment
def configure(self) -> None:
# Allow users to select mzML files for the analysis.
self.ui.select_input_file("mzML-files", multiple=True)
+ self.ui.select_input_file("fasta-file", multiple=False)
# Create tabs for different analysis steps.
t = st.tabs(
- ["**Feature Detection**", "**Feature Linking**", "**Python Custom Tool**"]
+ ["**IsobaricAnalyzer**", "**CometAdapter**", "**PercolatorAdapter**", "**IDFilter**", "**IDMapper**", "**FileMerger**",
+ "**ProteinInference**", "**IDFilter**", "**IDConflictResolver**", "**ProteinQuantifier**", "**Group Selection**"]
)
with t[0]:
- # Parameters for FeatureFinderMetabo TOPP tool.
+ # Checkbox for decoy generation
+ # reactive=True ensures the parent configure() fragment re-runs when checkbox changes,
+ # so conditional UI (DecoyDatabase settings) updates immediately
+ self.ui.input_widget(
+ key="generate-decoys",
+ default=True,
+ name="Generate Decoy Database",
+ widget_type="checkbox",
+ help="Generate reversed decoy sequences for FDR calculation. Disable if your FASTA already contains decoys.",
+ reactive=True,
+ )
+
+ # Reload params to get current checkbox value after it was saved
+ self.params = self.parameter_manager.get_parameters_from_json()
+
+ # Show DecoyDatabase settings if generating decoys
+ if self.params.get("generate-decoys", True):
+ st.info("""
+ **Decoy Database Settings:**
+ * **method**: How decoy sequences are generated from target protein sequences.
+ *Reverse* creates decoys by reversing each sequence, while *shuffle* randomly
+ rearranges the amino acids. Both methods preserve the amino acid composition
+ of the original protein, ensuring decoys have similar properties to real sequences
+ for accurate false discovery rate (FDR) estimation.
+ """)
+ self.ui.input_TOPP(
+ "DecoyDatabase",
+ custom_defaults={
+ "decoy_string": "rev_",
+ "decoy_string_position": "prefix",
+ "method": "reverse",
+ },
+ include_parameters=["method"],
+ )
+
+ comet_info = """
+ **Identification (Comet):**
+ * **enzyme**: The enzyme used for peptide digestion.
+ * **missed_cleavages**: Number of possible cleavage sites missed by the enzyme. It has no effect if enzyme is unspecific cleavage.
+ * **fixed_modifications**: Fixed modifications, specified using Unimod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'
+ * **variable_modifications**: Variable modifications, specified using Unimod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)' or 'Oxidation (M)'
+ * **instrument**: Type of instrument (high_res or low_res). Use 'high_res' for high-resolution MS2 (Orbitrap, TOF), 'low_res' for ion trap.
+ * **fragment_mass_tolerance**: Fragment mass tolerance for MS2 matching.
+ * **fragment_bin_offset**: Offset for binning MS2 spectra. Typically 0.0 for high-res, 0.4 for low-res instruments.
+ """
+ if not self.params.get("generate-decoys", True):
+ comet_info += """* **PeptideIndexing:decoy_string**: String that was appended (or prefixed - see 'decoy_string_position' flag below) to the accessions
+ in the protein database to indicate decoy proteins.
+ """
+ st.info(comet_info)
+
+ st.write(Path(self.workflow_dir, "results"))
+
+ comet_include = [":enzyme", "missed_cleavages", "fixed_modifications", "variable_modifications",
+ "instrument", "fragment_mass_tolerance", "fragment_error_units", "fragment_bin_offset"]
+ if not self.params.get("generate-decoys", True):
+ # Only show decoy_string when not generating decoys
+ comet_include.append("PeptideIndexing:decoy_string")
+
self.ui.input_TOPP(
- "FeatureFinderMetabo",
- custom_defaults={"algorithm:common:noise_threshold_int": 1000.0},
+ "IsobaricAnalyzer",
+ custom_defaults={
+ "tmt11plex:reference_channel": 126,
+ "type": "tmt11plex",
+ "extraction:select_activation": "auto",
+ "extraction:reporter_mass_shift": 0.002,
+ "extraction:min_reporter_intensity": 0.0,
+ "extraction:min_precursor_purity": 0.0,
+ "extraction:precursor_isotope_deviation": 10.0,
+ "quantification:isotope_correction": "false",
+ }
)
with t[1]:
- # Parameters for MetaboliteAdductDecharger TOPP tool.
- self.ui.input_TOPP("FeatureLinkerUnlabeledKD")
+ # Parameters for FeatureFinderMetabo TOPP tool.
+ # self.ui.input_TOPP(
+ # "FeatureFinderMetabo",
+ # custom_defaults={"algorithm:common:noise_threshold_int": 1000.0},
+ # )
+ comet_include = [":enzyme", "missed_cleavages", "fixed_modifications", "variable_modifications",
+ "instrument", "fragment_mass_tolerance", "fragment_error_units", "fragment_bin_offset", "PeptideIndexing:IL_equivalent"]
+ self.ui.input_TOPP(
+ "CometAdapter",
+ custom_defaults={
+ "PeptideIndexing:IL_equivalent": True,
+ "clip_nterm_methionine": "true",
+ "instrument": "high_res",
+ "missed_cleavages": 2,
+ "min_peptide_length": 6,
+ "max_peptide_length": 40,
+ "enzyme": "Trypsin/P",
+ "PeptideIndexing:unmatched_action": "warn",
+ "max_variable_mods_in_peptide": 3,
+ "precursor_mass_tolerance": 4.5,
+ "isotope_error": "0/1",
+ "precursor_error_units": "ppm",
+ "num_hits": 1,
+ "num_enzyme_termini": "fully",
+ "fragment_bin_offset": 0.0,
+ "minimum_peaks": 10,
+ "precursor_charge": "2:4",
+ "fragment_mass_tolerance": 0.015,
+ "PeptideIndexing:unmatched_action": "warn",
+ "variable_modifications": "Oxidation (M)\nAcetyl (Protein N-term)\nTMT6plex (K)\nTMT6plex (N-term)",
+ "debug": 0,
+ "force": True,
+ },
+ include_parameters=comet_include,
+ flag_parameters=["PeptideIndexing:IL_equivalent", "force"],
+ exclude_parameters=["second_enzyme"],
+ )
with t[2]:
+ st.info("""
+ **Filtering (IDFilter):**
+ * **score:type_peptide**: Score used for filtering. If empty, the main score is used.
+ * **score:psm**: The score which should be reached by a peptide hit to be kept. (use 'NAN' to disable this filter)
+ """)
+ self.ui.input_TOPP(
+ "PercolatorAdapter",
+ custom_defaults={
+ "subset_max_train": 300000,
+ "decoy_pattern": "DECOY_",
+ "score_type": "pep",
+ "post_processing_tdc": True,
+ "debug": 0,
+ },
+ flag_parameters=["post_processing_tdc"],
+ )
+
+ with t[3]:
+ # Parameters for MetaboliteAdductDecharger TOPP tool.
+ # self.ui.input_TOPP("FeatureLinkerUnlabeledKD")
+ self.ui.input_TOPP(
+ "IDFilter",
+ custom_defaults={
+ "score:type_peptide": "q-value",
+ "score:psm": 0.10,
+ },
+ tool_instance_name="IDFilter-strict",
+ )
+ with t[4]:
+ st.info("""
+ **Quantification (ProteomicsLFQ):**
+ * **intThreshold**: Peak intensity threshold applied in seed detection.
+ * **psmFDR**: FDR threshold for sub-protein level (e.g. 0.05=5%). Use -FDR_type to choose the level. Cutoff is applied at the highest level. If Bayesian inference was chosen, it is equivalent with a peptide FDR
+ * **proteinFDR**: Protein FDR threshold (0.05=5%).
+ """)
+ self.ui.input_TOPP(
+ "IDMapper",
+ custom_defaults={
+ "threads": 8,
+ "debug": 0,
+ }
+ )
+ with t[5]:
+ self.ui.input_TOPP(
+ "FileMerger",
+ custom_defaults={
+ "in_type": "consensusXML",
+ "append_method": "append_cols",
+ "annotate_file_origin": True,
+ "threads": 8,
+ },
+ flag_parameters=["annotate_file_origin"]
+ )
+ with t[6]:
+ self.ui.input_TOPP(
+ "ProteinInference",
+ custom_defaults={
+ "threads": 8,
+ "picked_decoy_string": "DECOY_",
+ "picked_fdr": "true",
+ "protein_fdr": "true",
+ "Algorithm:use_shared_peptides": "true",
+ "Algorithm:annotate_indistinguishable_groups": "true",
+ "Algorithm:score_type": "PEP",
+ "Algorithm:score_aggregation_method": "best",
+ "Algorithm:min_peptides_per_protein": 1,
+ }
+ )
+ with t[7]:
# A single checkbox widget for workflow logic.
- self.ui.input_widget("run-python-script", False, "Run custom Python script")
+ # self.ui.input_widget("run-python-script", False, "Run custom Python script") *
# Generate input widgets for a custom Python tool, located at src/python-tools.
# Parameters are specified within the file in the DEFAULTS dictionary.
- self.ui.input_python("example")
+ # self.ui.input_python("example") *
+ self.ui.input_TOPP(
+ "IDFilter",
+ custom_defaults={
+ "score:type_protein": "q-value",
+ "score:proteingroup": 0.01,
+ "score:psm": 0.01,
+ "delete_unreferenced_peptide_hits": True,
+ "remove_decoys": True
+ },
+ flag_parameters=["delete_unreferenced_peptide_hits", "remove_decoys"],
+ tool_instance_name="IDFilter-lenient",
+ )
+ with t[8]:
+ self.ui.input_TOPP(
+ "IDConflictResolver",
+ custom_defaults={
+ "threads": 4,
+ }
+ )
+
+ with t[9]:
+ self.ui.input_TOPP(
+ "ProteinQuantifier",
+ custom_defaults={
+ "method": "top",
+ "top:N": 3,
+ "top:aggregate": "median",
+ "top:include_all": True,
+ "ratios": True,
+ "threads": 8,
+ "debug": 0,
+ },
+ flag_parameters=["top:include_all", "ratios"]
+ )
+ with t[10]:
+ st.markdown("### π§ͺ TMT Sample Group Assignment")
+
+ # 1. Determine TMT type (e.g., tmt10plex, tmt16plex)
+ target_key = f"{self.parameter_manager.topp_param_prefix}IsobaricAnalyzer:1:type"
+ selected_tmt = st.session_state.get(target_key, "tmt12plex")
+
+ if "tmt" in selected_tmt:
+ import re
+ # Extract the number to determine the plex count
+ num_plex_match = re.search(r'\d+', selected_tmt)
+ if num_plex_match:
+ num_plex = int(num_plex_match.group())
+ all_channels = [f"sample{i+1}" for i in range(num_plex)]
+
+ st.info(
+ "Enter a group name for each TMT channel.\n\n"
+ "Type **'skip'** for channels you wish to skip. (e.g., control, case, skip)"
+ )
+
+ # 2. Create an input_widget for each channel (automatically saved to params.json)
+ cols = st.columns(2)
+ for i, ch in enumerate(all_channels):
+ with cols[i % 2]:
+ self.ui.input_widget(
+ key=f"TMT-group-{ch}",
+ default="",
+ name=f"Group for {ch}",
+ widget_type="text",
+ help="Enter group name or 'skip' to ignore this channel.",
+ )
+
+ # 3. Read values from params.json and construct a dictionary in tmt_group_map format
+ # (This can be used later to filter DataFrames in subsequent logic)
+ self.params = self.parameter_manager.get_parameters_from_json()
+
+ tmt_group_map = {}
+ for i, ch in enumerate(all_channels):
+ # Retrieve stored value (default is empty string)
+ group_val = self.params.get(f"TMT-group-{ch}", "")
+ tmt_group_map[str(i)] = group_val
+
+ # For data inspection (remove if not needed)
+ if st.checkbox("Show current TMT mapping"):
+ st.json(tmt_group_map)
+
+ # 4. Clean up parameters from unused/previous TMT settings
+ all_possible_channel_keys = {f"TMT-group-{ch}" for ch in all_channels}
+ orphaned_keys = [
+ k for k in self.params.keys()
+ if k.startswith("TMT-group-") and k not in all_possible_channel_keys
+ ]
+
+ if orphaned_keys:
+ for key in orphaned_keys:
+ del self.params[key]
+ self.parameter_manager.save_parameters()
+
+ else:
+ st.warning("Please select a TMT type in the parameters first.")
+ # with t[10]:
+ # st.markdown("### π§ͺ Sample Group Assignment")
+ # target_key = f"{self.parameter_manager.topp_param_prefix}IsobaricAnalyzer:1:type"
+ # selected_tmt = st.session_state.get(target_key, "tmt12plex")
+
+ # if "tmt" in selected_tmt:
+ # import re
+ # num_plex = int(re.search(r'\d+', selected_tmt).group())
+ # all_channels = [f"sample{i+1}" for i in range(num_plex)]
+
+ # @st.fragment
+ # def render_group_assignment():
+ # exclude_channels = st.multiselect(
+ # "Select samples to exclude.",
+ # options=all_channels,
+ # key="exclude_selector",
+ # help="Samples selected here will be excluded from the dataframe."
+ # )
+ # st.write("exclude_channels:", exclude_channels)
+
+ # exclude_indices = [i for i, ch in enumerate(all_channels) if ch in exclude_channels]
+
+ # st.write("exclude_indices:", exclude_indices)
+ # st.session_state["tmt_exclude_indices"] = exclude_indices
+
+ # keep_channels = [ch for ch in all_channels if ch not in exclude_channels]
+
+ # st.info("Enter group names for remaining samples.")
+ # group_mapping = {}
+
+ # if keep_channels:
+ # cols = st.columns(2)
+ # for idx, ch in enumerate(keep_channels):
+ # original_idx = all_channels.index(ch)
+ # with cols[idx % 2]:
+ # group_name = st.text_input(
+ # f"Group for {ch}",
+ # value=st.session_state.get("tmt_group_map", {}).get(original_idx, ""),
+ # placeholder="e.g. Control or Case",
+ # key=f"input_{ch}_{len(keep_channels)}"
+ # )
+ # group_mapping[original_idx] = group_name
+ # else:
+ # st.warning("All samples were selected for exclusion.")
+
+ # st.session_state["tmt_group_map"] = group_mapping
+
+ # if st.checkbox("Saved session data check"):
+ # st.write("Indices to exclude:", st.session_state["tmt_exclude_indices"])
+ # st.write("Groups for remaining indices:", st.session_state["tmt_group_map"])
+
+ # render_group_assignment()
+
+ # else:
+ # st.warning("Please select a TMT type in the parameters first.")
+
def execution(self) -> None:
# Any parameter checks, here simply checking if mzML files are selected
if not self.params["mzML-files"]:
self.logger.log("ERROR: No mzML files selected.")
return
+
+ if not self.params.get("fasta-file"):
+ st.error("No FASTA file selected.")
+ return False
# Get mzML files with FileManager
in_mzML = self.file_manager.get_files(self.params["mzML-files"])
+ fasta_file = self.file_manager.get_files([self.params["fasta-file"]])[0]
+
+ if len(in_mzML) < 1:
+ st.error("At least one mzML file is required.")
+ return False
+
+ fasta_path = Path(fasta_file)
+
+ self.logger.log(f"π Loaded {len(in_mzML)} sample(s)")
+
+ if self.params.get("generate-decoys", True):
+ decoy_fasta = fasta_path.with_suffix(".decoy.fasta")
+ # Get decoy_string from DecoyDatabase params
+ decoy_string = self.params.get("DecoyDatabase", {}).get("decoy_string", "rev_")
+
+ if not decoy_fasta.exists():
+ self.logger.log("𧬠Generating decoy database...")
+ st.info("Generating decoy FASTA database...")
+ if not self.executor.run_topp(
+ "DecoyDatabase",
+ {"in": [str(fasta_path)], "out": [str(decoy_fasta)]},
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ self.logger.log("β
Decoy database ready")
+ st.success(f"Using decoy FASTA: {decoy_fasta.name}")
+ database_fasta = decoy_fasta
+ else:
+ # Get decoy_string from CometAdapter params
+ decoy_string = self.params.get("CometAdapter", {}).get("PeptideIndexing:decoy_string", "rev_")
+ self.logger.log("π Using existing FASTA database")
+ st.info(f"Using original FASTA: {fasta_path.name}")
+ database_fasta = fasta_path
# Log any messages.
self.logger.log(f"Number of input mzML files: {len(in_mzML)}")
- # Prepare output files for feature detection.
- out_ffm = self.file_manager.get_files(
- in_mzML, "featureXML", "feature-detection"
- )
+ results_dir = Path(self.workflow_dir, "results")
+ iso_dir = results_dir / "isobaric_consensusXML"
+ comet_dir = results_dir / "comet_results"
+ perc_dir = results_dir / "percolator"
+ psm_filter_dir = results_dir / "psm_filter"
+ map_dir = results_dir / "idmapper"
+ merge_dir = results_dir / "merged"
+ protein_dir = results_dir / "protein"
+ msstats_dir = results_dir / "msstats"
+ quant_dir = results_dir / "quant_results"
- # Run FeatureFinderMetabo tool with input and output files.
- self.logger.log("Detecting features...")
- self.executor.run_topp(
- "FeatureFinderMetabo", input_output={"in": in_mzML, "out": out_ffm}
- )
+ iso_consensus = []
+ comet_results = []
+ percolator_results = []
+ psm_filtered = []
+ mapped_ids = []
- # Prepare input and output files for feature linking
- in_fl = self.file_manager.get_files(out_ffm, collect=True)
- out_fl = self.file_manager.get_files(
- "feature_matrix.consensusXML", set_results_dir="feature-linking"
- )
+ for d in [
+ iso_dir, comet_dir, perc_dir, psm_filter_dir,
+ map_dir, merge_dir, protein_dir, msstats_dir, quant_dir
+ ]:
+ d.mkdir(parents=True, exist_ok=True)
- # Run FeatureLinkerUnlabaeledKD with all feature maps passed at once
- self.logger.log("Linking features...")
- self.executor.run_topp(
- "FeatureLinkerUnlabeledKD", input_output={"in": in_fl, "out": out_fl}
- )
- self.logger.log("Exporting consensus features to pandas DataFrame...")
- self.executor.run_python(
- "export_consensus_feature_df", input_output={"in": out_fl[0]}
- )
- # Check if adduct detection should be run.
- if self.params["run-python-script"]:
- # Example for a custom Python tool, which is located in src/python-tools.
- self.executor.run_python("example", {"in": in_mzML})
+ for mz in in_mzML:
+ stem = Path(mz).stem
+ iso_consensus.append(str(iso_dir / f"{stem}_iso.consensusXML"))
+ comet_results.append(str(comet_dir / f"{stem}_comet.idXML"))
+ percolator_results.append(str(perc_dir / f"{stem}_comet_perc.idXML"))
+ psm_filtered.append(str(psm_filter_dir / f"{stem}_comet_perc_filter.idXML"))
+ mapped_ids.append(str(map_dir / f"{stem}_comet_perc_filter_map.consensusXML"))
+
+ merged_id = str(merge_dir / "ID_mapper_merge.consensusXML")
+ protein_id = str(protein_dir / "ID_mapper_merge_epi.consensusXML")
+ protein_filter = str(protein_dir / "ID_mapper_merge_epi_filter.consensusXML")
+ protein_resolved = str(protein_dir / "ID_mapper_merge_epi_filter_resconf.consensusXML")
+ # msstats_input = str(msstats_dir / "msstats_input.csv")
+ consensus_out = str(quant_dir / "openms_design_protein_openms.csv")
+
+ # --- IsobaricAnalyzer ---
+ self.logger.log("π·οΈ Running isobaric labeling analysis...")
+ with st.spinner("IsobaricAnalyzer"):
+ if not self.executor.run_topp(
+ "IsobaricAnalyzer",
+ {
+ "in": in_mzML,
+ "out": iso_consensus,
+ },
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ self.logger.log("β
IsobaricAnalyzer complete")
+
+ # --- CometAdapter ---
+ self.logger.log("π Running peptide search...")
+ with st.spinner(f"CometAdapter ({stem})"):
+ comet_extra_params = {"database": str(database_fasta)}
+ if self.params.get("generate-decoys", True):
+ # Propagate decoy_string from DecoyDatabase
+ comet_extra_params["PeptideIndexing:decoy_string"] = decoy_string
+ if not self.executor.run_topp(
+ "CometAdapter",
+ {
+ "in": in_mzML,
+ "out": comet_results,
+ },
+ comet_extra_params,
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ self.logger.log("β
CometAdapter complete")
+
+ # Get fragment tolerance from CometAdapter parameters for visualization
+ comet_params = self.parameter_manager.get_topp_parameters("CometAdapter")
+ frag_tol = comet_params.get("fragment_mass_tolerance", 0.02)
+ frag_tol_is_ppm = comet_params.get("fragment_error_units", "Da") != "Da"
+
+ # Build visualization cache for Comet results
+ results_dir_path = Path(self.workflow_dir, "results")
+ cache_dir = results_dir_path / "insight_cache"
+ cache_dir.mkdir(parents=True, exist_ok=True)
+
+ # Get mzML directory
+ mzml_dir = Path(in_mzML[0]).parent
+
+ # Build spectra cache (once, shared by all stages)
+ spectra_df = None
+ filename_to_index = {}
+
+ for idxml_file in comet_results:
+ idxml_path = Path(idxml_file)
+ cache_id_prefix = idxml_path.stem
+
+ # Parse idXML to DataFrame
+ id_df, spectra_data = parse_idxml(idxml_path)
+
+ # Build spectra cache (only once)
+ if spectra_df is None:
+ filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)}
+ spectra_df, filename_to_index = build_spectra_cache(mzml_dir, filename_to_index)
+
+ # Initialize Table component (caches itself)
+ Table(
+ cache_id=f"table_{cache_id_prefix}",
+ data=id_df.lazy(),
+ cache_path=str(cache_dir),
+ interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"},
+ column_definitions=[
+ {"field": "sequence", "title": "Sequence"},
+ {"field": "charge", "title": "Z", "sorter": "number"},
+ {"field": "mz", "title": "m/z", "sorter": "number"},
+ {"field": "rt", "title": "RT", "sorter": "number"},
+ {"field": "score", "title": "Score", "sorter": "number"},
+ {"field": "protein_accession", "title": "Proteins"},
+ ],
+ initial_sort=[{"column": "score", "dir": "asc"}],
+ index_field="id_idx",
+ )
+
+ # Initialize Heatmap component
+ Heatmap(
+ cache_id=f"heatmap_{cache_id_prefix}",
+ data=id_df.lazy(),
+ cache_path=str(cache_dir),
+ x_column="rt",
+ y_column="mz",
+ intensity_column="score",
+ interactivity={"identification": "id_idx"},
+ )
+
+ # Initialize SequenceView component
+ seq_view = SequenceView(
+ cache_id=f"seqview_{cache_id_prefix}",
+ sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({
+ "id_idx": "sequence_id",
+ "charge": "precursor_charge",
+ }),
+ peaks_data=spectra_df.lazy(),
+ filters={
+ "identification": "sequence_id",
+ "file": "file_index",
+ "spectrum": "scan_id",
+ },
+ interactivity={"peak": "peak_id"},
+ cache_path=str(cache_dir),
+ deconvolved=False,
+ annotation_config={
+ "ion_types": ["b", "y"],
+ "neutral_losses": True,
+ "tolerance": frag_tol,
+ "tolerance_ppm": frag_tol_is_ppm,
+ },
+ )
+
+ # Initialize LinePlot from SequenceView
+ LinePlot.from_sequence_view(
+ seq_view,
+ cache_id=f"lineplot_{cache_id_prefix}",
+ cache_path=str(cache_dir),
+ title="Annotated Spectrum",
+ styling={
+ "unhighlightedColor": "#CCCCCC",
+ "highlightColor": "#E74C3C",
+ "selectedColor": "#F3A712",
+ },
+ )
+
+ self.logger.log("β
Peptide search complete")
+
+ # --- PercolatorAdapter ---
+ self.logger.log("π Running rescoring...")
+ with st.spinner(f"PercolatorAdapter"):
+ if not self.executor.run_topp(
+ "PercolatorAdapter",
+ {
+ "in": comet_results,
+ "out": percolator_results,
+ },
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ # Build visualization cache for Percolator results
+ for idxml_file in percolator_results:
+ idxml_path = Path(idxml_file)
+ cache_id_prefix = idxml_path.stem
+
+ # Parse idXML to DataFrame
+ id_df, spectra_data = parse_idxml(idxml_path)
+
+ # Initialize Table component (caches itself)
+ Table(
+ cache_id=f"table_{cache_id_prefix}",
+ data=id_df.lazy(),
+ cache_path=str(cache_dir),
+ interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"},
+ column_definitions=[
+ {"field": "sequence", "title": "Sequence"},
+ {"field": "charge", "title": "Z", "sorter": "number"},
+ {"field": "mz", "title": "m/z", "sorter": "number"},
+ {"field": "rt", "title": "RT", "sorter": "number"},
+ {"field": "score", "title": "Score", "sorter": "number"},
+ {"field": "protein_accession", "title": "Proteins"},
+ ],
+ initial_sort=[{"column": "score", "dir": "asc"}],
+ index_field="id_idx",
+ )
+
+ # Initialize Heatmap component
+ Heatmap(
+ cache_id=f"heatmap_{cache_id_prefix}",
+ data=id_df.lazy(),
+ cache_path=str(cache_dir),
+ x_column="rt",
+ y_column="mz",
+ intensity_column="score",
+ interactivity={"identification": "id_idx"},
+ )
+
+ # Initialize SequenceView component
+ seq_view = SequenceView(
+ cache_id=f"seqview_{cache_id_prefix}",
+ sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({
+ "id_idx": "sequence_id",
+ "charge": "precursor_charge",
+ }),
+ peaks_data=spectra_df.lazy(),
+ filters={
+ "identification": "sequence_id",
+ "file": "file_index",
+ "spectrum": "scan_id",
+ },
+ interactivity={"peak": "peak_id"},
+ cache_path=str(cache_dir),
+ deconvolved=False,
+ annotation_config={
+ "ion_types": ["b", "y"],
+ "neutral_losses": True,
+ "tolerance": frag_tol,
+ "tolerance_ppm": frag_tol_is_ppm,
+ },
+ )
+
+ # Initialize LinePlot from SequenceView
+ LinePlot.from_sequence_view(
+ seq_view,
+ cache_id=f"lineplot_{cache_id_prefix}",
+ cache_path=str(cache_dir),
+ title="Annotated Spectrum",
+ styling={
+ "unhighlightedColor": "#CCCCCC",
+ "highlightColor": "#E74C3C",
+ "selectedColor": "#F3A712",
+ },
+ )
+
+ self.logger.log("β
PercolatorAdapter complete")
+
+ # --- IDFilter ---
+ self.logger.log("π§ Filtering identifications...")
+ with st.spinner(f"IDFilter"):
+ if not self.executor.run_topp(
+ "IDFilter",
+ {
+ "in": percolator_results,
+ "out": psm_filtered,
+ },
+ tool_instance_name="IDFilter-strict"
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ self.logger.log("β
IDFilter-strict complete")
+
+ # Build visualization cache for Filter results
+ for idxml_file in psm_filtered:
+ idxml_path = Path(idxml_file)
+ cache_id_prefix = idxml_path.stem
+
+ # Parse idXML to DataFrame
+ id_df, spectra_data = parse_idxml(idxml_path)
+
+ # Initialize Table component (caches itself)
+ Table(
+ cache_id=f"table_{cache_id_prefix}",
+ data=id_df.lazy(),
+ cache_path=str(cache_dir),
+ interactivity={"file": "file_index", "spectrum": "scan_id", "identification": "id_idx"},
+ column_definitions=[
+ {"field": "sequence", "title": "Sequence"},
+ {"field": "charge", "title": "Z", "sorter": "number"},
+ {"field": "mz", "title": "m/z", "sorter": "number"},
+ {"field": "rt", "title": "RT", "sorter": "number"},
+ {"field": "score", "title": "Score", "sorter": "number"},
+ {"field": "protein_accession", "title": "Proteins"},
+ ],
+ initial_sort=[{"column": "score", "dir": "asc"}],
+ index_field="id_idx",
+ )
+
+ # Initialize Heatmap component
+ Heatmap(
+ cache_id=f"heatmap_{cache_id_prefix}",
+ data=id_df.lazy(),
+ cache_path=str(cache_dir),
+ x_column="rt",
+ y_column="mz",
+ intensity_column="score",
+ interactivity={"identification": "id_idx"},
+ )
+
+ # Initialize SequenceView component
+ seq_view = SequenceView(
+ cache_id=f"seqview_{cache_id_prefix}",
+ sequence_data=id_df.lazy().select(["id_idx", "sequence", "charge", "file_index", "scan_id"]).rename({
+ "id_idx": "sequence_id",
+ "charge": "precursor_charge",
+ }),
+ peaks_data=spectra_df.lazy(),
+ filters={
+ "identification": "sequence_id",
+ "file": "file_index",
+ "spectrum": "scan_id",
+ },
+ interactivity={"peak": "peak_id"},
+ cache_path=str(cache_dir),
+ deconvolved=False,
+ annotation_config={
+ "ion_types": ["b", "y"],
+ "neutral_losses": True,
+ "tolerance": frag_tol,
+ "tolerance_ppm": frag_tol_is_ppm,
+ },
+ )
+
+ # Initialize LinePlot from SequenceView
+ LinePlot.from_sequence_view(
+ seq_view,
+ cache_id=f"lineplot_{cache_id_prefix}",
+ cache_path=str(cache_dir),
+ title="Annotated Spectrum",
+ styling={
+ "unhighlightedColor": "#CCCCCC",
+ "highlightColor": "#E74C3C",
+ "selectedColor": "#F3A712",
+ },
+ )
+
+ # --- IDMapper ---
+ self.logger.log("πΊοΈ Mapping IDs to isobaric consensus features...")
+ for iso, psm, mapped in zip(iso_consensus, psm_filtered, mapped_ids):
+ iso_stem = Path(iso).stem
+ with st.spinner(f"IDMapper ({iso_stem})"):
+ if not self.executor.run_topp(
+ "IDMapper",
+ {
+ "in": [iso],
+ "id": [psm],
+ "out": [mapped],
+ },
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ self.logger.log("β
IDMapper complete")
+
+ # --- FileMerger ---
+ self.logger.log("π Merging mapped consensus files...")
+ with st.spinner("FileMerger"):
+ if not self.executor.run_topp(
+ "FileMerger",
+ {
+ "in": mapped_ids,
+ "out": [merged_id],
+ },
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ self.logger.log("β
FileMerger complete")
+
+ # --- ProteinInference ---
+ self.logger.log("π§© Running protein inference...")
+ with st.spinner("ProteinInference"):
+ if not self.executor.run_topp(
+ "ProteinInference",
+ {
+ "in": [merged_id],
+ "out": [protein_id],
+ },
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ self.logger.log("β
ProteinInference complete")
+
+ # --- IDFilter-lenient (Protein) ---
+ self.logger.log("π¬ Filtering proteins...")
+ with st.spinner("IDFilter (Protein)"):
+ if not self.executor.run_topp(
+ "IDFilter",
+ {
+ "in": [protein_id],
+ "out": [protein_filter],
+ },
+ tool_instance_name="IDFilter-lenient"
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ self.logger.log("β
IDFilter-lenient (Protein) complete")
+
+ # ================================
+ # β¨ NEW: 8οΈβ£ IDConflictResolver (protein_filter β protein_resolved)
+ # ================================
+ self.logger.log("βοΈ Resolving ID conflicts...")
+ with st.spinner("IDConflictResolver"):
+ if not self.executor.run_topp(
+ "IDConflictResolver",
+ {
+ "in": [protein_filter],
+ "out": [protein_resolved],
+ },
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ self.logger.log("β
IDConflictResolver complete")
+
+ # ================================
+ # β¨ NEW: π ProteinQuantifier (protein_resolved β consensus_out)
+ # ================================
+ self.logger.log("π Running protein quantification...")
+ with st.spinner("ProteinQuantifier"):
+ if not self.executor.run_topp(
+ "ProteinQuantifier",
+ {
+ "in": [protein_resolved],
+ "out": [consensus_out],
+ },
+ ):
+ self.logger.log("Workflow stopped due to error")
+ return False
+ self.logger.log("β
ProteinQuantifier complete")
+ self.logger.log("π WORKFLOW FINISHED")
@st.fragment
def results(self) -> None:
- @st.fragment
- def show_consensus_features():
- df = pd.read_csv(file, sep="\t", index_col=0)
- st.metric("number of consensus features", df.shape[0])
- c1, c2 = st.columns(2)
- rows = c1.dataframe(df, selection_mode="multi-row", on_select="rerun")[
- "selection"
- ]["rows"]
- if rows:
- df = df.iloc[rows, 4:]
- fig = px.bar(df, barmode="group", labels={"value": "intensity"})
- with c2:
- show_fig(fig, "consensus-feature-intensities")
+ st.title("π DDA-TMT Analysis Results")
+
+ # Tab configuration (TMT-specific)
+ tabs = st.tabs([
+ "π Identification",
+ "π Rescoring & Filter",
+ "π TMT Reporter Intensity",
+ "𧬠Protein Grouping",
+ "π Statistical Analysis"
+ ])
+
+ id_tab, filter_tab, tmt_tab, prot_tab, stat_tab = tabs
+
+ # Helper: idXML to DataFrame (Maintain existing code)
+ def idxml_to_df(idxml_file):
+ proteins, peptides = [], []
+ IdXMLFile().load(str(idxml_file), proteins, peptides)
+ records = []
+ for pep in peptides:
+ rt, mz = pep.getRT(), pep.getMZ()
+ for h in pep.getHits():
+ records.append({
+ "RT": rt, "m/z": mz, "Sequence": h.getSequence().toString(),
+ "Charge": h.getCharge(), "Score": h.getScore(),
+ "Proteins": ",".join([ev.getProteinAccession() for ev in h.getPeptideEvidences()])
+ })
+ return pd.DataFrame(records)
+
+ # 1. Identification (Comet)
+ with id_tab:
+ comet_files = sorted(Path(self.workflow_dir, "results", "comet").glob("*.idXML"))
+ if comet_files:
+ selected_file = st.selectbox("Select Identification file", comet_files, key="comet_sb")
+ df_comet = idxml_to_df(selected_file)
+ st.dataframe(df_comet, use_container_width=True)
+ # Scatter plot code can remain the same
+
+ # 2. Filtering (Percolator + IDFilter)
+ with filter_tab:
+ filter_files = sorted(Path(self.workflow_dir, "results", "psm_filter").glob("*.idXML"))
+ if filter_files:
+ selected_f = st.selectbox("Select Filtered file", filter_files, key="filter_sb")
+ df_filter = idxml_to_df(selected_f)
+ st.success(f"Identified {len(df_filter)} PSMs after filtering (FDR < 0.01)")
+ st.dataframe(df_filter, use_container_width=True)
+
+ # 3. TMT Reporter Intensity (IsobaricAnalyzer)
+ with tmt_tab:
+ st.subheader("TMT Reporter Ion Intensity Distribution")
+ # Extract channel-specific intensities from IsobaricAnalyzer consensusXML
+ iso_files = sorted(Path(self.workflow_dir, "results", "isobaric_consensusXML").glob("*.consensusXML"))
+ if iso_files:
+ sel_iso = st.selectbox("Select TMT result", iso_files)
+ # For simple visualization, it is better to use the quantitative results CSV if available
+ # Example distribution using the final CSV (openms_design_protein_openms.csv)
+ quant_file = Path(self.workflow_dir, "results", "quant", "openms_design_protein_openms.csv")
+ if quant_file.exists():
+ df_q = pd.read_csv(quant_file)
+ # Filter channel columns (usually prefixed with 'Abundance_' or specific tags)
+ # Column names need to be verified based on the OpenMS output structure
+ intensity_cols = [c for c in df_q.columns if 'intensity' in c.lower() or 'abundance' in c.lower()]
+ if intensity_cols:
+ fig_box = px.box(df_q.melt(value_vars=intensity_cols), x="variable", y="value", log_y=True,
+ title="Log-scaled Reporter Intensity Distribution per Channel")
+ st.plotly_chart(fig_box)
+
+ # 4. Protein Grouping & Quantification
+ with prot_tab:
+ st.subheader("𧬠Final Protein-Level Results")
+ final_csv = Path(self.workflow_dir, "results", "quant", "openms_design_protein_openms.csv")
+
+ if final_csv.exists():
+ df_final = pd.read_csv(final_csv)
+ st.info(f"Total Protein Groups: {df_final['ProteinName'].nunique() if 'ProteinName' in df_final.columns else len(df_final)}")
+ st.dataframe(df_final, use_container_width=True)
+
+ # CSV Download Button
+ st.download_button("Download Results", df_final.to_csv(index=False), "TMT_Results.csv")
else:
- st.info(
- "π‘ Select one ore more rows in the table to show a barplot with intensities."
- )
+ st.warning("Final quantification CSV not found.")
- file = Path(
- self.workflow_dir, "results", "feature-linking", "feature_matrix.tsv"
- )
- if file.exists():
- show_consensus_features()
- else:
- st.warning("No consensus feature file found. Please run workflow first.")
+ # 5. Statistical Analysis (Volcano Plot etc.)
+ with stat_tab:
+ final_csv = Path(self.workflow_dir, "results", "quant", "openms_design_protein_openms.csv")
+
+ if not final_csv.exists():
+ st.warning("Analysis results not found. Please run the workflow first.")
+ return
+
+ try:
+ # 1οΈβ£ Data loading and preprocessing
+ df_quant = pd.read_csv(final_csv)
+
+ # Identify intensity (abundance) columns from TMT results
+ # Typically starts with 'abundance_' or 'intensity_' depending on OpenMS output format
+ intensity_cols = [c for c in df_quant.columns if 'abundance' in c.lower() or 'intensity' in c.lower()]
+
+ if len(intensity_cols) < 2:
+ st.error("Not enough intensity columns found for comparison.")
+ return
+
+ # 2οΈβ£ Group setup (use existing source's group_map)
+ # In TMT, multiple channels (samples) exist within a single file,
+ # so map which column belongs to which group (Control/Case).
+ st.subheader("Group Comparison Setup")
+ col1, col2 = st.columns(2)
+ with col1:
+ group_a_cols = st.multiselect("Select Control Group Channels", intensity_cols, default=[intensity_cols[0]])
+ with col2:
+ group_b_cols = st.multiselect("Select Case Group Channels", intensity_cols, default=[intensity_cols[-1]])
+
+ if st.button("Run Statistical Analysis"):
+ stats_results = []
+
+ for _, row in df_quant.iterrows():
+ g1 = row[group_a_cols].values.astype(float)
+ g2 = row[group_b_cols].values.astype(float)
+
+ # Calculate Log2 Fold Change
+ log2fc = np.log2(np.mean(g2) / np.mean(g1)) if np.mean(g1) > 0 else 0
+
+ # T-test (p-value)
+ _, pval = ttest_ind(g1, g2, nan_policy='omit')
+
+ stats_results.append({
+ "Protein": row.get("ProteinName", "Unknown"),
+ "log2FC": log2fc,
+ "pvalue": pval,
+ "-log10_pvalue": -np.log10(pval) if pval > 0 else 0
+ })
+
+ df_stats = pd.DataFrame(stats_results)
+
+ # 3οΈβ£ Volcano plot visualization
+ st.divider()
+ st.subheader("Volcano Plot")
+
+ # Define colors to highlight significant proteins
+ df_stats['Significance'] = 'Normal'
+ df_stats.loc[(df_stats['log2FC'] > 1) & (df_stats['pvalue'] < 0.05), 'Significance'] = 'Up'
+ df_stats.loc[(df_stats['log2FC'] < -1) & (df_stats['pvalue'] < 0.05), 'Significance'] = 'Down'
+
+ fig_volcano = px.scatter(
+ df_stats, x="log2FC", y="-log10_pvalue",
+ color="Significance",
+ hover_data=["Protein"],
+ color_discrete_map={'Up': 'red', 'Down': 'blue', 'Normal': 'gray'},
+ title=f"Comparison: {', '.join(group_b_cols)} vs {', '.join(group_a_cols)}"
+ )
+
+ # Guidelines (p=0.05, FC=2)
+ fig_volcano.add_hline(y=-np.log10(0.05), line_dash="dash", line_color="black")
+ fig_volcano.add_vline(x=1, line_dash="dash", line_color="black")
+ fig_volcano.add_vline(x=-1, line_dash="dash", line_color="black")
+
+ st.plotly_chart(fig_volcano, use_container_width=True)
+ st.dataframe(df_stats.sort_values("pvalue"), use_container_width=True)
+
+ except Exception as e:
+ st.error(f"Error during statistical analysis: {e}")
\ No newline at end of file
diff --git a/src/common/results_helpers.py b/src/common/results_helpers.py
new file mode 100644
index 00000000..8ff3110d
--- /dev/null
+++ b/src/common/results_helpers.py
@@ -0,0 +1,483 @@
+"""Helper functions for results pages."""
+import re
+import pandas as pd
+import polars as pl
+import numpy as np
+import streamlit as st
+from pathlib import Path
+from scipy.stats import ttest_ind
+from pyopenms import IdXMLFile, MSExperiment, MzMLFile
+from src.workflow.ParameterManager import ParameterManager
+from statsmodels.stats.multitest import multipletests
+
+def get_workflow_dir(workspace):
+ """Get the workflow directory path."""
+ return Path(workspace, "topp-workflow")
+
+
+def idxml_to_df(idxml_file):
+ """Parse idXML file and return DataFrame with peptide hits."""
+ proteins = []
+ peptides = []
+ IdXMLFile().load(str(idxml_file), proteins, peptides)
+
+ records = []
+ for pep in peptides:
+ rt = pep.getRT()
+ mz = pep.getMZ()
+ for h in pep.getHits():
+ protein_refs = [ev.getProteinAccession() for ev in h.getPeptideEvidences()]
+ records.append({
+ "RT": rt,
+ "m/z": mz,
+ "Sequence": h.getSequence().toString(),
+ "Charge": h.getCharge(),
+ "Score": h.getScore(),
+ "Proteins": ",".join(protein_refs) if protein_refs else None,
+ })
+
+ df = pd.DataFrame(records)
+ if not df.empty:
+ df["Charge"] = df["Charge"].astype(str)
+ df["Charge_num"] = df["Charge"].astype(int)
+ return df
+
+
+def create_psm_scatter_plot(df_plot):
+ """Create a scatter plot for PSM visualization."""
+ import plotly.express as px
+
+ fig = px.scatter(
+ df_plot,
+ x="RT",
+ y="m/z",
+ color="Score",
+ custom_data=["index", "Sequence", "Proteins"],
+ color_continuous_scale=["#a6cee3", "#1f78b4", "#08519c", "#08306b"],
+ )
+ fig.update_traces(
+ marker=dict(size=6, opacity=0.8),
+ hovertemplate='Index: %{customdata[0]}
'
+ + 'RT: %{x:.2f}
'
+ + 'm/z: %{y:.4f}
'
+ + 'Score: %{marker.color:.3f}
'
+ + 'Sequence: %{customdata[1]}
'
+ + 'Proteins: %{customdata[2]}
'
+ + ''
+ )
+ fig.update_layout(
+ coloraxis_colorbar=dict(title="Score"),
+ hovermode="closest"
+ )
+ return fig
+
+
+def extract_scan_from_ref(spec_ref: str) -> int:
+ """Extract scan number from spectrum reference string.
+
+ Format: "controllerType=0 controllerNumber=1 scan=1234"
+ """
+ match = re.search(r'scan=(\d+)', spec_ref)
+ return int(match.group(1)) if match else 0
+
+
+def extract_scan_number(native_id: str) -> int:
+ """Extract scan number from native ID."""
+ match = re.search(r'scan=(\d+)', native_id)
+ return int(match.group(1)) if match else 0
+
+
+def extract_filename_from_idxml(idxml_path: Path) -> str:
+ """Derive mzML filename from idXML filename."""
+ stem = idxml_path.stem
+ for suffix in ['_comet', '_per', '_filter']:
+ stem = stem.replace(suffix, '')
+ return f"{stem}.mzML"
+
+
+def parse_idxml(idxml_path: Path) -> tuple[pl.DataFrame, list[str]]:
+ """Parse idXML and return DataFrame for openms_insight.
+
+ Returns:
+ Tuple of (id_df, spectra_data list of source filenames)
+ """
+ proteins = []
+ peptides = []
+ IdXMLFile().load(str(idxml_path), proteins, peptides)
+
+ # Derive mzML filename from idXML filename (e.g., 02COVID_filter.idXML -> 02COVID.mzML)
+ spectra_data = [extract_filename_from_idxml(idxml_path)]
+
+ # Build filename to index mapping
+ filename_to_index = {Path(f).name: i for i, f in enumerate(spectra_data)}
+
+ records = []
+ for pep in peptides:
+ # Get spectrum reference from meta value (key may be bytes or string)
+ spec_ref = ""
+ if pep.metaValueExists("spectrum_reference"):
+ spec_ref = pep.getMetaValue("spectrum_reference")
+ if isinstance(spec_ref, bytes):
+ spec_ref = spec_ref.decode()
+ scan_id = extract_scan_from_ref(spec_ref)
+
+ # Get file index from id_merge_index or derive from filename
+ file_index = pep.getMetaValue("id_merge_index") if pep.metaValueExists("id_merge_index") else 0
+ filename = spectra_data[file_index] if file_index < len(spectra_data) else ""
+
+ for h in pep.getHits():
+ records.append({
+ "id_idx": len(records),
+ "scan_id": scan_id,
+ "file_index": file_index,
+ "filename": Path(filename).name if filename else "",
+ "sequence": h.getSequence().toString(),
+ "charge": h.getCharge(),
+ "mz": pep.getMZ(),
+ "rt": pep.getRT(),
+ "score": h.getScore(),
+ "protein_accession": ";".join([ev.getProteinAccession() for ev in h.getPeptideEvidences()]),
+ })
+
+ return pl.DataFrame(records), spectra_data
+
+
+def build_spectra_cache(mzml_dir: Path, filename_to_index: dict) -> tuple[pl.DataFrame, dict]:
+ """Extract MS2 spectra from mzML files and return DataFrame.
+
+ Args:
+ mzml_dir: Directory containing mzML files
+ filename_to_index: Dict mapping filename to file_index
+
+ Returns:
+ Tuple of (spectra_df, updated filename_to_index)
+ """
+ records = []
+ peak_id = 0
+
+ for mzml_path in sorted(mzml_dir.glob("*.mzML")):
+ # Get or create file index
+ if mzml_path.name not in filename_to_index:
+ filename_to_index[mzml_path.name] = len(filename_to_index)
+ file_index = filename_to_index[mzml_path.name]
+
+ exp = MSExperiment()
+ MzMLFile().load(str(mzml_path), exp)
+
+ for spec in exp:
+ if spec.getMSLevel() != 2:
+ continue
+ scan_id = extract_scan_number(spec.getNativeID())
+ mz_array, int_array = spec.get_peaks()
+
+ for mz, intensity in zip(mz_array, int_array):
+ records.append({
+ "peak_id": peak_id,
+ "file_index": file_index,
+ "scan_id": scan_id,
+ "mass": float(mz), # Use "mass" not "mz"
+ "intensity": float(intensity),
+ })
+ peak_id += 1
+
+ return pl.DataFrame(records), filename_to_index
+
+
+@st.cache_data
+def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None:
+ """Load CSV, compute stats (log2FC, p-value), build pivot_df and expr_df.
+
+ Args:
+ workspace_path: Path to the workspace directory
+ csv_mtime: Modification time of CSV file (used as cache key)
+
+ Returns:
+ Tuple of (pivot_df, expr_df, group_map) or None if data unavailable
+ """
+ workflow_dir = get_workflow_dir(Path(workspace_path))
+ quant_dir = workflow_dir / "results" / "quant_results"
+
+ if not quant_dir.exists():
+ return None
+
+ csv_files = sorted(quant_dir.glob("*.csv"))
+ if not csv_files:
+ return None
+
+ csv_file = csv_files[0]
+
+ try:
+ df = pd.read_csv(csv_file, sep="\t", comment="#", engine="python")
+ except Exception:
+ return None
+
+ st.write(f"Loaded quantification data from {csv_file.name} (mtime: {csv_mtime})")
+
+ if df.empty:
+ return None
+
+ # ratio column removal
+ df = df.loc[:, ~df.columns.str.contains('ratio', case=False)]
+
+ # exclude_indices = st.session_state.get("tmt_exclude_indices", [])
+ # group_map = st.session_state.get("tmt_group_map", {})
+ # Get group mapping from parameters
+ param_manager = ParameterManager(Path(workflow_dir))
+ params = param_manager.get_parameters_from_json()
+ group_map = {}
+ for key, value in params.items():
+ if key.startswith("TMT-group-") and value:
+ # Extract the numeric part from keys like "TMT-group-sample1"
+ match = re.search(r'sample(\d+)', key)
+ if match:
+ # Subtract 1 to convert to a 0-based index (0, 1, 2...).
+ # If your samples are already 0-based, remove the -1 adjustment.
+ index = str(int(match.group(1)) - 1)
+ group_map[index] = value
+
+ # 1. Extract keys labeled as "skip" from group_map as integer list
+ exclude_indices = [
+ int(k) for k, v in group_map.items() if v.lower() == "skip"
+ ]
+
+ # 2. Remove "skip" entries from group_map (keep only actual group info)
+ group_map = {
+ int(k): v for k, v in group_map.items() if v.lower() != "skip"
+ }
+
+ start_column_offset = 4
+
+ # st.write("exclude_indices:", exclude_indices)
+ # st.write("group_map:", group_map)
+
+ if not group_map:
+ st.warning("β οΈ Group mapping information is missing. Please configure sample groups in the Setup page.")
+ return None
+
+ if exclude_indices:
+ # st.write("Current columns:", df.columns.tolist())
+ # st.write("Number of columns:", len(df.columns))
+ # st.write("Exclude indices:", exclude_indices)
+ # st.write("Offset:", start_column_offset)
+ cols_to_drop = [df.columns[i + start_column_offset] for i in exclude_indices]
+ df_cleaned = df.drop(columns=cols_to_drop)
+ else:
+ df_cleaned = df.copy()
+
+ if group_map:
+ # Create new row data (defaulting to empty strings)
+ # Create a list with the same length as the column order of df_cleaned
+ new_row = [""] * len(df_cleaned.columns)
+ new_row[0] = "Group"
+
+ # Get the column names of the current dataframe as a list
+ current_cols = df_cleaned.columns.tolist()
+ original_cols = df.columns.tolist()
+
+ for col_name in current_cols[start_column_offset:]:
+ # Check the original index position of this column
+ original_idx = original_cols.index(col_name) - start_column_offset
+ col_pos = current_cols.index(col_name)
+ new_row[col_pos] = group_map.get(original_idx, "NA")
+
+ # Insert the row at the top of the dataframe
+ # Create a new DF and concatenate to prepend the row to existing data
+ group_df = pd.DataFrame([new_row], columns=df_cleaned.columns)
+ df_with_groups = pd.concat([group_df, df_cleaned], ignore_index=True)
+
+ # drop_msg = f"{len(exclude_indices)} channels dropped" if exclude_indices else "No channels dropped"
+ # st.success(f"β
{drop_msg} and Group names have been inserted at the top of the data.")
+
+ # st.write("### Data Preview with Group Information")
+ # st.dataframe(df_with_groups.head(10))
+
+ if group_map and len(set(group_map.values())) >= 2:
+ # Prepare data for calculation
+ # Extract group information from row 0 of df_with_groups (the newly added Group row)
+ # Actual sample data starts from the 5th column (index 4)
+ group_info_row = df_with_groups.iloc[0]
+
+ # Get unique group names (excluding NA)
+ unique_groups = sorted([g for g in set(group_map.values()) if g != "NA"])
+ g1_name, g2_name = unique_groups[0], unique_groups[1]
+
+ # Extract numerical data for statistical calculation (from row 1 and column index 4 onwards)
+ # Convert to numeric type (to prevent calculation errors)
+ numeric_data = df_with_groups.iloc[1:, 4:].apply(pd.to_numeric, errors='coerce')
+
+ # Column indexing by group
+ # Categorize columns based on the values in the Group row
+ g1_cols = [col for col in numeric_data.columns if group_info_row[col] == g1_name]
+ g2_cols = [col for col in numeric_data.columns if group_info_row[col] == g2_name]
+
+ # Calculate log2FC and p-value for each row
+ def run_stats(row):
+ v1 = row[g1_cols].dropna()
+ v2 = row[g2_cols].dropna()
+
+ # log2FC (Group2 / Group1)
+ m1, m2 = v1.mean(), v2.mean()
+ l2fc = np.log2(m2 / m1) if m1 > 0 and m2 > 0 else np.nan
+
+ # p-value (T-test)
+ if len(v1) > 1 and len(v2) > 1:
+ _, pval = ttest_ind(v1, v2, equal_var=False)
+ else:
+ pval = np.nan
+ return pd.Series([l2fc, pval])
+
+ stats_results = numeric_data.apply(run_stats, axis=1)
+ stats_results.columns = ['log2FC', 'p-value']
+ # Add Adjusted p-value (FDR) calculation
+ if not stats_results['p-value'].isna().all():
+ # Select only rows that contain p-values
+ mask = stats_results['p-value'].notna()
+ # Apply Benjamini-Hochberg (BH) correction
+ _, p_adj, _, _ = multipletests(stats_results.loc[mask, 'p-value'], method='fdr_bh')
+ stats_results.loc[mask, 'p-adj'] = p_adj
+ else:
+ stats_results['p-adj'] = np.nan
+
+ # Construct the final dataframe (Based on df_cleaned - excluding the group row)
+ # Insert calculation results into the 2nd and 3rd column positions
+ pivot_df = df_cleaned.copy()
+ pivot_df.insert(1, "log2FC", stats_results['log2FC'].values)
+ pivot_df.insert(2, "p-value", stats_results['p-value'].values)
+ pivot_df.insert(3, "p-adj", stats_results['p-adj'].values)
+
+ # st.success(f"Analysis Complete: {g1_name} (n={len(g1_cols)}) vs {g2_name} (n={len(g2_cols)})")
+
+ # Set the first column ('protein') of final_df as the index
+ protein_col = pivot_df.columns[0]
+ sample_cols = current_cols[start_column_offset:] # Identify actual sample column names
+
+ # Select sample columns and create a matrix
+ expr_df = pivot_df.set_index(protein_col)[sample_cols]
+
+ # Replace 0 with NaN (to prevent log transformation errors)
+ expr_df = expr_df.replace(0, np.nan)
+
+ # Log2 transformation (data normalization)
+ expr_df = np.log2(expr_df + 1)
+
+ # Remove proteins (rows) with any missing values
+ expr_df = expr_df.dropna()
+
+ return pivot_df, expr_df, group_map
+ else:
+ st.warning("β οΈ At least two distinct groups are required for statistical analysis.")
+ else:
+ st.warning("β οΈ No group mapping information is set. Please check the Configure page.")
+ return None
+
+ # Get group mapping from parameters
+ # param_manager = ParameterManager(workflow_dir)
+ # params = param_manager.get_parameters_from_json()
+ # group_map = {
+ # key[11:]: value # Remove "mzML-group-" prefix
+ # for key, value in params.items()
+ # if key.startswith("mzML-group-") and value
+ # }
+
+ # if not group_map:
+ # return None
+
+ # df["Sample"] = df["Reference"].str.replace(".mzML", "", regex=False)
+ # df["Group"] = df["Reference"].map(group_map)
+ # df = df.dropna(subset=["Group"])
+
+ # groups = sorted(df["Group"].unique())
+
+ # if len(groups) < 2:
+ # return None
+
+ # group1, group2 = groups[:2]
+
+ # # Compute statistics per protein
+ # stats_rows = []
+ # for protein, protein_df in df.groupby("ProteinName"):
+ # g1_vals = protein_df[protein_df["Group"] == group1]["Intensity"].values
+ # g2_vals = protein_df[protein_df["Group"] == group2]["Intensity"].values
+
+ # if len(g1_vals) < 2 or len(g2_vals) < 2:
+ # pval = np.nan
+ # else:
+ # _, pval = ttest_ind(g1_vals, g2_vals, equal_var=False)
+
+ # mean_g1 = np.mean(g1_vals) if len(g1_vals) > 0 else np.nan
+ # mean_g2 = np.mean(g2_vals) if len(g2_vals) > 0 else np.nan
+
+ # log2fc = np.log2(mean_g2 / mean_g1) if mean_g1 > 0 else np.nan
+
+ # stats_rows.append({
+ # "ProteinName": protein,
+ # "log2FC": log2fc,
+ # "p-value": pval,
+ # })
+
+ # stats_df = pd.DataFrame(stats_rows)
+
+ # if not stats_df.empty:
+ # mask = stats_df["p-value"].notna()
+ # if mask.any():
+ # _, p_adj, _, _ = multipletests(stats_df.loc[mask, "p-value"], method="fdr_bh")
+ # stats_df.loc[mask, "p-adj"] = p_adj
+ # else:
+ # stats_df["p-adj"] = np.nan
+
+ # # Order samples by group (group2 first, then group1)
+ # sample_group_df = df[["Sample", "Group"]].drop_duplicates()
+ # group2_samples = sample_group_df[sample_group_df["Group"] == group2]["Sample"].tolist()
+ # group1_samples = sample_group_df[sample_group_df["Group"] == group1]["Sample"].tolist()
+ # all_samples = group2_samples + group1_samples
+
+ # # Build pivot table
+ # pivot_list = []
+ # for protein, group_df in df.groupby("ProteinName"):
+ # peptides = ";".join(group_df["PeptideSequence"].unique())
+ # intensity_dict = group_df.groupby("Sample")["Intensity"].sum().to_dict()
+ # intensity_dict_complete = {
+ # sample: intensity_dict.get(sample, 0)
+ # for sample in all_samples
+ # }
+ # row = {
+ # "ProteinName": protein,
+ # **intensity_dict_complete,
+ # "PeptideSequence": peptides,
+ # }
+ # pivot_list.append(row)
+
+ # pivot_df = pd.DataFrame(pivot_list)
+ # pivot_df = pivot_df.merge(stats_df, on="ProteinName", how="left")
+ # pivot_df = pivot_df[["ProteinName", "log2FC", "p-value", "p-adj"] + all_samples + ["PeptideSequence"]]
+
+ # # Build expression matrix (log2-transformed)
+ # expr_df = pivot_df.set_index("ProteinName")[all_samples]
+ # expr_df = expr_df.replace(0, np.nan)
+ # expr_df = np.log2(expr_df + 1)
+ # expr_df = expr_df.dropna()
+
+ # return pivot_df, expr_df, group_map
+
+
+def get_abundance_data(workspace: Path) -> tuple | None:
+ """Wrapper that handles cache key (workspace + CSV mtime).
+
+ Args:
+ workspace: Path to the workspace directory
+
+ Returns:
+ Tuple of (pivot_df, expr_df, group_map) or None if data unavailable
+ """
+ workflow_dir = get_workflow_dir(workspace)
+ quant_dir = workflow_dir / "results" / "quant_results"
+
+ if not quant_dir.exists():
+ return None
+
+ csv_files = sorted(quant_dir.glob("*.csv"))
+ if not csv_files:
+ return None
+
+ csv_mtime = csv_files[0].stat().st_mtime
+ return load_abundance_data(str(workspace), csv_mtime)
diff --git a/src/workflow/CommandExecutor.py b/src/workflow/CommandExecutor.py
index 14d28a32..4c1d4337 100644
--- a/src/workflow/CommandExecutor.py
+++ b/src/workflow/CommandExecutor.py
@@ -21,6 +21,7 @@ class CommandExecutor:
for execution.
"""
# Methods for running commands and logging
+
def __init__(self, workflow_dir: Path, logger: Logger, parameter_manager: ParameterManager):
self.pid_dir = Path(workflow_dir, "pids")
self.logger = logger
@@ -268,6 +269,11 @@ def run_topp(self, tool: str, input_output: dict, custom_params: dict = {}, tool
# Load merged parameters (_defaults + user overrides) for this tool instance
merged_params = self.parameter_manager.get_merged_params(params_key)
+ flag_map = self.parameter_manager.get_parameters_from_json().get("_flag_params", {})
+ if not flag_map:
+ flag_map = st.session_state.get("_topp_flag_params", {})
+ flag_params = set(flag_map.get(params_key, []))
+
# Construct commands for each process
for i in range(n_processes):
command = [tool]
@@ -288,28 +294,55 @@ def run_topp(self, tool: str, input_output: dict, custom_params: dict = {}, tool
command += [value[i]]
# Add merged TOPP tool parameters (_defaults + user overrides)
for k, v in merged_params.items():
- command += [f"-{k}"]
- # Skip only empty strings (pass flag with no value)
- # Note: 0 and 0.0 are valid values, so use explicit check
- if v != "" and v is not None:
- if isinstance(v, str) and "\n" in v:
- command += v.split("\n")
+ if k in flag_params:
+ # CLI flag: include "-k" only when enabled
+ if isinstance(v, str):
+ is_enabled = v.lower() in {"true", "1", "yes", "on"}
else:
- command += [str(v)]
+ is_enabled = bool(v)
+ if is_enabled:
+ command += [f"-{k}"]
+ continue
+ # For non-flag parameters, skip entirely if empty.
+ # Note: 0 and 0.0 are valid values, so use explicit checks.
+ if v == "" or v is None:
+ continue
+ command += [f"-{k}"]
+ if isinstance(v, str) and "\n" in v:
+ command += v.split("\n")
+ elif isinstance(v, bool):
+ command += [str(v).lower()]
+ else:
+ command += [str(v)]
# Add custom parameters
for k, v in custom_params.items():
- command += [f"-{k}"]
- # Skip only empty strings (pass flag with no value)
- # Note: 0 and 0.0 are valid values, so use explicit check
- if v != "" and v is not None:
- if isinstance(v, list):
- command += [str(x) for x in v]
+ if k in flag_params:
+ if isinstance(v, str):
+ is_enabled = v.lower() in {"true", "1", "yes", "on"}
else:
- command += [str(v)]
+ is_enabled = bool(v)
+ if is_enabled:
+ command += [f"-{k}"]
+ continue
+ if v == "" or v is None:
+ continue
+ command += [f"-{k}"]
+ if isinstance(v, list):
+ command += [str(x) for x in v]
+ elif isinstance(v, bool):
+ command += [str(v).lower()]
+ else:
+ command += [str(v)]
# Add threads parameter for TOPP tools
command += ["-threads", str(threads_per_command)]
commands.append(command)
+ for idx, cmd in enumerate(commands):
+ # Print list-form command joined into a single string for readability
+ print(f" πΉ Command {idx + 1}: {' '.join(cmd)}")
+ print("==========================================================\n")
+
+
# Run command(s)
if len(commands) == 1:
return self.run_command(commands[0])
diff --git a/src/workflow/StreamlitUI.py b/src/workflow/StreamlitUI.py
index c0880966..ff3dd05e 100644
--- a/src/workflow/StreamlitUI.py
+++ b/src/workflow/StreamlitUI.py
@@ -31,6 +31,7 @@ class StreamlitUI:
"""
# Methods for Streamlit UI components
+
def __init__(self, workflow_dir, logger, executor, parameter_manager):
self.workflow_dir = workflow_dir
self.logger = logger
@@ -612,6 +613,7 @@ def input_TOPP(
num_cols: int = 4,
exclude_parameters: List[str] = [],
include_parameters: List[str] = [],
+ flag_parameters: List[str] = [],
display_tool_name: bool = True,
display_subsections: bool = True,
display_subsection_tabs: bool = False,
@@ -628,6 +630,8 @@ def input_TOPP(
num_cols (int, optional): Number of columns to use for the layout. Defaults to 3.
exclude_parameters (List[str], optional): List of parameter names to exclude from the widget. Defaults to an empty list.
include_parameters (List[str], optional): List of parameter names to include in the widget. Defaults to an empty list.
+ flag_parameters (List[str], optional): List of parameter names that should
+ be treated as no-value CLI flags during command construction.
display_tool_name (bool, optional): Whether to display the TOPP tool name. Defaults to True.
display_subsections (bool, optional): Whether to split parameters into subsections based on the prefix. Defaults to True.
display_subsection_tabs (bool, optional): Whether to display main subsections in separate tabs (if more than one main section). Defaults to False.
@@ -647,6 +651,16 @@ def input_TOPP(
if "_topp_tool_instance_map" not in st.session_state:
st.session_state["_topp_tool_instance_map"] = {}
st.session_state["_topp_tool_instance_map"][tool_instance_name] = topp_tool_name
+ if "_topp_flag_params" not in st.session_state:
+ st.session_state["_topp_flag_params"] = {}
+ st.session_state["_topp_flag_params"][tool_instance_name] = list(flag_parameters)
+ # Persist flag metadata so execution still sees it outside UI reruns/session context.
+ params = self.parameter_manager.get_parameters_from_json()
+ if "_flag_params" not in params:
+ params["_flag_params"] = {}
+ params["_flag_params"][tool_instance_name] = list(flag_parameters)
+ with open(self.parameter_manager.params_file, "w", encoding="utf-8") as f:
+ json.dump(params, f, indent=4)
if not display_subsections:
display_subsection_tabs = False
@@ -738,6 +752,7 @@ def _matches_parameter(pattern: str, key: bytes) -> bool:
":".join(key.decode().split(":")[:-1])
),
}
+ p["is_flag"] = (b"flag" in param.getTags(key))
# Parameter sections and subsections as string (e.g. "section:subsection")
if display_subsections:
p["sections"] = ":".join(
@@ -817,27 +832,58 @@ def display_TOPP_params(params: dict, num_cols):
"""Displays individual TOPP parameters in given number of columns"""
cols = st.columns(num_cols)
i = 0
+ # st.write("--------------------------------display_TOPP_params------------------------------------")
+ # st.write(params)
for p in params:
# get key and name β use tool_instance_name in session state key
key_str = p['key'].decode()
if tool_instance_name != topp_tool_name:
key_str = key_str.replace(f"{topp_tool_name}:1:", f"{tool_instance_name}:1:", 1)
key = f"{self.parameter_manager.topp_param_prefix}{key_str}"
+ # st.write("--------------------------------p['key'](display_TOPP_params loop)------------------------------------")
+ # st.write(key)
name = p["name"]
try:
# sometimes strings with newline, handle as list
if isinstance(p["value"], str) and "\n" in p["value"]:
p["value"] = p["value"].split("\n")
+ # no-value CLI flag parameters should be shown as checkboxes
+ if p.get("is_flag", False):
+ flag_default = p["value"]
+ if isinstance(flag_default, str):
+ flag_default = flag_default.lower() in {"true", "1", "yes", "on"}
+ else:
+ flag_default = bool(flag_default)
+ # Streamlit widget keys persist in session_state and can override
+ # updated custom_defaults. Normalize and seed key explicitly.
+ if key in st.session_state:
+ current = st.session_state[key]
+ if isinstance(current, str):
+ st.session_state[key] = current.lower() in {"true", "1", "yes", "on"}
+ else:
+ st.session_state[key] = bool(current)
+ else:
+ st.session_state[key] = flag_default
+ cols[i].selectbox(
+ name,
+ options=[True, False],
+ index=0 if st.session_state[key] else 1,
+ format_func=lambda x: "True" if x else "False",
+ help=p["description"],
+ key=key,
+ )
# bools
- if isinstance(p["value"], bool):
- cols[i].markdown("##")
- cols[i].checkbox(
+ elif isinstance(p["value"], bool):
+ bool_value = (
+ (p["value"] == "true")
+ if type(p["value"]) == str
+ else p["value"]
+ )
+ cols[i].selectbox(
name,
- value=(
- (p["value"] == "true")
- if type(p["value"]) == str
- else p["value"]
- ),
+ options=[True, False],
+ index=0 if bool_value else 1,
+ format_func=lambda x: "True" if x else "False",
help=p["description"],
key=key,
)
@@ -922,7 +968,6 @@ def on_multiselect_change(dk=display_key, tk=key):
cols[i].error(f"Error in parameter **{p['name']}**.")
print('Error parsing "' + p["name"] + '": ' + str(e))
-
for section, params in param_sections.items():
if tabs is None:
show_subsection_header(section, display_subsections)
diff --git a/src/workflow/WorkflowManager.py b/src/workflow/WorkflowManager.py
index a87fb339..c35ecc55 100644
--- a/src/workflow/WorkflowManager.py
+++ b/src/workflow/WorkflowManager.py
@@ -194,6 +194,8 @@ def _stop_local_workflow(self) -> bool:
"""Stop locally running workflow process"""
import os
import signal
+ import platform
+ import subprocess
pid_dir = self.executor.pid_dir
if not pid_dir.exists():
@@ -203,11 +205,17 @@ def _stop_local_workflow(self) -> bool:
for pid_file in pid_dir.iterdir():
try:
pid = int(pid_file.name)
- os.kill(pid, signal.SIGTERM)
+ if platform.system() == "Windows":
+ subprocess.call(["taskkill", "/F", "/T", "/PID", str(pid)],
+ capture_output=True, text=True)
+ else:
+ os.kill(pid, signal.SIGTERM)
pid_file.unlink()
stopped = True
- except (ValueError, ProcessLookupError, PermissionError):
- pid_file.unlink() # Clean up stale PID file
+ except (ValueError, ProcessLookupError, PermissionError, Exception) as e:
+ print(f"Error stopping process {pid}: {e}")
+ if pid_file.exists():
+ pid_file.unlink() # Clean up stale PID file
# Clean up the pid directory
shutil.rmtree(pid_dir, ignore_errors=True)