From e310fa8526e031fd59a9455addbd92b72b6e85f9 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 19 May 2026 10:02:35 +0100 Subject: [PATCH] fix(diann2msstats): keep raw sequence when pyopenms can't resolve a mod MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `diann2msstats` crashes the conversion when DIA-NN emits a peptide with a modification that pyopenms cannot resolve, e.g.: RuntimeError: the value 'Met-loss' was used but is not valid; Cannot convert string to peptide modification. No modification matches in our database. This fires in production whenever the runtime container ships pyopenms without the OpenMS share directory (UniMod XML) — visible via the warning that always precedes the traceback: UserWarning: OPENMS_DATA_PATH environment variable not found and no share directory was installed. In that state only a small set of common mods (Carbamidomethyl, Oxidation, Phospho, …) resolves from the compiled-in fallback, so Acetyl, Met-loss, and other valid UniMod entries crash the whole job. Any reanalysis whose SDRF declares such a mod is bricked. Fix the source side: wrap the `AASequence.fromString` round-trip in `_to_openms_sequence`, catch `RuntimeError`, log a warning, and pass the raw DIA-NN string through unchanged. The raw string is still a stable peptide identifier for MSstats (different mod forms => distinct IDs), so downstream consumers are unaffected. Also dedupe the conversion across rows: build a map over the unique PeptideSequence values and re-map back. The old `.apply(... axis=1)` re-parsed identical strings millions of times on real datasets. Adds two unit tests against `_to_openms_sequence`: * unknown mod falls back to the raw input * `^` N-term anchor is preserved on both code paths Note: the deeper fix is the container/recipe packaging — the bioconda recipe for quantms-utils should ship pyopenms with its OpenMS share directory so the parser has the full UniMod database. This source patch is the resilient companion: even after the recipe is fixed, an unexpected mod string should not crash the converter. --- quantmsutils/diann/diann2msstats.py | 38 +++++++++++++++++++++++------ tests/test_commands.py | 29 ++++++++++++++++++++++ 2 files changed, 59 insertions(+), 8 deletions(-) diff --git a/quantmsutils/diann/diann2msstats.py b/quantmsutils/diann/diann2msstats.py index 0d238ea..6da9f2c 100644 --- a/quantmsutils/diann/diann2msstats.py +++ b/quantmsutils/diann/diann2msstats.py @@ -73,14 +73,8 @@ def diann2msstats( out_msstats = out_msstats[out_msstats["Intensity"] != 0] out_msstats["PeptideSequence"] = out_msstats["PeptideSequence"].apply(_sanitize_sequence) - out_msstats.loc[:, "PeptideSequence"] = out_msstats.apply( - lambda x: ( - AASequence.fromString(x["PeptideSequence"]).toString() - if "^" not in x["PeptideSequence"] - else "^" + AASequence.fromString(x["PeptideSequence"].replace("^", "")).toString() - ), - axis=1, - ) + seq_map = {s: _to_openms_sequence(s) for s in out_msstats["PeptideSequence"].unique()} + out_msstats["PeptideSequence"] = out_msstats["PeptideSequence"].map(seq_map) out_msstats["FragmentIon"] = "NA" out_msstats["ProductCharge"] = "0" @@ -267,3 +261,31 @@ def load_report(report_path, qvalue_threshold: float) -> pd.DataFrame: def _sanitize_sequence(seq): seq = seq.replace("(SILAC)", "") return seq + + +def _to_openms_sequence(seq: str) -> str: + """Canonicalize a DIA-NN peptide+mod string via pyopenms. + + Preserves the leading ``^`` anchor used by DIA-NN to mark N-terminal + cleavage peptides. When pyopenms raises a ``RuntimeError`` — typically + because the runtime container ships pyopenms without the OpenMS share + directory (UniMod XML), leaving only a small set of common modifications + resolvable from the compiled-in fallback — the input is returned + unchanged so downstream conversion can proceed. A warning is logged + once per unique input string. + """ + has_anchor = "^" in seq + body = seq.replace("^", "") if has_anchor else seq + try: + canonical = AASequence.fromString(body).toString() + except RuntimeError as err: + logger.warning( + "pyopenms could not parse peptide %r (%s); keeping the raw " + "DIA-NN sequence. If this affects many peptides, the runtime " + "container is likely missing the OpenMS share directory " + "(OPENMS_DATA_PATH).", + body, + err, + ) + canonical = body + return ("^" + canonical) if has_anchor else canonical diff --git a/tests/test_commands.py b/tests/test_commands.py index 79a1e89..45fdf24 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -103,6 +103,35 @@ def test_dianncfg_example(self): assert result.exit_code == 0 + def test_to_openms_sequence_falls_back_on_unknown_mod(self): + """Unknown modifications should not crash the conversion. + + The runtime container may ship pyopenms without the OpenMS share + directory, leaving only common mods resolvable. In that case we + keep the raw DIA-NN sequence so downstream MSstats conversion can + proceed instead of crashing with a RuntimeError. + """ + from quantmsutils.diann.diann2msstats import _to_openms_sequence + + bogus = "M(NoSuchModXYZ)PEPTIDE" + assert _to_openms_sequence(bogus) == bogus + + def test_to_openms_sequence_preserves_n_term_anchor(self): + """The leading ``^`` anchor used by DIA-NN for N-terminal cleavage + peptides must survive both the pyopenms round-trip and the + fallback path. + """ + from quantmsutils.diann.diann2msstats import _to_openms_sequence + + # known mod -> canonical form retains anchor + anchored = "^M(Oxidation)PEPTIDE" + out = _to_openms_sequence(anchored) + assert out.startswith("^") + + # unknown mod -> raw passthrough retains anchor + bogus_anchored = "^M(NoSuchModXYZ)PEPTIDE" + assert _to_openms_sequence(bogus_anchored) == bogus_anchored + class TestSamplesheetCommands: """Test class for samplesheet related commands"""