From 40624b8962e38a899273bb64e6d562aa3088d79b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 21 Mar 2026 08:03:47 +0000 Subject: [PATCH 1/5] making some changes in the SDRF validation for the workflow --- Dockerfile.dev | 5 + pyproject.toml | 2 +- quantmsutils/diann/dianncfg.py | 2 +- quantmsutils/sdrf/check_samplesheet.py | 154 +++--------------- tests/test_commands.py | 91 ++++++++++- .../diann2msstats/PXD026600_diann_design.tsv | 5 + 6 files changed, 121 insertions(+), 138 deletions(-) create mode 100644 Dockerfile.dev create mode 100644 tests/test_data/diann2msstats/PXD026600_diann_design.tsv diff --git a/Dockerfile.dev b/Dockerfile.dev new file mode 100644 index 0000000..5b8109e --- /dev/null +++ b/Dockerfile.dev @@ -0,0 +1,5 @@ +FROM python:3.11-slim +RUN apt-get update && apt-get install -y --no-install-recommends git procps libglib2.0-0t64 && rm -rf /var/lib/apt/lists/* +WORKDIR /src +COPY . . +RUN pip install --no-cache-dir . diff --git a/pyproject.toml b/pyproject.toml index fa2b47d..da5a252 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ packages = [ [tool.poetry.dependencies] python = "*" click = "*" -sdrf-pipelines = "==0.0.33" +sdrf-pipelines = ">=0.1.1" pyopenms = ">=3.3.0" pandas = "*" pyarrow = ">=16.1.0" diff --git a/quantmsutils/diann/dianncfg.py b/quantmsutils/diann/dianncfg.py index 0634ff1..db41635 100644 --- a/quantmsutils/diann/dianncfg.py +++ b/quantmsutils/diann/dianncfg.py @@ -9,7 +9,7 @@ from typing import List, Tuple from collections import defaultdict import click -from sdrf_pipelines.openms.unimod import UnimodDatabase +from sdrf_pipelines.converters.openms.unimod import UnimodDatabase logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG) logger = logging.getLogger(__name__) diff --git a/quantmsutils/sdrf/check_samplesheet.py b/quantmsutils/sdrf/check_samplesheet.py index cd788bb..13ed882 100644 --- a/quantmsutils/sdrf/check_samplesheet.py +++ b/quantmsutils/sdrf/check_samplesheet.py @@ -1,15 +1,8 @@ -# nf-core: Update the script to check the sdrf -# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - -import errno import logging -import os import sys import click -import pandas as pd -from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame -from sdrf_pipelines.sdrf.sdrf_schema import DEFAULT_TEMPLATE, MASS_SPECTROMETRY +from sdrf_pipelines.sdrf.sdrf import read_sdrf logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG) logger = logging.getLogger(__name__) @@ -36,36 +29,25 @@ def print_error(error, context="Line", context_str=""): def check_sdrf( input_sdrf: str, - skip_ms_validation: bool = False, - skip_factor_validation: bool = False, - skip_experimental_design_validation: bool = False, - use_ols_cache_only: bool = False, skip_sdrf_validation: bool = False, + use_ols_cache_only: bool = False, ): """ Check the SDRF file for errors. If any errors are found, print them and exit with a non-zero status code. - @param input_sdrf: Path to the SDRF file to check - @param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications) - @param skip_factor_validation: Disable the validation of factor values in SDRF - @param skip_experimental_design_validation: Disable the validation of experimental design - @param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service - @param skip_sdrf_validation: Disable the validation of SDRF + + :param input_sdrf: Path to the SDRF file to check + :param skip_sdrf_validation: Skip all SDRF validation + :param use_ols_cache_only: Use OLS cache instead of live OLS service """ if skip_sdrf_validation: print("No SDRF validation was performed.") sys.exit(0) - df = SdrfDataFrame.parse(input_sdrf) - errors = df.validate(DEFAULT_TEMPLATE, use_ols_cache_only) - - if not skip_ms_validation: - errors = errors + df.validate(MASS_SPECTROMETRY, use_ols_cache_only) - - if not skip_factor_validation: - errors = errors + df.validate_factor_values() - - if not skip_experimental_design_validation: - errors = errors + df.validate_experimental_design() + df = read_sdrf(input_sdrf) + errors = df.validate_sdrf( + template="ms-proteomics", + use_ols_cache_only=use_ols_cache_only, + ) for error in errors: print(error) @@ -73,120 +55,26 @@ def check_sdrf( sys.exit(bool(errors)) -def check_expdesign(expdesign): - """ - Check the expdesign file for errors. If any errors are found, print them and exit with a non-zero status code. - @param expdesign: Path to the expdesign file to check - """ - data = pd.read_csv(expdesign, sep="\t", header=0, dtype=str) - data = data.dropna() - schema_file = ["Fraction_Group", "Fraction", "Spectra_Filepath", "Label", "Sample"] - schema_sample = ["Sample", "MSstats_Condition", "MSstats_BioReplicate"] - - # check table format: two table - with open(expdesign, "r") as f: - lines = f.readlines() - try: - empty_row = lines.index("\n") - except ValueError: - print( - "the one-table format parser is broken in OpenMS2.5, please use one-table or sdrf" - ) - sys.exit(1) - - s_table = [i.replace("\n", "").split("\t") for i in lines[empty_row + 1 :]][1:] - s_header = lines[empty_row + 1].replace("\n", "").split("\t") - s_data_frame = pd.DataFrame(s_table, columns=s_header) - - # check missed mandatory column - missed_columns = set(schema_file) - set(data.columns) - if len(missed_columns) != 0: - print("{0} column missed".format(" ".join(missed_columns))) - sys.exit(1) - - missed_columns = set(schema_sample) - set(s_data_frame.columns) - if len(missed_columns) != 0: - print("{0} column missed".format(" ".join(missed_columns))) - sys.exit(1) - - if len(set(data.Label)) != 1 and "MSstats_Mixture" not in s_data_frame.columns: - print("MSstats_Mixture column missed in ISO experiments") - sys.exit(1) - - # check logical problem: may be improved - check_expdesign_logic(data, s_data_frame) - - -def check_expdesign_logic(f_table, s_table): - fg_ints = f_table["Fraction_Group"].astype(int) - if fg_ints.max() > fg_ints.nunique(): - print("Fraction_Group discontinuous!") - sys.exit(1) - f_table_d = f_table.drop_duplicates(["Fraction_Group", "Fraction", "Label", "Sample"]) - if f_table_d.shape[0] < f_table.shape[0]: - print("Existing duplicate entries in Fraction_Group, Fraction, Label and Sample") - sys.exit(1) - if len(set(s_table.Sample)) < s_table.shape[0]: - print("Existing duplicate Sample in sample table!") - sys.exit(1) - @click.command( "checksamplesheet", - short_help="Reformat nf-core/quantms sdrf file and check its contents.", -) -@click.option("--exp_design", help="SDRF/Expdesign file to be validated") -@click.option("--is_sdrf", help="SDRF file or Expdesign file", is_flag=True) -@click.option("--skip_sdrf_validation", help="Disable the validation of SDRF", is_flag=True) -@click.option( - "--skip_ms_validation", - help="Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications)", - is_flag=True, -) -@click.option( - "--skip_factor_validation", - help="Disable the validation of factor values in SDRF", - is_flag=True, -) -@click.option( - "--skip_experimental_design_validation", - help="Disable the validation of experimental design", - is_flag=True, + short_help="Validate an SDRF file for quantms pipelines.", ) +@click.option("--exp_design", help="SDRF file to be validated", required=True) +@click.option("--skip_sdrf_validation", help="Skip all SDRF validation", is_flag=True) @click.option( "--use_ols_cache_only", - help="Use ols cache for validation of the terms and not OLS internet service", + help="Use OLS cache for ontology validation instead of the live OLS service", is_flag=True, ) def checksamplesheet( exp_design: str, - is_sdrf: bool = False, skip_sdrf_validation: bool = False, - skip_ms_validation: bool = False, - skip_factor_validation: bool = False, - skip_experimental_design_validation: bool = False, use_ols_cache_only: bool = False, ): - """ - Reformat nf-core/quantms sdrf file and check its contents. - @param exp_design: SDRF/Expdesign file to be validated - @param is_sdrf: SDRF file or Expdesign file - @param skip_sdrf_validation: Disable the validation of SDRF - @param skip_ms_validation: Disable the validation of mass spectrometry fields in SDRF (e.g. posttranslational modifications) - @param skip_factor_validation: Disable the validation of factor values in SDRF - @param skip_experimental_design_validation: Disable the validation of experimental design - @param use_ols_cache_only: Use ols cache for validation of the terms and not OLS internet service - - """ - # TODO validate expdesign file - if is_sdrf: - check_sdrf( - input_sdrf=exp_design, - skip_sdrf_validation=skip_sdrf_validation, - skip_ms_validation=skip_ms_validation, - skip_factor_validation=skip_factor_validation, - skip_experimental_design_validation=skip_experimental_design_validation, - use_ols_cache_only=use_ols_cache_only, - ) - else: - check_expdesign(exp_design) + """Validate an SDRF file for quantms pipelines.""" + check_sdrf( + input_sdrf=exp_design, + skip_sdrf_validation=skip_sdrf_validation, + use_ols_cache_only=use_ols_cache_only, + ) diff --git a/tests/test_commands.py b/tests/test_commands.py index fae6f66..3d340db 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -107,10 +107,10 @@ def test_dianncfg_example(self): class TestSamplesheetCommands: """Test class for samplesheet related commands""" - def test_check_samplesheet_sdrf(self): - """Test the validation of an SDRF file""" + def test_check_samplesheet_sdrf_skip_validation(self): + """Test the SDRF check command with skip_sdrf_validation (smoke test).""" args = [ - "--is_sdrf", + "--skip_sdrf_validation", "--exp_design", str(TEST_DATA_DIR / "PXD000001.sdrf.tsv"), ] @@ -267,6 +267,91 @@ def test_nterm_modification(self): assert result == ["0-Acetyl"] +class TestDiannUnifiedDesign: + """Tests for unified design file format parsing (from convert-diann)""" + + def test_diann2msstats_unified_format(self): + """Test DIA-NN to MSstats conversion with the unified design file format.""" + report_path = (DIANN_TEST_DIR / "diann_report.tsv").resolve() + design_path = (DIANN_TEST_DIR / "PXD026600_diann_design.tsv").resolve() + assert report_path.exists(), f"Test report missing: {report_path}" + assert design_path.exists(), f"Test design missing: {design_path}" + + args = [ + "--report", str(report_path), + "--exp_design", str(design_path), + "--qvalue_threshold", "0.01", + ] + result = run_cli_command("diann2msstats", args) + if result.exit_code != 0: + raise AssertionError( + f"diann2msstats with unified format failed (exit {result.exit_code}). " + f"stdout: {result.output!r}, stderr: {result.stderr!r}" + ) + + def test_unified_format_parsed_correctly(self): + """Test that the unified format produces the correct sample/file tables.""" + from quantmsutils.diann.diann2msstats import get_exp_design_dfs + + design_path = str((DIANN_TEST_DIR / "PXD026600_diann_design.tsv").resolve()) + s_df, f_table = get_exp_design_dfs(design_path) + + # Sample table has correct columns and 2 unique samples + assert "MSstats_Condition" in s_df.columns + assert "MSstats_BioReplicate" in s_df.columns + assert len(s_df) == 2 + + # File table has 4 rows with run names + assert "run" in f_table.columns + assert "Fraction" in f_table.columns + assert "Sample" in f_table.columns + assert len(f_table) == 4 + + # Run names are file stems without extension + runs = f_table["run"].tolist() + assert "RD139_Narrow_UPS1_0_1fmol_inj1" in runs + assert "RD139_Narrow_UPS1_0_25fmol_inj2" in runs + + def test_legacy_format_still_works(self): + """Test that the legacy two-table format is still parsed correctly.""" + from quantmsutils.diann.diann2msstats import get_exp_design_dfs + + design_path = str((DIANN_TEST_DIR / "PXD026600.sdrf_openms_design.tsv").resolve()) + s_df, f_table = get_exp_design_dfs(design_path) + + assert "MSstats_Condition" in s_df.columns + assert "MSstats_BioReplicate" in s_df.columns + assert len(s_df) == 2 + assert "run" in f_table.columns + assert len(f_table) == 4 + + def test_unified_format_validates_required_columns(self): + """Test that missing required columns in unified format raise ValueError.""" + from quantmsutils.diann.diann2msstats import get_exp_design_dfs + + with tempfile.TemporaryDirectory() as tmpdir: + bad_file = os.path.join(tmpdir, "bad_design.tsv") + with open(bad_file, "w") as f: + # Has Filename+Condition+BioReplicate (triggers unified) but missing Fraction and Sample + f.write("Filename\tCondition\tBioReplicate\n") + f.write("file1.raw\tA\t1\n") + with pytest.raises(ValueError, match="missing required columns"): + get_exp_design_dfs(bad_file) + + def test_unified_format_validates_sample_consistency(self): + """Test that inconsistent Sample->Condition mapping raises ValueError.""" + from quantmsutils.diann.diann2msstats import get_exp_design_dfs + + with tempfile.TemporaryDirectory() as tmpdir: + bad_file = os.path.join(tmpdir, "inconsistent_design.tsv") + with open(bad_file, "w") as f: + f.write("Filename\tSample\tFraction\tCondition\tBioReplicate\n") + f.write("file1.raw\t1\t1\tCondA\t1\n") + f.write("file2.raw\t1\t1\tCondB\t2\n") # Same Sample, different Condition + with pytest.raises(ValueError, match="Inconsistent"): + get_exp_design_dfs(bad_file) + + class TestExtractSampleMixture: """Test extract_sample with MSstats_Mixture column (covers DataFrame.append fix)""" diff --git a/tests/test_data/diann2msstats/PXD026600_diann_design.tsv b/tests/test_data/diann2msstats/PXD026600_diann_design.tsv new file mode 100644 index 0000000..0bc8b83 --- /dev/null +++ b/tests/test_data/diann2msstats/PXD026600_diann_design.tsv @@ -0,0 +1,5 @@ +Filename URI Sample FractionGroup Fraction Label LabelType AcquisitionMethod DissociationMethod Condition BioReplicate Enzyme FixedModifications VariableModifications PrecursorMassTolerance PrecursorMassToleranceUnit FragmentMassTolerance FragmentMassToleranceUnit MS1MinMz MS1MaxMz MS2MinMz MS2MaxMz +RD139_Narrow_UPS1_0_1fmol_inj1.raw https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/MSV000087597/RD139_Narrow_UPS1_0_1fmol_inj1.raw 1 1 1 label free sample label free Data-Independent Acquisition HCD CT=Mixture;CN=UPS1;QY=0.1 fmol 1 Trypsin NT=Carbamidomethyl;TA=C;mt=fixed;AC=UNIMOD:4 NT=Oxidation;mt=variable;TA=M;AC=Unimod:35 10 ppm 0.02 Da +RD139_Narrow_UPS1_0_1fmol_inj2.raw https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/MSV000087597/RD139_Narrow_UPS1_0_1fmol_inj2.raw 1 2 1 label free sample label free Data-Independent Acquisition HCD CT=Mixture;CN=UPS1;QY=0.1 fmol 1 Trypsin NT=Carbamidomethyl;TA=C;mt=fixed;AC=UNIMOD:4 NT=Oxidation;mt=variable;TA=M;AC=Unimod:35 10 ppm 0.02 Da +RD139_Narrow_UPS1_0_25fmol_inj1.raw https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/MSV000087597/RD139_Narrow_UPS1_0_25fmol_inj1.raw 2 3 1 label free sample label free Data-Independent Acquisition HCD CT=Mixture;CN=UPS1;QY=0.25 fmol 2 Trypsin NT=Carbamidomethyl;TA=C;mt=fixed;AC=UNIMOD:4 NT=Oxidation;mt=variable;TA=M;AC=Unimod:35 10 ppm 0.02 Da +RD139_Narrow_UPS1_0_25fmol_inj2.raw https://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/quantms-ci-github/MSV000087597/RD139_Narrow_UPS1_0_25fmol_inj2.raw 2 4 1 label free sample label free Data-Independent Acquisition HCD CT=Mixture;CN=UPS1;QY=0.25 fmol 2 Trypsin NT=Carbamidomethyl;TA=C;mt=fixed;AC=UNIMOD:4 NT=Oxidation;mt=variable;TA=M;AC=Unimod:35 10 ppm 0.02 Da From 92c92ff9a231c59491ab3956d5ed7fa33b0540cf Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 21 Mar 2026 08:21:00 +0000 Subject: [PATCH 2/5] minor changes --- quantmsutils/sdrf/check_samplesheet.py | 32 +++++++++----------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/quantmsutils/sdrf/check_samplesheet.py b/quantmsutils/sdrf/check_samplesheet.py index 13ed882..8d876ef 100644 --- a/quantmsutils/sdrf/check_samplesheet.py +++ b/quantmsutils/sdrf/check_samplesheet.py @@ -2,33 +2,16 @@ import sys import click + from sdrf_pipelines.sdrf.sdrf import read_sdrf logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG) logger = logging.getLogger(__name__) -def make_dir(path): - if len(path) > 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise exception - - -def print_error(error, context="Line", context_str=""): - error_str = "ERROR: Please check samplesheet -> {}".format(error) - if context != "" and context_str != "": - error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( - error, context.strip(), context_str.strip() - ) - print(error_str) - sys.exit(1) - - def check_sdrf( input_sdrf: str, + template: str = "ms-proteomics", skip_sdrf_validation: bool = False, use_ols_cache_only: bool = False, ): @@ -36,6 +19,7 @@ def check_sdrf( Check the SDRF file for errors. If any errors are found, print them and exit with a non-zero status code. :param input_sdrf: Path to the SDRF file to check + :param template: Schema template to validate against (e.g. 'ms-proteomics', 'dia-acquisition') :param skip_sdrf_validation: Skip all SDRF validation :param use_ols_cache_only: Use OLS cache instead of live OLS service """ @@ -45,7 +29,7 @@ def check_sdrf( df = read_sdrf(input_sdrf) errors = df.validate_sdrf( - template="ms-proteomics", + template=template, use_ols_cache_only=use_ols_cache_only, ) @@ -55,12 +39,16 @@ def check_sdrf( sys.exit(bool(errors)) - @click.command( "checksamplesheet", short_help="Validate an SDRF file for quantms pipelines.", ) @click.option("--exp_design", help="SDRF file to be validated", required=True) +@click.option( + "--template", "-t", + help="Schema template to validate against (e.g. ms-proteomics, dia-acquisition)", + default="ms-proteomics", +) @click.option("--skip_sdrf_validation", help="Skip all SDRF validation", is_flag=True) @click.option( "--use_ols_cache_only", @@ -69,12 +57,14 @@ def check_sdrf( ) def checksamplesheet( exp_design: str, + template: str = "ms-proteomics", skip_sdrf_validation: bool = False, use_ols_cache_only: bool = False, ): """Validate an SDRF file for quantms pipelines.""" check_sdrf( input_sdrf=exp_design, + template=template, skip_sdrf_validation=skip_sdrf_validation, use_ols_cache_only=use_ols_cache_only, ) From fdc415b79f186385acc51d8c7027fd90ca5f07a1 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 21 Mar 2026 08:46:08 +0000 Subject: [PATCH 3/5] minor changes remove setuptools --- quantmsutils/sdrf/check_samplesheet.py | 97 +++++++++++++++++++++----- recipe/meta.yaml | 9 +-- tests/test_commands.py | 39 ++++++++++- 3 files changed, 119 insertions(+), 26 deletions(-) diff --git a/quantmsutils/sdrf/check_samplesheet.py b/quantmsutils/sdrf/check_samplesheet.py index 8d876ef..01af482 100644 --- a/quantmsutils/sdrf/check_samplesheet.py +++ b/quantmsutils/sdrf/check_samplesheet.py @@ -2,36 +2,63 @@ import sys import click +import pandas as pd from sdrf_pipelines.sdrf.sdrf import read_sdrf logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG) logger = logging.getLogger(__name__) +# Minimal columns required to run quantms/quantmsdiann pipelines. +# These are checked in --minimal mode instead of full schema validation. +MINIMAL_REQUIRED_COLUMNS = [ + "source name", + "assay name", + "comment[data file]", + "comment[label]", + "comment[cleavage agent details]", + "comment[instrument]", + "comment[proteomics data acquisition method]", + "technology type", +] + +# Columns with at least one modification parameters column (pattern match) +MINIMAL_PATTERN_COLUMNS = [ + "comment[modification parameters", # prefix match — multiple columns allowed +] + +# Recommended columns: warn if missing but don't fail +MINIMAL_RECOMMENDED_COLUMNS = [ + "comment[precursor mass tolerance]", + "comment[fragment mass tolerance]", + "comment[dissociation method]", + "comment[technical replicate]", + "comment[fraction identifier]", +] + def check_sdrf( input_sdrf: str, template: str = "ms-proteomics", - skip_sdrf_validation: bool = False, + minimal: bool = False, use_ols_cache_only: bool = False, ): """ - Check the SDRF file for errors. If any errors are found, print them and exit with a non-zero status code. + Check the SDRF file for errors. :param input_sdrf: Path to the SDRF file to check - :param template: Schema template to validate against (e.g. 'ms-proteomics', 'dia-acquisition') - :param skip_sdrf_validation: Skip all SDRF validation + :param template: Schema template for full validation (e.g. 'ms-proteomics', 'dia-acquisition') + :param minimal: Only validate columns required to run the pipeline (skip organism, etc.) :param use_ols_cache_only: Use OLS cache instead of live OLS service """ - if skip_sdrf_validation: - print("No SDRF validation was performed.") - sys.exit(0) - - df = read_sdrf(input_sdrf) - errors = df.validate_sdrf( - template=template, - use_ols_cache_only=use_ols_cache_only, - ) + if minimal: + errors = _validate_minimal(input_sdrf) + else: + df = read_sdrf(input_sdrf) + errors = df.validate_sdrf( + template=template, + use_ols_cache_only=use_ols_cache_only, + ) for error in errors: print(error) @@ -39,6 +66,38 @@ def check_sdrf( sys.exit(bool(errors)) +def _validate_minimal(input_sdrf: str) -> list[str]: + """Validate only the columns required to run the pipeline. + + Returns a list of error strings. Only missing required columns + produce errors; missing recommended columns produce warnings (non-blocking). + """ + df = pd.read_csv(input_sdrf, sep="\t", nrows=0) + columns_lower = [c.lower() for c in df.columns] + errors = [] + + # Check required columns (case-insensitive) + for col in MINIMAL_REQUIRED_COLUMNS: + if col.lower() not in columns_lower: + errors.append(f"ERROR: Required column '{col}' is missing from the SDRF file.") + + # Check at least one modification parameters column exists + has_mod_col = any(c.startswith("comment[modification parameters") for c in columns_lower) + if not has_mod_col: + errors.append( + "ERROR: At least one 'comment[modification parameters]' column is required." + ) + + # Warn about recommended columns (non-blocking) + for col in MINIMAL_RECOMMENDED_COLUMNS: + if col.lower() not in columns_lower: + logger.warning( + f"Recommended column '{col}' is missing. Pipeline will use default parameters." + ) + + return errors + + @click.command( "checksamplesheet", short_help="Validate an SDRF file for quantms pipelines.", @@ -46,10 +105,14 @@ def check_sdrf( @click.option("--exp_design", help="SDRF file to be validated", required=True) @click.option( "--template", "-t", - help="Schema template to validate against (e.g. ms-proteomics, dia-acquisition)", + help="Schema template for full validation (e.g. ms-proteomics, dia-acquisition)", default="ms-proteomics", ) -@click.option("--skip_sdrf_validation", help="Skip all SDRF validation", is_flag=True) +@click.option( + "--minimal", + help="Only validate columns required to run the pipeline (skip organism, metadata, etc.)", + is_flag=True, +) @click.option( "--use_ols_cache_only", help="Use OLS cache for ontology validation instead of the live OLS service", @@ -58,13 +121,13 @@ def check_sdrf( def checksamplesheet( exp_design: str, template: str = "ms-proteomics", - skip_sdrf_validation: bool = False, + minimal: bool = False, use_ols_cache_only: bool = False, ): """Validate an SDRF file for quantms pipelines.""" check_sdrf( input_sdrf=exp_design, template=template, - skip_sdrf_validation=skip_sdrf_validation, + minimal=minimal, use_ols_cache_only=use_ols_cache_only, ) diff --git a/recipe/meta.yaml b/recipe/meta.yaml index d31b5be..b318cac 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,7 +1,7 @@ # recipe/meta.yaml package: name: quantms-utils - version: "0.0.25" + version: "0.0.26" source: path: ../ @@ -20,19 +20,16 @@ requirements: - python - pip - poetry-core >=1.2.0 - - setuptools <78 + run: - python >=3.9,<3.13 - click - - setuptools <78 - - sdrf-pipelines >=0.0.33,<0.1.0 + - sdrf-pipelines >=0.1.1 - pyopenms>=3.3.0 - pandas - pyarrow>=16.1.0 - scipy test: - requires: - - setuptools <78 imports: - quantmsutils commands: diff --git a/tests/test_commands.py b/tests/test_commands.py index 3d340db..0d35f54 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -107,16 +107,49 @@ def test_dianncfg_example(self): class TestSamplesheetCommands: """Test class for samplesheet related commands""" - def test_check_samplesheet_sdrf_skip_validation(self): - """Test the SDRF check command with skip_sdrf_validation (smoke test).""" + def test_check_samplesheet_minimal_pxd000001(self): + """Test minimal validation on PXD000001 (legacy SDRF without acquisition method).""" + # PXD000001 is a TMT dataset without comment[proteomics data acquisition method] + # Minimal validation should flag it as missing a required column args = [ - "--skip_sdrf_validation", + "--minimal", "--exp_design", str(TEST_DATA_DIR / "PXD000001.sdrf.tsv"), ] result = run_cli_command("checksamplesheet", args) + assert result.exit_code != 0 + assert "proteomics data acquisition method" in result.output.lower() + + def test_check_samplesheet_minimal_valid(self): + """Test minimal validation passes for a valid SDRF with all required columns.""" + import tempfile + with tempfile.NamedTemporaryFile(mode="w", suffix=".sdrf.tsv", delete=False) as f: + f.write("source name\tassay name\tcomment[data file]\tcomment[label]\t" + "comment[instrument]\tcomment[proteomics data acquisition method]\t" + "technology type\tcomment[cleavage agent details]\t" + "comment[modification parameters]\n") + f.write("S1\trun1\tfile1.raw\tlabel free sample\tOrbitrap\t" + "Data-Independent Acquisition\tMS\tTrypsin\tOxidation\n") + tmp_path = f.name + args = ["--minimal", "--exp_design", tmp_path] + result = run_cli_command("checksamplesheet", args) assert result.exit_code == 0 + def test_check_samplesheet_minimal_missing_column(self): + """Test minimal validation fails when a required column is missing.""" + import tempfile + with tempfile.NamedTemporaryFile(mode="w", suffix=".sdrf.tsv", delete=False) as f: + # Missing comment[cleavage agent details] + f.write("source name\tassay name\tcomment[data file]\tcomment[label]\t" + "comment[instrument]\tcomment[proteomics data acquisition method]\t" + "technology type\tcomment[modification parameters]\n") + f.write("S1\trun1\tfile1.raw\tlabel free sample\tOrbitrap\tDIA\tMS\tOxidation\n") + f.name + args = ["--minimal", "--exp_design", f.name] + result = run_cli_command("checksamplesheet", args) + assert result.exit_code != 0 + assert "cleavage agent" in result.output.lower() + def test_extract_sample_from_expdesign(self): """Test extracting sample information from experiment design""" args = ["--expdesign", str(TEST_DATA_DIR / "BSA_design_urls.tsv")] From 11ccc4c5fb6850657c2ca898eea07f4d741de5d5 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 21 Mar 2026 09:58:03 +0000 Subject: [PATCH 4/5] minor changes --- tests/test_commands.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_commands.py b/tests/test_commands.py index 0d35f54..021d5a1 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -144,8 +144,8 @@ def test_check_samplesheet_minimal_missing_column(self): "comment[instrument]\tcomment[proteomics data acquisition method]\t" "technology type\tcomment[modification parameters]\n") f.write("S1\trun1\tfile1.raw\tlabel free sample\tOrbitrap\tDIA\tMS\tOxidation\n") - f.name - args = ["--minimal", "--exp_design", f.name] + tmp_path = f.name + args = ["--minimal", "--exp_design", tmp_path] result = run_cli_command("checksamplesheet", args) assert result.exit_code != 0 assert "cleavage agent" in result.output.lower() From 45f3cffd466c56f032c91bd34f691ff25880921f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 21 Mar 2026 10:51:55 +0000 Subject: [PATCH 5/5] minor changes --- quantmsutils/sdrf/check_samplesheet.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/quantmsutils/sdrf/check_samplesheet.py b/quantmsutils/sdrf/check_samplesheet.py index 01af482..8754ea3 100644 --- a/quantmsutils/sdrf/check_samplesheet.py +++ b/quantmsutils/sdrf/check_samplesheet.py @@ -22,11 +22,6 @@ "technology type", ] -# Columns with at least one modification parameters column (pattern match) -MINIMAL_PATTERN_COLUMNS = [ - "comment[modification parameters", # prefix match — multiple columns allowed -] - # Recommended columns: warn if missing but don't fail MINIMAL_RECOMMENDED_COLUMNS = [ "comment[precursor mass tolerance]", @@ -72,10 +67,16 @@ def _validate_minimal(input_sdrf: str) -> list[str]: Returns a list of error strings. Only missing required columns produce errors; missing recommended columns produce warnings (non-blocking). """ - df = pd.read_csv(input_sdrf, sep="\t", nrows=0) - columns_lower = [c.lower() for c in df.columns] + df_header = pd.read_csv(input_sdrf, sep="\t", nrows=0) + columns_lower = [c.lower() for c in df_header.columns] errors = [] + # Reject header-only files + df_rows = pd.read_csv(input_sdrf, sep="\t", nrows=1) + if len(df_rows) == 0: + errors.append("ERROR: SDRF file contains a header but no data rows.") + return errors + # Check required columns (case-insensitive) for col in MINIMAL_REQUIRED_COLUMNS: if col.lower() not in columns_lower: