Skip to content

Commit a4c1cd3

Browse files
Improve SageAdapter parameter documentation and add tolerance validation (OpenMS#8259)
* Initial plan * Improve SageAdapter parameter documentation with detailed descriptions and validation Co-authored-by: timosachsenberg <5803621+timosachsenberg@users.noreply.github.com> * Update src/topp/SageAdapter.cpp * Update src/topp/SageAdapter.cpp * Update src/topp/SageAdapter.cpp * Update src/topp/SageAdapter.cpp * Update src/topp/SageAdapter.cpp * Update src/topp/SageAdapter.cpp --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: timosachsenberg <5803621+timosachsenberg@users.noreply.github.com> Co-authored-by: Timo Sachsenberg <timo.sachsenberg@uni-tuebingen.de>
1 parent 05e5ad2 commit a4c1cd3

1 file changed

Lines changed: 61 additions & 30 deletions

File tree

src/topp/SageAdapter.cpp

Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -436,37 +436,36 @@ class TOPPSageAdapter :
436436
#endif
437437
"The Sage executable. Provide a full or relative path, or make sure it can be found in your PATH environment.", true, false, {"is_executable"});
438438

439-
registerStringOption_("decoy_prefix", "<prefix>", "DECOY_", "Prefix on protein accession used to distinguish decoy from target proteins. NOTE: Decoy suffix is currently not supported by sage.", false, false);
440-
registerIntOption_("batch_size", "<int>", 0, "Number of files to load and search in parallel (default = # of CPUs/2)", false, false);
439+
registerStringOption_("decoy_prefix", "<prefix>", "DECOY_", "Prefix on protein accession used to distinguish decoy from target proteins. Decoy proteins in the FASTA file should have this prefix in their accession. NOTE: Decoy suffix is currently not supported by Sage.", false, false);
440+
registerIntOption_("batch_size", "<int>", 0, "Number of files to load and search in parallel. Setting this to 0 (default) uses an automatic value (typically number of CPUs/2). Default: 0", false, false);
441441

442-
registerDoubleOption_("precursor_tol_left", "<double>", -6.0, "Start (left side) of the precursor tolerance window w.r.t. precursor location. Usually used with negative values smaller or equal to the 'right' counterpart.", false, false);
443-
registerDoubleOption_("precursor_tol_right", "<double>", 6.0, "End (right side) of the precursor tolerance window w.r.t. precursor location. Usually used with positive values larger or equal to the 'left' counterpart.", false, false);
442+
registerDoubleOption_("precursor_tol_left", "<double>", -6.0, "Start (left side) of the precursor tolerance window w.r.t. precursor location. This value is relative to the experimental precursor mass and used to define the lower bound of the search window. Must be negative (e.g., -6 ppm means 6 ppm below the observed mass).", false, false);
443+
registerDoubleOption_("precursor_tol_right", "<double>", 6.0, "End (right side) of the precursor tolerance window w.r.t. precursor location. This value is added to the experimental precursor mass to define the upper bound of the search window. Must be positive (e.g., 6 ppm means 6 ppm above the observed mass).", false, false);
444444
registerStringOption_("precursor_tol_unit", "<unit>", "ppm", "Unit of precursor tolerance (ppm or Da)", false, false);
445445
setValidStrings_("precursor_tol_unit", ListUtils::create<String>("ppm,Da"));
446446

447-
registerDoubleOption_("fragment_tol_left", "<double>", -20.0, "Start (left side) of the fragment tolerance window w.r.t. precursor location. Usually used with negative values smaller or equal to the 'right' counterpart.", false, false);
448-
registerDoubleOption_("fragment_tol_right", "<double>", 20.0, "End (right side) of the fragment tolerance window w.r.t. precursor location. Usually used with positive values larger or equal to the 'left' counterpart.", false, false);
447+
registerDoubleOption_("fragment_tol_left", "<double>", -20.0, "Start (left side) of the fragment tolerance window w.r.t. fragment location. This value reduces the experimental fragment mass to define the lower bound of the search window. Must be negative (e.g., -20 ppm means 20 ppm below the observed mass).", false, false);
448+
registerDoubleOption_("fragment_tol_right", "<double>", 20.0, "End (right side) of the fragment tolerance window w.r.t. fragment location. This value is added to the experimental fragment mass to define the upper bound of the search window. Must be positive (e.g., 20 ppm means 20 ppm above the observed mass).", false, false);
449449
registerStringOption_("fragment_tol_unit", "<unit>", "ppm", "Unit of fragment tolerance (ppm or Da)", false, false);
450450
setValidStrings_("fragment_tol_unit", ListUtils::create<String>("ppm,Da"));
451451

452452
// add advanced options
453-
registerIntOption_("min_matched_peaks", "<int>", min_matched_peaks, "Minimum number of b+y ions required to match for PSM to be reported", false, true);
454-
registerIntOption_("min_peaks", "<int>", min_peaks, "Minimum number of peaks required for a spectrum to be considered", false, true);
455-
registerIntOption_("max_peaks", "<int>", max_peaks, "Take the top N most intense MS2 peaks only for matching", false, true);
456-
registerIntOption_("report_psms", "<int>", report_psms, "Number of hits (PSMs) to report for each spectrum", false, true);
457-
registerIntOption_("bucket_size", "<int>", bucket_size, "How many fragments are in each internal mass bucket (default: 8192 for hi-res data). Try increasing it to 32k or 64k for low-res. See also: fragment_tol_*", false, true);
458-
registerIntOption_("min_len", "<int>", min_len, "Minimum peptide length", false, true);
459-
registerIntOption_("max_len", "<int>", max_len, "Maximum peptide length", false, true);
460-
registerIntOption_("missed_cleavages", "<int>", missed_cleavages, "Number of missed cleavages", false, true);
461-
registerDoubleOption_("fragment_min_mz", "<double>", fragment_min_mz, "Minimum fragment m/z", false, true);
462-
registerDoubleOption_("fragment_max_mz", "<double>", fragment_max_mz, "Maximum fragment m/z", false, true);
463-
registerDoubleOption_("peptide_min_mass", "<double>", peptide_min_mass, "Minimum monoisotopic peptide mass to consider a peptide from the DB", false, true);
464-
registerDoubleOption_("peptide_max_mass", "<double>", peptide_max_mass, "Maximum monoisotopic peptide mass to consider a peptide from the DB", false, true);
465-
registerIntOption_("min_ion_index", "<int>", min_ion_index, "Minimum ion index to consider for preliminary scoring. Default = 2 to skip b1/y1 AND (sic) b2/y2 ions that are often missing.", false, true);
466-
registerIntOption_("max_variable_mods", "<int>", max_variable_mods, "Maximum number of variable modifications", false, true);
467-
registerStringOption_("isotope_error_range", "<start,end>", isotope_errors, "Range of (C13) isotope errors to consider for precursor."
468-
"Can be negative. E.g. '-1,3' for considering '-1/0/1/2/3'", false, true);
469-
registerStringOption_("charges", "<start,end>", charges_if_not_annotated, "Range of precursor charges to consider if not annotated in the file."
453+
registerIntOption_("min_matched_peaks", "<int>", min_matched_peaks, "Minimum number of b+y ions required to match for PSM to be reported. Default: 6", false, true);
454+
registerIntOption_("min_peaks", "<int>", min_peaks, "Minimum number of peaks required for a spectrum to be considered. Spectra with fewer peaks will be ignored. Default: 15", false, true);
455+
registerIntOption_("max_peaks", "<int>", max_peaks, "Take the top N most intense MS2 peaks to search. Default: 150", false, true);
456+
registerIntOption_("report_psms", "<int>", report_psms, "Number of peptide-spectrum matches (PSMs) to report for each spectrum. The top N scoring PSMs will be reported. Values higher than 1 can be useful for chimeric spectra but may affect downstream statistical analysis. Default: 1", false, true);
457+
registerIntOption_("bucket_size", "<int>", bucket_size, "How many fragments are in each internal mass bucket. Default: 8192 (optimal for high-resolution data). Try increasing it to 32768 or 65536 for low-resolution data. See also: fragment_tol_*", false, true);
458+
registerIntOption_("min_len", "<int>", min_len, "Minimum peptide length (in amino acids). Default: 5", false, true);
459+
registerIntOption_("max_len", "<int>", max_len, "Maximum peptide length (in amino acids). Default: 50", false, true);
460+
registerIntOption_("missed_cleavages", "<int>", missed_cleavages, "Maximum number of missed enzymatic cleavages to allow in peptide generation. Default: 2", false, true);
461+
registerDoubleOption_("fragment_min_mz", "<double>", fragment_min_mz, "Minimum fragment m/z to consider. Fragment ions below this m/z will be ignored. Default: 200.0", false, true);
462+
registerDoubleOption_("fragment_max_mz", "<double>", fragment_max_mz, "Maximum fragment m/z to consider. Fragment ions above this m/z will be ignored. Default: 2000.0", false, true);
463+
registerDoubleOption_("peptide_min_mass", "<double>", peptide_min_mass, "Minimum monoisotopic peptide mass to consider for in silico digestion. Peptides below this mass will be excluded from the search database. Default: 500.0", false, true);
464+
registerDoubleOption_("peptide_max_mass", "<double>", peptide_max_mass, "Maximum monoisotopic peptide mass to consider for in silico digestion. Peptides above this mass will be excluded from the search database. Default: 5000.0", false, true);
465+
registerIntOption_("min_ion_index", "<int>", min_ion_index, "Minimum ion index to consider for preliminary scoring. This parameter controls which fragment ions are used in preliminary scoring. Default: 2 (skips b1/b2/y1/y2 ions, which are often missing or unreliable). Setting this to 1 would only skip b1/y1 ions. Does not affect the final scoring of PSMs.", false, true);
466+
registerIntOption_("max_variable_mods", "<int>", max_variable_mods, "Maximum number of variable modifications allowed per peptide. Default: 2", false, true);
467+
registerStringOption_("isotope_error_range", "<start,end>", isotope_errors, "Range of C13 isotope errors to consider for precursor matching, specified as 'start,end' (e.g., '-1,3'). For a range of '-1,3', Sage will consider all isotope errors from -1 to +3 (i.e., -1, 0, 1, 2, 3). This is useful when the monoisotopic peak may not be selected. Can include negative values. Default: '-1,3'. Note: Searching with isotope errors is slower than using a wider precursor tolerance.", false, true);
468+
registerStringOption_("charges", "<start,end>", charges_if_not_annotated, "Range of precursor charge states to consider if not annotated in the file, specified as 'start,end' (e.g., '2,5'). For a range of '2,5', Sage will consider charge states 2, 3, 4, and 5. This is only used when charge state information is missing from the input file. Default: '2,5'"
470469
, false, true);
471470

472471

@@ -486,13 +485,13 @@ class TOPPSageAdapter :
486485

487486
//FDR and misc
488487

489-
registerDoubleOption_("q_value_threshold", "<double>", 1, "The FDR threshhold for filtering peptides", false, false);
490-
registerStringOption_("annotate_matches", "<bool>", "true", "If the matches should be annotated (default: false),", false, false);
491-
registerStringOption_("deisotope", "<bool>", "false", "Sets deisotope option (true or false), default: false", false, false );
492-
registerStringOption_("chimera", "<bool>", "false", "Sets chimera option (true or false), default: false", false, false );
493-
registerStringOption_("predict_rt", "<bool>", "false", "Sets predict_rt option (true or false), default: false", false, false );
494-
registerStringOption_("wide_window", "<bool>", "false", "Sets wide_window option (true or false), default: false", false, false);
495-
registerStringOption_("smoothing", "<bool>", "true", "Should the PTM histogram be smoothed and local maxima be picked. If false, uses raw data, default: false", false, false);
488+
registerDoubleOption_("q_value_threshold", "<double>", 1, "The FDR (False Discovery Rate) threshold for filtering peptides. PSMs with q-values above this threshold will be excluded. Default: 1 (no filtering)", false, false);
489+
registerStringOption_("annotate_matches", "<bool>", "true", "Whether fragment ion matches should be annotated in the output. This provides additional information about which theoretical ions matched experimental peaks. Default: true", false, false);
490+
registerStringOption_("deisotope", "<bool>", "false", "Perform deisotoping and charge state deconvolution on MS2 spectra. Recommended for high-resolution MS2 data. May interfere with TMT-MS2 quantification. Default: false", false, false );
491+
registerStringOption_("chimera", "<bool>", "false", "Enable chimeric spectra search mode. When enabled, multiple peptide identifications can be reported for each MS2 scan, useful for co-fragmenting peptides. Default: false", false, false );
492+
registerStringOption_("predict_rt", "<bool>", "false", "Use retention time prediction model as a feature for machine learning scoring. Note: This is incompatible with label-free quantification (LFQ). Default: false", false, false );
493+
registerStringOption_("wide_window", "<bool>", "false", "Enable wide-window/DIA search mode. When enabled, the precursor_tol parameter is ignored and a dynamic precursor tolerance is used. Default: false", false, false);
494+
registerStringOption_("smoothing", "<bool>", "true", "Whether to smooth the PTM (post-translational modification) mass histogram and pick local maxima. If false, uses raw histogram data. Default: true", false, false);
496495
registerIntOption_("threads", "<int>", 1, "Amount of threads available to the program", false, false);
497496

498497
// register peptide indexing parameter (with defaults for this search engine)
@@ -506,6 +505,38 @@ class TOPPSageAdapter :
506505
// parsing parameters
507506
//-------------------------------------------------------------
508507

508+
// Validate tolerance parameters
509+
double precursor_tol_left = getDoubleOption_("precursor_tol_left");
510+
double precursor_tol_right = getDoubleOption_("precursor_tol_right");
511+
double fragment_tol_left = getDoubleOption_("fragment_tol_left");
512+
double fragment_tol_right = getDoubleOption_("fragment_tol_right");
513+
514+
// Warn if tolerance parameters seem incorrect
515+
if (precursor_tol_left > 0)
516+
{
517+
OPENMS_LOG_WARN << "WARNING: precursor_tol_left is positive (" << precursor_tol_left << "). "
518+
<< "This parameter is used to reduce the experimental mass, so it should typically be negative. "
519+
<< "A positive value will likely produce an incorrect search window." << std::endl;
520+
}
521+
if (precursor_tol_right < 0)
522+
{
523+
OPENMS_LOG_WARN << "WARNING: precursor_tol_right is negative (" << precursor_tol_right << "). "
524+
<< "This parameter is ADDED to the experimental mass, so it should typically be positive. "
525+
<< "A negative value will likely produce an incorrect search window." << std::endl;
526+
}
527+
if (fragment_tol_left > 0)
528+
{
529+
OPENMS_LOG_WARN << "WARNING: fragment_tol_left is positive (" << fragment_tol_left << "). "
530+
<< "This parameter is used to reduce the experimental mass, so it should typically be negative. "
531+
<< "A positive value will likely produce an incorrect search window." << std::endl;
532+
}
533+
if (fragment_tol_right < 0)
534+
{
535+
OPENMS_LOG_WARN << "WARNING: fragment_tol_right is negative (" << fragment_tol_right << "). "
536+
<< "This parameter is ADDED to the experimental mass, so it should typically be positive. "
537+
<< "A negative value will likely produce an incorrect search window." << std::endl;
538+
}
539+
509540
// do this early, to see if Sage is installed
510541
String sage_executable = getStringOption_("sage_executable");
511542
std::cout << sage_executable << " sage executable" << std::endl;

0 commit comments

Comments
 (0)