From 76a601de7a6a74c64dcb0e38e151b95ba6d7a7a4 Mon Sep 17 00:00:00 2001 From: shafighi Date: Mon, 1 Jun 2026 15:44:21 +0200 Subject: [PATCH] Add pre-flight BAM integrity check before scAbsolute calling New validate_bams rule runs samtools quickcheck on every BAM in the sample sheet before any qc / scale_scAbsolute job is scheduled. If any file is truncated or otherwise unreadable, the workflow aborts up front with the full list of bad files so the user can fix them all in one pass (typically by re-copying from the source) rather than discovering failures one cell at a time deep into a multi-hour run. Failures are recorded in the SAME canonical CSV used by the downstream combine / merge step: results//__failed_cells.csv with a new failure_reason value "truncated_bam" alongside the existing "missing_output" and "process_crash". One file, one schema, one place to look regardless of which stage caught the failure. Also updates README.md to document the failure_reason vocabulary and recommends running snakemake with --keep-going so that single-cell failures later in the run do not abort the entire batch. --- README.md | 32 +++++++++++++++++ workflow/Snakefile_absolute | 5 +-- workflow/rules/qc.smk | 3 +- workflow/rules/validate.smk | 70 +++++++++++++++++++++++++++++++++++++ 4 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 workflow/rules/validate.smk diff --git a/README.md b/README.md index d35b3fd..0fee48a 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,38 @@ This command should automatically bind your current working directory into the c this repository's code and data which it needs run scAbsolute. If errors are generated related to lacking access to needed code or scripts this problem could be resolved by appending ` -B ${PWD}:${PWD}` to `--singularity-args` in the above command. +We also recommend passing `--keep-going` so that a single-cell failure mid-run +does not abort the whole batch. + +### Failure handling and the failed_cells.csv report + +The workflow runs a pre-flight `samtools quickcheck` against every BAM listed +in your sample sheet before any copy-number calling starts. If any BAM is +truncated or otherwise unreadable, the workflow aborts immediately with the +full list of offending files, so you can fix them all in one pass (typically +by re-copying from the source filesystem) instead of discovering failures one +cell at a time deep into a multi-hour run. + +All cell failures across the whole pipeline are recorded in a single canonical +CSV: + +``` +results//__failed_cells.csv +``` + +Schema: `name,failure_reason`. Possible `failure_reason` values are: + +| Value | Stage | Meaning | +|---|---|---| +| `truncated_bam` | pre-flight `validate_bams` | `samtools quickcheck` rejected the BAM (most often an incomplete copy) | +| `missing_output` | `combine` | The `.rds` for this cell was never produced | +| `process_crash` | `combine` | The `.rds` exists but is empty / unreadable | +| (other) | `scAbsolute` | Per-cell QC failures recorded by `scAbsolute` (low coverage, fit failure, etc.) | + +After a validation failure, the workflow stops with no `.rds` results. After a +successful validation but partial downstream failure, the CSV lists only the +cells that failed in scAbsolute / combine — everything else has a result. + ### QC analysis Please take the time to analyze the data (the qc script to be used for this step is available at scripts/qc-script.R) diff --git a/workflow/Snakefile_absolute b/workflow/Snakefile_absolute index 69120f7..824a12a 100644 --- a/workflow/Snakefile_absolute +++ b/workflow/Snakefile_absolute @@ -35,15 +35,16 @@ def helper_dirname(wildcards): return os.path.join("results", str(config["binSize"]), filename) +include: "rules/validate.smk" include: "rules/qc.smk" include: "rules/scale_scAbsolute.smk" include: "rules/combine.smk" if config["estimateReadDensity"]: include: "rules/density.smk" - ruleorder: qc > density > scale_scAbsolute > combine + ruleorder: validate_bams > qc > density > scale_scAbsolute > combine else: - ruleorder: qc > scale_scAbsolute > combine + ruleorder: validate_bams > qc > scale_scAbsolute > combine rule all: diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index cf1e4cd..bd8095d 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -1,6 +1,7 @@ rule qc: input: - bam="data/aligned/" + str(config["sampleName"]) + "/{sample}.bam" + bam="data/aligned/" + str(config["sampleName"]) + "/{sample}.bam", + validated="data/aligned/" + str(config["sampleName"]) + "/.bam_validation.passed" output: flagstat="data/aligned/" + str(config["sampleName"]) + "/{sample}.flagstat", bai="data/aligned/" + str(config["sampleName"]) + "/{sample}.bam.bai" diff --git a/workflow/rules/validate.smk b/workflow/rules/validate.smk new file mode 100644 index 0000000..d6cdf5a --- /dev/null +++ b/workflow/rules/validate.smk @@ -0,0 +1,70 @@ +rule validate_bams: + """Pre-flight BAM integrity check. Runs samtools quickcheck on every BAM + listed in the sample sheet and fails fast with a full list of bad files + before any qc / scale_scAbsolute job is scheduled. Common failure mode + this catches: BAMs that were truncated during copy from the source path. + + On failure, writes the bad cells into the SAME failed_cells.csv that the + combine / merge step uses downstream, with failure_reason = "truncated_bam". + This gives a single canonical place to look for any failed cell across the + whole pipeline (truncated input, missing output, process crash, or QC + failure during scAbsolute). + """ + input: + bams=expand("data/aligned/" + str(config["sampleName"]) + "/{sample}.bam", + sample=SAMPLE_FILES) + output: + report="data/aligned/" + str(config["sampleName"]) + "/.bam_validation.passed" + params: + failed_csv="results/" + str(config["binSize"]) + "/" + str(config["sampleName"]) + "_" + str(config["binSize"]) + "_failed_cells.csv" + container: + config["qc_img"] + message: + "Pre-flight BAM integrity check (samtools quickcheck)" + threads: + 1 + shell: + r""" + set -uo pipefail + bad_tmp=$(mktemp) + n_total=0 + n_bad=0 + for bam in {input.bams}; do + n_total=$((n_total+1)) + if ! samtools quickcheck "$bam" 2>/dev/null; then + # Strip .bam extension to match cell-name convention used downstream + base=$(basename "$bam" .bam) + echo "$base" >> "$bad_tmp" + n_bad=$((n_bad+1)) + fi + done + + if [ "$n_bad" -gt 0 ]; then + # Write failed cells into the canonical failed_cells.csv (same file + # merge.R uses) so the user has ONE place to look for failures. + mkdir -p "$(dirname {params.failed_csv})" + {{ + echo '"name","failure_reason"' + while IFS= read -r cell; do + printf '"%s","truncated_bam"\n' "$cell" + done < "$bad_tmp" + }} > {params.failed_csv} + + echo "" >&2 + echo "============================================================" >&2 + echo "ERROR: $n_bad / $n_total BAM file(s) failed integrity check:" >&2 + echo "============================================================" >&2 + cat "$bad_tmp" >&2 + echo "============================================================" >&2 + echo "Full failure report written to:" >&2 + echo " {params.failed_csv}" >&2 + echo "(same CSV that downstream failures use; failure_reason = truncated_bam)" >&2 + echo "Common cause: incomplete copy from source filesystem." >&2 + echo "Action: re-copy listed files from source and re-run." >&2 + echo "============================================================" >&2 + rm -f "$bad_tmp" + exit 1 + fi + rm -f "$bad_tmp" + echo "All $n_total BAMs passed samtools quickcheck on $(date)" > {output.report} + """