diff --git a/README.md b/README.md index d35b3fd..0fee48a 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,38 @@ This command should automatically bind your current working directory into the c this repository's code and data which it needs run scAbsolute. If errors are generated related to lacking access to needed code or scripts this problem could be resolved by appending ` -B ${PWD}:${PWD}` to `--singularity-args` in the above command. +We also recommend passing `--keep-going` so that a single-cell failure mid-run +does not abort the whole batch. + +### Failure handling and the failed_cells.csv report + +The workflow runs a pre-flight `samtools quickcheck` against every BAM listed +in your sample sheet before any copy-number calling starts. If any BAM is +truncated or otherwise unreadable, the workflow aborts immediately with the +full list of offending files, so you can fix them all in one pass (typically +by re-copying from the source filesystem) instead of discovering failures one +cell at a time deep into a multi-hour run. + +All cell failures across the whole pipeline are recorded in a single canonical +CSV: + +``` +results//__failed_cells.csv +``` + +Schema: `name,failure_reason`. Possible `failure_reason` values are: + +| Value | Stage | Meaning | +|---|---|---| +| `truncated_bam` | pre-flight `validate_bams` | `samtools quickcheck` rejected the BAM (most often an incomplete copy) | +| `missing_output` | `combine` | The `.rds` for this cell was never produced | +| `process_crash` | `combine` | The `.rds` exists but is empty / unreadable | +| (other) | `scAbsolute` | Per-cell QC failures recorded by `scAbsolute` (low coverage, fit failure, etc.) | + +After a validation failure, the workflow stops with no `.rds` results. After a +successful validation but partial downstream failure, the CSV lists only the +cells that failed in scAbsolute / combine — everything else has a result. + ### QC analysis Please take the time to analyze the data (the qc script to be used for this step is available at scripts/qc-script.R) diff --git a/workflow/Snakefile_absolute b/workflow/Snakefile_absolute index 69120f7..824a12a 100644 --- a/workflow/Snakefile_absolute +++ b/workflow/Snakefile_absolute @@ -35,15 +35,16 @@ def helper_dirname(wildcards): return os.path.join("results", str(config["binSize"]), filename) +include: "rules/validate.smk" include: "rules/qc.smk" include: "rules/scale_scAbsolute.smk" include: "rules/combine.smk" if config["estimateReadDensity"]: include: "rules/density.smk" - ruleorder: qc > density > scale_scAbsolute > combine + ruleorder: validate_bams > qc > density > scale_scAbsolute > combine else: - ruleorder: qc > scale_scAbsolute > combine + ruleorder: validate_bams > qc > scale_scAbsolute > combine rule all: diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index cf1e4cd..bd8095d 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -1,6 +1,7 @@ rule qc: input: - bam="data/aligned/" + str(config["sampleName"]) + "/{sample}.bam" + bam="data/aligned/" + str(config["sampleName"]) + "/{sample}.bam", + validated="data/aligned/" + str(config["sampleName"]) + "/.bam_validation.passed" output: flagstat="data/aligned/" + str(config["sampleName"]) + "/{sample}.flagstat", bai="data/aligned/" + str(config["sampleName"]) + "/{sample}.bam.bai" diff --git a/workflow/rules/validate.smk b/workflow/rules/validate.smk new file mode 100644 index 0000000..d6cdf5a --- /dev/null +++ b/workflow/rules/validate.smk @@ -0,0 +1,70 @@ +rule validate_bams: + """Pre-flight BAM integrity check. Runs samtools quickcheck on every BAM + listed in the sample sheet and fails fast with a full list of bad files + before any qc / scale_scAbsolute job is scheduled. Common failure mode + this catches: BAMs that were truncated during copy from the source path. + + On failure, writes the bad cells into the SAME failed_cells.csv that the + combine / merge step uses downstream, with failure_reason = "truncated_bam". + This gives a single canonical place to look for any failed cell across the + whole pipeline (truncated input, missing output, process crash, or QC + failure during scAbsolute). + """ + input: + bams=expand("data/aligned/" + str(config["sampleName"]) + "/{sample}.bam", + sample=SAMPLE_FILES) + output: + report="data/aligned/" + str(config["sampleName"]) + "/.bam_validation.passed" + params: + failed_csv="results/" + str(config["binSize"]) + "/" + str(config["sampleName"]) + "_" + str(config["binSize"]) + "_failed_cells.csv" + container: + config["qc_img"] + message: + "Pre-flight BAM integrity check (samtools quickcheck)" + threads: + 1 + shell: + r""" + set -uo pipefail + bad_tmp=$(mktemp) + n_total=0 + n_bad=0 + for bam in {input.bams}; do + n_total=$((n_total+1)) + if ! samtools quickcheck "$bam" 2>/dev/null; then + # Strip .bam extension to match cell-name convention used downstream + base=$(basename "$bam" .bam) + echo "$base" >> "$bad_tmp" + n_bad=$((n_bad+1)) + fi + done + + if [ "$n_bad" -gt 0 ]; then + # Write failed cells into the canonical failed_cells.csv (same file + # merge.R uses) so the user has ONE place to look for failures. + mkdir -p "$(dirname {params.failed_csv})" + {{ + echo '"name","failure_reason"' + while IFS= read -r cell; do + printf '"%s","truncated_bam"\n' "$cell" + done < "$bad_tmp" + }} > {params.failed_csv} + + echo "" >&2 + echo "============================================================" >&2 + echo "ERROR: $n_bad / $n_total BAM file(s) failed integrity check:" >&2 + echo "============================================================" >&2 + cat "$bad_tmp" >&2 + echo "============================================================" >&2 + echo "Full failure report written to:" >&2 + echo " {params.failed_csv}" >&2 + echo "(same CSV that downstream failures use; failure_reason = truncated_bam)" >&2 + echo "Common cause: incomplete copy from source filesystem." >&2 + echo "Action: re-copy listed files from source and re-run." >&2 + echo "============================================================" >&2 + rm -f "$bad_tmp" + exit 1 + fi + rm -f "$bad_tmp" + echo "All $n_total BAMs passed samtools quickcheck on $(date)" > {output.report} + """