Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a0f3473
updated to use tmpdir
erflynn Jun 27, 2025
a09f110
addeded to RRNA step
erflynn Jun 27, 2025
c081bbc
added TMPDIR piece for RRNA removal step
erflynn Jun 27, 2025
a01d023
modify dynamic memory allocation to allocate a minimum of 8Gbs to pre…
AlaaALatif Jul 1, 2025
cae5acf
fixes minor instruction errors to run and resume pipeline
AlaaALatif Jul 1, 2025
06204ff
change dynamic mem allocation to account for empty files
AlaaALatif Jul 10, 2025
9b2bc61
added TMPDIR piece for STAR align step
AlaaALatif Jul 14, 2025
cc37332
fix bug with tmp dir getting permission denied errors
AlaaALatif Jul 17, 2025
e8a338e
Publish results from adapter trimming and ribosomal rna removal to fa…
AlaaALatif Jul 30, 2025
e17c055
initialize snakemake rules for bulk RNAseq pipeline
AlaaALatif Sep 16, 2025
87ea985
initialize snakemake Snakefile
AlaaALatif Sep 16, 2025
6cba518
initialize snakemake configuration files
AlaaALatif Sep 16, 2025
25e64bc
initialize snakemake executable bash script
AlaaALatif Sep 16, 2025
4c31aa0
Fixes bug with mounting home dirs
AlaaALatif Sep 16, 2025
2fa62ef
remove conda statement from all rules
AlaaALatif Sep 16, 2025
bf46d9b
fixes based on c4 testing
AlaaALatif Sep 16, 2025
2d9b77b
fixes list concat bug
AlaaALatif Sep 16, 2025
5dcb986
updates to Snakefile
AlaaALatif Sep 16, 2025
c4696c8
Fix formatting issues
AlaaALatif Sep 19, 2025
d7a2aa7
Fix to handle empty input reads gracefully, without errors
AlaaALatif Sep 19, 2025
e1f3314
Fix to handle empty input reads gracefully, without errors
AlaaALatif Sep 19, 2025
af67f0c
Modify pipeline to accommodate params.call_snps, optional argument to…
AlaaALatif Sep 19, 2025
868eff8
Added params.call_snps to specify whether or not to skip SNP calling …
AlaaALatif Sep 19, 2025
f76c484
update for debugging
AlaaALatif Sep 22, 2025
41cb267
initialize configuration file for Snakemake pipeline
AlaaALatif Sep 25, 2025
c662df2
Fixes syntax error with kallisto_paired for empty file inputs
AlaaALatif Sep 25, 2025
1ceffa0
Incorporates /dscolab into bind mounts
AlaaALatif Sep 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
491 changes: 491 additions & 0 deletions bulk_RNASeq/Snakefile

Large diffs are not rendered by default.

100 changes: 29 additions & 71 deletions bulk_RNASeq/bulk_rna_seq.nf
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ workflow {
// CUSTOM_MERGE_COUNTS (
// counts
// )
// === SNP Calling branch: run only if requested ===
if (params.call_snps) { // <- ### ADDED

//
// SUBWORKFLOW: Align FastQ reads; sort, and index BAM files
//
Expand All @@ -169,6 +172,7 @@ workflow {
ch_star_multiqc = ALIGN_READS.out.log_final
ch_reports = ch_reports.mix(ALIGN_READS.out.log_final.map{it[1]}.ifEmpty([]))
ch_star_bam_bai = ch_star_bam.join(ch_star_bai, by: [0])

//
// SUBWORKFLOW: Mark duplicate reads
//
Expand All @@ -192,6 +196,7 @@ workflow {
ch_reports = ch_reports.mix(BAM_MARKDUPLICATES_PICARD.out.stats.map{it[1]}.ifEmpty([]))
ch_reports = ch_reports.mix(BAM_MARKDUPLICATES_PICARD.out.metrics.map{it[1]}.ifEmpty([]))
ch_genome_bam_bai = ch_genome_bam.join(ch_genome_bai, by: [0])

//
// MODULE: SplitNCigarReads and reassign mapping qualities
//
Expand All @@ -205,6 +210,7 @@ workflow {
)
ch_split_bam = GATK4_SPLITNCIGARREADS.out.bam
ch_split_bai = GATK4_SPLITNCIGARREADS.out.bai

//
// MODULE: Base Recalibration table generation
//
Expand All @@ -220,6 +226,7 @@ workflow {
)
ch_recal_table = GATK4_BASE_RECALIBRATOR.out.table
ch_reports = ch_reports.mix(ch_recal_table.map{ meta, table -> table})

//
// MODULE: Apply BQSR using recalibration table, then index
//
Expand All @@ -238,6 +245,7 @@ workflow {
)
ch_bam_variant_calling = GATK4_APPLY_BQSR.out.bam
ch_bai_variant_calling = SAMTOOLS_INDEX_BQSR.out.bai

//
// MODULE: Call SNPs and Indels using HaplotypeCaller
//
Expand All @@ -255,6 +263,7 @@ workflow {
ch_haplotype_vcf = GATK4_HAPLOTYPECALLER.out.vcf
ch_haplotype_tbi = GATK4_HAPLOTYPECALLER.out.tbi
ch_haplotype_vcf_tbi = ch_haplotype_vcf.join(ch_haplotype_tbi, by: [0])

//
// MODULE: Filter variants using VariantFiltration
//
Expand All @@ -271,11 +280,12 @@ workflow {
// MODULE: Convert VCF contigs to desired naming format (e.g. ucsc)
//
BCFTOOLS_CONTIG_CONVERSION (
ch_filtered_vcf,
params.contig_format_map
ch_filtered_vcf,
params.contig_format_map
)
ch_filtered_vcf = BCFTOOLS_CONTIG_CONVERSION.out.formatted_vcf
}

//
// MODULE: Sort and index VCFs
//
Expand All @@ -284,48 +294,18 @@ workflow {
ch_filtered_vcf
)
ch_sorted_vcf = BCFTOOLS_SORT_VCF.out.sorted_vcf

//
// MODULE: Index VCFs
//
ch_vcf_index = Channel.empty()
BCFTOOLS_INDEX_VCF (
ch_sorted_vcf
)
// ch_sorted_vcf = BCFTOOLS_INDEX_VCF.out.sorted_vcf
ch_vcf_index = BCFTOOLS_INDEX_VCF.out.vcf_index
ch_vcf = ch_sorted_vcf.join(ch_vcf_index, by: [0])
// Collect all VCFs and index files from upstream process
// meta = ch_vcf
// .map { tuple -> tuple[0]}
// .collect()
// vcfs = ch_vcf
// .map { tuple -> tuple[1]}
// .collect()
// tbis = ch_vcf
// .map { tuple -> tuple[2]}
// .collect()
// //
// // MODULE: Merge VCFs
// //
// BCFTOOLS_MERGE_VCF (
// meta,
// vcfs,
// tbis
// )
// Collect VCFs and TBIs, filtering out any nulls or missing files
// Filter ch_vcf to samples with both VCF and TBI files
// ch_vcf_success = ch_vcf.filter { meta, vcf, tbi -> vcf && tbi }

// // Collect the VCF files into a list within a channel
// ch_vcf_lists = ch_vcf_success
// .collect()
// .map { vcf_tuples ->
// def metaList = vcf_tuples.collect { it[0] }
// def vcfList = vcf_tuples.collect { it[1] }
// def tbiList = vcf_tuples.collect { it[2] }
// return [ metaList, vcfList, tbiList ]
// }
// Split the combined channel into three separate channels
// Merge samples
meta = ch_vcf
.map { tuple -> tuple[0]}
.collect()
Expand All @@ -335,42 +315,20 @@ workflow {
tbis = ch_vcf
.map { tuple -> tuple[2]}
.collect()
// Now, invoke the process outside of any closure
BCFTOOLS_MERGE_VCF(
meta,
vcfs,
tbis
)
// // Filter ch_vcf to samples with both VCF and TBI files
// ch_vcf_success = ch_vcf.filter { meta, vcf, tbi -> vcf && tbi }

// // Collect the VCF files into a list
// vcf_list = ch_vcf_success
// .map { meta, vcf, tbi -> vcf }
// .collect()
BCFTOOLS_MERGE_VCF(
meta,
vcfs,
tbis
)
} // <--- END IF

// // Subscribe to the vcf_list when it's ready
// vcf_list.subscribe { list ->
// if (!list.isEmpty()) {
// BCFTOOLS_MERGE_VCF(list)
// } else {
// println "No VCF files to merge."
// }
// }
//
// MODULE: Generate QC reports using MULTIQC
//
// After correcting all instances, you can now filter and use ch_reports
// ch_multiqc_files = ch_reports.filter { it.exists() }
// MULTIQC(ch_reports)
// multiqc_report = MULTIQC.out.report.toList()
// ch_multiqc_files = ch_reports
// .filter { it.exists() }
// MULTIQC (ch_multiqc_files.collect())
// multiqc_report = MULTIQC.out.report.toList()
ch_multiqc_files = Channel
.empty()
.mix(ch_reports.collect())
MULTIQC (ch_multiqc_files.collect())
multiqc_report = MULTIQC.out.report.toList()
//
// MODULE: Generate QC reports using MULTIQC
//
// Always run MultiQC after quant and, conditionally, SNP calling
ch_multiqc_files = Channel
.empty()
.mix(ch_reports.collect())
MULTIQC (ch_multiqc_files.collect())
multiqc_report = MULTIQC.out.report.toList()
}
2 changes: 2 additions & 0 deletions bulk_RNASeq/config/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ profiles {
process.executor = 'slurm'
executor.queueSize = 60
process.cache = 'lenient'
process.scratch = true
trace.enabled = true
trace.taskMemory = true
withLabel: 'per_sample' {
Expand All @@ -23,6 +24,7 @@ profiles {
process.cache = 'lenient'
process.executor = 'sge'
process.penv = 'smp'
process.scratch = true
clusterOptions = '-S /bin/bash'
withLabel: 'per_sample' {
errorStrategy = 'finish'
Expand Down
5 changes: 3 additions & 2 deletions bulk_RNASeq/config/parameters.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ params {
gatk_vf_qd_filter = 1.0
umitools_dedup_stats= false
filter_rrna = true
call_snps = false
format_contigs = false
adapter_sequence_1 = "CTGTCTCTTATACACATCT"
adapter_sequence_2 = "CTGTCTCTTATACACATCT"
adapter_sequence_1 = ""
adapter_sequence_2 = ""

// STAR custom arguments for SNP calling sensitivity
star_outfilter_mismatch_n_over_lmax = 0.07
Expand Down
38 changes: 38 additions & 0 deletions bulk_RNASeq/config/parameters.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Pipeline-wide parameters (analogous to Nextflow parameters.config)
salmon_quant_libtype: null
fragment_length_mean: 200
fragment_length_std: 20
gtf_extra_attributes: "gene_name"
gtf_group_features: "gene_id"
gatk_vf_cluster_size: 3
gatk_vf_window_size: 35
gatk_vf_fs_filter: 60.0
gatk_vf_qd_filter: 1.0
umitools_dedup_stats: false
filter_rrna: true
format_contigs: false
adapter_sequence_1: "CTGTCTCTTATACACATCT"
adapter_sequence_2: "CTGTCTCTTATACACATCT"

# STAR sensitivity
star_outfilter_mismatch_n_over_lmax: 0.07
star_align_sjoverhang_min: 8
star_outfilter_multimap_nmax: 50
star_seed_search_start_lmax: 30
star_additional: "--outSAMattributes NH HI AS nM XS"

# GATK HaplotypeCaller RNA-seq params
gatk_dont_use_soft_clipped_bases: true
gatk_standard_min_confidence: 10
gatk_min_pruning: 1
gatk_recover_all_dangling_branches: true
gatk_allow_nonunique_kmer: true
gatk_max_mnp_distance: 0

# VariantFiltration thresholds
gatk_vf_cluster_size: 3
gatk_vf_window_size: 35
gatk_vf_fs_filter: 60.0
gatk_vf_qd_filter: 1.0

emit_unfiltered_vcf: true
14 changes: 14 additions & 0 deletions bulk_RNASeq/config/references/hg38_p13.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Reference set (matches hg38_p13_references.config)
reference_directory: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq"
genome: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39_GRCh38.p13_genomic.fna"
genome_idx: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39_GRCh38.p13_genomic.fna.fai"
genome_dict: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39_GRCh38.p13_genomic.dict"
genome_dir: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/star_index"
gtf: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39_GRCh38.p13_genomic.gtf"
transcript_fasta: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/rsem_genome.transcripts.fa"
transcript_index: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/kallisto_index"
dbsnp: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39.gz"
dbsnp_tbi: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39.gz.tbi"
gene_mapper: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39.gz.tbi"
contig_format_map: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/hg38.p13.chromAlias.txt"
rrna_db_file: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/rrna_db.txt"
14 changes: 14 additions & 0 deletions bulk_RNASeq/config/references/hg38_p14.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Example alternative reference set (matches references.config)
reference_directory: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq"
genome: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40_GRCh38.p14_genomic.fna"
genome_idx: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40_GRCh38.p14_genomic.fna.fai"
genome_dict: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40_GRCh38.p14_genomic.dict"
genome_dir: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/star_index"
gtf: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/genomic.gtf"
transcript_fasta: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/rsem_genome.transcripts.fa"
transcript_index: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/kallisto_index"
dbsnp: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40.gz"
dbsnp_tbi: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40.gz.tbi"
gene_mapper: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40.gz.tbi"
contig_format_map: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/chromAlias.txt"
rrna_db_file: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/rrna_db.txt"
22 changes: 22 additions & 0 deletions bulk_RNASeq/config/snakemake_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# -------------------------------------------------------------------------
# Configuration for the Snakemake “light” RNA-seq pipeline
# -------------------------------------------------------------------------
# All paths are absolute so that Snakemake sees the same files inside/outside
# the Singularity container (the host directories are bind-mounted).
# -------------------------------------------------------------------------

input_sample_sheet: "/krummellab/data1/alaa/data/tests/bulk_rnaseq/pipeline_tests/merlin_round2_test4_snakemake/sample_sheet.csv"
results_directory : "/krummellab/data1/alaa/data/tests/bulk_rnaseq/pipeline_tests/merlin_round2_test4_snakemake"

# Kallisto index (same as Nextflow ‘params.transcript_index’)
transcript_index : "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/kallisto_index"

# rRNA database list file (one FASTA path per line)
rrna_db_file : "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/rrna_db.txt"

# ---- optional parameters -------------------------------------------------
filter_rrna : true
fragment_length_mean : 200
fragment_length_std : 20
adapter_sequence_1 : ""
adapter_sequence_2 : ""
6 changes: 6 additions & 0 deletions bulk_RNASeq/config/user.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# User-level settings (override as needed)
reference_profile: "hg38_p13"

tmp_dir: "/c4/scratch/${USER}"
results_directory: "/krummellab/data1/alaa/data/tests/bulk_rnaseq/pipeline_tests/emily_mini_dataset7_p13"
input_sample_sheet: "/krummellab/data1/alaa/data/tests/bulk_rnaseq/pipeline_tests/emily_mini_dataset7_p13/sample_sheet.csv"
3 changes: 2 additions & 1 deletion bulk_RNASeq/modules/bcftools_sort_vcf.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ process BCFTOOLS_SORT_VCF {
fileSize = vcf.size() / (1024 * 1024 * 1024)
return 1.GB + (1.GB * fileSize * 0.01)
}
containerOptions "-B /scratch/"

input:
tuple val(meta), path(vcf)
Expand All @@ -24,7 +25,7 @@ process BCFTOOLS_SORT_VCF {
"""
bcftools sort \\
--output ${prefix}.sorted.vcf.gz -Oz \\
--temp-dir \$PWD \\
--temp-dir \$TMPDIR/ \\
$vcf
"""
}
8 changes: 5 additions & 3 deletions bulk_RNASeq/modules/fastp_trim_adapters.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ process FASTP_TRIM_ADAPTERS {
// File size in GB
fileSize = reads[0].size() / (1024 * 1024 * 1024)
}
if (fileSize > 5){
fileSize = 5
}
if (fileSize > 5){
fileSize = 5
}
return 10.GB * (1 + (fileSize * 2))
}
publishDir "${params.results_directory}/trimmed_reads", mode: 'copy'
containerOptions "-B /scratch/"

input:
tuple val(meta), path(reads)
Expand Down
4 changes: 3 additions & 1 deletion bulk_RNASeq/modules/gatk4_apply_bqsr.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ process GATK4_APPLY_BQSR {
return 1.GB + (2.GB * fileSize * 0.1)
}

containerOptions "-B /scratch/"

input:
tuple val(meta), path(input), path(input_index), path(bqsr_table)
path genome
Expand All @@ -30,7 +32,7 @@ process GATK4_APPLY_BQSR {
--output ${prefix}_bqsr.bam \\
--reference $genome \\
--bqsr-recal-file $bqsr_table \\
--tmp-dir \$PWD \\
--tmp-dir \$TMPDIR \\
$args
"""
}
5 changes: 4 additions & 1 deletion bulk_RNASeq/modules/gatk4_haplotype_caller.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ process GATK4_HAPLOTYPECALLER {
return 17.GB + (1.GB * fileSize * 3)
}

containerOptions "-B /scratch/"


input:
tuple val(meta), path(input), path(input_index)
path fasta
Expand Down Expand Up @@ -41,7 +44,7 @@ process GATK4_HAPLOTYPECALLER {
--output ${prefix}.vcf.gz \\
$reference_command \\
$dbsnp_command \\
--tmp-dir \$PWD \\
--tmp-dir \$TMPDIR \\
$soft_clipped \\
$min_conf \\
$min_pruning \\
Expand Down
Loading