UCSF-DSCOLAB · AlaaALatif · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jul 1, 2025
diff --git a/bulk_RNASeq/Snakefile b/bulk_RNASeq/Snakefile
diff --git a/bulk_RNASeq/bulk_rna_seq.nf b/bulk_RNASeq/bulk_rna_seq.nf
@@ -145,6 +145,9 @@ workflow {
     // CUSTOM_MERGE_COUNTS (
     //     counts
     // )
+    // === SNP Calling branch: run only if requested ===
+if (params.call_snps) {   // <- ### ADDED
+
     //
     // SUBWORKFLOW: Align FastQ reads; sort, and index BAM files
     //
@@ -169,6 +172,7 @@ workflow {
     ch_star_multiqc  = ALIGN_READS.out.log_final
     ch_reports = ch_reports.mix(ALIGN_READS.out.log_final.map{it[1]}.ifEmpty([]))
     ch_star_bam_bai = ch_star_bam.join(ch_star_bai, by: [0])
+
     //
     // SUBWORKFLOW: Mark duplicate reads
     //
@@ -192,6 +196,7 @@ workflow {
     ch_reports = ch_reports.mix(BAM_MARKDUPLICATES_PICARD.out.stats.map{it[1]}.ifEmpty([]))
     ch_reports = ch_reports.mix(BAM_MARKDUPLICATES_PICARD.out.metrics.map{it[1]}.ifEmpty([]))
     ch_genome_bam_bai = ch_genome_bam.join(ch_genome_bai, by: [0])
+
     //
     // MODULE: SplitNCigarReads and reassign mapping qualities
     //
@@ -205,6 +210,7 @@ workflow {
     )
     ch_split_bam = GATK4_SPLITNCIGARREADS.out.bam
     ch_split_bai = GATK4_SPLITNCIGARREADS.out.bai
+
     //
     // MODULE: Base Recalibration table generation
     //
@@ -220,6 +226,7 @@ workflow {
     )
     ch_recal_table = GATK4_BASE_RECALIBRATOR.out.table
     ch_reports = ch_reports.mix(ch_recal_table.map{ meta, table -> table})
+
     //
     // MODULE: Apply BQSR using recalibration table, then index
     //
@@ -238,6 +245,7 @@ workflow {
     )
     ch_bam_variant_calling = GATK4_APPLY_BQSR.out.bam
     ch_bai_variant_calling = SAMTOOLS_INDEX_BQSR.out.bai
+
     //
     // MODULE: Call SNPs and Indels using HaplotypeCaller
     //
@@ -255,6 +263,7 @@ workflow {
     ch_haplotype_vcf = GATK4_HAPLOTYPECALLER.out.vcf
     ch_haplotype_tbi = GATK4_HAPLOTYPECALLER.out.tbi
     ch_haplotype_vcf_tbi = ch_haplotype_vcf.join(ch_haplotype_tbi, by: [0])
+
     //
     // MODULE: Filter variants using VariantFiltration
     //
@@ -271,11 +280,12 @@ workflow {
         // MODULE: Convert VCF contigs to desired naming format (e.g. ucsc)
         //
         BCFTOOLS_CONTIG_CONVERSION (
-           ch_filtered_vcf,
-           params.contig_format_map
+        ch_filtered_vcf,
+        params.contig_format_map
         )
         ch_filtered_vcf = BCFTOOLS_CONTIG_CONVERSION.out.formatted_vcf
     }
+
     //
     // MODULE: Sort and index VCFs
     //
@@ -284,48 +294,18 @@ workflow {
         ch_filtered_vcf
     )
     ch_sorted_vcf = BCFTOOLS_SORT_VCF.out.sorted_vcf
+
     //
     // MODULE: Index VCFs
     //
     ch_vcf_index = Channel.empty()
     BCFTOOLS_INDEX_VCF (
         ch_sorted_vcf
     )
-    // ch_sorted_vcf = BCFTOOLS_INDEX_VCF.out.sorted_vcf
     ch_vcf_index = BCFTOOLS_INDEX_VCF.out.vcf_index
     ch_vcf = ch_sorted_vcf.join(ch_vcf_index, by: [0])
-    // Collect all VCFs and index files from upstream process
-    // meta = ch_vcf
-    // .map { tuple -> tuple[0]}
-    // .collect()
-    // vcfs = ch_vcf
-    // .map { tuple -> tuple[1]}
-    // .collect()
-    // tbis = ch_vcf
-    // .map { tuple -> tuple[2]}
-    // .collect()
-    // //
-    // // MODULE: Merge VCFs
-    // //
-    // BCFTOOLS_MERGE_VCF (
-    //     meta, 
-    //     vcfs, 
-    //     tbis
-    // )
-    // Collect VCFs and TBIs, filtering out any nulls or missing files
-    // Filter ch_vcf to samples with both VCF and TBI files
-    // ch_vcf_success = ch_vcf.filter { meta, vcf, tbi -> vcf && tbi }
 
-    // // Collect the VCF files into a list within a channel
-    // ch_vcf_lists = ch_vcf_success
-    //     .collect()
-    //     .map { vcf_tuples ->
-    //         def metaList = vcf_tuples.collect { it[0] }
-    //         def vcfList = vcf_tuples.collect { it[1] }
-    //         def tbiList = vcf_tuples.collect { it[2] }
-    //         return [ metaList, vcfList, tbiList ]
-    //     }
-    // Split the combined channel into three separate channels
+    // Merge samples
     meta = ch_vcf
         .map { tuple -> tuple[0]}
         .collect()
@@ -335,42 +315,20 @@ workflow {
     tbis = ch_vcf
         .map { tuple -> tuple[2]}
         .collect()
-    // Now, invoke the process outside of any closure
-    BCFTOOLS_MERGE_VCF( 
-        meta, 
-        vcfs, 
-        tbis 
-        )
-    // // Filter ch_vcf to samples with both VCF and TBI files
-    // ch_vcf_success = ch_vcf.filter { meta, vcf, tbi -> vcf && tbi }
-
-    // // Collect the VCF files into a list
-    // vcf_list = ch_vcf_success
-    //     .map { meta, vcf, tbi -> vcf }
-    //     .collect()
+    BCFTOOLS_MERGE_VCF(
+        meta,
+        vcfs,
+        tbis
+    )
+}   // <--- END IF
 
-    // // Subscribe to the vcf_list when it's ready
-    // vcf_list.subscribe { list ->
-    //     if (!list.isEmpty()) {
-    //         BCFTOOLS_MERGE_VCF(list)
-    //     } else {
-    //         println "No VCF files to merge."
-    //     }
-    // }
-    //
-    // MODULE: Generate QC reports using MULTIQC
-    //
-    // After correcting all instances, you can now filter and use ch_reports
-    // ch_multiqc_files = ch_reports.filter { it.exists() }
-    // MULTIQC(ch_reports)
-    // multiqc_report = MULTIQC.out.report.toList()
-    // ch_multiqc_files = ch_reports
-    //     .filter { it.exists() }
-    // MULTIQC (ch_multiqc_files.collect())
-    // multiqc_report = MULTIQC.out.report.toList()
-    ch_multiqc_files = Channel
-                            .empty()
-                            .mix(ch_reports.collect())
-    MULTIQC (ch_multiqc_files.collect())
-    multiqc_report = MULTIQC.out.report.toList()
+//
+// MODULE: Generate QC reports using MULTIQC
+//
+// Always run MultiQC after quant and, conditionally, SNP calling
+ch_multiqc_files = Channel
+                        .empty()
+                        .mix(ch_reports.collect())
+MULTIQC (ch_multiqc_files.collect())
+multiqc_report = MULTIQC.out.report.toList()
 }
diff --git a/bulk_RNASeq/config/base.config b/bulk_RNASeq/config/base.config
@@ -10,6 +10,7 @@ profiles {
         process.executor = 'slurm'
 	    executor.queueSize = 60
         process.cache = 'lenient'
+	   process.scratch = true
 	    trace.enabled = true
         trace.taskMemory = true
         withLabel: 'per_sample' {
@@ -23,6 +24,7 @@ profiles {
         process.cache = 'lenient'
         process.executor = 'sge'
         process.penv = 'smp'
+	   process.scratch = true
         clusterOptions = '-S /bin/bash'
         withLabel: 'per_sample' {
             errorStrategy = 'finish'

diff --git a/bulk_RNASeq/config/parameters.config b/bulk_RNASeq/config/parameters.config
@@ -11,9 +11,10 @@ params {
     gatk_vf_qd_filter   = 1.0
     umitools_dedup_stats= false
     filter_rrna         = true
+    call_snps           = false
     format_contigs      = false
-    adapter_sequence_1  = "CTGTCTCTTATACACATCT"
-    adapter_sequence_2  = "CTGTCTCTTATACACATCT"
+    adapter_sequence_1  = ""
+    adapter_sequence_2  = ""
 
     // STAR custom arguments for SNP calling sensitivity
     star_outfilter_mismatch_n_over_lmax   = 0.07

diff --git a/bulk_RNASeq/config/parameters.yaml b/bulk_RNASeq/config/parameters.yaml
@@ -0,0 +1,38 @@
+# Pipeline-wide parameters (analogous to Nextflow parameters.config)
+salmon_quant_libtype: null
+fragment_length_mean: 200
+fragment_length_std: 20
+gtf_extra_attributes: "gene_name"
+gtf_group_features: "gene_id"
+gatk_vf_cluster_size: 3
+gatk_vf_window_size: 35
+gatk_vf_fs_filter: 60.0
+gatk_vf_qd_filter: 1.0
+umitools_dedup_stats: false
+filter_rrna: true
+format_contigs: false
+adapter_sequence_1: "CTGTCTCTTATACACATCT"
+adapter_sequence_2: "CTGTCTCTTATACACATCT"
+
+# STAR sensitivity
+star_outfilter_mismatch_n_over_lmax: 0.07
+star_align_sjoverhang_min: 8
+star_outfilter_multimap_nmax: 50
+star_seed_search_start_lmax: 30
+star_additional: "--outSAMattributes NH HI AS nM XS"
+
+# GATK HaplotypeCaller RNA-seq params
+gatk_dont_use_soft_clipped_bases: true
+gatk_standard_min_confidence: 10
+gatk_min_pruning: 1
+gatk_recover_all_dangling_branches: true
+gatk_allow_nonunique_kmer: true
+gatk_max_mnp_distance: 0
+
+# VariantFiltration thresholds
+gatk_vf_cluster_size: 3
+gatk_vf_window_size: 35
+gatk_vf_fs_filter: 60.0
+gatk_vf_qd_filter: 1.0
+
+emit_unfiltered_vcf: true
diff --git a/bulk_RNASeq/config/references/hg38_p13.yaml b/bulk_RNASeq/config/references/hg38_p13.yaml
@@ -0,0 +1,14 @@
+# Reference set (matches hg38_p13_references.config)
+reference_directory: "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq"
+genome:              "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39_GRCh38.p13_genomic.fna"
+genome_idx:          "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39_GRCh38.p13_genomic.fna.fai"
+genome_dict:         "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39_GRCh38.p13_genomic.dict"
+genome_dir:          "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/star_index"
+gtf:                 "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39_GRCh38.p13_genomic.gtf"
+transcript_fasta:    "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/rsem_genome.transcripts.fa"
+transcript_index:    "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/kallisto_index"
+dbsnp:               "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39.gz"
+dbsnp_tbi:           "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39.gz.tbi"
+gene_mapper:         "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/GCF_000001405.39.gz.tbi"
+contig_format_map:   "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/hg38.p13.chromAlias.txt"
+rrna_db_file:        "/krummellab/data1/DSCoLab/references/human/hg38_p13/ncbi_refseq/rrna_db.txt"
diff --git a/bulk_RNASeq/config/references/hg38_p14.yaml b/bulk_RNASeq/config/references/hg38_p14.yaml
@@ -0,0 +1,14 @@
+# Example alternative reference set (matches references.config)
+reference_directory: "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq"
+genome:              "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40_GRCh38.p14_genomic.fna"
+genome_idx:          "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40_GRCh38.p14_genomic.fna.fai"
+genome_dict:         "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40_GRCh38.p14_genomic.dict"
+genome_dir:          "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/star_index"
+gtf:                 "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/genomic.gtf"
+transcript_fasta:    "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/rsem_genome.transcripts.fa"
+transcript_index:    "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/kallisto_index"
+dbsnp:               "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40.gz"
+dbsnp_tbi:           "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40.gz.tbi"
+gene_mapper:         "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/GCF_000001405.40.gz.tbi"
+contig_format_map:   "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/chromAlias.txt"
+rrna_db_file:        "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/rrna_db.txt"
diff --git a/bulk_RNASeq/config/snakemake_config.yaml b/bulk_RNASeq/config/snakemake_config.yaml
@@ -0,0 +1,22 @@
+# -------------------------------------------------------------------------
+# Configuration for the Snakemake “light” RNA-seq pipeline
+# -------------------------------------------------------------------------
+# All paths are absolute so that Snakemake sees the same files inside/outside
+# the Singularity container (the host directories are bind-mounted).
+# -------------------------------------------------------------------------
+
+input_sample_sheet: "/krummellab/data1/alaa/data/tests/bulk_rnaseq/pipeline_tests/merlin_round2_test4_snakemake/sample_sheet.csv"
+results_directory : "/krummellab/data1/alaa/data/tests/bulk_rnaseq/pipeline_tests/merlin_round2_test4_snakemake"
+
+# Kallisto index (same as Nextflow ‘params.transcript_index’)
+transcript_index  : "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/kallisto_index"
+
+# rRNA database list file (one FASTA path per line)
+rrna_db_file      : "/krummellab/data1/DSCoLab/references/human/hg38_p14/ncbi_refseq/rrna_db.txt"
+
+# ---- optional parameters -------------------------------------------------
+filter_rrna           : true
+fragment_length_mean  : 200
+fragment_length_std   : 20
+adapter_sequence_1    : ""
+adapter_sequence_2    : ""
diff --git a/bulk_RNASeq/config/user.yaml b/bulk_RNASeq/config/user.yaml
@@ -0,0 +1,6 @@
+# User-level settings (override as needed)
+reference_profile: "hg38_p13"
+
+tmp_dir: "/c4/scratch/${USER}"
+results_directory: "/krummellab/data1/alaa/data/tests/bulk_rnaseq/pipeline_tests/emily_mini_dataset7_p13"
+input_sample_sheet: "/krummellab/data1/alaa/data/tests/bulk_rnaseq/pipeline_tests/emily_mini_dataset7_p13/sample_sheet.csv"
diff --git a/bulk_RNASeq/modules/bcftools_sort_vcf.nf b/bulk_RNASeq/modules/bcftools_sort_vcf.nf
@@ -8,6 +8,7 @@ process BCFTOOLS_SORT_VCF {
         fileSize = vcf.size() / (1024 * 1024 * 1024)
         return 1.GB + (1.GB * fileSize * 0.01)
     }
+    containerOptions "-B /scratch/"
 
     input:
     tuple val(meta), path(vcf)
@@ -24,7 +25,7 @@ process BCFTOOLS_SORT_VCF {
     """
     bcftools sort \\
             --output ${prefix}.sorted.vcf.gz -Oz \\
-            --temp-dir \$PWD \\
+            --temp-dir \$TMPDIR/ \\
             $vcf
     """
 }
diff --git a/bulk_RNASeq/modules/fastp_trim_adapters.nf b/bulk_RNASeq/modules/fastp_trim_adapters.nf
@@ -10,11 +10,13 @@ process FASTP_TRIM_ADAPTERS {
           // File size in GB
           fileSize = reads[0].size() / (1024 * 1024 * 1024)
         }
-	if (fileSize > 5){
-	   fileSize = 5
-	}
+        if (fileSize > 5){
+          fileSize = 5
+        }
         return 10.GB * (1 + (fileSize * 2))
     }
+    publishDir "${params.results_directory}/trimmed_reads", mode: 'copy'
+    containerOptions "-B /scratch/"
 
     input:
     tuple val(meta), path(reads)

diff --git a/bulk_RNASeq/modules/gatk4_apply_bqsr.nf b/bulk_RNASeq/modules/gatk4_apply_bqsr.nf
@@ -8,6 +8,8 @@ process GATK4_APPLY_BQSR {
         return 1.GB + (2.GB * fileSize * 0.1)
     }
 
+    containerOptions "-B /scratch/"
+
     input:
     tuple val(meta), path(input), path(input_index), path(bqsr_table)
     path  genome
@@ -30,7 +32,7 @@ process GATK4_APPLY_BQSR {
         --output ${prefix}_bqsr.bam \\
         --reference $genome \\
         --bqsr-recal-file $bqsr_table \\
-        --tmp-dir \$PWD \\
+        --tmp-dir \$TMPDIR \\
         $args
     """
 }
diff --git a/bulk_RNASeq/modules/gatk4_haplotype_caller.nf b/bulk_RNASeq/modules/gatk4_haplotype_caller.nf
@@ -9,6 +9,9 @@ process GATK4_HAPLOTYPECALLER {
         return 17.GB + (1.GB * fileSize * 3)
     }
 
+    containerOptions "-B /scratch/"
+
+
     input:
     tuple val(meta), path(input), path(input_index)
     path  fasta
@@ -41,7 +44,7 @@ process GATK4_HAPLOTYPECALLER {
         --output ${prefix}.vcf.gz \\
         $reference_command \\
         $dbsnp_command \\
-        --tmp-dir \$PWD \\
+        --tmp-dir \$TMPDIR \\
         $soft_clipped \\
         $min_conf \\
         $min_pruning \\