HPCBio
diff --git a/‎bin/assign_taxa_species.R‎
Lines changed: 89 additions & 0 deletions b/‎bin/assign_taxa_species.R‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎bin/check_fastq_qualities.R‎
Lines changed: 42 additions & 0 deletions b/‎bin/check_fastq_qualities.R‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎bin/dada2_biom.R‎
Lines changed: 11 additions & 0 deletions b/‎bin/dada2_biom.R‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎bin/dada2_derep_seqs.R‎
Lines changed: 27 additions & 0 deletions b/‎bin/dada2_derep_seqs.R‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎bin/dada2_pooled_infer.R‎
Lines changed: 65 additions & 0 deletions b/‎bin/dada2_pooled_infer.R‎
Lines changed: 65 additions & 0 deletions
@@ -0,0 +1,89 @@
+#!/usr/bin/env Rscript
+suppressPackageStartupMessages(library(dada2))
+suppressPackageStartupMessages(library(optparse))
+
+option_list <- list(
+    make_option("--seqtab", type = "character", help = "Input readmap RDS (with id and seq columns)"),
+    make_option("--ref", type = "character", help = "Reference FASTA for assignTaxonomy"),
+    make_option("--species_ref", type = "character", default = "null",
+        help = "Reference FASTA for addSpecies(); omit or 'null' to skip species assignment"),
+    make_option("--tax_batch", type = "integer", default = 0,
+        help = "Batch size for taxonomy assignment (0 = no batching) [default %default]"),
+    make_option("--min_boot", type = "integer", default = 50,
+        help = "Minimum bootstrap confidence for taxonomy assignment [default %default]"),
+    make_option("--ncpus", type = "integer", default = 1,
+        help = "Number of processors to use [default %default]")
+)
+
+opt <- parse_args(OptionParser(option_list = option_list))
+for (arg in c("seqtab", "ref")) {
+    if (is.null(opt[[arg]])) stop(paste("--", arg, " is required", sep = ""))
+}
+
+runSpecies <- !is.null(opt$species_ref) && opt$species_ref != "null"
+
+seqs <- readRDS(opt$seqtab)
+seqtab <- seqs$seq
+
+# Assign taxonomy
+tax <- NULL
+boots <- NULL
+
+if (opt$tax_batch == 0 | length(seqtab) < opt$tax_batch) { # no batch, run normally
+    cat("Running all samples\n")
+    tax <- assignTaxonomy(seqtab, opt$ref,
+        multithread = opt$ncpus,
+        tryRC = TRUE,
+        outputBootstraps = TRUE,
+        minBoot = opt$min_boot,
+        verbose = TRUE)
+    boots <- tax$boot
+    if (runSpecies) {
+        tax <- addSpecies(tax$tax, opt$species_ref, tryRC = TRUE, verbose = TRUE)
+    } else {
+        tax <- tax$tax
+    }
+} else {
+    # see https://github.com/benjjneb/dada2/issues/1429 for this
+    to_split <- seq(1, length(seqtab), by = opt$tax_batch)
+    to_split2 <- c(to_split[2:length(to_split)] - 1, length(seqtab))
+
+    for (i in 1:length(to_split)) {
+        cat(paste("Running all samples from", to_split[i], "to", to_split2[i], "\n"))
+        seqtab2 <- seqtab[to_split[i]:to_split2[i]]
+        tax2 <- assignTaxonomy(seqtab2, opt$ref,
+            multithread = opt$ncpus,
+            tryRC = TRUE,
+            outputBootstraps = TRUE,
+            minBoot = opt$min_boot,
+            verbose = TRUE)
+
+        if (is.null(boots)) {
+            boots <- tax2$boot
+        } else {
+            boots <- rbind(boots, tax2$boot)
+        }
+
+        if (runSpecies) {
+            tax2 <- addSpecies(tax2$tax,
+                refFasta = opt$species_ref,
+                tryRC = TRUE,
+                verbose = TRUE)
+        } else {
+            tax2 <- tax2$tax
+        }
+        if (is.null(tax)) {
+            tax <- tax2
+        } else {
+            tax <- rbind(tax, tax2)
+        }
+    }
+}
+
+# make sure these are the same order
+rownames(tax) <- seqs[rownames(tax), ]$id
+rownames(boots) <- seqs[rownames(boots), ]$id
+
+# Write original data
+saveRDS(tax, "taxtab.original.RDS")
+saveRDS(boots, "bootstraps.original.RDS")
@@ -0,0 +1,42 @@
+#!/usr/bin/env Rscript
+suppressPackageStartupMessages(library(ShortRead))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(optparse))
+
+option_list <- list(
+    make_option("--fwd", type = "character", help = "Forward (R1) FASTQ file to sample"),
+    make_option("--sample_id", type = "character", help = "Sample ID (used for output naming)"),
+    make_option("--n_reads", type = "integer", default = 1000000,
+        help = "Number of reads to sample [default %default]")
+)
+
+opt <- parse_args(OptionParser(option_list = option_list))
+for (arg in c("fwd", "sample_id")) {
+    if (is.null(opt[[arg]])) stop(paste("--", arg, " is required", sep = ""))
+}
+
+# Read just R1
+l <- FastqSampler(opt$fwd,
+    n = opt$n_reads,
+    readerBlockSize = 1e4)
+
+fq <- yield(l)
+qual_matrix <- as(quality(fq), "matrix")
+qual_df <- as.data.frame(qual_matrix)
+qual_df$ReadID <- rownames(qual_df)
+qual_long <- qual_df %>%
+    pivot_longer(
+        cols = -ReadID,
+        names_to = "BasePosition",
+        values_to = "QualityScore"
+    ) %>%
+    mutate(BasePosition = as.integer(gsub("V", "", BasePosition)))
+
+binned_quals <- factor(qual_long$QualityScore) %>%
+    levels() %>%
+    as.integer()
+
+# this assumes there are 10 or fewer bins
+stopifnot(length(binned_quals) <= 10)
+
+saveRDS(binned_quals, "quality_bins.RDS")
@@ -0,0 +1,11 @@
+#!/usr/bin/env Rscript
+suppressPackageStartupMessages(library(biomformat))
+
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) < 2) stop("Usage: dada2_biom.R <seqtab.RDS> <taxtab.RDS>")
+
+seqtab <- readRDS(args[1])
+taxtab <- readRDS(args[2])
+packageVersion("biomformat")
+st.biom <- make_biom(t(seqtab), observation_metadata = taxtab)
+write_biom(st.biom, "final.biom")
@@ -0,0 +1,27 @@
+#!/usr/bin/env Rscript
+suppressPackageStartupMessages(library(dada2))
+suppressPackageStartupMessages(library(optparse))
+
+option_list <- list(
+    make_option("--fwd", type = "character", help = "Forward (R1) filtered FASTQ file"),
+    make_option("--rev", type = "character", default = "null",
+        help = "Reverse (R2) filtered FASTQ file; omit or pass 'null' for single-end"),
+    make_option("--sample_id", type = "character", help = "Sample ID used for output file naming"),
+    make_option("--maxrecords", type = "integer", default = 100000,
+        help = "Max records to read per derep call [default %default]")
+)
+
+opt <- parse_args(OptionParser(option_list = option_list))
+for (arg in c("fwd", "sample_id")) {
+    if (is.null(opt[[arg]])) stop(paste("--", arg, " is required", sep = ""))
+}
+
+derepsF <- derepFastq(opt$fwd, n = opt$maxrecords, verbose = TRUE)
+derepsF$file <- basename(opt$fwd)
+saveRDS(derepsF, paste0(opt$sample_id, ".R1.derep.RDS"))
+
+if (!is.null(opt$rev) && opt$rev != "null") {
+    derepsR <- derepFastq(opt$rev, n = opt$maxrecords, verbose = TRUE)
+    derepsR$file <- basename(opt$rev)
+    saveRDS(derepsR, paste0(opt$sample_id, ".R2.derep.RDS"))
+}
@@ -0,0 +1,65 @@
+#!/usr/bin/env Rscript
+suppressPackageStartupMessages(library(dada2))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(optparse))
+
+option_list <- list(
+    make_option("--readmode", type = "character", help = "Read mode label (e.g. R1, R2)"),
+    make_option("--err", type = "character", help = "Error model RDS file"),
+    make_option("--pool", type = "character", default = "FALSE",
+        help = "Pooling mode: TRUE, FALSE, or pseudo [default %default]"),
+    make_option("--dada_opts", type = "character", default = "",
+        help = "Additional dada options as a key=value string passed to setDadaOpt()"),
+    make_option("--platform", type = "character", default = "illumina",
+        help = "Sequencing platform: illumina or pacbio [default %default]"),
+    make_option("--ncpus", type = "integer", default = 1,
+        help = "Number of processors to use [default %default]")
+)
+
+opt <- parse_args(OptionParser(option_list = option_list))
+for (arg in c("readmode", "err")) {
+    if (is.null(opt[[arg]])) stop(paste("--", arg, " is required", sep = ""))
+}
+
+if (nzchar(opt$dada_opts)) {
+    eval(parse(text = paste0("setDadaOpt(", opt$dada_opts, ")")))
+    cat("dada Options:\n", opt$dada_opts, "\n")
+}
+
+getN <- function(x) sum(getUniques(x))
+
+set.seed(100)
+
+cat("Processing all samples\n")
+
+err <- readRDS(opt$err)
+
+# 'pool' is a weird flag: either 'pseudo' (string), or T/F (bool)
+pool <- opt$pool
+if (pool != "pseudo") {
+    pool <- as.logical(pool)
+}
+
+# Determine trim pattern from readmode (R1 -> _1, R2 -> _2, else no suffix)
+trimmode <- sub("R", "", opt$readmode)  # "1" or "2"
+filts <- list.files('.', pattern = paste0("(_", trimmode, ")?.trim.fastq.gz"))
+names(filts) <- gsub(paste0("(_", trimmode, ")?.trim.fastq.gz"), "", filts)
+
+cat(paste0("Denoising ", opt$readmode, " reads: pool:", pool, "\n"))
+
+dada_args <- list(filts, err = err, multithread = opt$ncpus, pool = pool)
+
+if (opt$platform == "pacbio") {
+    dada_args$BAND_SIZE <- 32L
+}
+
+dds <- do.call(dada, dada_args)
+
+saveRDS(dds, paste0("all.dd.", opt$readmode, ".RDS"))
+
+tracking_dds <- as.data.frame(sapply(dds, getN))
+colnames(tracking_dds) <- c(paste0("dada2.denoised.pooled.", opt$readmode))
+tracking_dds <- tracking_dds %>%
+    as_tibble() %>%
+    mutate(SampleID = rownames(tracking_dds), .before = 1)
+write_csv(tracking_dds, paste0("dada2.denoised.pooled.", opt$readmode, ".csv"))