Skip to content

Commit 1dead19

Browse files
cjfieldsclaude
andcommitted
Migrate inline R code from Nextflow modules to external bin/ scripts
Extract all inline R code from 35 modules/local/*.nf files into standalone R scripts under bin/. Each script uses optparse for parameter handling where Nextflow variables were previously interpolated directly. Modules updated to call the external scripts with appropriate CLI flags. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 5b5fe27 commit 1dead19

70 files changed

Lines changed: 1885 additions & 1353 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

bin/assign_taxa_species.R

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/usr/bin/env Rscript
2+
suppressPackageStartupMessages(library(dada2))
3+
suppressPackageStartupMessages(library(optparse))
4+
5+
option_list <- list(
6+
make_option("--seqtab", type = "character", help = "Input readmap RDS (with id and seq columns)"),
7+
make_option("--ref", type = "character", help = "Reference FASTA for assignTaxonomy"),
8+
make_option("--species_ref", type = "character", default = "null",
9+
help = "Reference FASTA for addSpecies(); omit or 'null' to skip species assignment"),
10+
make_option("--tax_batch", type = "integer", default = 0,
11+
help = "Batch size for taxonomy assignment (0 = no batching) [default %default]"),
12+
make_option("--min_boot", type = "integer", default = 50,
13+
help = "Minimum bootstrap confidence for taxonomy assignment [default %default]"),
14+
make_option("--ncpus", type = "integer", default = 1,
15+
help = "Number of processors to use [default %default]")
16+
)
17+
18+
opt <- parse_args(OptionParser(option_list = option_list))
19+
for (arg in c("seqtab", "ref")) {
20+
if (is.null(opt[[arg]])) stop(paste("--", arg, " is required", sep = ""))
21+
}
22+
23+
runSpecies <- !is.null(opt$species_ref) && opt$species_ref != "null"
24+
25+
seqs <- readRDS(opt$seqtab)
26+
seqtab <- seqs$seq
27+
28+
# Assign taxonomy
29+
tax <- NULL
30+
boots <- NULL
31+
32+
if (opt$tax_batch == 0 | length(seqtab) < opt$tax_batch) { # no batch, run normally
33+
cat("Running all samples\n")
34+
tax <- assignTaxonomy(seqtab, opt$ref,
35+
multithread = opt$ncpus,
36+
tryRC = TRUE,
37+
outputBootstraps = TRUE,
38+
minBoot = opt$min_boot,
39+
verbose = TRUE)
40+
boots <- tax$boot
41+
if (runSpecies) {
42+
tax <- addSpecies(tax$tax, opt$species_ref, tryRC = TRUE, verbose = TRUE)
43+
} else {
44+
tax <- tax$tax
45+
}
46+
} else {
47+
# see https://github.com/benjjneb/dada2/issues/1429 for this
48+
to_split <- seq(1, length(seqtab), by = opt$tax_batch)
49+
to_split2 <- c(to_split[2:length(to_split)] - 1, length(seqtab))
50+
51+
for (i in 1:length(to_split)) {
52+
cat(paste("Running all samples from", to_split[i], "to", to_split2[i], "\n"))
53+
seqtab2 <- seqtab[to_split[i]:to_split2[i]]
54+
tax2 <- assignTaxonomy(seqtab2, opt$ref,
55+
multithread = opt$ncpus,
56+
tryRC = TRUE,
57+
outputBootstraps = TRUE,
58+
minBoot = opt$min_boot,
59+
verbose = TRUE)
60+
61+
if (is.null(boots)) {
62+
boots <- tax2$boot
63+
} else {
64+
boots <- rbind(boots, tax2$boot)
65+
}
66+
67+
if (runSpecies) {
68+
tax2 <- addSpecies(tax2$tax,
69+
refFasta = opt$species_ref,
70+
tryRC = TRUE,
71+
verbose = TRUE)
72+
} else {
73+
tax2 <- tax2$tax
74+
}
75+
if (is.null(tax)) {
76+
tax <- tax2
77+
} else {
78+
tax <- rbind(tax, tax2)
79+
}
80+
}
81+
}
82+
83+
# make sure these are the same order
84+
rownames(tax) <- seqs[rownames(tax), ]$id
85+
rownames(boots) <- seqs[rownames(boots), ]$id
86+
87+
# Write original data
88+
saveRDS(tax, "taxtab.original.RDS")
89+
saveRDS(boots, "bootstraps.original.RDS")

bin/check_fastq_qualities.R

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env Rscript
2+
suppressPackageStartupMessages(library(ShortRead))
3+
suppressPackageStartupMessages(library(tidyverse))
4+
suppressPackageStartupMessages(library(optparse))
5+
6+
option_list <- list(
7+
make_option("--fwd", type = "character", help = "Forward (R1) FASTQ file to sample"),
8+
make_option("--sample_id", type = "character", help = "Sample ID (used for output naming)"),
9+
make_option("--n_reads", type = "integer", default = 1000000,
10+
help = "Number of reads to sample [default %default]")
11+
)
12+
13+
opt <- parse_args(OptionParser(option_list = option_list))
14+
for (arg in c("fwd", "sample_id")) {
15+
if (is.null(opt[[arg]])) stop(paste("--", arg, " is required", sep = ""))
16+
}
17+
18+
# Read just R1
19+
l <- FastqSampler(opt$fwd,
20+
n = opt$n_reads,
21+
readerBlockSize = 1e4)
22+
23+
fq <- yield(l)
24+
qual_matrix <- as(quality(fq), "matrix")
25+
qual_df <- as.data.frame(qual_matrix)
26+
qual_df$ReadID <- rownames(qual_df)
27+
qual_long <- qual_df %>%
28+
pivot_longer(
29+
cols = -ReadID,
30+
names_to = "BasePosition",
31+
values_to = "QualityScore"
32+
) %>%
33+
mutate(BasePosition = as.integer(gsub("V", "", BasePosition)))
34+
35+
binned_quals <- factor(qual_long$QualityScore) %>%
36+
levels() %>%
37+
as.integer()
38+
39+
# this assumes there are 10 or fewer bins
40+
stopifnot(length(binned_quals) <= 10)
41+
42+
saveRDS(binned_quals, "quality_bins.RDS")

bin/dada2_biom.R

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/usr/bin/env Rscript
2+
suppressPackageStartupMessages(library(biomformat))
3+
4+
args <- commandArgs(trailingOnly = TRUE)
5+
if (length(args) < 2) stop("Usage: dada2_biom.R <seqtab.RDS> <taxtab.RDS>")
6+
7+
seqtab <- readRDS(args[1])
8+
taxtab <- readRDS(args[2])
9+
packageVersion("biomformat")
10+
st.biom <- make_biom(t(seqtab), observation_metadata = taxtab)
11+
write_biom(st.biom, "final.biom")

bin/dada2_derep_seqs.R

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env Rscript
2+
suppressPackageStartupMessages(library(dada2))
3+
suppressPackageStartupMessages(library(optparse))
4+
5+
option_list <- list(
6+
make_option("--fwd", type = "character", help = "Forward (R1) filtered FASTQ file"),
7+
make_option("--rev", type = "character", default = "null",
8+
help = "Reverse (R2) filtered FASTQ file; omit or pass 'null' for single-end"),
9+
make_option("--sample_id", type = "character", help = "Sample ID used for output file naming"),
10+
make_option("--maxrecords", type = "integer", default = 100000,
11+
help = "Max records to read per derep call [default %default]")
12+
)
13+
14+
opt <- parse_args(OptionParser(option_list = option_list))
15+
for (arg in c("fwd", "sample_id")) {
16+
if (is.null(opt[[arg]])) stop(paste("--", arg, " is required", sep = ""))
17+
}
18+
19+
derepsF <- derepFastq(opt$fwd, n = opt$maxrecords, verbose = TRUE)
20+
derepsF$file <- basename(opt$fwd)
21+
saveRDS(derepsF, paste0(opt$sample_id, ".R1.derep.RDS"))
22+
23+
if (!is.null(opt$rev) && opt$rev != "null") {
24+
derepsR <- derepFastq(opt$rev, n = opt$maxrecords, verbose = TRUE)
25+
derepsR$file <- basename(opt$rev)
26+
saveRDS(derepsR, paste0(opt$sample_id, ".R2.derep.RDS"))
27+
}

bin/dada2_pooled_infer.R

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/usr/bin/env Rscript
2+
suppressPackageStartupMessages(library(dada2))
3+
suppressPackageStartupMessages(library(tidyverse))
4+
suppressPackageStartupMessages(library(optparse))
5+
6+
option_list <- list(
7+
make_option("--readmode", type = "character", help = "Read mode label (e.g. R1, R2)"),
8+
make_option("--err", type = "character", help = "Error model RDS file"),
9+
make_option("--pool", type = "character", default = "FALSE",
10+
help = "Pooling mode: TRUE, FALSE, or pseudo [default %default]"),
11+
make_option("--dada_opts", type = "character", default = "",
12+
help = "Additional dada options as a key=value string passed to setDadaOpt()"),
13+
make_option("--platform", type = "character", default = "illumina",
14+
help = "Sequencing platform: illumina or pacbio [default %default]"),
15+
make_option("--ncpus", type = "integer", default = 1,
16+
help = "Number of processors to use [default %default]")
17+
)
18+
19+
opt <- parse_args(OptionParser(option_list = option_list))
20+
for (arg in c("readmode", "err")) {
21+
if (is.null(opt[[arg]])) stop(paste("--", arg, " is required", sep = ""))
22+
}
23+
24+
if (nzchar(opt$dada_opts)) {
25+
eval(parse(text = paste0("setDadaOpt(", opt$dada_opts, ")")))
26+
cat("dada Options:\n", opt$dada_opts, "\n")
27+
}
28+
29+
getN <- function(x) sum(getUniques(x))
30+
31+
set.seed(100)
32+
33+
cat("Processing all samples\n")
34+
35+
err <- readRDS(opt$err)
36+
37+
# 'pool' is a weird flag: either 'pseudo' (string), or T/F (bool)
38+
pool <- opt$pool
39+
if (pool != "pseudo") {
40+
pool <- as.logical(pool)
41+
}
42+
43+
# Determine trim pattern from readmode (R1 -> _1, R2 -> _2, else no suffix)
44+
trimmode <- sub("R", "", opt$readmode) # "1" or "2"
45+
filts <- list.files('.', pattern = paste0("(_", trimmode, ")?.trim.fastq.gz"))
46+
names(filts) <- gsub(paste0("(_", trimmode, ")?.trim.fastq.gz"), "", filts)
47+
48+
cat(paste0("Denoising ", opt$readmode, " reads: pool:", pool, "\n"))
49+
50+
dada_args <- list(filts, err = err, multithread = opt$ncpus, pool = pool)
51+
52+
if (opt$platform == "pacbio") {
53+
dada_args$BAND_SIZE <- 32L
54+
}
55+
56+
dds <- do.call(dada, dada_args)
57+
58+
saveRDS(dds, paste0("all.dd.", opt$readmode, ".RDS"))
59+
60+
tracking_dds <- as.data.frame(sapply(dds, getN))
61+
colnames(tracking_dds) <- c(paste0("dada2.denoised.pooled.", opt$readmode))
62+
tracking_dds <- tracking_dds %>%
63+
as_tibble() %>%
64+
mutate(SampleID = rownames(tracking_dds), .before = 1)
65+
write_csv(tracking_dds, paste0("dada2.denoised.pooled.", opt$readmode, ".csv"))

0 commit comments

Comments
 (0)