From bf32099f48ec796c22ffb37fd0b40e749ee4f2da Mon Sep 17 00:00:00 2001 From: lingminhao Date: Wed, 15 Apr 2026 20:10:40 +0800 Subject: [PATCH 01/29] convert quantData into an object storing list of S4 object --- NAMESPACE | 2 ++ R/bambu-assignDist.R | 44 +++++++++++++++++++++----------------------- R/bambu-classes.R | 25 +++++++++++++++++++++++++ R/bambu.R | 22 +++++++++++----------- 4 files changed, 59 insertions(+), 34 deletions(-) create mode 100644 R/bambu-classes.R diff --git a/NAMESPACE b/NAMESPACE index 3adb4e37..e51b129b 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,8 @@ export(writeAnnotationsToGTF) export(trainBambu) export(setNDR) export(compareTranscripts) +exportClass(quantData) +exportMethods("$") importFrom(stats,predict) importFrom(BiocGenerics,basename) importFrom(BiocGenerics,unstrand) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index 750c1e87..a5319290 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -18,32 +18,30 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame data.table() #return non-em counts ColData <- generateColData(readClassList, sampleMetadata, demultiplexed) - quantData <- SummarizedExperiment(assays = SimpleList( - counts = generateUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)), - rowRanges = annotations, - colData = ColData) - colnames(quantData) <- ColData$id - if(sum(metadata(readClassList)$incompatibleCountMatrix)==0){ - metadata(quantData)$incompatibleCounts <- NULL - }else{ - metadata(quantData)$incompatibleCounts <- generateIncompatibleCounts(metadata(readClassList)$incompatibleCountMatrix, annotations) - } - metadata(quantData)$nonuniqueCounts <- generateNonUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations) - metadata(quantData)$readClassDt <- readClassDt - metadata(quantData)$countMatrix <- metadata(readClassList)$countMatrix - metadata(quantData)$incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix - metadata(quantData)$sampleName <- metadata(readClassList)$sampleData$sampleName - if(returnDistTable) - metadata(quantData)$distTable <- metadata(metadata(readClassList)$readClassDist)$distTableOld - - if(trackReads) - metadata(quantData)$readToTranscriptMap <- - generateReadToTranscriptMap(readClassList, + + incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix + incompatibleCounts <- if(sum(incompatibleCountMatrix)==0) NULL else generateIncompatibleCounts(incompatibleCountMatrix, annotations) + + distTable <- if(returnDistTable) metadata(metadata(readClassList)$readClassDist)$distTableOld else NULL + + readToTranscriptMap <- if(trackReads) generateReadToTranscriptMap(readClassList, metadata(readClassList)$readClassDist, - annotations) + annotations) else NULL - return(quantData) + quantData <- new("quantData", + sampleData = data.frame(ColData), + uniqueCounts = generateUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations), + readClassDt = readClassDt, + countMatrix = metadata(readClassList)$countMatrix, + incompatibleCountMatrix = incompatibleCountMatrix, + sampleNames = as.character(metadata(readClassList)$sampleData$sampleName), + incompatibleCounts = incompatibleCounts, + nonuniqueCounts = generateNonUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations), + distTable = distTable, + readToTranscriptMap = readToTranscriptMap + ) + return(quantData) } #' Generate unique counts diff --git a/R/bambu-classes.R b/R/bambu-classes.R new file mode 100644 index 00000000..80c66ac7 --- /dev/null +++ b/R/bambu-classes.R @@ -0,0 +1,25 @@ +#' @import methods +#' @importFrom Matrix Matrix +#' @importFrom data.table data.table +#' @importClassesFrom Matrix Matrix +#' @importClassesFrom data.table data.table +#' @export +setClass("quantData", + slots = list( + sampleData = "data.frame", + uniqueCounts = "Matrix", + readClassDt = "data.table", + countMatrix = "Matrix", + incompatibleCountMatrix = "Matrix", + sampleNames = "character", + incompatibleCounts = "ANY", + nonuniqueCounts = "Matrix", + distTable = "ANY", + readToTranscriptMap = "ANY" + ) +) + +#' @export +setMethod("$", signature(x = "quantData"), function(x, name) { + slot(x, name) +}) diff --git a/R/bambu.R b/R/bambu.R index cf61a555..d6bbefbd 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -274,17 +274,17 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, for(i in seq_along(quantData)){ quantData_i <- quantData[[i]] #load in the barcode clustering from file if provided - iter <- seq_len(ncol(metadata(quantData_i)$countMatrix)) # iter is integer + iter <- seq_len(ncol(quantData_i$countMatrix)) # iter is integer if(!is.null(clusters)){ if(class(clusters[[i]])!="CompressedCharacterList"){ # !is.list(clusters) is FALSE for CompressedCharacterList clusterMaps <- NULL - for(j in seq_along(metadata(quantData_i)$sampleNames)){ #load in a file per sample name provided + for(j in seq_along(quantData_i$sampleNames)){ #load in a file per sample name provided clusterMap <- fread(clusters[[j]], header = FALSE, data.table = FALSE) # read.table(clusters[[j]], # sep = ifelse(grepl(".tsv$",clusters[[j]]), "\t", ","), # header = FALSE) - clusterMap[,1] <- paste0(metadata(quantData_i)$sampleNames[j], + clusterMap[,1] <- paste0(quantData_i$sampleNames[j], "_",clusterMap[,1]) clusterMaps <- rbind(clusterMaps, clusterMap) } @@ -299,14 +299,14 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, } countsSeCompressed <- bplapply(iter, FUN = function(j){ # previous i changed to j to avoid duplicated assignment #i = iter[i %in% colnames(metadata(quantData_i)$countMatrix)] #bug, after assignment, i become emptyprint(i) - countMatrix <- unname(metadata(quantData_i)$countMatrix[,j]) # same here - incompatibleCountMatrix <- unname(metadata(quantData_i)$incompatibleCountMatrix[,j]) # same here + countMatrix <- unname(quantData_i$countMatrix[,j]) # same here + incompatibleCountMatrix <- unname(quantData_i$incompatibleCountMatrix[,j]) # same here if(!is.null(dim(countMatrix))){ countMatrix <- rowSums(countMatrix) - incompatibleCountMatrix <- rowSums(metadata(quantData_i)$incompatibleCountMatrix[,j]) # same here + incompatibleCountMatrix <- rowSums(quantData_i$incompatibleCountMatrix[,j]) # same here } - return(bambu.quantify(readClassDt = metadata(quantData_i)$readClassDt, countMatrix = countMatrix, - incompatibleCountMatrix = data.table(GENEID.i = as.numeric(rownames(metadata(quantData_i)$incompatibleCountMatrix)), counts = incompatibleCountMatrix), + return(bambu.quantify(readClassDt = quantData_i$readClassDt, countMatrix = countMatrix, + incompatibleCountMatrix = data.table(GENEID.i = as.numeric(rownames(quantData_i$incompatibleCountMatrix)), counts = incompatibleCountMatrix), txid.index = mcols(annotations)$txid, GENEIDs = GENEIDs.i, isoreParameters = isoreParameters, emParameters = emParameters, trackReads = trackReads, verbose = verbose))}, @@ -321,8 +321,8 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, row.names = names(countsSeCompressed) ) } else{ - ColNames <- c(ColNames, colnames(quantData_i)) - colData.all[[i]] <- data.frame(colData(quantData_i)) + ColNames <- c(ColNames, rownames(quantData_i$sampleData)) + colData.all[[i]] <- data.frame(quantData_i$sampleData) } countsSeCompressed.all <- c(countsSeCompressed.all, countsSeCompressed) } @@ -332,7 +332,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if(returnDistTable){ distTables = list() for(i in seq_along(quantData)){ - distTables[[i]] <- metadata(quantData[[i]])$distTable + distTables[[i]] <- quantData[[i]]$distTable } metadata(countsSe)$distTables <- distTables } From abc7411ee5b9e9851580e90bd6c7226ae69f804e Mon Sep 17 00:00:00 2001 From: lingminhao Date: Wed, 15 Apr 2026 22:34:00 +0800 Subject: [PATCH 02/29] Refactor quantData: remove countMatrix and replace with list columns [columnIds, columnCounts] in readClassDt --- R/bambu-assignDist.R | 55 ++++++++++++------- R/bambu-classes.R | 1 - R/bambu-processReads.R | 21 +++---- ...processReads_utilityConstructReadClasses.R | 47 ++++++++-------- R/bambu-quantify.R | 19 ++++++- R/bambu-quantify_utilityFunctions.R | 14 ++--- R/bambu.R | 17 +++--- 7 files changed, 100 insertions(+), 74 deletions(-) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index a5319290..d43a6826 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -10,6 +10,13 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame readClassList <- splitReadClassFiles(readClassList) readClassDt <- genEquiRCs(metadata(readClassList)$readClassDist, annotations, verbose) readClassDt$eqClass.match = match(readClassDt$eqClassById,metadata(readClassList)$eqClassById) + + # Add columnIds and columnCounts columns + readClassDt[, `:=`(columnIds = as.list(rep(NA, .N)), columnCounts = as.list(rep(NA, .N)))] + observedIdx <- which(!is.na(readClassDt$eqClass.match)) + readClassDt$columnIds[observedIdx] <- metadata(readClassList)$columnIds[readClassDt$eqClass.match[observedIdx]] + readClassDt$columnCounts[observedIdx] <- metadata(readClassList)$columnCounts[readClassDt$eqClass.match[observedIdx]] + readClassDt <- simplifyNames(readClassDt) readClassDt <- readClassDt %>% group_by(eqClassId, gene_sid) %>% mutate(multi_align = length(unique(txid))>1) %>% @@ -30,13 +37,12 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame quantData <- new("quantData", sampleData = data.frame(ColData), - uniqueCounts = generateUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations), + uniqueCounts = generateUniqueCounts(readClassDt, annotations, nrow(metadata(readClassList)$sampleData)), readClassDt = readClassDt, - countMatrix = metadata(readClassList)$countMatrix, incompatibleCountMatrix = incompatibleCountMatrix, sampleNames = as.character(metadata(readClassList)$sampleData$sampleName), incompatibleCounts = incompatibleCounts, - nonuniqueCounts = generateNonUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations), + nonuniqueCounts = generateNonUniqueCounts(readClassDt, annotations, nrow(metadata(readClassList)$sampleData)), distTable = distTable, readToTranscriptMap = readToTranscriptMap ) @@ -46,22 +52,20 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame #' Generate unique counts #' @noRd -generateUniqueCounts <- function(readClassDt, countMatrix, annotations){ +generateUniqueCounts <- function(readClassDt, annotations, nSamples){ x <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match)) - uniqueCounts <- countMatrix[x$eqClass.match,] - uniqueCounts.tx <- sparse.model.matrix(~ factor(x$txid) - 1) - uniqueCounts <- t(uniqueCounts.tx) %*% uniqueCounts - rownames(uniqueCounts) <- names(annotations)[match(as.numeric(levels(factor(x$txid))),mcols(annotations)$txid)] - counts <- sparseMatrix(length(annotations), ncol(uniqueCounts), x = 0) - rownames(counts) <- names(annotations) - counts[rownames(uniqueCounts),] <- uniqueCounts - return(counts) - # these three lines appear after return, so it's not used, is this used for debug only? - # counts.total = colSums(countMatrix) + colSums(incompatibleCountMatrix) - # counts.total[counts.total==0] = 1 - # counts.CPM = counts/counts.total * 10^6 - + uniqueCounts <- if(nrow(x) == 0) { + sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(annotations), nSamples)) + } else { + txids <- mcols(annotations)$txid + i <- rep(match(x$txid, txids), lengths(x$columnIds)) + j <- unlist(x$columnIds) + x_vals <- unlist(x$columnCounts) + sparseMatrix(i = i, j = j, x = x_vals, dims = c(length(annotations), nSamples)) + } + rownames(uniqueCounts) <- names(annotations) + return(uniqueCounts) } @@ -79,11 +83,22 @@ generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ #' Generate non-unique counts #' @noRd -generateNonUniqueCounts <- function(readClassDt, countMatrix, annotations){ +generateNonUniqueCounts <- function(readClassDt, annotations, nSamples){ #fuse multi align RCs by gene x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) x <- x %>% distinct(eqClassId, .keep_all = TRUE) - nonuniqueCounts <- countMatrix[x$eqClass.match,, drop = FALSE] + + if(nrow(x) == 0) { + genes <- levels(factor(unique(mcols(annotations)$GENEID))) + return(sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(genes), nSamples), dimnames = list(genes, NULL))) + } + + # x has columnIds and columnCounts + i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) + j <- unlist(x$columnIds) + x_vals <- unlist(x$columnCounts) + nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nSamples)) + if(nrow(x)>1 & length(unique(x$gene_sid))>1){ nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts @@ -99,7 +114,7 @@ generateNonUniqueCounts <- function(readClassDt, countMatrix, annotations){ rownames(nonuniqueCounts) <- geneids #create matrix for all annotated genes genes <- levels(factor(unique(mcols(annotations)$GENEID))) - geneMat <- sparseMatrix(length(genes), ncol(nonuniqueCounts), x = 0) + geneMat <- sparseMatrix(length(genes), nSamples, x = 0) rownames(geneMat) <- genes if(!is.null(rownames(nonuniqueCounts))){ geneMat[rownames(nonuniqueCounts),] <- nonuniqueCounts diff --git a/R/bambu-classes.R b/R/bambu-classes.R index 80c66ac7..ae5241b5 100644 --- a/R/bambu-classes.R +++ b/R/bambu-classes.R @@ -9,7 +9,6 @@ setClass("quantData", sampleData = "data.frame", uniqueCounts = "Matrix", readClassDt = "data.table", - countMatrix = "Matrix", incompatibleCountMatrix = "Matrix", sampleNames = "character", incompatibleCounts = "ANY", diff --git a/R/bambu-processReads.R b/R/bambu-processReads.R index dd988584..7c59590f 100644 --- a/R/bambu-processReads.R +++ b/R/bambu-processReads.R @@ -174,9 +174,9 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations, mcols(readGrgList)$id <- seq_along(readGrgList) if(!isFALSE(demultiplexed)){ - mcols(readGrgList)$sampleID <- as.numeric(mcols(readGrgList)$CB) + mcols(readGrgList)$columnID <- as.numeric(mcols(readGrgList)$CB) } else { - mcols(readGrgList)$sampleID <- index + mcols(readGrgList)$columnID <- index } # construct read classes for each chromosome seperately @@ -398,10 +398,12 @@ splitReadClassFiles = function(readClassFile){ distTable <- metadata(metadata(readClassFile)$readClassDist)$distTable eqClasses <- distTable %>% group_by(eqClassById) %>% distinct(eqClassById, readCount,GENEID, totalWidth, firstExonWidth, .keep_all = TRUE) - eqClasses$sampleIDs <- rowData(readClassFile)$sampleIDs[match(eqClasses$readClassId, rownames(readClassFile))] + eqClasses$columnIds <- rowData(readClassFile)$columnIds[match(eqClasses$readClassId, rownames(readClassFile))] eqClasses <- eqClasses %>% summarise(nobs = sum(readCount), - sampleIDs = list(unlist(sampleIDs))) - counts.table <- tableFunction(eqClasses$sampleIDs) + columnIds = list(unlist(columnIds))) + counts.table <- tableFunction(eqClasses$columnIds) + metadata(readClassFile)$columnIds <- lapply(counts.table, function(x) as.numeric(names(x))) + metadata(readClassFile)$columnCounts <- lapply(counts.table, function(x) as.numeric(x)) counts <- sparseMatrix( i = rep(seq_along(counts.table), lengths(counts.table)), j = as.numeric(names(unlist(counts.table))), @@ -414,10 +416,10 @@ splitReadClassFiles = function(readClassFile){ dims = c(1, length(metadata(readClassFile)$sampleData$id))) rownames(counts.incompatible) <- "TODO" } else{ - distTable$sampleIDs <- rowData(readClassFile)$sampleIDs[match(distTable$readClassId, rownames(readClassFile))] + distTable$columnIds <- rowData(readClassFile)$columnIds[match(distTable$readClassId, rownames(readClassFile))] distTable <- distTable %>% group_by(GENEID.i) %>% summarise(counts = sum(readCount), - sampleIDs = list(unlist(sampleIDs))) - counts.table <- lapply(distTable$sampleIDs, FUN = function(x){table(x)}) + columnIds = list(unlist(columnIds))) + counts.table <- lapply(distTable$columnIds, FUN = function(x){table(x)}) counts.incompatible <- sparseMatrix( i = rep(seq_along(counts.table), lengths(counts.table)), j = as.numeric(names(unlist(counts.table))), @@ -429,7 +431,6 @@ splitReadClassFiles = function(readClassFile){ colnames(counts) <- metadata(readClassFile)$sampleData$id metadata(readClassFile)$eqClassById <- eqClasses$eqClassById #rownames(counts) = eqClasses$eqClassById - metadata(readClassFile)$countMatrix <- counts metadata(readClassFile)$incompatibleCountMatrix <- counts.incompatible return(readClassFile) } @@ -439,7 +440,7 @@ splitReadClassFiles = function(readClassFile){ #' @importFrom Matrix #' @noRd splitReadClassFilesByRC <- function(readClassFile){ - counts.table <- tableFunction(rowData(readClassFile)$sampleIDs) + counts.table <- tableFunction(rowData(readClassFile)$columnIds) counts <- sparseMatrix( i = rep(seq_along(counts.table), lengths(counts.table)), j = as.numeric(names(unlist(counts.table))), diff --git a/R/bambu-processReads_utilityConstructReadClasses.R b/R/bambu-processReads_utilityConstructReadClasses.R index a9610045..4d50c9a9 100644 --- a/R/bambu-processReads_utilityConstructReadClasses.R +++ b/R/bambu-processReads_utilityConstructReadClasses.R @@ -16,8 +16,8 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions, use.names = FALSE) mcols(reads.singleExon)$id <- mcols(readGrgList[ elementNROWS(readGrgList) == 1])$id - mcols(reads.singleExon)$sampleID <- mcols(readGrgList[ - elementNROWS(readGrgList) == 1])$sampleID + mcols(reads.singleExon)$columnID <- mcols(readGrgList[ + elementNROWS(readGrgList) == 1])$columnID #only keep multi exons reads in readGrgList readGrgList <- readGrgList[elementNROWS(readGrgList) > 1] if (!identical(mcols(readGrgList)$id,unique(mcols(unlisted_junctions)$id))) @@ -100,7 +100,7 @@ constructSplicedReadClasses <- function(uniqueJunctions, unlisted_junctions, readTable <- readTable %>% dplyr::select(chr.rc = chr, strand.rc = strand, startSD = startSD, endSD = endSD, readCount.posStrand = readCount.posStrand, intronStarts, intronEnds, - confidenceType, readCount, readIds, sampleIDs) + confidenceType, readCount, readIds, columnIds) mcols(exonsByReadClass) <- readTable options(scipen = 0) return(exonsByReadClass) @@ -183,7 +183,7 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end, strand = readStrand, confidenceType = readConfidence, alignmentStrand = as.character(getStrandFromGrList(readGrgList))=='+', readId = mcols(readGrgList)$id, - sampleID = mcols(readGrgList)$sampleID) + columnID = mcols(readGrgList)$columnID) rm(readRanges, readStrand, unlisted_junctions_start, unlisted_junctions_end, unlisted_junctions_id, readConfidence, intronStartCoordinatesInt, intronEndCoordinatesInt) @@ -194,7 +194,7 @@ createReadTable <- function(unlisted_junctions_start, unlisted_junctions_end, start = nth(x = start, n = ceiling(readCount / 5), order_by = start), end = nth(x = end, n = ceiling(readCount / 1.25), order_by = end), readCount.posStrand = sum(alignmentStrand, na.rm = TRUE), - readIds = list(readId), sampleIDs = list(sampleID), + readIds = list(readId), columnIds = list(columnID), .groups = 'drop') %>% arrange(chr, start, end) %>% mutate(readClassId = paste("rc", row_number(), sep = ".")) @@ -252,11 +252,11 @@ constructUnsplicedReadClasses <- function(reads.singleExon, annotations, # by their minimum read class coordinates #remove duplicate ranges counts = as.data.frame(reads.singleExon) %>% - mutate(id = mcols(reads.singleExon)$id, - sampleID = mcols(reads.singleExon)$sampleID) %>% + mutate(id = mcols(reads.singleExon)$id, + columnID = mcols(reads.singleExon)$columnID) %>% group_by(seqnames,start,end,strand) %>% - mutate(counts=n(), id = list(id), sampleID = list(sampleID)) %>% - ungroup() %>% + mutate(counts=n(), id = list(id), columnID = list(columnID)) %>% + ungroup() %>% as.data.frame() reads.singleExon = GRanges(counts) reads.singleExon = unique(reads.singleExon) @@ -274,7 +274,7 @@ constructUnsplicedReadClasses <- function(reads.singleExon, annotations, exonsByReadClass <- rcUnsplicedAnnotation }else{ referenceExons <- reduce(reads.singleExon, ignore.strand = !stranded) - #(2) reads do not fall within a annotated exon/high confidence read class + #(2) reads do not fall within a annotated exon/high confidence read class # exon are summarised based on the union of overlapping unspliced reads rcUnsplicedReduced <- getUnsplicedReadClassByReference( granges = reads.singleExon, grangesReference = referenceExons, @@ -286,17 +286,17 @@ constructUnsplicedReadClasses <- function(reads.singleExon, annotations, if (verbose) message("Finished create single exon transcript models ", "(read classes) in ", round((end.ptm - start.ptm)[3] / 60, 1), " mins.") return(exonsByReadClass) -} + } -#' reconstruct read classes using unspliced reads that fall -#' within exons from annotations -#' @importFrom GenomicRanges GRanges relist -#' @importFrom dplyr %>% select group_by summarise .groups mutate cut_group_id -#' ungroup distinct -#' @noRd -getUnsplicedReadClassByReference <- function(granges, grangesReference, + #' reconstruct read classes using unspliced reads that fall + #' within exons from annotations + #' @importFrom GenomicRanges GRanges relist + #' @importFrom dplyr %>% select group_by summarise .groups mutate cut_group_id + #' ungroup distinct + #' @noRd + getUnsplicedReadClassByReference <- function(granges, grangesReference, confidenceType = "unspliced", stranded = TRUE) { if (is.null(mcols(granges)$id)) stop("ID column is missing from mcols(granges)") @@ -319,17 +319,17 @@ getUnsplicedReadClassByReference <- function(granges, grangesReference, readEnd = end(granges)[queryHits], counts = mcols(granges)$counts[queryHits], readId = mcols(granges[queryHits])$id, - sampleID = mcols(granges[queryHits])$sampleID) + columnID = mcols(granges[queryHits])$columnID) hitsDF <- hitsDF %>% dplyr::select(chr, start, end, readStart, readEnd, strand, readClassId, alignmentStrand, - counts, readId, sampleID) %>% + counts, readId, columnID) %>% group_by(readClassId) %>% summarise(start = start[1], end = end[1], strand = strand[1], chr = chr[1], readCount = sum(counts), startSD = sd(rep(readStart,counts)), endSD = sd(rep(readEnd,counts)), readCount.posStrand = sum(rep(alignmentStrand,counts)), - readIds = list(unlist(readId)), sampleIDs = list(unlist(sampleID))) %>% + readIds = list(unlist(readId)), columnIds = list(unlist(columnID))) %>% mutate(confidenceType = confidenceType, intronStarts = NA, intronEnds = NA) if(nrow(hitsDF)==0){ @@ -348,11 +348,10 @@ getUnsplicedReadClassByReference <- function(granges, grangesReference, hitsDF <- dplyr::select(hitsDF, chr.rc = chr, strand.rc = strand, intronStarts, intronEnds, confidenceType, readCount, startSD, endSD, - readCount.posStrand, readIds, sampleIDs) + readCount.posStrand, readIds, columnIds) mcols(exByReadClassUnspliced) <- hitsDF return(exByReadClassUnspliced) -} - + } #' initiate the hits dataframe #' @param hitsWithin hitsWithin #' @param grangesReference grangesReference diff --git a/R/bambu-quantify.R b/R/bambu-quantify.R index 68b41395..ccc7d4a3 100644 --- a/R/bambu-quantify.R +++ b/R/bambu-quantify.R @@ -2,12 +2,25 @@ #' @inheritParams bambu #' @import data.table #' @noRd -bambu.quantify <- function(readClassDt, countMatrix, incompatibleCountMatrix, txid.index, GENEIDs, emParameters, +bambu.quantify <- function(readClassDt, columnIdx, incompatibleCountMatrix, txid.index, GENEIDs, emParameters, trackReads = FALSE, returnDistTable = FALSE, verbose = FALSE, isoreParameters = setIsoreParameters(NULL)) { start.ptm <- proc.time() - readClassDt$nobs = countMatrix[readClassDt$eqClass.match] - readClassDt$nobs[is.na(readClassDt$nobs)] = 0 + + # Calculate nobs for sample(s) columnIdx + # Use data.table syntax for in-place creation of nobs column for the scope of this function + readClassDt[, nobs := { + ids <- columnIds[[1]] + if (is.null(ids) || length(ids) == 0 || is.na(ids[1])) { + 0L + } else if (length(columnIdx) == 1) { + match_idx <- match(columnIdx, ids) + if (is.na(match_idx)) 0L else columnCounts[[1]][match_idx] + } else { + sum(columnCounts[[1]][ids %in% columnIdx]) + } + }, by = eqClassId] + compatibleCounts <- bambu.quantDT(readClassDt, emParameters = emParameters,verbose = verbose) incompatibleCounts <- incompatibleCountMatrix[data.table(GENEID.i = GENEIDs), on = "GENEID.i"] incompatibleCounts[is.na(counts), counts := 0] diff --git a/R/bambu-quantify_utilityFunctions.R b/R/bambu-quantify_utilityFunctions.R index 936763f1..5122577d 100644 --- a/R/bambu-quantify_utilityFunctions.R +++ b/R/bambu-quantify_utilityFunctions.R @@ -106,10 +106,9 @@ getUniCountPerEquiRC <- function(distTable){ mutate(anyEqual = any(equal)) %>% select(eqClassById, firstExonWidth,totalWidth, readCount,GENEID,anyEqual) %>% #eqClassByIdTemp, distinct() %>% - mutate(nobs = sum(readCount), - rcWidth = ifelse(anyEqual, max(totalWidth), - max(firstExonWidth))) %>% - select(eqClassById,GENEID,nobs,rcWidth) %>% #eqClassByIdTemp, + mutate(rcWidth = ifelse(anyEqual, max(totalWidth), + max(firstExonWidth))) %>% + select(eqClassById,GENEID,rcWidth) %>% #eqClassByIdTemp, ungroup() %>% distinct() return(eqClassCount) @@ -124,11 +123,10 @@ addEmptyRC <- function(eqClassCount, annotations){ eqClassCount <- createEqClassToTxMapping(eqClassCount) eqClassCountJoin <- full_join(eqClassCount, minEquiRC, by = c("eqClassById","GENEID","txid","equal")) eqClassCountJoin[is.na(eqClassCountJoin)] <- 0 - eqClassCount_final <- eqClassCountJoin %>% + eqClassCount_final <- eqClassCountJoin %>% group_by(eqClassById) %>% - mutate(nobs = max(nobs), - rcWidth = max(rcWidth), - minRC = max(minRC)) %>% + mutate(rcWidth = max(rcWidth, na.rm = TRUE), + minRC = max(minRC, na.rm = TRUE)) %>% ungroup() %>% distinct() return(eqClassCount_final) diff --git a/R/bambu.R b/R/bambu.R index d6bbefbd..cf45da6d 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -274,7 +274,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, for(i in seq_along(quantData)){ quantData_i <- quantData[[i]] #load in the barcode clustering from file if provided - iter <- seq_len(ncol(quantData_i$countMatrix)) # iter is integer + iter <- seq_len(nrow(quantData_i$sampleData)) # iter is integer if(!is.null(clusters)){ if(class(clusters[[i]])!="CompressedCharacterList"){ # !is.list(clusters) is FALSE for CompressedCharacterList clusterMaps <- NULL @@ -297,15 +297,16 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, iter <- clusters[[i]] } } - countsSeCompressed <- bplapply(iter, FUN = function(j){ # previous i changed to j to avoid duplicated assignment + countsSeCompressed <- bplapply(iter, FUN = function(columnIdx){ # previous i changed to j to avoid duplicated assignment #i = iter[i %in% colnames(metadata(quantData_i)$countMatrix)] #bug, after assignment, i become emptyprint(i) - countMatrix <- unname(quantData_i$countMatrix[,j]) # same here - incompatibleCountMatrix <- unname(quantData_i$incompatibleCountMatrix[,j]) # same here - if(!is.null(dim(countMatrix))){ - countMatrix <- rowSums(countMatrix) - incompatibleCountMatrix <- rowSums(quantData_i$incompatibleCountMatrix[,j]) # same here + if(is.character(columnIdx)) { + columnIdx <- match(columnIdx, quantData_i$sampleData$id) } - return(bambu.quantify(readClassDt = quantData_i$readClassDt, countMatrix = countMatrix, + incompatibleCountMatrix <- unname(quantData_i$incompatibleCountMatrix[,columnIdx]) # same here + if(!is.null(dim(incompatibleCountMatrix))){ + incompatibleCountMatrix <- rowSums(incompatibleCountMatrix) + } + return(bambu.quantify(readClassDt = quantData_i$readClassDt, columnIdx = columnIdx, incompatibleCountMatrix = data.table(GENEID.i = as.numeric(rownames(quantData_i$incompatibleCountMatrix)), counts = incompatibleCountMatrix), txid.index = mcols(annotations)$txid, GENEIDs = GENEIDs.i, isoreParameters = isoreParameters, emParameters = emParameters, trackReads = trackReads, From ce1259e373158af65cf58c6315e32670f7f22280 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Thu, 16 Apr 2026 09:48:40 +0800 Subject: [PATCH 03/29] remove redundant stored data in quantData --- R/bambu-assignDist.R | 11 ++++------- R/bambu-classes.R | 4 ---- R/bambu-processReads.R | 6 +++--- R/bambu-quantify.R | 4 ++-- R/bambu.R | 13 ++++++------- 5 files changed, 15 insertions(+), 23 deletions(-) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index d43a6826..5bf5b704 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -27,7 +27,7 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame ColData <- generateColData(readClassList, sampleMetadata, demultiplexed) incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix - incompatibleCounts <- if(sum(incompatibleCountMatrix)==0) NULL else generateIncompatibleCounts(incompatibleCountMatrix, annotations) + incompatibleCounts <- generateIncompatibleCounts(incompatibleCountMatrix, annotations) distTable <- if(returnDistTable) metadata(metadata(readClassList)$readClassDist)$distTableOld else NULL @@ -37,12 +37,8 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame quantData <- new("quantData", sampleData = data.frame(ColData), - uniqueCounts = generateUniqueCounts(readClassDt, annotations, nrow(metadata(readClassList)$sampleData)), readClassDt = readClassDt, - incompatibleCountMatrix = incompatibleCountMatrix, - sampleNames = as.character(metadata(readClassList)$sampleData$sampleName), incompatibleCounts = incompatibleCounts, - nonuniqueCounts = generateNonUniqueCounts(readClassDt, annotations, nrow(metadata(readClassList)$sampleData)), distTable = distTable, readToTranscriptMap = readToTranscriptMap ) @@ -76,7 +72,8 @@ generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ rownames(incompatibleCountMatrix) <- genes[as.numeric(rownames(incompatibleCountMatrix))] geneMat <- sparseMatrix(length(genes), ncol(incompatibleCountMatrix), x = 0) rownames(geneMat) <- genes - geneMat[rownames(incompatibleCountMatrix),] <- incompatibleCountMatrix + colnames(geneMat) <- colnames(incompatibleCountMatrix) + geneMat[match(rownames(incompatibleCountMatrix), rownames(geneMat)), ] <- incompatibleCountMatrix return(geneMat) } @@ -117,7 +114,7 @@ generateNonUniqueCounts <- function(readClassDt, annotations, nSamples){ geneMat <- sparseMatrix(length(genes), nSamples, x = 0) rownames(geneMat) <- genes if(!is.null(rownames(nonuniqueCounts))){ - geneMat[rownames(nonuniqueCounts),] <- nonuniqueCounts + geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts } return(geneMat) } diff --git a/R/bambu-classes.R b/R/bambu-classes.R index ae5241b5..28ac6532 100644 --- a/R/bambu-classes.R +++ b/R/bambu-classes.R @@ -7,12 +7,8 @@ setClass("quantData", slots = list( sampleData = "data.frame", - uniqueCounts = "Matrix", readClassDt = "data.table", - incompatibleCountMatrix = "Matrix", - sampleNames = "character", incompatibleCounts = "ANY", - nonuniqueCounts = "Matrix", distTable = "ANY", readToTranscriptMap = "ANY" ) diff --git a/R/bambu-processReads.R b/R/bambu-processReads.R index 7c59590f..c44604a4 100644 --- a/R/bambu-processReads.R +++ b/R/bambu-processReads.R @@ -412,9 +412,9 @@ splitReadClassFiles = function(readClassFile){ #incompatible counts distTable <- metadata(metadata(readClassFile)$readClassDist)$distTable.incompatible if(nrow(distTable)==0) { - counts.incompatible <- sparseMatrix(i= 1, j = 1, x = 0, - dims = c(1, length(metadata(readClassFile)$sampleData$id))) - rownames(counts.incompatible) <- "TODO" + counts.incompatible <- sparseMatrix(i= integer(0), j = integer(0), x = numeric(0), + dims = c(0, length(metadata(readClassFile)$sampleData$id))) + rownames(counts.incompatible) <- character(0) } else{ distTable$columnIds <- rowData(readClassFile)$columnIds[match(distTable$readClassId, rownames(readClassFile))] distTable <- distTable %>% group_by(GENEID.i) %>% summarise(counts = sum(readCount), diff --git a/R/bambu-quantify.R b/R/bambu-quantify.R index ccc7d4a3..a9c28f36 100644 --- a/R/bambu-quantify.R +++ b/R/bambu-quantify.R @@ -2,7 +2,7 @@ #' @inheritParams bambu #' @import data.table #' @noRd -bambu.quantify <- function(readClassDt, columnIdx, incompatibleCountMatrix, txid.index, GENEIDs, emParameters, +bambu.quantify <- function(readClassDt, columnIdx, incompatibleCounts, txid.index, GENEIDs, emParameters, trackReads = FALSE, returnDistTable = FALSE, verbose = FALSE, isoreParameters = setIsoreParameters(NULL)) { start.ptm <- proc.time() @@ -22,7 +22,7 @@ bambu.quantify <- function(readClassDt, columnIdx, incompatibleCountMatrix, txid }, by = eqClassId] compatibleCounts <- bambu.quantDT(readClassDt, emParameters = emParameters,verbose = verbose) - incompatibleCounts <- incompatibleCountMatrix[data.table(GENEID.i = GENEIDs), on = "GENEID.i"] + incompatibleCounts <- incompatibleCounts[data.table(GENEID.i = GENEIDs), on = "GENEID.i"] incompatibleCounts[is.na(counts), counts := 0] compatibleCounts <- calculateCPM(compatibleCounts, incompatibleCounts) counts <- compatibleCounts[match(txid.index, txid)] diff --git a/R/bambu.R b/R/bambu.R index cf45da6d..09d21015 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -266,7 +266,6 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if(length(annotations)==0) stop("No valid annotations, if running de novo please try less stringent parameters") if(is.null(quantData)) stop("quantData must be provided or assignDist = TRUE") - GENEIDs.i <- as.numeric(factor(unique(mcols(annotations)$GENEID))) start.ptm <- proc.time() countsSeCompressed.all <- NULL ColNames <- c() @@ -302,13 +301,13 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if(is.character(columnIdx)) { columnIdx <- match(columnIdx, quantData_i$sampleData$id) } - incompatibleCountMatrix <- unname(quantData_i$incompatibleCountMatrix[,columnIdx]) # same here - if(!is.null(dim(incompatibleCountMatrix))){ - incompatibleCountMatrix <- rowSums(incompatibleCountMatrix) - } + + incompatibleCounts_i <- quantData_i$incompatibleCounts[, columnIdx, drop = FALSE] + incompatibleCounts_i <- rowSums(incompatibleCounts_i) + return(bambu.quantify(readClassDt = quantData_i$readClassDt, columnIdx = columnIdx, - incompatibleCountMatrix = data.table(GENEID.i = as.numeric(rownames(quantData_i$incompatibleCountMatrix)), counts = incompatibleCountMatrix), - txid.index = mcols(annotations)$txid, GENEIDs = GENEIDs.i, isoreParameters = isoreParameters, + incompatibleCounts = data.table(GENEID.i = names(incompatibleCounts_i), counts = incompatibleCounts_i), + txid.index = mcols(annotations)$txid, GENEIDs = names(incompatibleCounts_i), isoreParameters = isoreParameters, emParameters = emParameters, trackReads = trackReads, verbose = verbose))}, BPPARAM = bpParameters) From 9ce39c1b62409993acfa1b65092067df58021dbf Mon Sep 17 00:00:00 2001 From: lingminhao Date: Thu, 16 Apr 2026 10:10:10 +0800 Subject: [PATCH 04/29] add readToTranscriptMaps to final se when trackReads is TRUE --- R/bambu.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/bambu.R b/R/bambu.R index 09d21015..01a76cf4 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -329,6 +329,13 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, names(countsSeCompressed.all) <- ColNames countsSe <- combineCountSes(countsSeCompressed.all, colData.all, annotations) + if(trackReads){ + readToTranscriptMaps = list() + for(i in seq_along(quantData)){ + readToTranscriptMaps[[i]] <- quantData[[i]]$readToTranscriptMap + } + metadata(countsSe)$readToTranscriptMaps <- readToTranscriptMaps + } if(returnDistTable){ distTables = list() for(i in seq_along(quantData)){ From c7ed6d3f7d885db654f2ee32b5c92a2e02279adb Mon Sep 17 00:00:00 2001 From: lingminhao Date: Thu, 16 Apr 2026 11:07:07 +0800 Subject: [PATCH 05/29] Add gene and unique count matrix helpers from quantData --- R/bambu-assignDist.R | 141 +++++++++++++++++++++++++++---------------- 1 file changed, 89 insertions(+), 52 deletions(-) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index 5bf5b704..c13a2b3d 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -46,25 +46,6 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame return(quantData) } -#' Generate unique counts -#' @noRd -generateUniqueCounts <- function(readClassDt, annotations, nSamples){ - x <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match)) - - uniqueCounts <- if(nrow(x) == 0) { - sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(annotations), nSamples)) - } else { - txids <- mcols(annotations)$txid - i <- rep(match(x$txid, txids), lengths(x$columnIds)) - j <- unlist(x$columnIds) - x_vals <- unlist(x$columnCounts) - sparseMatrix(i = i, j = j, x = x_vals, dims = c(length(annotations), nSamples)) - } - rownames(uniqueCounts) <- names(annotations) - return(uniqueCounts) -} - - #' Generate incompatible counts #' @noRd generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ @@ -77,44 +58,100 @@ generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ return(geneMat) } +#' Generate unique counts +#' @noRd +generateUniqueCountsFromQuantData <- function(quantData, annotations){ + uniqueCountsList <- lapply(quantData, function(x) { + readClassDt <- x$readClassDt + x_filtered <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match)) + + uniqueCounts <- if(nrow(x_filtered) == 0) { + sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(annotations), nrow(x$sampleData))) + } else { + txids <- mcols(annotations)$txid + i <- rep(match(x_filtered$txid, txids), lengths(x_filtered$columnIds)) + j <- unlist(x_filtered$columnIds) + x_vals <- unlist(x_filtered$columnCounts) + sparseMatrix(i = i, j = j, x = x_vals, dims = c(length(annotations), nrow(x$sampleData))) + } + rownames(uniqueCounts) <- names(annotations) + colnames(uniqueCounts) <- rownames(x$sampleData) + return(uniqueCounts) + }) + return(do.call(cbind, uniqueCountsList)) +} #' Generate non-unique counts #' @noRd -generateNonUniqueCounts <- function(readClassDt, annotations, nSamples){ - #fuse multi align RCs by gene - x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) - x <- x %>% distinct(eqClassId, .keep_all = TRUE) - - if(nrow(x) == 0) { - genes <- levels(factor(unique(mcols(annotations)$GENEID))) - return(sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(genes), nSamples), dimnames = list(genes, NULL))) - } +generateNonUniqueCountsFromQuantData <- function(quantData, annotations){ + nonuniqueCountsList <- lapply(quantData, function(x_obj) { + readClassDt <- x_obj$readClassDt + #fuse multi align RCs by gene + x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) + x <- x %>% distinct(eqClassId, .keep_all = TRUE) + + if(nrow(x) == 0) { + genes <- levels(factor(unique(mcols(annotations)$GENEID))) + geneMat <- sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(genes), nrow(x_obj$sampleData)), dimnames = list(genes, NULL)) + colnames(geneMat) <- rownames(x_obj$sampleData) + return(geneMat) + } - # x has columnIds and columnCounts - i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) - j <- unlist(x$columnIds) - x_vals <- unlist(x$columnCounts) - nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nSamples)) + # x has columnIds and columnCounts + i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) + j <- unlist(x$columnIds) + x_vals <- unlist(x$columnCounts) + nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nrow(x_obj$sampleData))) - if(nrow(x)>1 & length(unique(x$gene_sid))>1){ - nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } else{ - warning("The factor variable 'gene_sid' has only one level. Adjusting output.") - nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + if(nrow(x)>1 & length(unique(x$gene_sid))>1){ + nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } else{ + warning("The factor variable 'gene_sid' has only one level. Adjusting output.") + nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } + #covert ids into gene ids + geneids <- as.numeric(levels(factor(x$gene_sid))) + geneids <- x$txid[match(geneids, x$gene_sid)] + geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] + rownames(nonuniqueCounts) <- geneids + #create matrix for all annotated genes + genes <- levels(factor(unique(mcols(annotations)$GENEID))) + geneMat <- sparseMatrix(length(genes), nrow(x_obj$sampleData), x = 0) + rownames(geneMat) <- genes + if(!is.null(rownames(nonuniqueCounts))){ + geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts + } + colnames(geneMat) <- rownames(x_obj$sampleData) + return(geneMat) + }) + return(do.call(cbind, nonuniqueCountsList)) +} + +#' Generate gene counts directly from quantData +#' @noRd +generateGeneCountsFromQuantData <- function(quantData, annotations){ + # 1. Get unique transcript counts and aggregate them to gene level + uniqueCounts <- generateUniqueCountsFromQuantData(quantData, annotations) + + geneIDs <- factor(mcols(annotations)$GENEID, levels = unique(mcols(annotations)$GENEID)) + geneCounts <- Matrix::fac2sparse(geneIDs) %*% uniqueCounts + + # 2. Get non-unique counts and incompatible counts + nonuniqueCounts <- generateNonUniqueCountsFromQuantData(quantData, annotations) + incompatibleCounts <- do.call(cbind, lapply(quantData, function(x) x$incompatibleCounts)) + + # 3. Align the rows/dimensions and sum them up + if(!is.null(incompatibleCounts)){ + incompatibleCounts <- Matrix(incompatibleCounts[match(rownames(geneCounts), rownames(incompatibleCounts)), ], sparse = TRUE) + geneCounts <- geneCounts + incompatibleCounts } - #covert ids into gene ids - geneids <- as.numeric(levels(factor(x$gene_sid))) - geneids <- x$txid[match(geneids, x$gene_sid)] - geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] - rownames(nonuniqueCounts) <- geneids - #create matrix for all annotated genes - genes <- levels(factor(unique(mcols(annotations)$GENEID))) - geneMat <- sparseMatrix(length(genes), nSamples, x = 0) - rownames(geneMat) <- genes - if(!is.null(rownames(nonuniqueCounts))){ - geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts + + if(!is.null(nonuniqueCounts)){ + nonuniqueCounts <- Matrix(nonuniqueCounts[match(rownames(geneCounts), rownames(nonuniqueCounts)), ], sparse = TRUE) + geneCounts <- geneCounts + nonuniqueCounts } - return(geneMat) + + return(geneCounts) } From ce96f7b2e62beceb2dcb6313805e694da5f20ab8 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Thu, 16 Apr 2026 11:42:52 +0800 Subject: [PATCH 06/29] add colnames and colData to readClassList --- R/bambu-processReads.R | 19 +++++++++++-------- ...processReads_utilityConstructReadClasses.R | 4 ++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/R/bambu-processReads.R b/R/bambu-processReads.R index c44604a4..48fe500d 100644 --- a/R/bambu-processReads.R +++ b/R/bambu-processReads.R @@ -96,7 +96,8 @@ bambu.processReads <- function(reads, annotations, genomeSequence, stranded = stranded, min.readCount = min.readCount, fitReadClassModel = fitReadClassModel, min.exonOverlap = min.exonOverlap, defaultModels = defaultModels, returnModel = returnModel, verbose = verbose, - processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode) + processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode, + runName = "combinedSamples") metadata(readClassList)$samples <- names(reads) metadata(readClassList)$sampleNames <- names(reads) if(!isFALSE(demultiplexed)) metadata(readClassList)$samples <- levels(mcols(readGrgList)$CB) @@ -179,16 +180,17 @@ bambu.processReadsByFile <- function(bam.file, genomeSequence, annotations, mcols(readGrgList)$columnID <- index } + runName <- names(bam.file)[1] # construct read classes for each chromosome seperately if(processByChromosome){ se <- lowMemoryConstructReadClasses(readGrgList, genomeSequence, - annotations, stranded, verbose,bam.file) + annotations, stranded, verbose, bam.file) } else{ unlisted_junctions <- unlistIntrons(readGrgList, use.ids = TRUE) uniqueJunctions <- isore.constructJunctionTables(unlisted_junctions, annotations,genomeSequence, stranded = stranded, verbose = verbose) se <- isore.constructReadClasses(readGrgList, - unlisted_junctions, uniqueJunctions, runName = "TODO", + unlisted_junctions, uniqueJunctions, runName = runName, annotations, stranded, verbose) } @@ -310,18 +312,18 @@ bambu.readsByFile <- function(bam.file, genomeSequence, annotations, constructReadClasses <- function(readGrgList, genomeSequence, annotations, stranded = FALSE, min.readCount = 2, fitReadClassModel = TRUE, min.exonOverlap = 10, defaultModels = NULL, returnModel = FALSE, - verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE){ + verbose = FALSE, processByChromosome = FALSE, trackReads = FALSE, fusionMode = FALSE, runName = "sample"){ if(processByChromosome){ # construct read classes for each chromosome seperately se <- lowMemoryConstructReadClasses(readGrgList, genomeSequence, - annotations, stranded, verbose,"TODO", fusionMode) + annotations, stranded, verbose, runName, fusionMode) } else{ unlisted_junctions <- unlistIntrons(readGrgList, use.ids = TRUE) uniqueJunctions <- isore.constructJunctionTables(unlisted_junctions, annotations,genomeSequence, stranded = stranded, verbose = verbose) se <- isore.constructReadClasses(readGrgList, - unlisted_junctions, uniqueJunctions, runName = "TODO", + unlisted_junctions, uniqueJunctions, runName = runName, annotations, stranded, verbose) } @@ -349,13 +351,14 @@ constructReadClasses <- function(readGrgList, genomeSequence, annotations, #' Low memory mode for construct read classes (processByChromosome) #' @noRd lowMemoryConstructReadClasses <- function(readGrgList, genomeSequence, - annotations, stranded, verbose,bam.file, fusionMode = FALSE){ + annotations, stranded, verbose, bam.file, fusionMode = FALSE){ if(fusionMode){ readGrgList <- list(readGrgList) names(readGrgList) <- c("fusion") } else{ readGrgList <- split(readGrgList, getChrFromGrList(readGrgList)) } + runName <- names(bam.file)[1] se <- lapply(names(readGrgList),FUN = function(i){ if(length(readGrgList[[i]]) == 0) return(NULL) # create error and strand corrected junction tables @@ -363,7 +366,7 @@ lowMemoryConstructReadClasses <- function(readGrgList, genomeSequence, uniqueJunctions <- isore.constructJunctionTables(unlisted_junctions, annotations,genomeSequence, stranded = stranded, verbose = verbose) se.temp <- isore.constructReadClasses(readGrgList[[i]], - unlisted_junctions, uniqueJunctions, runName = "TODO", + unlisted_junctions, uniqueJunctions, runName = runName, annotations, stranded, verbose) return(se.temp) }) diff --git a/R/bambu-processReads_utilityConstructReadClasses.R b/R/bambu-processReads_utilityConstructReadClasses.R index 4d50c9a9..6b05202d 100644 --- a/R/bambu-processReads_utilityConstructReadClasses.R +++ b/R/bambu-processReads_utilityConstructReadClasses.R @@ -9,7 +9,7 @@ #' @inheritParams bambu #' @noRd isore.constructReadClasses <- function(readGrgList, unlisted_junctions, - uniqueJunctions, runName = "sample1", + uniqueJunctions, runName = "sample", annotations, stranded = FALSE, verbose = FALSE) { #split reads into single exon and multi exon reads reads.singleExon <- unlist(readGrgList[elementNROWS(readGrgList) == 1], @@ -45,7 +45,7 @@ isore.constructReadClasses <- function(readGrgList, unlisted_junctions, annotations, exonsByRC.spliced, stranded, verbose) } exonsByRC <- c(exonsByRC.spliced, exonsByRC.unspliced) - colDataDf <- DataFrame(name = runName, row.names = runName) + colDataDf <- DataFrame(sampleName = runName, row.names = runName) counts <- matrix(mcols(exonsByRC)$readCount, dimnames = list(names(exonsByRC), runName)) From ed08825e83a468cf91ded94376c4dec7c4c749c3 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Thu, 16 Apr 2026 11:52:05 +0800 Subject: [PATCH 07/29] name readClassList, quantData, readToTranscriptMaps, and distTables lists by sample names --- R/bambu-processReads.R | 4 ++-- R/bambu.R | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R/bambu-processReads.R b/R/bambu-processReads.R index 48fe500d..0a33b47d 100644 --- a/R/bambu-processReads.R +++ b/R/bambu-processReads.R @@ -64,6 +64,7 @@ bambu.processReads <- function(reads, annotations, genomeSequence, processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode, demultiplexed = demultiplexed, cleanReads = cleanReads, dedupUMI = dedupUMI, index = 1, barcodesToFilter = barcodesToFilter)}, BPPARAM = bpParameters) + names(readClassList) <- names(reads) } else { readGrgList <- bplapply(seq_along(reads), function(i) { bambu.readsByFile(bam.file = reads[i], @@ -96,8 +97,7 @@ bambu.processReads <- function(reads, annotations, genomeSequence, stranded = stranded, min.readCount = min.readCount, fitReadClassModel = fitReadClassModel, min.exonOverlap = min.exonOverlap, defaultModels = defaultModels, returnModel = returnModel, verbose = verbose, - processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode, - runName = "combinedSamples") + processByChromosome = processByChromosome, trackReads = trackReads, fusionMode = fusionMode) metadata(readClassList)$samples <- names(reads) metadata(readClassList)$sampleNames <- names(reads) if(!isFALSE(demultiplexed)) metadata(readClassList)$samples <- levels(mcols(readGrgList)$CB) diff --git a/R/bambu.R b/R/bambu.R index 01a76cf4..d6f1a76e 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -252,6 +252,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, trackReads = trackReads ) }, BPPARAM = bpParameters) + names(quantData) <- names(readClassList) if (!quant) return(quantData) } } @@ -334,6 +335,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, for(i in seq_along(quantData)){ readToTranscriptMaps[[i]] <- quantData[[i]]$readToTranscriptMap } + names(readToTranscriptMaps) <- names(quantData) metadata(countsSe)$readToTranscriptMaps <- readToTranscriptMaps } if(returnDistTable){ @@ -341,6 +343,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, for(i in seq_along(quantData)){ distTables[[i]] <- quantData[[i]]$distTable } + names(distTables) <- names(quantData) metadata(countsSe)$distTables <- distTables } return(countsSe) From 7f56d214718377b2a1b6e410d2e3e38ec8f7e489 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Thu, 16 Apr 2026 22:39:54 +0800 Subject: [PATCH 08/29] add nonuniqueCounts to quantData & se. Co-Authored-By: Claude Sonnet 4.6 --- R/bambu-assignDist.R | 121 +++++++++++++------------------------ R/bambu-classes.R | 1 + R/bambu-quantify.R | 3 +- R/bambu.R | 7 ++- R/bambu_utilityFunctions.R | 5 +- 5 files changed, 53 insertions(+), 84 deletions(-) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index c13a2b3d..b6137aa5 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -28,17 +28,19 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix incompatibleCounts <- generateIncompatibleCounts(incompatibleCountMatrix, annotations) - + nonuniqueCounts <- generateNonUniqueCountMatrix(readClassDt, annotations, metadata(readClassList)$sampleData$id) + distTable <- if(returnDistTable) metadata(metadata(readClassList)$readClassDist)$distTableOld else NULL - - readToTranscriptMap <- if(trackReads) generateReadToTranscriptMap(readClassList, - metadata(readClassList)$readClassDist, + + readToTranscriptMap <- if(trackReads) generateReadToTranscriptMap(readClassList, + metadata(readClassList)$readClassDist, annotations) else NULL quantData <- new("quantData", sampleData = data.frame(ColData), readClassDt = readClassDt, incompatibleCounts = incompatibleCounts, + nonuniqueCounts = nonuniqueCounts, distTable = distTable, readToTranscriptMap = readToTranscriptMap ) @@ -58,6 +60,42 @@ generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ return(geneMat) } +#' Generate nonunique count matrix (gene x sample) from multi-align reads in readClassDt +#' @noRd +generateNonUniqueCountMatrix <- function(readClassDt, annotations, sampleIds){ + genes <- levels(factor(unique(mcols(annotations)$GENEID))) + nSamples <- length(sampleIds) + geneMat <- sparseMatrix(length(genes), nSamples, x = 0) + rownames(geneMat) <- genes + colnames(geneMat) <- sampleIds + + x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) + x <- x %>% distinct(eqClassId, .keep_all = TRUE) + if (nrow(x) == 0) return(geneMat) + + i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) + j <- unlist(x$columnIds) + x_vals <- unlist(x$columnCounts) + nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nSamples)) + + if (nrow(x) > 1 & length(unique(x$gene_sid)) > 1) { + nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } else { + nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } + + geneids <- as.numeric(levels(factor(x$gene_sid))) + geneids <- x$txid[match(geneids, x$gene_sid)] + geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] + rownames(nonuniqueCounts) <- geneids + colnames(nonuniqueCounts) <- sampleIds + + geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts + return(geneMat) +} + #' Generate unique counts #' @noRd generateUniqueCountsFromQuantData <- function(quantData, annotations){ @@ -80,78 +118,3 @@ generateUniqueCountsFromQuantData <- function(quantData, annotations){ }) return(do.call(cbind, uniqueCountsList)) } - -#' Generate non-unique counts -#' @noRd -generateNonUniqueCountsFromQuantData <- function(quantData, annotations){ - nonuniqueCountsList <- lapply(quantData, function(x_obj) { - readClassDt <- x_obj$readClassDt - #fuse multi align RCs by gene - x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) - x <- x %>% distinct(eqClassId, .keep_all = TRUE) - - if(nrow(x) == 0) { - genes <- levels(factor(unique(mcols(annotations)$GENEID))) - geneMat <- sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(genes), nrow(x_obj$sampleData)), dimnames = list(genes, NULL)) - colnames(geneMat) <- rownames(x_obj$sampleData) - return(geneMat) - } - - # x has columnIds and columnCounts - i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) - j <- unlist(x$columnIds) - x_vals <- unlist(x$columnCounts) - nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nrow(x_obj$sampleData))) - - if(nrow(x)>1 & length(unique(x$gene_sid))>1){ - nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } else{ - warning("The factor variable 'gene_sid' has only one level. Adjusting output.") - nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } - #covert ids into gene ids - geneids <- as.numeric(levels(factor(x$gene_sid))) - geneids <- x$txid[match(geneids, x$gene_sid)] - geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] - rownames(nonuniqueCounts) <- geneids - #create matrix for all annotated genes - genes <- levels(factor(unique(mcols(annotations)$GENEID))) - geneMat <- sparseMatrix(length(genes), nrow(x_obj$sampleData), x = 0) - rownames(geneMat) <- genes - if(!is.null(rownames(nonuniqueCounts))){ - geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts - } - colnames(geneMat) <- rownames(x_obj$sampleData) - return(geneMat) - }) - return(do.call(cbind, nonuniqueCountsList)) -} - -#' Generate gene counts directly from quantData -#' @noRd -generateGeneCountsFromQuantData <- function(quantData, annotations){ - # 1. Get unique transcript counts and aggregate them to gene level - uniqueCounts <- generateUniqueCountsFromQuantData(quantData, annotations) - - geneIDs <- factor(mcols(annotations)$GENEID, levels = unique(mcols(annotations)$GENEID)) - geneCounts <- Matrix::fac2sparse(geneIDs) %*% uniqueCounts - - # 2. Get non-unique counts and incompatible counts - nonuniqueCounts <- generateNonUniqueCountsFromQuantData(quantData, annotations) - incompatibleCounts <- do.call(cbind, lapply(quantData, function(x) x$incompatibleCounts)) - - # 3. Align the rows/dimensions and sum them up - if(!is.null(incompatibleCounts)){ - incompatibleCounts <- Matrix(incompatibleCounts[match(rownames(geneCounts), rownames(incompatibleCounts)), ], sparse = TRUE) - geneCounts <- geneCounts + incompatibleCounts - } - - if(!is.null(nonuniqueCounts)){ - nonuniqueCounts <- Matrix(nonuniqueCounts[match(rownames(geneCounts), rownames(nonuniqueCounts)), ], sparse = TRUE) - geneCounts <- geneCounts + nonuniqueCounts - } - - return(geneCounts) -} diff --git a/R/bambu-classes.R b/R/bambu-classes.R index 28ac6532..3825740c 100644 --- a/R/bambu-classes.R +++ b/R/bambu-classes.R @@ -9,6 +9,7 @@ setClass("quantData", sampleData = "data.frame", readClassDt = "data.table", incompatibleCounts = "ANY", + nonuniqueCounts = "ANY", distTable = "ANY", readToTranscriptMap = "ANY" ) diff --git a/R/bambu-quantify.R b/R/bambu-quantify.R index a9c28f36..0287020b 100644 --- a/R/bambu-quantify.R +++ b/R/bambu-quantify.R @@ -2,7 +2,7 @@ #' @inheritParams bambu #' @import data.table #' @noRd -bambu.quantify <- function(readClassDt, columnIdx, incompatibleCounts, txid.index, GENEIDs, emParameters, +bambu.quantify <- function(readClassDt, columnIdx, incompatibleCounts, nonuniqueCounts, txid.index, GENEIDs, emParameters, trackReads = FALSE, returnDistTable = FALSE, verbose = FALSE, isoreParameters = setIsoreParameters(NULL)) { start.ptm <- proc.time() @@ -28,6 +28,7 @@ bambu.quantify <- function(readClassDt, columnIdx, incompatibleCounts, txid.inde counts <- compatibleCounts[match(txid.index, txid)] sig.digit <- emParameters[["sig.digit"]] seOutput <- list(incompatibleCounts = as(incompatibleCounts$counts, "sparseVector"), + nonuniqueCounts = as(nonuniqueCounts, "sparseVector"), counts = as(round(counts$counts,sig.digit), "sparseVector"), CPM = as(round(counts$CPM,sig.digit), "sparseVector"), fullLengthCounts = as(round(counts$fullLengthCounts,sig.digit), "sparseVector"), diff --git a/R/bambu.R b/R/bambu.R index d6f1a76e..7baf1916 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -305,11 +305,14 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, incompatibleCounts_i <- quantData_i$incompatibleCounts[, columnIdx, drop = FALSE] incompatibleCounts_i <- rowSums(incompatibleCounts_i) + nonuniqueCounts_i <- quantData_i$nonuniqueCounts[, columnIdx, drop = FALSE] + nonuniqueCounts_i <- rowSums(nonuniqueCounts_i) - return(bambu.quantify(readClassDt = quantData_i$readClassDt, columnIdx = columnIdx, + return(bambu.quantify(readClassDt = quantData_i$readClassDt, columnIdx = columnIdx, incompatibleCounts = data.table(GENEID.i = names(incompatibleCounts_i), counts = incompatibleCounts_i), + nonuniqueCounts = nonuniqueCounts_i, txid.index = mcols(annotations)$txid, GENEIDs = names(incompatibleCounts_i), isoreParameters = isoreParameters, - emParameters = emParameters, trackReads = trackReads, + emParameters = emParameters, trackReads = trackReads, verbose = verbose))}, BPPARAM = bpParameters) end.ptm <- proc.time() diff --git a/R/bambu_utilityFunctions.R b/R/bambu_utilityFunctions.R index 44c2d747..6c6a029c 100644 --- a/R/bambu_utilityFunctions.R +++ b/R/bambu_utilityFunctions.R @@ -263,7 +263,7 @@ calculateDistTable <- function(readClassList, annotations, isoreParameters, verb #' Combine combined count se object from multiple samples, cells or spatial locations #' @noRd combineCountSes <- function(countsSe, colDataList, annotations){ - countsData <- c("counts", "CPM", "fullLengthCounts", "uniqueCounts", "incompatibleCounts") + countsData <- c("counts", "CPM", "fullLengthCounts", "uniqueCounts", "incompatibleCounts", "nonuniqueCounts") sampleNames <- names(countsSe) countsDataMat <- lapply(countsData, FUN = function(k){ countsVecList <- lapply(countsSe, function(j){j[[k]]}) @@ -276,7 +276,7 @@ combineCountSes <- function(countsSe, colDataList, annotations){ colnames(countsMat) <- sampleNames - if (k == "incompatibleCounts") + if (k %in% c("incompatibleCounts", "nonuniqueCounts")) rownames(countsMat) <- unique(mcols(annotations)$GENEID) return(countsMat) @@ -287,6 +287,7 @@ combineCountSes <- function(countsSe, colDataList, annotations){ fullLengthCounts = countsDataMat$fullLengthCounts, uniqueCounts = countsDataMat$uniqueCounts)) metadata(combinedCountsSe)$incompatibleCounts <- countsDataMat$incompatibleCounts + metadata(combinedCountsSe)$nonuniqueCounts <- countsDataMat$nonuniqueCounts rowRanges(combinedCountsSe) <- annotations colData(combinedCountsSe) <- DataFrame(bind_rows(colDataList)) From a30a5fd24b8e8d501540c61b77ae09280300c0a2 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Thu, 16 Apr 2026 23:25:05 +0800 Subject: [PATCH 09/29] refactor generateUniqueCountsFromQuantData to return SummarizedExperiment Replace generateUniqueCountsFromQuantData (returning a sparse matrix) with generateUniqueCountsSEFromQuantData, which returns a transcript-level SE with assays$uniqueCounts, metadata$incompatibleCounts, and metadata$nonuniqueCounts. This makes the output directly compatible with transcriptToGeneExpression. Move both functions into a new summarizeExpression.R (renamed from transcriptToGeneExpression.R) to reflect the broader scope of the file. Co-Authored-By: Claude Sonnet 4.6 --- R/bambu-assignDist.R | 23 -------- R/summarizeExpression.R | 100 +++++++++++++++++++++++++++++++++ R/transcriptToGeneExpression.R | 53 ----------------- 3 files changed, 100 insertions(+), 76 deletions(-) create mode 100644 R/summarizeExpression.R delete mode 100644 R/transcriptToGeneExpression.R diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index b6137aa5..4426ad17 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -95,26 +95,3 @@ generateNonUniqueCountMatrix <- function(readClassDt, annotations, sampleIds){ geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts return(geneMat) } - -#' Generate unique counts -#' @noRd -generateUniqueCountsFromQuantData <- function(quantData, annotations){ - uniqueCountsList <- lapply(quantData, function(x) { - readClassDt <- x$readClassDt - x_filtered <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match)) - - uniqueCounts <- if(nrow(x_filtered) == 0) { - sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(annotations), nrow(x$sampleData))) - } else { - txids <- mcols(annotations)$txid - i <- rep(match(x_filtered$txid, txids), lengths(x_filtered$columnIds)) - j <- unlist(x_filtered$columnIds) - x_vals <- unlist(x_filtered$columnCounts) - sparseMatrix(i = i, j = j, x = x_vals, dims = c(length(annotations), nrow(x$sampleData))) - } - rownames(uniqueCounts) <- names(annotations) - colnames(uniqueCounts) <- rownames(x$sampleData) - return(uniqueCounts) - }) - return(do.call(cbind, uniqueCountsList)) -} diff --git a/R/summarizeExpression.R b/R/summarizeExpression.R new file mode 100644 index 00000000..855bf8b1 --- /dev/null +++ b/R/summarizeExpression.R @@ -0,0 +1,100 @@ +#' Reduce transcript expression to gene expression +#' @title transcript to gene expression +#' @param se a summarizedExperiment object from \code{\link{bambu}} +#' @return A SummarizedExperiment object +#' @import data.table +#' @export +#' @examples +#' se <- readRDS(system.file("extdata", +#' "seOutput_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds", +#' package = "bambu" +#' )) +#' transcriptToGeneExpression(se) +transcriptToGeneExpression <- function(se) { + counts <- assays(se)$counts + runnames <- colnames(counts)[-1] + rowDataSe <- as.data.table(rowData(se)) + + counts = fac2sparse(factor(rowData(se)$GENEID, levels = unique(rowData(se)$GENEID))) %*% counts + if(!is.null(metadata(se)$incompatibleCounts)){ + incompatibleCounts <- metadata(se)$incompatibleCounts + if("nonuniqueCounts" %in% names(metadata(se))){ + incompatibleCounts = incompatibleCounts + metadata(se)$nonuniqueCounts + } + incompatibleCounts = Matrix(incompatibleCounts[match(rownames(counts), rownames(incompatibleCounts)),], sparse = TRUE) + counts = counts + incompatibleCounts + } + counts.total = colSums(counts) + counts.total[counts.total==0] = 1 + counts.CPM = counts/counts.total * 10^6 + + ## geneRanges + exByGene <- reducedRangesByGenes(rowRanges(se)) + if ("txClassDescription" %in% colnames(rowDataSe)) { + rowDataSe <- rowDataSe[, .(TXNAME, GENEID, txClassDescription)] + rowDataSe[, newGeneClass := ifelse(grepl("ENSG", GENEID), + "annotation", unique(txClassDescription)), by = GENEID] + mcols(exByGene) <- unique(rowDataSe[, .(GENEID, + newGeneClass)])[match(names(exByGene), GENEID)] + } + ## SE + RowNames <- rownames(counts) + ColNames <- colnames(counts) + ColData <- colData(se) + ColData@rownames <- ColNames + ColData@listData$name <- ColNames + seOutput <- SummarizedExperiment( + assays = SimpleList(counts = counts, + CPM = counts.CPM), + rowRanges = exByGene[RowNames], + colData = ColData) + + return(seOutput) +} + +#' Generate a SummarizedExperiment of unique counts from quantData +#' @description This function is intended to be used after the transcript +#' discovery and \code{assignDist} steps in \code{\link{bambu}}. It builds a +#' transcript-level SummarizedExperiment containing raw unique counts (reads +#' uniquely assigned to a single transcript) without EM estimation, which can +#' be passed directly to \code{\link{transcriptToGeneExpression}} to obtain +#' gene-level counts as uniqueCounts + nonuniqueCounts + incompatibleCounts. +#' @param quantData a list of quantData objects produced by the assignDist step +#' @param annotations a GRangesList of transcript annotations +#' @return A SummarizedExperiment object with \code{assays$uniqueCounts}, +#' \code{metadata$incompatibleCounts}, and \code{metadata$nonuniqueCounts} +#' @import data.table +#' @noRd +generateUniqueCountsSEFromQuantData <- function(quantData, annotations) { + uniqueCountsList <- lapply(quantData, function(x) { + readClassDt <- x$readClassDt + x_filtered <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match)) + + uniqueCounts <- if (nrow(x_filtered) == 0) { + sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(annotations), nrow(x$sampleData))) + } else { + txids <- mcols(annotations)$txid + i <- rep(match(x_filtered$txid, txids), lengths(x_filtered$columnIds)) + j <- unlist(x_filtered$columnIds) + x_vals <- unlist(x_filtered$columnCounts) + sparseMatrix(i = i, j = j, x = x_vals, dims = c(length(annotations), nrow(x$sampleData))) + } + rownames(uniqueCounts) <- names(annotations) + colnames(uniqueCounts) <- rownames(x$sampleData) + return(uniqueCounts) + }) + uniqueCounts <- do.call(cbind, uniqueCountsList) + + incompatibleCounts <- do.call(cbind, lapply(quantData, function(x) x$incompatibleCounts)) + nonuniqueCounts <- do.call(cbind, lapply(quantData, function(x) x$nonuniqueCounts)) + + colData <- do.call(rbind, lapply(quantData, function(x) x$sampleData)) + + se <- SummarizedExperiment(assays = SimpleList(uniqueCounts = uniqueCounts)) + rowRanges(se) <- annotations + colData(se) <- DataFrame(colData) + metadata(se)$incompatibleCounts <- incompatibleCounts + metadata(se)$nonuniqueCounts <- nonuniqueCounts + + return(se) +} diff --git a/R/transcriptToGeneExpression.R b/R/transcriptToGeneExpression.R deleted file mode 100644 index 963a9b00..00000000 --- a/R/transcriptToGeneExpression.R +++ /dev/null @@ -1,53 +0,0 @@ -#' Reduce transcript expression to gene expression -#' @title transcript to gene expression -#' @param se a summarizedExperiment object from \code{\link{bambu}} -#' @return A SummarizedExperiment object -#' @import data.table -#' @export -#' @examples -#' se <- readRDS(system.file("extdata", -#' "seOutput_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds", -#' package = "bambu" -#' )) -#' transcriptToGeneExpression(se) -transcriptToGeneExpression <- function(se) { - counts <- assays(se)$counts - runnames <- colnames(counts)[-1] - rowDataSe <- as.data.table(rowData(se)) - - counts = fac2sparse(factor(rowData(se)$GENEID, levels = unique(rowData(se)$GENEID))) %*% counts - if(!is.null(metadata(se)$incompatibleCounts)){ - incompatibleCounts <- metadata(se)$incompatibleCounts - if("nonuniqueCounts" %in% names(metadata(se))){ - incompatibleCounts = incompatibleCounts + metadata(se)$nonuniqueCounts - } - incompatibleCounts = Matrix(incompatibleCounts[match(rownames(counts), rownames(incompatibleCounts)),], sparse = TRUE) - counts = counts + incompatibleCounts - } - counts.total = colSums(counts) - counts.total[counts.total==0] = 1 - counts.CPM = counts/counts.total * 10^6 - - ## geneRanges - exByGene <- reducedRangesByGenes(rowRanges(se)) - if ("txClassDescription" %in% colnames(rowDataSe)) { - rowDataSe <- rowDataSe[, .(TXNAME, GENEID, txClassDescription)] - rowDataSe[, newGeneClass := ifelse(grepl("ENSG", GENEID), - "annotation", unique(txClassDescription)), by = GENEID] - mcols(exByGene) <- unique(rowDataSe[, .(GENEID, - newGeneClass)])[match(names(exByGene), GENEID)] - } - ## SE - RowNames <- rownames(counts) - ColNames <- colnames(counts) - ColData <- colData(se) - ColData@rownames <- ColNames - ColData@listData$name <- ColNames - seOutput <- SummarizedExperiment( - assays = SimpleList(counts = counts, - CPM = counts.CPM), - rowRanges = exByGene[RowNames], - colData = ColData) - - return(seOutput) -} \ No newline at end of file From acf4a45a503bda2e05cf33a1aacfe5582c7549a8 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Thu, 16 Apr 2026 23:25:56 +0800 Subject: [PATCH 10/29] refactor transcriptToGeneExpression to compute gene counts as uniqueCounts + incompatibleCounts + nonuniqueCounts Previously transcriptToGeneExpression aggregated EM counts by gene then added incompatibleCounts and nonuniqueCounts, which double-counted nonuniqueCounts since EM counts already incorporate them at the gene level. Now use assays$uniqueCounts as the base, aggregate by gene, then add incompatibleCounts and nonuniqueCounts explicitly. This gives the correct total: reads uniquely assigned to a transcript + reads multi-aligning within a gene + reads incompatible with any transcript. Also fix generateIncompatibleCounts and generateNonUniqueCountMatrix to use unique(mcols(annotations)$GENEID) (appearance order) instead of levels(factor(unique(...))), consistent with combineCountSes, preventing gene ID mislabelling. Co-Authored-By: Claude Sonnet 4.6 --- R/bambu-assignDist.R | 4 ++-- R/summarizeExpression.R | 16 +++++----------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index 4426ad17..f2049a9d 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -51,7 +51,7 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame #' Generate incompatible counts #' @noRd generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ - genes <- levels(factor(unique(mcols(annotations)$GENEID))) + genes <- unique(mcols(annotations)$GENEID) rownames(incompatibleCountMatrix) <- genes[as.numeric(rownames(incompatibleCountMatrix))] geneMat <- sparseMatrix(length(genes), ncol(incompatibleCountMatrix), x = 0) rownames(geneMat) <- genes @@ -63,7 +63,7 @@ generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ #' Generate nonunique count matrix (gene x sample) from multi-align reads in readClassDt #' @noRd generateNonUniqueCountMatrix <- function(readClassDt, annotations, sampleIds){ - genes <- levels(factor(unique(mcols(annotations)$GENEID))) + genes <- unique(mcols(annotations)$GENEID) nSamples <- length(sampleIds) geneMat <- sparseMatrix(length(genes), nSamples, x = 0) rownames(geneMat) <- genes diff --git a/R/summarizeExpression.R b/R/summarizeExpression.R index 855bf8b1..b4f78145 100644 --- a/R/summarizeExpression.R +++ b/R/summarizeExpression.R @@ -11,19 +11,13 @@ #' )) #' transcriptToGeneExpression(se) transcriptToGeneExpression <- function(se) { - counts <- assays(se)$counts - runnames <- colnames(counts)[-1] + uniqueCounts <- assays(se)$uniqueCounts rowDataSe <- as.data.table(rowData(se)) - counts = fac2sparse(factor(rowData(se)$GENEID, levels = unique(rowData(se)$GENEID))) %*% counts - if(!is.null(metadata(se)$incompatibleCounts)){ - incompatibleCounts <- metadata(se)$incompatibleCounts - if("nonuniqueCounts" %in% names(metadata(se))){ - incompatibleCounts = incompatibleCounts + metadata(se)$nonuniqueCounts - } - incompatibleCounts = Matrix(incompatibleCounts[match(rownames(counts), rownames(incompatibleCounts)),], sparse = TRUE) - counts = counts + incompatibleCounts - } + uniqueCounts = fac2sparse(factor(rowData(se)$GENEID, levels = unique(rowData(se)$GENEID))) %*% uniqueCounts + incompatibleCounts <- metadata(se)$incompatibleCounts + nonuniqueCounts <- metadata(se)$nonuniqueCounts + counts = uniqueCounts + incompatibleCounts + nonuniqueCounts counts.total = colSums(counts) counts.total[counts.total==0] = 1 counts.CPM = counts/counts.total * 10^6 From 5366795020b9ca49f3c91659aac23d980c2a457b Mon Sep 17 00:00:00 2001 From: lingminhao Date: Fri, 17 Apr 2026 00:10:53 +0800 Subject: [PATCH 11/29] refactor constructQuantData class using claude Replace catch-all $ method with typed accessors (getSampleData, getReadClassDt, getIncompatibleCounts, getNonuniqueCounts, getDistTable, getReadToTranscriptMap), add prototype, validity checks, and exported constructor constructQuantData. Type incompatibleCounts and nonuniqueCounts slots as sparseMatrix. Update all call sites in bambu.R, bambu-assignDist.R, and summarizeExpression.R. Also fixes pre-existing sampleNames typo bug in bambu.R. Drop exportClass(quantData) and exportMethods("$") from NAMESPACE. Co-Authored-By: Claude Sonnet 4.6 --- NAMESPACE | 2 -- R/bambu-assignDist.R | 12 +++---- R/bambu-classes.R | 75 ++++++++++++++++++++++++++++++++++------- R/bambu.R | 22 ++++++------ R/summarizeExpression.R | 14 ++++---- 5 files changed, 86 insertions(+), 39 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e51b129b..3adb4e37 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,8 +11,6 @@ export(writeAnnotationsToGTF) export(trainBambu) export(setNDR) export(compareTranscripts) -exportClass(quantData) -exportMethods("$") importFrom(stats,predict) importFrom(BiocGenerics,basename) importFrom(BiocGenerics,unstrand) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index f2049a9d..b680821e 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -36,12 +36,12 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame metadata(readClassList)$readClassDist, annotations) else NULL - quantData <- new("quantData", - sampleData = data.frame(ColData), - readClassDt = readClassDt, - incompatibleCounts = incompatibleCounts, - nonuniqueCounts = nonuniqueCounts, - distTable = distTable, + quantData <- constructQuantData( + sampleData = data.frame(ColData), + readClassDt = readClassDt, + incompatibleCounts = incompatibleCounts, + nonuniqueCounts = nonuniqueCounts, + distTable = distTable, readToTranscriptMap = readToTranscriptMap ) diff --git a/R/bambu-classes.R b/R/bambu-classes.R index 3825740c..4eebae94 100644 --- a/R/bambu-classes.R +++ b/R/bambu-classes.R @@ -3,19 +3,68 @@ #' @importFrom data.table data.table #' @importClassesFrom Matrix Matrix #' @importClassesFrom data.table data.table -#' @export setClass("quantData", - slots = list( - sampleData = "data.frame", - readClassDt = "data.table", - incompatibleCounts = "ANY", - nonuniqueCounts = "ANY", - distTable = "ANY", + slots = c( + sampleData = "data.frame", + readClassDt = "data.table", + incompatibleCounts = "sparseMatrix", + nonuniqueCounts = "sparseMatrix", + distTable = "ANY", readToTranscriptMap = "ANY" - ) -) + ), + prototype = list( + sampleData = data.frame(id = integer(), sampleName = character()), + readClassDt = data.table::data.table() + ), + validity = function(object) { + errs <- character() + if (!all(c("id", "sampleName") %in% names(object@sampleData))) + errs <- c(errs, "sampleData must have columns 'id' and 'sampleName'") + if (!is.null(object@distTable) && !is(object@distTable, "DataFrame")) + errs <- c(errs, "distTable must be a DataFrame or NULL") + if (!is.null(object@readToTranscriptMap) && !is(object@readToTranscriptMap, "tbl_df")) + errs <- c(errs, "readToTranscriptMap must be a tibble or NULL") + if (length(errs)) errs else TRUE + }) -#' @export -setMethod("$", signature(x = "quantData"), function(x, name) { - slot(x, name) -}) +#' Construct a quantData object +#' @noRd +constructQuantData <- function(sampleData, readClassDt, + incompatibleCounts, + nonuniqueCounts, + distTable = NULL, + readToTranscriptMap = NULL) { + new("quantData", + sampleData = sampleData, + readClassDt = readClassDt, + incompatibleCounts = incompatibleCounts, + nonuniqueCounts = nonuniqueCounts, + distTable = distTable, + readToTranscriptMap = readToTranscriptMap) +} + +#' @noRd +setGeneric("getSampleData", function(x) standardGeneric("getSampleData")) +#' @noRd +setGeneric("getReadClassDt", function(x) standardGeneric("getReadClassDt")) +#' @noRd +setGeneric("getIncompatibleCounts", function(x) standardGeneric("getIncompatibleCounts")) +#' @noRd +setGeneric("getNonuniqueCounts", function(x) standardGeneric("getNonuniqueCounts")) +#' @noRd +setGeneric("getDistTable", function(x) standardGeneric("getDistTable")) +#' @noRd +setGeneric("getReadToTranscriptMap", function(x) standardGeneric("getReadToTranscriptMap")) + +#' @noRd +setMethod("getSampleData", "quantData", function(x) x@sampleData) +#' @noRd +setMethod("getReadClassDt", "quantData", function(x) x@readClassDt) +#' @noRd +setMethod("getIncompatibleCounts", "quantData", function(x) x@incompatibleCounts) +#' @noRd +setMethod("getNonuniqueCounts", "quantData", function(x) x@nonuniqueCounts) +#' @noRd +setMethod("getDistTable", "quantData", function(x) x@distTable) +#' @noRd +setMethod("getReadToTranscriptMap", "quantData", function(x) x@readToTranscriptMap) diff --git a/R/bambu.R b/R/bambu.R index 7baf1916..4a0c6667 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -274,17 +274,17 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, for(i in seq_along(quantData)){ quantData_i <- quantData[[i]] #load in the barcode clustering from file if provided - iter <- seq_len(nrow(quantData_i$sampleData)) # iter is integer + iter <- seq_len(nrow(getSampleData(quantData_i))) # iter is integer if(!is.null(clusters)){ if(class(clusters[[i]])!="CompressedCharacterList"){ # !is.list(clusters) is FALSE for CompressedCharacterList clusterMaps <- NULL - for(j in seq_along(quantData_i$sampleNames)){ #load in a file per sample name provided + for(j in seq_along(getSampleData(quantData_i)$sampleName)){ #load in a file per sample name provided clusterMap <- fread(clusters[[j]], header = FALSE, data.table = FALSE) # read.table(clusters[[j]], # sep = ifelse(grepl(".tsv$",clusters[[j]]), "\t", ","), # header = FALSE) - clusterMap[,1] <- paste0(quantData_i$sampleNames[j], + clusterMap[,1] <- paste0(getSampleData(quantData_i)$sampleName[j], "_",clusterMap[,1]) clusterMaps <- rbind(clusterMaps, clusterMap) } @@ -300,15 +300,15 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, countsSeCompressed <- bplapply(iter, FUN = function(columnIdx){ # previous i changed to j to avoid duplicated assignment #i = iter[i %in% colnames(metadata(quantData_i)$countMatrix)] #bug, after assignment, i become emptyprint(i) if(is.character(columnIdx)) { - columnIdx <- match(columnIdx, quantData_i$sampleData$id) + columnIdx <- match(columnIdx, getSampleData(quantData_i)$id) } - incompatibleCounts_i <- quantData_i$incompatibleCounts[, columnIdx, drop = FALSE] + incompatibleCounts_i <- getIncompatibleCounts(quantData_i)[, columnIdx, drop = FALSE] incompatibleCounts_i <- rowSums(incompatibleCounts_i) - nonuniqueCounts_i <- quantData_i$nonuniqueCounts[, columnIdx, drop = FALSE] + nonuniqueCounts_i <- getNonuniqueCounts(quantData_i)[, columnIdx, drop = FALSE] nonuniqueCounts_i <- rowSums(nonuniqueCounts_i) - return(bambu.quantify(readClassDt = quantData_i$readClassDt, columnIdx = columnIdx, + return(bambu.quantify(readClassDt = getReadClassDt(quantData_i), columnIdx = columnIdx, incompatibleCounts = data.table(GENEID.i = names(incompatibleCounts_i), counts = incompatibleCounts_i), nonuniqueCounts = nonuniqueCounts_i, txid.index = mcols(annotations)$txid, GENEIDs = names(incompatibleCounts_i), isoreParameters = isoreParameters, @@ -325,8 +325,8 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, row.names = names(countsSeCompressed) ) } else{ - ColNames <- c(ColNames, rownames(quantData_i$sampleData)) - colData.all[[i]] <- data.frame(quantData_i$sampleData) + ColNames <- c(ColNames, rownames(getSampleData(quantData_i))) + colData.all[[i]] <- data.frame(getSampleData(quantData_i)) } countsSeCompressed.all <- c(countsSeCompressed.all, countsSeCompressed) } @@ -336,7 +336,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if(trackReads){ readToTranscriptMaps = list() for(i in seq_along(quantData)){ - readToTranscriptMaps[[i]] <- quantData[[i]]$readToTranscriptMap + readToTranscriptMaps[[i]] <- getReadToTranscriptMap(quantData[[i]]) } names(readToTranscriptMaps) <- names(quantData) metadata(countsSe)$readToTranscriptMaps <- readToTranscriptMaps @@ -344,7 +344,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if(returnDistTable){ distTables = list() for(i in seq_along(quantData)){ - distTables[[i]] <- quantData[[i]]$distTable + distTables[[i]] <- getDistTable(quantData[[i]]) } names(distTables) <- names(quantData) metadata(countsSe)$distTables <- distTables diff --git a/R/summarizeExpression.R b/R/summarizeExpression.R index b4f78145..619c58a6 100644 --- a/R/summarizeExpression.R +++ b/R/summarizeExpression.R @@ -61,28 +61,28 @@ transcriptToGeneExpression <- function(se) { #' @noRd generateUniqueCountsSEFromQuantData <- function(quantData, annotations) { uniqueCountsList <- lapply(quantData, function(x) { - readClassDt <- x$readClassDt + readClassDt <- getReadClassDt(x) x_filtered <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match)) uniqueCounts <- if (nrow(x_filtered) == 0) { - sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(annotations), nrow(x$sampleData))) + sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(annotations), nrow(getSampleData(x)))) } else { txids <- mcols(annotations)$txid i <- rep(match(x_filtered$txid, txids), lengths(x_filtered$columnIds)) j <- unlist(x_filtered$columnIds) x_vals <- unlist(x_filtered$columnCounts) - sparseMatrix(i = i, j = j, x = x_vals, dims = c(length(annotations), nrow(x$sampleData))) + sparseMatrix(i = i, j = j, x = x_vals, dims = c(length(annotations), nrow(getSampleData(x)))) } rownames(uniqueCounts) <- names(annotations) - colnames(uniqueCounts) <- rownames(x$sampleData) + colnames(uniqueCounts) <- rownames(getSampleData(x)) return(uniqueCounts) }) uniqueCounts <- do.call(cbind, uniqueCountsList) - incompatibleCounts <- do.call(cbind, lapply(quantData, function(x) x$incompatibleCounts)) - nonuniqueCounts <- do.call(cbind, lapply(quantData, function(x) x$nonuniqueCounts)) + incompatibleCounts <- do.call(cbind, lapply(quantData, getIncompatibleCounts)) + nonuniqueCounts <- do.call(cbind, lapply(quantData, getNonuniqueCounts)) - colData <- do.call(rbind, lapply(quantData, function(x) x$sampleData)) + colData <- do.call(rbind, lapply(quantData, getSampleData)) se <- SummarizedExperiment(assays = SimpleList(uniqueCounts = uniqueCounts)) rowRanges(se) <- annotations From e895bee74e30d0c496f0112c54fa284a72453486 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Fri, 17 Apr 2026 10:05:55 +0800 Subject: [PATCH 12/29] not storing nonuniqueCountMatrix in quantData nonuniqueCountMatrix is now computed on demand in bambu.R and generateUniqueCountsSEFromQuantData, and stored only in the uniqueCountsSe and final SE metadata. Moved generateNonUniqueCountMatrix to bambu_utilityFunctions.R as it is shared across both call sites. Co-Authored-By: Claude Sonnet 4.6 --- R/bambu-assignDist.R | 38 -------------------------------------- R/bambu-classes.R | 7 ------- R/bambu.R | 7 ++++--- R/bambu_utilityFunctions.R | 38 +++++++++++++++++++++++++++++++++++++- R/summarizeExpression.R | 4 +++- 5 files changed, 44 insertions(+), 50 deletions(-) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index b680821e..638b7211 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -28,7 +28,6 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix incompatibleCounts <- generateIncompatibleCounts(incompatibleCountMatrix, annotations) - nonuniqueCounts <- generateNonUniqueCountMatrix(readClassDt, annotations, metadata(readClassList)$sampleData$id) distTable <- if(returnDistTable) metadata(metadata(readClassList)$readClassDist)$distTableOld else NULL @@ -40,7 +39,6 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame sampleData = data.frame(ColData), readClassDt = readClassDt, incompatibleCounts = incompatibleCounts, - nonuniqueCounts = nonuniqueCounts, distTable = distTable, readToTranscriptMap = readToTranscriptMap ) @@ -59,39 +57,3 @@ generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ geneMat[match(rownames(incompatibleCountMatrix), rownames(geneMat)), ] <- incompatibleCountMatrix return(geneMat) } - -#' Generate nonunique count matrix (gene x sample) from multi-align reads in readClassDt -#' @noRd -generateNonUniqueCountMatrix <- function(readClassDt, annotations, sampleIds){ - genes <- unique(mcols(annotations)$GENEID) - nSamples <- length(sampleIds) - geneMat <- sparseMatrix(length(genes), nSamples, x = 0) - rownames(geneMat) <- genes - colnames(geneMat) <- sampleIds - - x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) - x <- x %>% distinct(eqClassId, .keep_all = TRUE) - if (nrow(x) == 0) return(geneMat) - - i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) - j <- unlist(x$columnIds) - x_vals <- unlist(x$columnCounts) - nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nSamples)) - - if (nrow(x) > 1 & length(unique(x$gene_sid)) > 1) { - nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } else { - nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } - - geneids <- as.numeric(levels(factor(x$gene_sid))) - geneids <- x$txid[match(geneids, x$gene_sid)] - geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] - rownames(nonuniqueCounts) <- geneids - colnames(nonuniqueCounts) <- sampleIds - - geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts - return(geneMat) -} diff --git a/R/bambu-classes.R b/R/bambu-classes.R index 4eebae94..b4b54cee 100644 --- a/R/bambu-classes.R +++ b/R/bambu-classes.R @@ -8,7 +8,6 @@ setClass("quantData", sampleData = "data.frame", readClassDt = "data.table", incompatibleCounts = "sparseMatrix", - nonuniqueCounts = "sparseMatrix", distTable = "ANY", readToTranscriptMap = "ANY" ), @@ -31,14 +30,12 @@ setClass("quantData", #' @noRd constructQuantData <- function(sampleData, readClassDt, incompatibleCounts, - nonuniqueCounts, distTable = NULL, readToTranscriptMap = NULL) { new("quantData", sampleData = sampleData, readClassDt = readClassDt, incompatibleCounts = incompatibleCounts, - nonuniqueCounts = nonuniqueCounts, distTable = distTable, readToTranscriptMap = readToTranscriptMap) } @@ -50,8 +47,6 @@ setGeneric("getReadClassDt", function(x) standardGeneric("getReadClassDt #' @noRd setGeneric("getIncompatibleCounts", function(x) standardGeneric("getIncompatibleCounts")) #' @noRd -setGeneric("getNonuniqueCounts", function(x) standardGeneric("getNonuniqueCounts")) -#' @noRd setGeneric("getDistTable", function(x) standardGeneric("getDistTable")) #' @noRd setGeneric("getReadToTranscriptMap", function(x) standardGeneric("getReadToTranscriptMap")) @@ -63,8 +58,6 @@ setMethod("getReadClassDt", "quantData", function(x) x@readClassDt) #' @noRd setMethod("getIncompatibleCounts", "quantData", function(x) x@incompatibleCounts) #' @noRd -setMethod("getNonuniqueCounts", "quantData", function(x) x@nonuniqueCounts) -#' @noRd setMethod("getDistTable", "quantData", function(x) x@distTable) #' @noRd setMethod("getReadToTranscriptMap", "quantData", function(x) x@readToTranscriptMap) diff --git a/R/bambu.R b/R/bambu.R index 4a0c6667..9f78d588 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -297,15 +297,16 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, iter <- clusters[[i]] } } - countsSeCompressed <- bplapply(iter, FUN = function(columnIdx){ # previous i changed to j to avoid duplicated assignment + nonuniqueCountMat_i <- generateNonUniqueCountMatrix(getReadClassDt(quantData_i), annotations, getSampleData(quantData_i)$id) + countsSeCompressed <- bplapply(iter, FUN = function(columnIdx){ # previous i changed to j to avoid duplicated assignment #i = iter[i %in% colnames(metadata(quantData_i)$countMatrix)] #bug, after assignment, i become emptyprint(i) if(is.character(columnIdx)) { columnIdx <- match(columnIdx, getSampleData(quantData_i)$id) } - + incompatibleCounts_i <- getIncompatibleCounts(quantData_i)[, columnIdx, drop = FALSE] incompatibleCounts_i <- rowSums(incompatibleCounts_i) - nonuniqueCounts_i <- getNonuniqueCounts(quantData_i)[, columnIdx, drop = FALSE] + nonuniqueCounts_i <- nonuniqueCountMat_i[, columnIdx, drop = FALSE] nonuniqueCounts_i <- rowSums(nonuniqueCounts_i) return(bambu.quantify(readClassDt = getReadClassDt(quantData_i), columnIdx = columnIdx, diff --git a/R/bambu_utilityFunctions.R b/R/bambu_utilityFunctions.R index 6c6a029c..72ac9f07 100644 --- a/R/bambu_utilityFunctions.R +++ b/R/bambu_utilityFunctions.R @@ -331,8 +331,44 @@ generateColData <- function(readClassList, sampleMetadata, demultiplexed) { } # Quick wrapper function (https://stackoverflow.com/questions/13273833/merging-multiple-data-tables) -#' @noRd +#' @noRd merge_wrapper <- function(x,y){ merge.data.table(x,y,by = "GENEID",all=TRUE) } +#' Generate nonunique count matrix (gene x sample) from multi-align reads in readClassDt +#' @noRd +generateNonUniqueCountMatrix <- function(readClassDt, annotations, sampleIds){ + genes <- unique(mcols(annotations)$GENEID) + nSamples <- length(sampleIds) + geneMat <- sparseMatrix(length(genes), nSamples, x = 0) + rownames(geneMat) <- genes + colnames(geneMat) <- sampleIds + + x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) + x <- x %>% distinct(eqClassId, .keep_all = TRUE) + if (nrow(x) == 0) return(geneMat) + + i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) + j <- unlist(x$columnIds) + x_vals <- unlist(x$columnCounts) + nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nSamples)) + + if (nrow(x) > 1 & length(unique(x$gene_sid)) > 1) { + nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } else { + nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } + + geneids <- as.numeric(levels(factor(x$gene_sid))) + geneids <- x$txid[match(geneids, x$gene_sid)] + geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] + rownames(nonuniqueCounts) <- geneids + colnames(nonuniqueCounts) <- sampleIds + + geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts + return(geneMat) +} + diff --git a/R/summarizeExpression.R b/R/summarizeExpression.R index 619c58a6..767ede01 100644 --- a/R/summarizeExpression.R +++ b/R/summarizeExpression.R @@ -80,7 +80,9 @@ generateUniqueCountsSEFromQuantData <- function(quantData, annotations) { uniqueCounts <- do.call(cbind, uniqueCountsList) incompatibleCounts <- do.call(cbind, lapply(quantData, getIncompatibleCounts)) - nonuniqueCounts <- do.call(cbind, lapply(quantData, getNonuniqueCounts)) + nonuniqueCounts <- do.call(cbind, lapply(quantData, function(x) { + generateNonUniqueCountMatrix(getReadClassDt(x), annotations, getSampleData(x)$id) + })) colData <- do.call(rbind, lapply(quantData, getSampleData)) From 572ef70f1d9c4c07a88cd478f7199a619c17b39f Mon Sep 17 00:00:00 2001 From: lingminhao Date: Fri, 17 Apr 2026 11:50:17 +0800 Subject: [PATCH 13/29] revert transcriptToGeneExpression & remove nonuniqueCounts from final SE - revert transcriptToGeneExpression to use assays$counts + incompatibleCounts (+ nonuniqueCounts if present in metadata, for uniqueCounts SE path) - rename summarizeExpression.R to transcriptToGeneExpression.R - move generateNonUniqueCountMatrix to transcriptToGeneExpression.R - remove nonuniqueCounts from bambu.quantify and combineCountSes Co-Authored-By: Claude Sonnet 4.6 --- R/bambu-quantify.R | 3 +- R/bambu.R | 4 - R/bambu_utilityFunctions.R | 41 +--------- R/transcriptToGeneExpression.R | 134 +++++++++++++++++++++++++++++++++ 4 files changed, 137 insertions(+), 45 deletions(-) create mode 100644 R/transcriptToGeneExpression.R diff --git a/R/bambu-quantify.R b/R/bambu-quantify.R index 0287020b..686f8fd3 100644 --- a/R/bambu-quantify.R +++ b/R/bambu-quantify.R @@ -2,7 +2,7 @@ #' @inheritParams bambu #' @import data.table #' @noRd -bambu.quantify <- function(readClassDt, columnIdx, incompatibleCounts, nonuniqueCounts, txid.index, GENEIDs, emParameters, +bambu.quantify <- function(readClassDt, columnIdx, incompatibleCounts, txid.index, GENEIDs, emParameters, trackReads = FALSE, returnDistTable = FALSE, verbose = FALSE, isoreParameters = setIsoreParameters(NULL)) { start.ptm <- proc.time() @@ -28,7 +28,6 @@ bambu.quantify <- function(readClassDt, columnIdx, incompatibleCounts, nonunique counts <- compatibleCounts[match(txid.index, txid)] sig.digit <- emParameters[["sig.digit"]] seOutput <- list(incompatibleCounts = as(incompatibleCounts$counts, "sparseVector"), - nonuniqueCounts = as(nonuniqueCounts, "sparseVector"), counts = as(round(counts$counts,sig.digit), "sparseVector"), CPM = as(round(counts$CPM,sig.digit), "sparseVector"), fullLengthCounts = as(round(counts$fullLengthCounts,sig.digit), "sparseVector"), diff --git a/R/bambu.R b/R/bambu.R index 9f78d588..e3e13d91 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -297,7 +297,6 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, iter <- clusters[[i]] } } - nonuniqueCountMat_i <- generateNonUniqueCountMatrix(getReadClassDt(quantData_i), annotations, getSampleData(quantData_i)$id) countsSeCompressed <- bplapply(iter, FUN = function(columnIdx){ # previous i changed to j to avoid duplicated assignment #i = iter[i %in% colnames(metadata(quantData_i)$countMatrix)] #bug, after assignment, i become emptyprint(i) if(is.character(columnIdx)) { @@ -306,12 +305,9 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, incompatibleCounts_i <- getIncompatibleCounts(quantData_i)[, columnIdx, drop = FALSE] incompatibleCounts_i <- rowSums(incompatibleCounts_i) - nonuniqueCounts_i <- nonuniqueCountMat_i[, columnIdx, drop = FALSE] - nonuniqueCounts_i <- rowSums(nonuniqueCounts_i) return(bambu.quantify(readClassDt = getReadClassDt(quantData_i), columnIdx = columnIdx, incompatibleCounts = data.table(GENEID.i = names(incompatibleCounts_i), counts = incompatibleCounts_i), - nonuniqueCounts = nonuniqueCounts_i, txid.index = mcols(annotations)$txid, GENEIDs = names(incompatibleCounts_i), isoreParameters = isoreParameters, emParameters = emParameters, trackReads = trackReads, verbose = verbose))}, diff --git a/R/bambu_utilityFunctions.R b/R/bambu_utilityFunctions.R index 72ac9f07..889f0342 100644 --- a/R/bambu_utilityFunctions.R +++ b/R/bambu_utilityFunctions.R @@ -263,7 +263,7 @@ calculateDistTable <- function(readClassList, annotations, isoreParameters, verb #' Combine combined count se object from multiple samples, cells or spatial locations #' @noRd combineCountSes <- function(countsSe, colDataList, annotations){ - countsData <- c("counts", "CPM", "fullLengthCounts", "uniqueCounts", "incompatibleCounts", "nonuniqueCounts") + countsData <- c("counts", "CPM", "fullLengthCounts", "uniqueCounts", "incompatibleCounts") sampleNames <- names(countsSe) countsDataMat <- lapply(countsData, FUN = function(k){ countsVecList <- lapply(countsSe, function(j){j[[k]]}) @@ -276,7 +276,7 @@ combineCountSes <- function(countsSe, colDataList, annotations){ colnames(countsMat) <- sampleNames - if (k %in% c("incompatibleCounts", "nonuniqueCounts")) + if (k == "incompatibleCounts") rownames(countsMat) <- unique(mcols(annotations)$GENEID) return(countsMat) @@ -287,7 +287,6 @@ combineCountSes <- function(countsSe, colDataList, annotations){ fullLengthCounts = countsDataMat$fullLengthCounts, uniqueCounts = countsDataMat$uniqueCounts)) metadata(combinedCountsSe)$incompatibleCounts <- countsDataMat$incompatibleCounts - metadata(combinedCountsSe)$nonuniqueCounts <- countsDataMat$nonuniqueCounts rowRanges(combinedCountsSe) <- annotations colData(combinedCountsSe) <- DataFrame(bind_rows(colDataList)) @@ -336,39 +335,3 @@ merge_wrapper <- function(x,y){ merge.data.table(x,y,by = "GENEID",all=TRUE) } -#' Generate nonunique count matrix (gene x sample) from multi-align reads in readClassDt -#' @noRd -generateNonUniqueCountMatrix <- function(readClassDt, annotations, sampleIds){ - genes <- unique(mcols(annotations)$GENEID) - nSamples <- length(sampleIds) - geneMat <- sparseMatrix(length(genes), nSamples, x = 0) - rownames(geneMat) <- genes - colnames(geneMat) <- sampleIds - - x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) - x <- x %>% distinct(eqClassId, .keep_all = TRUE) - if (nrow(x) == 0) return(geneMat) - - i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) - j <- unlist(x$columnIds) - x_vals <- unlist(x$columnCounts) - nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nSamples)) - - if (nrow(x) > 1 & length(unique(x$gene_sid)) > 1) { - nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } else { - nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } - - geneids <- as.numeric(levels(factor(x$gene_sid))) - geneids <- x$txid[match(geneids, x$gene_sid)] - geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] - rownames(nonuniqueCounts) <- geneids - colnames(nonuniqueCounts) <- sampleIds - - geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts - return(geneMat) -} - diff --git a/R/transcriptToGeneExpression.R b/R/transcriptToGeneExpression.R new file mode 100644 index 00000000..ebac146e --- /dev/null +++ b/R/transcriptToGeneExpression.R @@ -0,0 +1,134 @@ +#' Reduce transcript expression to gene expression +#' @title transcript to gene expression +#' @param se a summarizedExperiment object from \code{\link{bambu}} +#' @return A SummarizedExperiment object +#' @import data.table +#' @export +#' @examples +#' se <- readRDS(system.file("extdata", +#' "seOutput_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds", +#' package = "bambu" +#' )) +#' transcriptToGeneExpression(se) +transcriptToGeneExpression <- function(se) { + counts <- assays(se)$counts + rowDataSe <- as.data.table(rowData(se)) + + counts = fac2sparse(factor(rowData(se)$GENEID, levels = unique(rowData(se)$GENEID))) %*% counts + if (!is.null(metadata(se)$incompatibleCounts)) { + incompatibleCounts <- metadata(se)$incompatibleCounts + if ("nonuniqueCounts" %in% names(metadata(se))) + incompatibleCounts <- incompatibleCounts + metadata(se)$nonuniqueCounts + incompatibleCounts <- Matrix(incompatibleCounts[match(rownames(counts), rownames(incompatibleCounts)), ], sparse = TRUE) + counts <- counts + incompatibleCounts + } + counts.total = colSums(counts) + counts.total[counts.total==0] = 1 + counts.CPM = counts/counts.total * 10^6 + + ## geneRanges + exByGene <- reducedRangesByGenes(rowRanges(se)) + if ("txClassDescription" %in% colnames(rowDataSe)) { + rowDataSe <- rowDataSe[, .(TXNAME, GENEID, txClassDescription)] + rowDataSe[, newGeneClass := ifelse(grepl("ENSG", GENEID), + "annotation", unique(txClassDescription)), by = GENEID] + mcols(exByGene) <- unique(rowDataSe[, .(GENEID, + newGeneClass)])[match(names(exByGene), GENEID)] + } + ## SE + RowNames <- rownames(counts) + ColNames <- colnames(counts) + ColData <- colData(se) + ColData@rownames <- ColNames + ColData@listData$name <- ColNames + seOutput <- SummarizedExperiment( + assays = SimpleList(counts = counts, + CPM = counts.CPM), + rowRanges = exByGene[RowNames], + colData = ColData) + return(seOutput) +} + +#' Generate a SummarizedExperiment of unique counts from quantData +#' @description This function is intended to be used after the transcript +#' discovery and \code{assignDist} steps in \code{\link{bambu}}. It builds a +#' transcript-level SummarizedExperiment containing raw unique counts (reads +#' uniquely assigned to a single transcript) without EM estimation, which can +#' be passed directly to \code{\link{transcriptToGeneExpression}} to obtain +#' gene-level counts as uniqueCounts + nonuniqueCounts + incompatibleCounts. +#' @param quantData a list of quantData objects produced by the assignDist step +#' @param annotations a GRangesList of transcript annotations +#' @return A SummarizedExperiment object with \code{assays$uniqueCounts}, +#' \code{metadata$incompatibleCounts}, and \code{metadata$nonuniqueCounts} +#' @import data.table +#' @noRd +generateUniqueCountsSEFromQuantData <- function(quantData, annotations) { + uniqueCountsList <- lapply(quantData, function(x) { + readClassDt <- getReadClassDt(x) + x_filtered <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match)) + + uniqueCounts <- if (nrow(x_filtered) == 0) { + sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(annotations), nrow(getSampleData(x)))) + } else { + txids <- mcols(annotations)$txid + i <- rep(match(x_filtered$txid, txids), lengths(x_filtered$columnIds)) + j <- unlist(x_filtered$columnIds) + x_vals <- unlist(x_filtered$columnCounts) + sparseMatrix(i = i, j = j, x = x_vals, dims = c(length(annotations), nrow(getSampleData(x)))) + } + rownames(uniqueCounts) <- names(annotations) + colnames(uniqueCounts) <- rownames(getSampleData(x)) + return(uniqueCounts) + }) + uniqueCounts <- do.call(cbind, uniqueCountsList) + + incompatibleCounts <- do.call(cbind, lapply(quantData, getIncompatibleCounts)) + nonuniqueCounts <- do.call(cbind, lapply(quantData, function(x) { + generateNonUniqueCountMatrix(getReadClassDt(x), annotations, getSampleData(x)$id) + })) + + colData <- do.call(rbind, lapply(quantData, getSampleData)) + + se <- SummarizedExperiment(assays = SimpleList(counts = uniqueCounts)) + rowRanges(se) <- annotations + colData(se) <- DataFrame(colData) + metadata(se)$incompatibleCounts <- incompatibleCounts + metadata(se)$nonuniqueCounts <- nonuniqueCounts + return(se) +} + +#' Generate nonunique count matrix (gene x sample) from multi-align reads in readClassDt +#' @noRd +generateNonUniqueCountMatrix <- function(readClassDt, annotations, sampleIds){ + genes <- unique(mcols(annotations)$GENEID) + nSamples <- length(sampleIds) + geneMat <- sparseMatrix(length(genes), nSamples, x = 0) + rownames(geneMat) <- genes + colnames(geneMat) <- sampleIds + + x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) + x <- x %>% distinct(eqClassId, .keep_all = TRUE) + if (nrow(x) == 0) return(geneMat) + + i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) + j <- unlist(x$columnIds) + x_vals <- unlist(x$columnCounts) + nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nSamples)) + + if (nrow(x) > 1 & length(unique(x$gene_sid)) > 1) { + nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } else { + nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } + + geneids <- as.numeric(levels(factor(x$gene_sid))) + geneids <- x$txid[match(geneids, x$gene_sid)] + geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] + rownames(nonuniqueCounts) <- geneids + colnames(nonuniqueCounts) <- sampleIds + + geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts + return(geneMat) +} From 4277d65be83f279ee8051da68f9514eda1f3fdc5 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Fri, 17 Apr 2026 11:51:06 +0800 Subject: [PATCH 14/29] add seType tag to SE objects tag SE objects with metadata$seType to distinguish their content: "uniqueCounts", "EMCounts", and "geneCounts" Co-Authored-By: Claude Sonnet 4.6 --- R/bambu_utilityFunctions.R | 1 + R/transcriptToGeneExpression.R | 2 ++ 2 files changed, 3 insertions(+) diff --git a/R/bambu_utilityFunctions.R b/R/bambu_utilityFunctions.R index 889f0342..dc66fac3 100644 --- a/R/bambu_utilityFunctions.R +++ b/R/bambu_utilityFunctions.R @@ -287,6 +287,7 @@ combineCountSes <- function(countsSe, colDataList, annotations){ fullLengthCounts = countsDataMat$fullLengthCounts, uniqueCounts = countsDataMat$uniqueCounts)) metadata(combinedCountsSe)$incompatibleCounts <- countsDataMat$incompatibleCounts + metadata(combinedCountsSe)$seType <- "EMCounts" rowRanges(combinedCountsSe) <- annotations colData(combinedCountsSe) <- DataFrame(bind_rows(colDataList)) diff --git a/R/transcriptToGeneExpression.R b/R/transcriptToGeneExpression.R index ebac146e..99fd49be 100644 --- a/R/transcriptToGeneExpression.R +++ b/R/transcriptToGeneExpression.R @@ -46,6 +46,7 @@ transcriptToGeneExpression <- function(se) { CPM = counts.CPM), rowRanges = exByGene[RowNames], colData = ColData) + metadata(seOutput)$seType <- "geneCounts" return(seOutput) } @@ -94,6 +95,7 @@ generateUniqueCountsSEFromQuantData <- function(quantData, annotations) { colData(se) <- DataFrame(colData) metadata(se)$incompatibleCounts <- incompatibleCounts metadata(se)$nonuniqueCounts <- nonuniqueCounts + metadata(se)$seType <- "uniqueCounts" return(se) } From 8a6cb1eab2b170415584bfb201574b537bb19ab8 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Fri, 17 Apr 2026 12:02:55 +0800 Subject: [PATCH 15/29] remove summarizeExpression.R Co-Authored-By: Claude Sonnet 4.6 --- R/summarizeExpression.R | 96 ----------------------------------------- 1 file changed, 96 deletions(-) delete mode 100644 R/summarizeExpression.R diff --git a/R/summarizeExpression.R b/R/summarizeExpression.R deleted file mode 100644 index 767ede01..00000000 --- a/R/summarizeExpression.R +++ /dev/null @@ -1,96 +0,0 @@ -#' Reduce transcript expression to gene expression -#' @title transcript to gene expression -#' @param se a summarizedExperiment object from \code{\link{bambu}} -#' @return A SummarizedExperiment object -#' @import data.table -#' @export -#' @examples -#' se <- readRDS(system.file("extdata", -#' "seOutput_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds", -#' package = "bambu" -#' )) -#' transcriptToGeneExpression(se) -transcriptToGeneExpression <- function(se) { - uniqueCounts <- assays(se)$uniqueCounts - rowDataSe <- as.data.table(rowData(se)) - - uniqueCounts = fac2sparse(factor(rowData(se)$GENEID, levels = unique(rowData(se)$GENEID))) %*% uniqueCounts - incompatibleCounts <- metadata(se)$incompatibleCounts - nonuniqueCounts <- metadata(se)$nonuniqueCounts - counts = uniqueCounts + incompatibleCounts + nonuniqueCounts - counts.total = colSums(counts) - counts.total[counts.total==0] = 1 - counts.CPM = counts/counts.total * 10^6 - - ## geneRanges - exByGene <- reducedRangesByGenes(rowRanges(se)) - if ("txClassDescription" %in% colnames(rowDataSe)) { - rowDataSe <- rowDataSe[, .(TXNAME, GENEID, txClassDescription)] - rowDataSe[, newGeneClass := ifelse(grepl("ENSG", GENEID), - "annotation", unique(txClassDescription)), by = GENEID] - mcols(exByGene) <- unique(rowDataSe[, .(GENEID, - newGeneClass)])[match(names(exByGene), GENEID)] - } - ## SE - RowNames <- rownames(counts) - ColNames <- colnames(counts) - ColData <- colData(se) - ColData@rownames <- ColNames - ColData@listData$name <- ColNames - seOutput <- SummarizedExperiment( - assays = SimpleList(counts = counts, - CPM = counts.CPM), - rowRanges = exByGene[RowNames], - colData = ColData) - - return(seOutput) -} - -#' Generate a SummarizedExperiment of unique counts from quantData -#' @description This function is intended to be used after the transcript -#' discovery and \code{assignDist} steps in \code{\link{bambu}}. It builds a -#' transcript-level SummarizedExperiment containing raw unique counts (reads -#' uniquely assigned to a single transcript) without EM estimation, which can -#' be passed directly to \code{\link{transcriptToGeneExpression}} to obtain -#' gene-level counts as uniqueCounts + nonuniqueCounts + incompatibleCounts. -#' @param quantData a list of quantData objects produced by the assignDist step -#' @param annotations a GRangesList of transcript annotations -#' @return A SummarizedExperiment object with \code{assays$uniqueCounts}, -#' \code{metadata$incompatibleCounts}, and \code{metadata$nonuniqueCounts} -#' @import data.table -#' @noRd -generateUniqueCountsSEFromQuantData <- function(quantData, annotations) { - uniqueCountsList <- lapply(quantData, function(x) { - readClassDt <- getReadClassDt(x) - x_filtered <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match)) - - uniqueCounts <- if (nrow(x_filtered) == 0) { - sparseMatrix(i = 1, j = 1, x = 0, dims = c(length(annotations), nrow(getSampleData(x)))) - } else { - txids <- mcols(annotations)$txid - i <- rep(match(x_filtered$txid, txids), lengths(x_filtered$columnIds)) - j <- unlist(x_filtered$columnIds) - x_vals <- unlist(x_filtered$columnCounts) - sparseMatrix(i = i, j = j, x = x_vals, dims = c(length(annotations), nrow(getSampleData(x)))) - } - rownames(uniqueCounts) <- names(annotations) - colnames(uniqueCounts) <- rownames(getSampleData(x)) - return(uniqueCounts) - }) - uniqueCounts <- do.call(cbind, uniqueCountsList) - - incompatibleCounts <- do.call(cbind, lapply(quantData, getIncompatibleCounts)) - nonuniqueCounts <- do.call(cbind, lapply(quantData, function(x) { - generateNonUniqueCountMatrix(getReadClassDt(x), annotations, getSampleData(x)$id) - })) - - colData <- do.call(rbind, lapply(quantData, getSampleData)) - - se <- SummarizedExperiment(assays = SimpleList(uniqueCounts = uniqueCounts)) - rowRanges(se) <- annotations - colData(se) <- DataFrame(colData) - metadata(se)$incompatibleCounts <- incompatibleCounts - metadata(se)$nonuniqueCounts <- nonuniqueCounts - - return(se) -} From 192de76431893bf42ab9018ffa90e107f8ac7409 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Fri, 17 Apr 2026 15:52:10 +0800 Subject: [PATCH 16/29] fix gene index mapping in generateIncompatibleCounts Use levels(factor(unique(...))) to ensure genes vector is alphabetically sorted, matching the GENEID.i index scheme assigned in bambu_utilityFunctions.R. Co-Authored-By: Claude Sonnet 4.6 --- R/bambu-assignDist.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index 638b7211..b7e57858 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -49,7 +49,7 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame #' Generate incompatible counts #' @noRd generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ - genes <- unique(mcols(annotations)$GENEID) + genes <- levels(factor(unique(mcols(annotations)$GENEID))) rownames(incompatibleCountMatrix) <- genes[as.numeric(rownames(incompatibleCountMatrix))] geneMat <- sparseMatrix(length(genes), ncol(incompatibleCountMatrix), x = 0) rownames(geneMat) <- genes From 4368f7ace54e833e14f795df914f21f8b2da864a Mon Sep 17 00:00:00 2001 From: lingminhao Date: Fri, 17 Apr 2026 16:04:19 +0800 Subject: [PATCH 17/29] reorder incompatibleCounts to annotation order After decoding numeric indices to gene names, reorder the output matrix to match annotation order so values align with the rownames assigned in combineCountSes. Use drop=FALSE to preserve sparse matrix class for single-column inputs. Co-Authored-By: Claude Sonnet 4.6 --- R/bambu-assignDist.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index b7e57858..a9c00ebe 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -55,5 +55,5 @@ generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){ rownames(geneMat) <- genes colnames(geneMat) <- colnames(incompatibleCountMatrix) geneMat[match(rownames(incompatibleCountMatrix), rownames(geneMat)), ] <- incompatibleCountMatrix - return(geneMat) + return(geneMat[unique(mcols(annotations)$GENEID), , drop = FALSE]) } From b7a33ba5fd4555258697afab5ff0fd4a3aa246e7 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Fri, 17 Apr 2026 16:41:04 +0800 Subject: [PATCH 18/29] pass incompatibleCounts as single-column matrix to bambu.quantify Replace rowSums+names with direct rownames/as.vector to avoid collapsing multi-column subsets and to keep rownames attached as gene IDs for the GENEID.i join in bambu.quantify. Co-Authored-By: Claude Sonnet 4.6 --- R/bambu.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/bambu.R b/R/bambu.R index e3e13d91..9515e782 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -304,11 +304,10 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, } incompatibleCounts_i <- getIncompatibleCounts(quantData_i)[, columnIdx, drop = FALSE] - incompatibleCounts_i <- rowSums(incompatibleCounts_i) return(bambu.quantify(readClassDt = getReadClassDt(quantData_i), columnIdx = columnIdx, - incompatibleCounts = data.table(GENEID.i = names(incompatibleCounts_i), counts = incompatibleCounts_i), - txid.index = mcols(annotations)$txid, GENEIDs = names(incompatibleCounts_i), isoreParameters = isoreParameters, + incompatibleCounts = data.table(GENEID.i = rownames(incompatibleCounts_i), counts = as.vector(incompatibleCounts_i)), + txid.index = mcols(annotations)$txid, GENEIDs = rownames(incompatibleCounts_i), isoreParameters = isoreParameters, emParameters = emParameters, trackReads = trackReads, verbose = verbose))}, BPPARAM = bpParameters) From 219b2b1e697f672cdf0301812545f364498fa277 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Fri, 17 Apr 2026 17:26:50 +0800 Subject: [PATCH 19/29] convert clusters barcode string to integer Co-Authored-By: Claude Sonnet 4.6 --- R/bambu-quantify.R | 2 ++ R/bambu.R | 16 ++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/R/bambu-quantify.R b/R/bambu-quantify.R index 686f8fd3..c4dd3cca 100644 --- a/R/bambu-quantify.R +++ b/R/bambu-quantify.R @@ -14,9 +14,11 @@ bambu.quantify <- function(readClassDt, columnIdx, incompatibleCounts, txid.inde if (is.null(ids) || length(ids) == 0 || is.na(ids[1])) { 0L } else if (length(columnIdx) == 1) { + # single-cell: look up the single barcode's read count directly match_idx <- match(columnIdx, ids) if (is.na(match_idx)) 0L else columnCounts[[1]][match_idx] } else { + # cluster: sum read counts across all barcodes in the cluster sum(columnCounts[[1]][ids %in% columnIdx]) } }, by = eqClassId] diff --git a/R/bambu.R b/R/bambu.R index 9515e782..5fb732de 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -274,7 +274,8 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, for(i in seq_along(quantData)){ quantData_i <- quantData[[i]] #load in the barcode clustering from file if provided - iter <- seq_len(nrow(getSampleData(quantData_i))) # iter is integer + # single-cell mode: iter is an integer vector of column indices, one per barcode + iter <- seq_len(nrow(getSampleData(quantData_i))) if(!is.null(clusters)){ if(class(clusters[[i]])!="CompressedCharacterList"){ # !is.list(clusters) is FALSE for CompressedCharacterList clusterMaps <- NULL @@ -288,20 +289,19 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, "_",clusterMap[,1]) clusterMaps <- rbind(clusterMaps, clusterMap) } - clustering <- splitAsList(clusterMaps[,1], clusterMaps[,2]) + clustering <- splitAsList(clusterMaps[,1], clusterMaps[,2]) rm(clusterMaps) rm(clusterMap) iter <- clustering - + } else{ #if clusters is a list - iter <- clusters[[i]] + iter <- clusters[[i]] } + # cluster mode: convert barcode strings to integer column indices; + # iter becomes a named list of integer vectors, one per cluster + iter <- lapply(iter, match, getSampleData(quantData_i)$id) } countsSeCompressed <- bplapply(iter, FUN = function(columnIdx){ # previous i changed to j to avoid duplicated assignment - #i = iter[i %in% colnames(metadata(quantData_i)$countMatrix)] #bug, after assignment, i become emptyprint(i) - if(is.character(columnIdx)) { - columnIdx <- match(columnIdx, getSampleData(quantData_i)$id) - } incompatibleCounts_i <- getIncompatibleCounts(quantData_i)[, columnIdx, drop = FALSE] From f0e565814356ae7b4fe42d76632bea04d0d58cb3 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Mon, 20 Apr 2026 10:50:41 +0800 Subject: [PATCH 20/29] define SE_TYPES vocabulary and document quantData slots Co-Authored-By: Claude Sonnet 4.6 --- R/bambu-classes.R | 25 ++++++++++++++++++++----- R/bambu_utilityFunctions.R | 2 +- R/transcriptToGeneExpression.R | 4 ++-- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/R/bambu-classes.R b/R/bambu-classes.R index b4b54cee..e9ccaf43 100644 --- a/R/bambu-classes.R +++ b/R/bambu-classes.R @@ -3,13 +3,28 @@ #' @importFrom data.table data.table #' @importClassesFrom Matrix Matrix #' @importClassesFrom data.table data.table + +# Valid values for metadata(se)$seType, which identifies the SE variant: +# EMCounts — transcript-level SE with EM-estimated counts (main bambu output) +# geneCounts — gene-level SE derived by collapsing transcript counts +# uniqueCounts — transcript-level SE with uniquely-assigned counts only, no EM +SE_TYPES <- c( + EMCounts = "EMCounts", + geneCounts = "geneCounts", + uniqueCounts = "uniqueCounts" +) + +# quantData holds per-sample intermediate results from the assignDist step. +# distTable and readToTranscriptMap are optional: populated only when +# returnDistTable=TRUE or trackReads=TRUE respectively, and lifted into +# metadata(countsSe) at the end of bambu(). setClass("quantData", slots = c( - sampleData = "data.frame", - readClassDt = "data.table", - incompatibleCounts = "sparseMatrix", - distTable = "ANY", - readToTranscriptMap = "ANY" + sampleData = "data.frame", # per-sample metadata (id, sampleName) + readClassDt = "data.table", # read-class-level count and assignment data + incompatibleCounts = "sparseMatrix", # counts of reads incompatible with any annotation + distTable = "ANY", # read-class-to-transcript compatibility table (DataFrame or NULL) + readToTranscriptMap = "ANY" # per-read assignment to transcripts (tibble or NULL) ), prototype = list( sampleData = data.frame(id = integer(), sampleName = character()), diff --git a/R/bambu_utilityFunctions.R b/R/bambu_utilityFunctions.R index dc66fac3..94908891 100644 --- a/R/bambu_utilityFunctions.R +++ b/R/bambu_utilityFunctions.R @@ -287,7 +287,7 @@ combineCountSes <- function(countsSe, colDataList, annotations){ fullLengthCounts = countsDataMat$fullLengthCounts, uniqueCounts = countsDataMat$uniqueCounts)) metadata(combinedCountsSe)$incompatibleCounts <- countsDataMat$incompatibleCounts - metadata(combinedCountsSe)$seType <- "EMCounts" + metadata(combinedCountsSe)$seType <- SE_TYPES[["EMCounts"]] rowRanges(combinedCountsSe) <- annotations colData(combinedCountsSe) <- DataFrame(bind_rows(colDataList)) diff --git a/R/transcriptToGeneExpression.R b/R/transcriptToGeneExpression.R index 99fd49be..49840b83 100644 --- a/R/transcriptToGeneExpression.R +++ b/R/transcriptToGeneExpression.R @@ -46,7 +46,7 @@ transcriptToGeneExpression <- function(se) { CPM = counts.CPM), rowRanges = exByGene[RowNames], colData = ColData) - metadata(seOutput)$seType <- "geneCounts" + metadata(seOutput)$seType <- SE_TYPES[["geneCounts"]] return(seOutput) } @@ -95,7 +95,7 @@ generateUniqueCountsSEFromQuantData <- function(quantData, annotations) { colData(se) <- DataFrame(colData) metadata(se)$incompatibleCounts <- incompatibleCounts metadata(se)$nonuniqueCounts <- nonuniqueCounts - metadata(se)$seType <- "uniqueCounts" + metadata(se)$seType <- SE_TYPES[["uniqueCounts"]] return(se) } From bf99de572c7fb0b40a9f635d850341252be3322f Mon Sep 17 00:00:00 2001 From: lingminhao Date: Mon, 20 Apr 2026 10:57:58 +0800 Subject: [PATCH 21/29] tidy up code --- R/bambu-assignDist.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index a9c00ebe..31efcac6 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -12,7 +12,7 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame readClassDt$eqClass.match = match(readClassDt$eqClassById,metadata(readClassList)$eqClassById) # Add columnIds and columnCounts columns - readClassDt[, `:=`(columnIds = as.list(rep(NA, .N)), columnCounts = as.list(rep(NA, .N)))] + readClassDt[, `:=`(columnIds = vector("list", .N), columnCounts = vector("list", .N))] observedIdx <- which(!is.na(readClassDt$eqClass.match)) readClassDt$columnIds[observedIdx] <- metadata(readClassList)$columnIds[readClassDt$eqClass.match[observedIdx]] readClassDt$columnCounts[observedIdx] <- metadata(readClassList)$columnCounts[readClassDt$eqClass.match[observedIdx]] @@ -29,11 +29,17 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParame incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix incompatibleCounts <- generateIncompatibleCounts(incompatibleCountMatrix, annotations) - distTable <- if(returnDistTable) metadata(metadata(readClassList)$readClassDist)$distTableOld else NULL + distTable <- if(returnDistTable) { + metadata(metadata(readClassList)$readClassDist)$distTableOld + } else{ + NULL + } - readToTranscriptMap <- if(trackReads) generateReadToTranscriptMap(readClassList, - metadata(readClassList)$readClassDist, - annotations) else NULL + readToTranscriptMap <- if(trackReads) { + generateReadToTranscriptMap(readClassList, metadata(readClassList)$readClassDist,annotations) + } else{ + NULL + } quantData <- constructQuantData( sampleData = data.frame(ColData), From 97018f71279151f8fdce521272077d7f7f38db94 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Mon, 20 Apr 2026 11:37:18 +0800 Subject: [PATCH 22/29] fix duplicated sample name prefix in SE colnames for demultiplexed data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lapply on a named quantData list preserves names, causing do.call(cbind/rbind) to prepend the list element name (sampleName) to each column/row name with a period separator — producing e.g. demultiplexed2.demultiplexed2_TCAGATGTCACCAGGC instead of demultiplexed2_TCAGATGTCACCAGGC. unname() the lapply results before cbind/rbind in generateUniqueCountsSEFromQuantData. Co-Authored-By: Claude Sonnet 4.6 --- R/transcriptToGeneExpression.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/transcriptToGeneExpression.R b/R/transcriptToGeneExpression.R index 49840b83..9773f60b 100644 --- a/R/transcriptToGeneExpression.R +++ b/R/transcriptToGeneExpression.R @@ -81,14 +81,14 @@ generateUniqueCountsSEFromQuantData <- function(quantData, annotations) { colnames(uniqueCounts) <- rownames(getSampleData(x)) return(uniqueCounts) }) - uniqueCounts <- do.call(cbind, uniqueCountsList) + uniqueCounts <- do.call(cbind, unname(uniqueCountsList)) - incompatibleCounts <- do.call(cbind, lapply(quantData, getIncompatibleCounts)) - nonuniqueCounts <- do.call(cbind, lapply(quantData, function(x) { + incompatibleCounts <- do.call(cbind, unname(lapply(quantData, getIncompatibleCounts))) + nonuniqueCounts <- do.call(cbind, unname(lapply(quantData, function(x) { generateNonUniqueCountMatrix(getReadClassDt(x), annotations, getSampleData(x)$id) - })) + }))) - colData <- do.call(rbind, lapply(quantData, getSampleData)) + colData <- do.call(rbind, unname(lapply(quantData, getSampleData))) se <- SummarizedExperiment(assays = SimpleList(counts = uniqueCounts)) rowRanges(se) <- annotations From ac7cd4fce3fe4f91d8f0ca8ebf0a5156ec059b0a Mon Sep 17 00:00:00 2001 From: lingminhao Date: Mon, 20 Apr 2026 11:40:16 +0800 Subject: [PATCH 23/29] move generateNonUniqueCountMatrix to transcriptToGeneExpression_utilityFunctions.R Co-Authored-By: Claude Sonnet 4.6 --- R/transcriptToGeneExpression.R | 36 ------------------- ...nscriptToGeneExpression_utilityFunctions.R | 36 +++++++++++++++++++ 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/R/transcriptToGeneExpression.R b/R/transcriptToGeneExpression.R index 9773f60b..a76aa7d0 100644 --- a/R/transcriptToGeneExpression.R +++ b/R/transcriptToGeneExpression.R @@ -98,39 +98,3 @@ generateUniqueCountsSEFromQuantData <- function(quantData, annotations) { metadata(se)$seType <- SE_TYPES[["uniqueCounts"]] return(se) } - -#' Generate nonunique count matrix (gene x sample) from multi-align reads in readClassDt -#' @noRd -generateNonUniqueCountMatrix <- function(readClassDt, annotations, sampleIds){ - genes <- unique(mcols(annotations)$GENEID) - nSamples <- length(sampleIds) - geneMat <- sparseMatrix(length(genes), nSamples, x = 0) - rownames(geneMat) <- genes - colnames(geneMat) <- sampleIds - - x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) - x <- x %>% distinct(eqClassId, .keep_all = TRUE) - if (nrow(x) == 0) return(geneMat) - - i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) - j <- unlist(x$columnIds) - x_vals <- unlist(x$columnCounts) - nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nSamples)) - - if (nrow(x) > 1 & length(unique(x$gene_sid)) > 1) { - nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } else { - nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) - nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts - } - - geneids <- as.numeric(levels(factor(x$gene_sid))) - geneids <- x$txid[match(geneids, x$gene_sid)] - geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] - rownames(nonuniqueCounts) <- geneids - colnames(nonuniqueCounts) <- sampleIds - - geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts - return(geneMat) -} diff --git a/R/transcriptToGeneExpression_utilityFunctions.R b/R/transcriptToGeneExpression_utilityFunctions.R index c0e61165..aff2eb50 100644 --- a/R/transcriptToGeneExpression_utilityFunctions.R +++ b/R/transcriptToGeneExpression_utilityFunctions.R @@ -1,4 +1,40 @@ +#' Generate nonunique count matrix (gene x sample) from multi-align reads in readClassDt +#' @noRd +generateNonUniqueCountMatrix <- function(readClassDt, annotations, sampleIds){ + genes <- unique(mcols(annotations)$GENEID) + nSamples <- length(sampleIds) + geneMat <- sparseMatrix(length(genes), nSamples, x = 0) + rownames(geneMat) <- genes + colnames(geneMat) <- sampleIds + + x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match)) + x <- x %>% distinct(eqClassId, .keep_all = TRUE) + if (nrow(x) == 0) return(geneMat) + + i <- rep(seq_along(x$gene_sid), lengths(x$columnIds)) + j <- unlist(x$columnIds) + x_vals <- unlist(x$columnCounts) + nonuniqueCounts <- sparseMatrix(i = i, j = j, x = x_vals, dims = c(nrow(x), nSamples)) + + if (nrow(x) > 1 & length(unique(x$gene_sid)) > 1) { + nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } else { + nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE) + nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts + } + + geneids <- as.numeric(levels(factor(x$gene_sid))) + geneids <- x$txid[match(geneids, x$gene_sid)] + geneids <- mcols(annotations)$GENEID[as.numeric(geneids)] + rownames(nonuniqueCounts) <- geneids + colnames(nonuniqueCounts) <- sampleIds + + geneMat[match(rownames(nonuniqueCounts), rownames(geneMat)), ] <- nonuniqueCounts + return(geneMat) +} + #' rename runnames when there are duplicated names #' @title rename_duplicatedNames #' @param runnames sample names From 53d3728b63d8d82030fa45ef8b3da43597787dc0 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Mon, 20 Apr 2026 21:21:47 +0800 Subject: [PATCH 24/29] fix: correct wrong variable names from merge conflict --- R/bambu-assignDist.R | 2 +- R/bambu.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/bambu-assignDist.R b/R/bambu-assignDist.R index a259ed1b..7b5cb122 100644 --- a/R/bambu-assignDist.R +++ b/R/bambu-assignDist.R @@ -24,7 +24,7 @@ assignReadClasstoTranscripts <- function(readClassList, annotations, rcAssignmen mutate(aval = 1) %>% data.table() #return non-em counts - ColData <- generateColData(readClassList, sampleMetadata, demultiplexed) + ColData <- generateColData(readClassList, sampleMetadata, extractBarcodeUMI) incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix incompatibleCounts <- generateIncompatibleCounts(incompatibleCountMatrix, annotations) diff --git a/R/bambu.R b/R/bambu.R index 318f2e81..759707b6 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -331,7 +331,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, return(bambu.quantify(readClassDt = getReadClassDt(quantData_i), columnIdx = columnIdx, incompatibleCounts = data.table(GENEID.i = rownames(incompatibleCounts_i), counts = as.vector(incompatibleCounts_i)), txid.index = mcols(annotations)$txid, GENEIDs = rownames(incompatibleCounts_i), - emParameters = emParameters, trackReads = trackReads, + emParameters = opt.em, trackReads = trackReads, verbose = verbose))}, BPPARAM = bpParameters) end.ptm <- proc.time() From dfbe4d70ad04c2980ee2ae0234258915a489585c Mon Sep 17 00:00:00 2001 From: lingminhao Date: Mon, 27 Apr 2026 15:47:43 +0800 Subject: [PATCH 25/29] fix bug for summing incompatibleCounts during clustering --- R/bambu-quantify.R | 2 +- R/bambu.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/bambu-quantify.R b/R/bambu-quantify.R index 99f9eaaa..65998cad 100644 --- a/R/bambu-quantify.R +++ b/R/bambu-quantify.R @@ -10,7 +10,7 @@ bambu.quantify <- function(readClassDt, columnIdx, incompatibleCounts, txid.inde # Calculate nobs for sample(s) columnIdx # Use data.table syntax for in-place creation of nobs column for the scope of this function readClassDt[, nobs := { - ids <- columnIds[[1]] + ids <- columnIdx[[1]] if (is.null(ids) || length(ids) == 0 || is.na(ids[1])) { 0L } else if (length(columnIdx) == 1) { diff --git a/R/bambu.R b/R/bambu.R index 759707b6..ffa2467e 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -329,7 +329,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, incompatibleCounts_i <- getIncompatibleCounts(quantData_i)[, columnIdx, drop = FALSE] return(bambu.quantify(readClassDt = getReadClassDt(quantData_i), columnIdx = columnIdx, - incompatibleCounts = data.table(GENEID.i = rownames(incompatibleCounts_i), counts = as.vector(incompatibleCounts_i)), + incompatibleCounts = data.table(GENEID.i = rownames(incompatibleCounts_i), counts = rowSums(incompatibleCounts_i)), txid.index = mcols(annotations)$txid, GENEIDs = rownames(incompatibleCounts_i), emParameters = opt.em, trackReads = trackReads, verbose = verbose))}, From c3055495ee2567d9e6b2b938878bdae0038fe596 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Mon, 27 Apr 2026 17:28:16 +0800 Subject: [PATCH 26/29] update colData for pseudobulk se object --- R/bambu.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/bambu.R b/R/bambu.R index ffa2467e..8301777c 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -275,7 +275,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, trackReads = trackReads ) }, BPPARAM = bpParameters) - names(quantData) <- names(readClassList) + names(quantData) <- unname(sapply(readClassFile, colnames)) if (!quant) return(quantData) } } @@ -340,7 +340,8 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, ColNames <- c(ColNames, names(iter)) colData.all[[i]] <- data.frame( id = names(countsSeCompressed), - sampleName = names(countsSeCompressed), + sampleName = sapply(strsplit(names(countsSeCompressed), "_"), `[`, 1), + cluster = sapply(strsplit(names(countsSeCompressed), "_"), `[`, 2), row.names = names(countsSeCompressed) ) } else{ From 2b58a237c68219e975e8d47e7fdbab9883a1fe6d Mon Sep 17 00:00:00 2001 From: lingminhao Date: Tue, 28 Apr 2026 10:39:07 +0800 Subject: [PATCH 27/29] update sample name retrieval for quantData to align with Bambu-Pipe implementation --- R/bambu.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/bambu.R b/R/bambu.R index 8301777c..d1dc2b3a 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -275,7 +275,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, trackReads = trackReads ) }, BPPARAM = bpParameters) - names(quantData) <- unname(sapply(readClassFile, colnames)) + names(quantData) <- unname(sapply(readClassList, colnames)) if (!quant) return(quantData) } } From 44e86d91666066486449f83165d7a0cc0e43ddbc Mon Sep 17 00:00:00 2001 From: lingminhao Date: Tue, 28 Apr 2026 11:04:40 +0800 Subject: [PATCH 28/29] revert: undo last two commits, retaining colData sampleName/cluster changes Reverts 2b58a23 and c305549. Restores names(readClassList) for quantData naming. Retains the sampleName/cluster split in colData from c305549. Co-Authored-By: Claude Sonnet 4.6 --- R/bambu.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/bambu.R b/R/bambu.R index d1dc2b3a..b6a1bdd9 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -275,7 +275,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, trackReads = trackReads ) }, BPPARAM = bpParameters) - names(quantData) <- unname(sapply(readClassList, colnames)) + names(quantData) <- names(readClassList) if (!quant) return(quantData) } } @@ -339,7 +339,7 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, if(!is.null(clusters)){ ColNames <- c(ColNames, names(iter)) colData.all[[i]] <- data.frame( - id = names(countsSeCompressed), + id = names(countsSeCompressed), sampleName = sapply(strsplit(names(countsSeCompressed), "_"), `[`, 1), cluster = sapply(strsplit(names(countsSeCompressed), "_"), `[`, 2), row.names = names(countsSeCompressed) From d112d3a29fcb735a997c00d72162c3991922eef3 Mon Sep 17 00:00:00 2001 From: lingminhao Date: Mon, 4 May 2026 14:19:04 +0800 Subject: [PATCH 29/29] update colData for pseudobulk se object --- R/bambu.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/bambu.R b/R/bambu.R index b6a1bdd9..46034eaa 100644 --- a/R/bambu.R +++ b/R/bambu.R @@ -340,8 +340,8 @@ bambu <- function(reads, annotations = NULL, genome = NULL, NDR = NULL, ColNames <- c(ColNames, names(iter)) colData.all[[i]] <- data.frame( id = names(countsSeCompressed), - sampleName = sapply(strsplit(names(countsSeCompressed), "_"), `[`, 1), - cluster = sapply(strsplit(names(countsSeCompressed), "_"), `[`, 2), + sampleName = sub("_[^_]+$", "", names(countsSeCompressed)), + cluster = sub(".*_", "", names(countsSeCompressed)), row.names = names(countsSeCompressed) ) } else{