From 6be897484a6541f5e46fc0cdb480829ce83fd19a Mon Sep 17 00:00:00 2001 From: shafighi Date: Tue, 2 Jun 2026 12:08:46 +0200 Subject: [PATCH] Guard MannKendall against segments with <3 non-NA values The two cellcycle classification functions (cellcycleMetadata, cellcycle_gctest) loop over genome segments and call Kendall::MannKendall(co[idx]) on each segment's copy-number values. The existing gates skip a segment if it has fewer than 20 bins (segment_size_cutoff) or if all values are zero, but neither gate accounts for NA values in co. With the new mouse blacklist + pericentromeric + segmental duplication masks, segments often clear the size gate but have most of their bins NA-filtered. The non-zero gate also misfires because all(co == 0) returns NA (not FALSE) when co contains NAs, so the gate's if-condition errors -- which the outer tryCatch in run_scAbsolute.R catches as a hard failure, producing an empty .rds. Reproducibly observed on SLX-27548 mouse cells: A1_2 (65M reads) fails with stop("length(x)<3") fired by Kendall internals when most of a segment's bins are NA. Fix: 1. all(co == 0) -> all(co == 0, na.rm=TRUE) so NA-only and zero+NA segments are skipped instead of erroring. 2. New gate: sum(!is.na(co)) < 3 returns NA for that segment instead of feeding a degenerate vector to MannKendall. Same fix applied to both cellcycleMetadata (lines ~62-66) and cellcycle_gctest (lines ~192-196). --- R/cellcycle.R | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/R/cellcycle.R b/R/cellcycle.R index 125c180..3bcd9f6 100644 --- a/R/cellcycle.R +++ b/R/cellcycle.R @@ -59,11 +59,24 @@ cellcycleMetadata <- function(segmentedCounts,segment_size_cutoff=20){ return(list("pval"=NA, "tau"=NA, "tau_corrected"=NA, "cor_corrected"=NA)) } co = counts[index == x] - if(all(co == 0)){ + if(all(co == 0, na.rm=TRUE)){ return(list("pval"=NA, "tau"=NA, "tau_corrected"=NA, "cor_corrected"=NA)) } time = repTime[index == x] covar = gc_map_covariate[index == x] + # Mann-Kendall (and the partial-MK / partial-spearman calls below) all + # need >=3 jointly-non-NA values. With aggressive mouse filtering + # (blacklist + pericentromeric + segdup) and mm10-liftover gaps in + # the replication-timing track, a long segment can clear the size + # cutoff but still have <3 bins that are non-NA across co, time and + # covar simultaneously. base::sort(time, ...) drops NAs from time + # silently, so the resulting idx is shorter than the segment and + # co[idx] feeds a degenerate vector into Kendall::MannKendall, + # triggering stop("length(x)<3") inside the package. + jointly_valid = !is.na(co) & !is.na(time) & !is.na(covar) + if(sum(jointly_valid) < 3){ + return(list("pval"=NA, "tau"=NA, "tau_corrected"=NA, "cor_corrected"=NA)) + } idx = base::sort(time, index.return=TRUE)$ix # ggplot(data=dplyr::tibble(co=co[idx], covar=covar[idx], time=time[idx])) + geom_point(aes(x=time, y=co, color=covar)) trend.kendall_test = Kendall::MannKendall(co[idx]) @@ -186,6 +199,12 @@ cellcycle_gctest <- function(segmentedCounts, segment_size_cutoff=20){ } co = counts[index == x] time = gc[index == x] + # See cellcycleMetadata above for the same guard. base::sort(time, ...) + # silently drops NA time values, so the segment can clear the size + # gate but still feed a <3-length vector to Kendall::MannKendall. + if(sum(!is.na(co) & !is.na(time)) < 3){ + return(list("pval"=NA, "tau"=NA)) + } idx = base::sort(time, index.return=TRUE)$ix trend.kendall_test = Kendall::MannKendall(co[idx]) pv = trend.kendall_test[["sl"]][[1]]