Continue inching further.

SingleR-inc · Sep 6, 2024 · d1cd66f · d1cd66f
1 parent b8bd376
commit d1cd66f
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 32 deletions.
diff --git a/R/SingleR.R b/R/SingleR.R
@@ -103,10 +103,24 @@ SingleR <- function(
         on.exit(bpstop(BPPARAM))
     }
 
-    # We have to clean it at the start to remove NAs before we do the build,
+    # We have to do all this row-subsetting at the start before trainSingleR,
     # otherwise 'test.genes' won't match up to the filtered 'test'.
     test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM)
 
+    tmp.ref <- ref
+    if (!is.list(tmp.ref)) {
+        tmp.ref <- list(ref)
+    }
+    for (rr in tmp.ref) {
+        keep <- rownames(test) %in% rownames(rr)
+        if (!all(keep)) {
+            test <- DelayedArray(test)[keep,,drop=FALSE]
+        }
+    }
+    if (nrow(test) == 0) {
+        stop("no common genes between 'test' and 'ref")
+    }
+
     trained <- trainSingleR(
         ref, 
         labels, 

diff --git a/R/classifySingleR.R b/R/classifySingleR.R
@@ -12,8 +12,7 @@
 #' @param tune.thresh A numeric scalar specifying the maximum difference from the maximum correlation to use in fine-tuning.
 #' @param sd.thresh Deprecated and ignored.
 #' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.
-#' @param check.missing Logical scalar indicating whether rows should be checked for missing values.
-#' If true and any missing values are found, the rows containing these values are silently removed.
+#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.
 #' @param prune A logical scalar indicating whether label pruning should be performed.
 #' @param num.threads Integer scalar specifying the number of threads to use for classification.
 #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for \code{NA} scanning, when \code{check.missing=TRUE}.
@@ -98,11 +97,11 @@ classifySingleR <- function(
     sd.thresh=NULL, 
     prune=TRUE, 
     assay.type="logcounts", 
-    check.missing=TRUE,
+    check.missing=FALSE,
     num.threads = bpnworkers(BPPARAM),
     BPPARAM=SerialParam()) 
 {
-    test <- .to_clean_matrix(test, assay.type, check.missing, msg="test", BPPARAM=BPPARAM)
+    test <- .to_clean_matrix(test, assay.type, check.missing=FALSE, msg="test", BPPARAM=BPPARAM)
 
     solo <- .is_solo(trained)
     if (solo) { 
@@ -140,8 +139,8 @@ classifySingleR <- function(
         if (!identical(trained$options$test.genes, rownames(test))) {
             stop("expected 'rownames(test)' to be the same as 'test.genes' in 'trained'")
         }
-    } else if (nrow(trained$ref) != nrow(test)) {
-        stop("expected 'test' to have the same number of rows as the reference dataset")
+    } else if (!identical(rownames(trained$ref), rownames(test))) {
+        stop("expected 'rownames(test)' to be the same as 'rownames(ref)' in 'trained'")
     }
 }
 

diff --git a/R/combineRecomputedResults.R b/R/combineRecomputedResults.R
@@ -9,7 +9,7 @@
 #' @param trained A list of \linkS4class{List}s containing the trained outputs of multiple references,
 #' equivalent to either (i) the output of \code{\link{trainSingleR}} on multiple references with \code{recompute=TRUE},
 #' or (ii) running \code{trainSingleR} on each reference separately and manually making a list of the trained outputs.
-#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references.
+#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references.
 #' @param quantile Numeric scalar specifying the quantile of the correlation distribution to use for computing the score, see \code{\link{classifySingleR}}.
 #' @param allow.lost Deprecated.
 #'
@@ -108,8 +108,8 @@ combineRecomputedResults <- function(
     quantile=0.8, 
     assay.type.test="logcounts", 
     check.missing=TRUE, 
-    allow.lost=FALSE, 
     warn.lost=TRUE,
+    allow.lost=FALSE, 
     num.threads = bpnworkers(BPPARAM),
     BPPARAM=SerialParam())
 {
@@ -133,8 +133,18 @@ combineRecomputedResults <- function(
         .check_test_genes(test, curtrain)
     }
 
-    # Applying the integration.
+    # Checking the genes.
     all.refnames <- lapply(trained, function(x) rownames(x$ref))
+    if (warn.lost) {
+        intersected <- Reduce(intersect, all.refnames)
+        for (i in seq_along(trained)) {
+            if (!all(trained[[i]]$markers$unique %in% intersected)) {
+                warning("not all markers in 'trained' are available in each reference")
+            }
+        }
+    }
+
+    # Applying the integration.
     universe <- Reduce(union, c(list(rownames(test)), all.refnames))
     ibuilt <- train_integrated(
         test_features=match(rownames(test), universe) - 1L,

diff --git a/R/trainSingleR.R b/R/trainSingleR.R
@@ -293,9 +293,6 @@ trainSingleR <- function(
     }
     if (!is.null(test.genes)) {
         ref <- DelayedArray(ref)[rownames(ref) %in% test.genes,,drop=FALSE]
-        if (nrow(ref) == 0L) {
-            stop("no common genes between 'test' and 'ref'")
-        }
     }
 
     if (.is_list(genes)) {

diff --git a/man/classifySingleR.Rd b/man/classifySingleR.Rd
diff --git a/man/combineRecomputedResults.Rd b/man/combineRecomputedResults.Rd
diff --git a/tests/testthat/test-recomputed.R b/tests/testthat/test-recomputed.R
@@ -19,11 +19,11 @@ test <- .mockTestData(ref)
 test <- scuttle::logNormCounts(test)
 
 ref1 <- scuttle::logNormCounts(ref1)
-train1 <- trainSingleR(ref1, labels=ref1$label)
+train1 <- trainSingleR(ref1, labels=ref1$label, test.genes=rownames(test))
 pred1 <- classifySingleR(test, train1)
 
 ref2 <- scuttle::logNormCounts(ref2)
-train2 <- trainSingleR(ref2, labels=ref2$label)
+train2 <- trainSingleR(ref2, labels=ref2$label, test.genes=rownames(test))
 pred2 <- classifySingleR(test, train2)
 
 test_that("combineRecomputedResults works as expected (light check)", {
@@ -100,18 +100,12 @@ test_that("combineRecomputedResults handles mismatches to rows and cells", {
         trained=list(train1, train2)), "not identical")
     colnames(test) <- NULL
 
-    # Correctly reorders the gene universes.
-    ref <- combineRecomputedResults(
-        results=list(pred1, pred2),
-        test=test,
-        trained=list(train1, train2))
-
+    # Responds to mismatches in the genes.
     s <- sample(nrow(test))
-    out <- combineRecomputedResults(
+    expect_error(combineRecomputedResults(
         results=list(pred1, pred2),
         test=test[s,],
-        trained=list(train1, train2))
-    expect_equal(ref, out)
+        trained=list(train1, train2)), "test.genes")
 })
 
 test_that("combineRecomputedResults emits warnings when missing genes are present", {
@@ -120,13 +114,13 @@ test_that("combineRecomputedResults emits warnings when missing genes are presen
     rownames(ref1b)[1] <- "BLAH"
     markers1 <- train1$markers$full
     markers1$A$B <- c(markers1$A$B, "BLAH")
-    train1b <- trainSingleR(ref1b, labels=ref1$label, genes=markers1)
+    train1b <- trainSingleR(ref1b, labels=ref1$label, genes=markers1, test.genes=rownames(test))
 
     ref2b <- ref2[c(1, seq_len(nrow(ref2))),]
     rownames(ref2b)[1] <- "WHEE"
     markers2 <- train2$markers$full
     markers2$A$B <- c(markers2$a$b, "WHEE")
-    train2b <- trainSingleR(ref2b, labels=ref2$label, genes=markers2)
+    train2b <- trainSingleR(ref2b, labels=ref2$label, genes=markers2, test.genes=rownames(test))
 
     expect_error(out <- combineRecomputedResults(
         results=list(pred1, pred2),