diff --git a/R/SingleR.R b/R/SingleR.R index 246a22a..6ed93fc 100644 --- a/R/SingleR.R +++ b/R/SingleR.R @@ -103,10 +103,24 @@ SingleR <- function( on.exit(bpstop(BPPARAM)) } - # We have to clean it at the start to remove NAs before we do the build, + # We have to do all this row-subsetting at the start before trainSingleR, # otherwise 'test.genes' won't match up to the filtered 'test'. test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM) + tmp.ref <- ref + if (!is.list(tmp.ref)) { + tmp.ref <- list(ref) + } + for (rr in tmp.ref) { + keep <- rownames(test) %in% rownames(rr) + if (!all(keep)) { + test <- DelayedArray(test)[keep,,drop=FALSE] + } + } + if (nrow(test) == 0) { + stop("no common genes between 'test' and 'ref") + } + trained <- trainSingleR( ref, labels, diff --git a/R/classifySingleR.R b/R/classifySingleR.R index 12e4b4b..52987cc 100644 --- a/R/classifySingleR.R +++ b/R/classifySingleR.R @@ -12,8 +12,7 @@ #' @param tune.thresh A numeric scalar specifying the maximum difference from the maximum correlation to use in fine-tuning. #' @param sd.thresh Deprecated and ignored. #' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}. -#' @param check.missing Logical scalar indicating whether rows should be checked for missing values. -#' If true and any missing values are found, the rows containing these values are silently removed. +#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}. #' @param prune A logical scalar indicating whether label pruning should be performed. #' @param num.threads Integer scalar specifying the number of threads to use for classification. #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for \code{NA} scanning, when \code{check.missing=TRUE}. @@ -98,11 +97,11 @@ classifySingleR <- function( sd.thresh=NULL, prune=TRUE, assay.type="logcounts", - check.missing=TRUE, + check.missing=FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM=SerialParam()) { - test <- .to_clean_matrix(test, assay.type, check.missing, msg="test", BPPARAM=BPPARAM) + test <- .to_clean_matrix(test, assay.type, check.missing=FALSE, msg="test", BPPARAM=BPPARAM) solo <- .is_solo(trained) if (solo) { @@ -140,8 +139,8 @@ classifySingleR <- function( if (!identical(trained$options$test.genes, rownames(test))) { stop("expected 'rownames(test)' to be the same as 'test.genes' in 'trained'") } - } else if (nrow(trained$ref) != nrow(test)) { - stop("expected 'test' to have the same number of rows as the reference dataset") + } else if (!identical(rownames(trained$ref), rownames(test))) { + stop("expected 'rownames(test)' to be the same as 'rownames(ref)' in 'trained'") } } diff --git a/R/combineRecomputedResults.R b/R/combineRecomputedResults.R index 6c32c10..f783bb3 100644 --- a/R/combineRecomputedResults.R +++ b/R/combineRecomputedResults.R @@ -9,7 +9,7 @@ #' @param trained A list of \linkS4class{List}s containing the trained outputs of multiple references, #' equivalent to either (i) the output of \code{\link{trainSingleR}} on multiple references with \code{recompute=TRUE}, #' or (ii) running \code{trainSingleR} on each reference separately and manually making a list of the trained outputs. -#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references. +#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references. #' @param quantile Numeric scalar specifying the quantile of the correlation distribution to use for computing the score, see \code{\link{classifySingleR}}. #' @param allow.lost Deprecated. #' @@ -108,8 +108,8 @@ combineRecomputedResults <- function( quantile=0.8, assay.type.test="logcounts", check.missing=TRUE, - allow.lost=FALSE, warn.lost=TRUE, + allow.lost=FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM=SerialParam()) { @@ -133,8 +133,18 @@ combineRecomputedResults <- function( .check_test_genes(test, curtrain) } - # Applying the integration. + # Checking the genes. all.refnames <- lapply(trained, function(x) rownames(x$ref)) + if (warn.lost) { + intersected <- Reduce(intersect, all.refnames) + for (i in seq_along(trained)) { + if (!all(trained[[i]]$markers$unique %in% intersected)) { + warning("not all markers in 'trained' are available in each reference") + } + } + } + + # Applying the integration. universe <- Reduce(union, c(list(rownames(test)), all.refnames)) ibuilt <- train_integrated( test_features=match(rownames(test), universe) - 1L, diff --git a/R/trainSingleR.R b/R/trainSingleR.R index 28437fd..2f6d81f 100644 --- a/R/trainSingleR.R +++ b/R/trainSingleR.R @@ -293,9 +293,6 @@ trainSingleR <- function( } if (!is.null(test.genes)) { ref <- DelayedArray(ref)[rownames(ref) %in% test.genes,,drop=FALSE] - if (nrow(ref) == 0L) { - stop("no common genes between 'test' and 'ref'") - } } if (.is_list(genes)) { diff --git a/man/classifySingleR.Rd b/man/classifySingleR.Rd index 51a71f8..0f9dfba 100644 --- a/man/classifySingleR.Rd +++ b/man/classifySingleR.Rd @@ -13,7 +13,7 @@ classifySingleR( sd.thresh = NULL, prune = TRUE, assay.type = "logcounts", - check.missing = TRUE, + check.missing = FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM = SerialParam() ) @@ -38,8 +38,7 @@ Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multip \item{assay.type}{Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.} -\item{check.missing}{Logical scalar indicating whether rows should be checked for missing values. -If true and any missing values are found, the rows containing these values are silently removed.} +\item{check.missing}{Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.} \item{num.threads}{Integer scalar specifying the number of threads to use for classification.} diff --git a/man/combineRecomputedResults.Rd b/man/combineRecomputedResults.Rd index 19920e9..9ff4a95 100644 --- a/man/combineRecomputedResults.Rd +++ b/man/combineRecomputedResults.Rd @@ -12,7 +12,6 @@ combineRecomputedResults( assay.type.test = "logcounts", check.missing = TRUE, allow.lost = FALSE, - warn.lost = TRUE, num.threads = bpnworkers(BPPARAM), BPPARAM = SerialParam() ) @@ -36,12 +35,12 @@ if \code{test} is a \linkS4class{SummarizedExperiment} object.} \item{allow.lost}{Deprecated.} -\item{warn.lost}{Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references.} - \item{num.threads}{Integer scalar specifying the number of threads to use for index building and classification.} \item{BPPARAM}{A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed in other steps, see \code{?\link{trainSingleR}} and \code{?\link{classifySingleR}} for more details.} + +\item{warn.lost}{Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references.} } \value{ A \linkS4class{DataFrame} is returned containing the annotation statistics for each cell or cluster (row). diff --git a/tests/testthat/test-recomputed.R b/tests/testthat/test-recomputed.R index 4c75c3f..18f7fa6 100644 --- a/tests/testthat/test-recomputed.R +++ b/tests/testthat/test-recomputed.R @@ -19,11 +19,11 @@ test <- .mockTestData(ref) test <- scuttle::logNormCounts(test) ref1 <- scuttle::logNormCounts(ref1) -train1 <- trainSingleR(ref1, labels=ref1$label) +train1 <- trainSingleR(ref1, labels=ref1$label, test.genes=rownames(test)) pred1 <- classifySingleR(test, train1) ref2 <- scuttle::logNormCounts(ref2) -train2 <- trainSingleR(ref2, labels=ref2$label) +train2 <- trainSingleR(ref2, labels=ref2$label, test.genes=rownames(test)) pred2 <- classifySingleR(test, train2) test_that("combineRecomputedResults works as expected (light check)", { @@ -100,18 +100,12 @@ test_that("combineRecomputedResults handles mismatches to rows and cells", { trained=list(train1, train2)), "not identical") colnames(test) <- NULL - # Correctly reorders the gene universes. - ref <- combineRecomputedResults( - results=list(pred1, pred2), - test=test, - trained=list(train1, train2)) - + # Responds to mismatches in the genes. s <- sample(nrow(test)) - out <- combineRecomputedResults( + expect_error(combineRecomputedResults( results=list(pred1, pred2), test=test[s,], - trained=list(train1, train2)) - expect_equal(ref, out) + trained=list(train1, train2)), "test.genes") }) test_that("combineRecomputedResults emits warnings when missing genes are present", { @@ -120,13 +114,13 @@ test_that("combineRecomputedResults emits warnings when missing genes are presen rownames(ref1b)[1] <- "BLAH" markers1 <- train1$markers$full markers1$A$B <- c(markers1$A$B, "BLAH") - train1b <- trainSingleR(ref1b, labels=ref1$label, genes=markers1) + train1b <- trainSingleR(ref1b, labels=ref1$label, genes=markers1, test.genes=rownames(test)) ref2b <- ref2[c(1, seq_len(nrow(ref2))),] rownames(ref2b)[1] <- "WHEE" markers2 <- train2$markers$full markers2$A$B <- c(markers2$a$b, "WHEE") - train2b <- trainSingleR(ref2b, labels=ref2$label, genes=markers2) + train2b <- trainSingleR(ref2b, labels=ref2$label, genes=markers2, test.genes=rownames(test)) expect_error(out <- combineRecomputedResults( results=list(pred1, pred2),