Update to the latest version of the singlepp library in assorthead. (#…

…274) This allows us to get rid of our internally vendored copy of the library. The latest version of singlepp greatly simplifies management of gene IDs, forcing us to perform intersections if we are not completely sure that the test and reference row identities are the same. This is much easier to reason about than the previous logic; now, our test and reference features are essentially fixed, and the C++ code handles all of the intersections. A consequence of this change is that we need to know the test features during training (or we assume that they're the same as the reference). This involves some minor changes to the trained output to record the test features, so that we can check that they're the same as 'rownames(test)' during classification. If they're not, we throw an error, which is more stringent than the previous behavior where we would only throw if the subset of markers could not be found; but in practice, the two are probably the same anyway. Another consequence is that we can't do any filtering of the test matrix after training, as that would cause mismatches between the trained output's record of the test features and the test features used during classificaiton. This is probably fine as most people shouldn't have NAs in the test matrix, and even then, most people are using SingleR() and the filtering + intersections are handled automatically before training anyway. We also return to using BiocNeighbors. In this case, it provides C++-level builder objects for the neighbor index construction, allowing us to avoid having to compile for all of the different NN search algorithms.
SingleR-inc · Sep 6, 2024 · 74a6d59 · 74a6d59
1 parent 84a6e55
commit 74a6d59
Show file tree

Hide file tree

Showing 48 changed files with 524 additions and 4,122 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SingleR
 Title: Reference-Based Single-Cell RNA-Seq Annotation
-Version: 2.7.1
-Date: 2024-05-22
+Version: 2.7.2
+Date: 2024-09-06
 Authors@R: c(person("Dvir", "Aran", email="[email protected]", role=c("aut", "cph")),
     person("Aaron", "Lun", email="[email protected]", role=c("ctb", "cre")),
     person("Daniel", "Bunis", role="ctb"),
@@ -20,6 +20,7 @@ Imports:
     DelayedMatrixStats,
     BiocParallel,
     BiocSingular,
+    BiocNeighbors,
     stats,
     utils,
     Rcpp,
@@ -28,6 +29,7 @@ Imports:
 LinkingTo:
     Rcpp,
     beachmat,
+    assorthead,
     BiocNeighbors
 Suggests:
     testthat,
@@ -57,6 +59,6 @@ biocViews:
 SystemRequirements: C++17
 VignetteBuilder: knitr
 Encoding: UTF-8
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 URL: https://github.com/LTLA/SingleR
 BugReports: https://support.bioconductor.org/
diff --git a/NAMESPACE b/NAMESPACE
@@ -27,6 +27,9 @@ importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(S4Vectors,DataFrame)
 importClassesFrom(S4Vectors,List)
 importClassesFrom(SummarizedExperiment,SummarizedExperiment)
+importFrom(BiocNeighbors,AnnoyParam)
+importFrom(BiocNeighbors,KmknnParam)
+importFrom(BiocNeighbors,defineBuilder)
 importFrom(BiocParallel,SerialParam)
 importFrom(BiocParallel,bpisup)
 importFrom(BiocParallel,bpmapply)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,6 +1,16 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
+classify_integrated <- function(test, results, integrated_build, quantile, nthreads) {
+    .Call('_SingleR_classify_integrated', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads)
+}
+
+#' @importFrom Rcpp sourceCpp
+#' @useDynLib SingleR
+classify_single <- function(test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) {
+    .Call('_SingleR_classify_single', PACKAGE = 'SingleR', test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)
+}
+
 find_classic_markers <- function(nlabels, ngenes, labels, ref, de_n, nthreads) {
     .Call('_SingleR_find_classic_markers', PACKAGE = 'SingleR', nlabels, ngenes, labels, ref, de_n, nthreads)
 }
@@ -9,31 +19,21 @@ grouped_medians <- function(ref, groups, ngroups, nthreads) {
     .Call('_SingleR_grouped_medians', PACKAGE = 'SingleR', ref, groups, ngroups, nthreads)
 }
 
-integrate_build <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) {
-    .Call('_SingleR_integrate_build', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads)
-}
-
-integrate_run <- function(test, results, integrated_build, quantile, nthreads) {
-    .Call('_SingleR_integrate_run', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads)
+train_integrated <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) {
+    .Call('_SingleR_train_integrated', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads)
 }
 
 #' @importFrom Rcpp sourceCpp
 #' @useDynLib SingleR
-prebuild <- function(ref, labels, markers, approximate, nthreads) {
-    .Call('_SingleR_prebuild', PACKAGE = 'SingleR', ref, labels, markers, approximate, nthreads)
+train_single <- function(test_features, ref, ref_features, labels, markers, builder, nthreads) {
+    .Call('_SingleR_train_single', PACKAGE = 'SingleR', test_features, ref, ref_features, labels, markers, builder, nthreads)
 }
 
-get_subset <- function(built) {
-    .Call('_SingleR_get_subset', PACKAGE = 'SingleR', built)
+get_ref_subset <- function(built) {
+    .Call('_SingleR_get_ref_subset', PACKAGE = 'SingleR', built)
 }
 
 is_valid_built <- function(built) {
     .Call('_SingleR_is_valid_built', PACKAGE = 'SingleR', built)
 }
 
-#' @importFrom Rcpp sourceCpp
-#' @useDynLib SingleR
-run <- function(test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) {
-    .Call('_SingleR_run', PACKAGE = 'SingleR', test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)
-}
-
diff --git a/R/SingleR.R b/R/SingleR.R
@@ -102,33 +102,23 @@ SingleR <- function(
         bpstart(BPPARAM)
         on.exit(bpstop(BPPARAM))
     }
-    test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM)
-
-    # Converting to a common list format for ease of data munging.
-    if (single.ref <- !.is_list(ref)) {
-        ref <- list(ref)
-    }
 
-    ref <- lapply(ref, FUN=.to_clean_matrix, assay.type=assay.type.ref, 
-        check.missing=check.missing, msg="ref", BPPARAM=BPPARAM)
-    refnames <- Reduce(intersect, lapply(ref, rownames))
+    # We have to do all this row-subsetting at the start before trainSingleR,
+    # otherwise 'test.genes' won't match up to the filtered 'test'.
+    test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM)
 
-    keep <- intersect(rownames(test), refnames)
-    if (length(keep) == 0) {
-        stop("no common genes between 'test' and 'ref'")
+    tmp.ref <- ref
+    if (!is.list(tmp.ref) || is.data.frame(tmp.ref)) {
+        tmp.ref <- list(ref)
     }
-    if (!identical(keep, rownames(test))) {
-        test <- test[keep,]
-    }
-    for (i in seq_along(ref)) {
-        if (!identical(keep, rownames(ref[[i]]))) {
-            ref[[i]] <- ref[[i]][keep,,drop=FALSE]
+    for (rr in tmp.ref) {
+        keep <- rownames(test) %in% rownames(rr)
+        if (!all(keep)) {
+            test <- DelayedArray(test)[keep,,drop=FALSE] # only keeping the intersection, for safety's sake - see ?combineRecomputedResults.
         }
     }
-
-    # Converting back.
-    if (single.ref) {
-        ref <- ref[[1]]
+    if (nrow(test) == 0) {
+        stop("no common genes between 'test' and 'ref")
     }
 
     trained <- trainSingleR(
@@ -143,7 +133,8 @@ SingleR <- function(
         aggr.args = aggr.args, 
         recompute=recompute,
         restrict = restrict, 
-        check.missing=FALSE, 
+        test.genes=rownames(test),
+        check.missing=check.missing, 
         BNPARAM=BNPARAM, 
         num.threads = num.threads, 
         BPPARAM=BPPARAM

diff --git a/R/classifySingleR.R b/R/classifySingleR.R
@@ -3,17 +3,19 @@
 #' Assign labels to each cell in a test dataset, using a pre-trained classifier combined with an iterative fine-tuning approach.
 #' 
 #' @param test A numeric matrix of single-cell expression values where rows are genes and columns are cells.
+#' Each row should be named with the gene name.
 #'
 #' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix.
 #' @param trained A \linkS4class{List} containing the output of the \code{\link{trainSingleR}} function.
+#' If the row names of \code{test} are not exactly the same as the reference dataset, the call to \code{trainSingleR} should set \code{test.genes=rownames(test)}.
+#'
 #' Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multiple references.
 #' @param quantile A numeric scalar specifying the quantile of the correlation distribution to use to compute the score for each label.
 #' @param fine.tune A logical scalar indicating whether fine-tuning should be performed. 
 #' @param tune.thresh A numeric scalar specifying the maximum difference from the maximum correlation to use in fine-tuning.
 #' @param sd.thresh Deprecated and ignored.
 #' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.
-#' @param check.missing Logical scalar indicating whether rows should be checked for missing values.
-#' If true and any missing values are found, the rows containing these values are silently removed.
+#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.
 #' @param prune A logical scalar indicating whether label pruning should be performed.
 #' @param num.threads Integer scalar specifying the number of threads to use for classification.
 #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for \code{NA} scanning, when \code{check.missing=TRUE}.
@@ -98,25 +100,30 @@ classifySingleR <- function(
     sd.thresh=NULL, 
     prune=TRUE, 
     assay.type="logcounts", 
-    check.missing=TRUE,
+    check.missing=FALSE,
     num.threads = bpnworkers(BPPARAM),
     BPPARAM=SerialParam()) 
 {
-    test <- .to_clean_matrix(test, assay.type, check.missing, msg="test", BPPARAM=BPPARAM)
+    test <- .to_clean_matrix(test, assay.type, check.missing=FALSE, msg="test", BPPARAM=BPPARAM)
 
     solo <- .is_solo(trained)
     if (solo) { 
         trained <- list(trained)
     }
 
-    results <- lapply(trained, FUN=.classify_internals, 
-        test=test, 
-        quantile=quantile, 
-        fine.tune=fine.tune, 
-        tune.thresh=tune.thresh, 
-        prune=prune, 
-        num.threads=num.threads
-    )
+    results <- vector("list", length(trained))
+    names(results) <- names(trained)
+    for (l in seq_along(results)) {
+        results[[l]] <- .classify_internals(
+            test=test, 
+            trained=trained[[l]],
+            quantile=quantile, 
+            fine.tune=fine.tune, 
+            tune.thresh=tune.thresh, 
+            prune=prune, 
+            num.threads=num.threads
+        )
+    }
 
     if (solo) {
         results[[1]]
@@ -131,21 +138,30 @@ classifySingleR <- function(
     } 
 }
 
-#' @importFrom S4Vectors DataFrame metadata metadata<- I
-.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) {
-    m <- match(trained$markers$unique, rownames(test))
-    if (anyNA(m)) {
-        stop("'rownames(test)' does not contain all genes used in 'trained'")
+.check_test_genes <- function(test, trained) {
+    if (!is.null(trained$options$test.genes)) {
+        if (!identical(trained$options$test.genes, rownames(test))) {
+            stop("expected 'rownames(test)' to be the same as 'test.genes' in 'trained'")
+        }
+    } else if (!identical(rownames(trained$ref), rownames(test))) {
+        stop("expected 'rownames(test)' to be the same as 'rownames(ref)' in 'trained'")
     }
+}
 
+#' @importFrom S4Vectors DataFrame metadata metadata<- I
+.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) {
+    .check_test_genes(test, trained)
     trained <- rebuildIndex(trained, num.threads = num.threads)
 
     parsed <- initializeCpp(test)
-    out <- run(parsed, m - 1L, trained$built, 
+    out <- classify_single(
+        test = parsed, 
+        prebuilt = trained$built, 
         quantile = quantile, 
         use_fine_tune = fine.tune, 
         fine_tune_threshold = tune.thresh, 
-        nthreads = num.threads)
+        nthreads = num.threads
+    )
 
     colnames(out$scores) <- trained$labels$unique
     output <- DataFrame(

diff --git a/R/combineRecomputedResults.R b/R/combineRecomputedResults.R
@@ -6,10 +6,11 @@
 #'
 #' @param results A list of \linkS4class{DataFrame} prediction results as returned by \code{\link{classifySingleR}} when run on each reference separately.
 #' @inheritParams SingleR
+#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.
 #' @param trained A list of \linkS4class{List}s containing the trained outputs of multiple references,
 #' equivalent to either (i) the output of \code{\link{trainSingleR}} on multiple references with \code{recompute=TRUE},
 #' or (ii) running \code{trainSingleR} on each reference separately and manually making a list of the trained outputs.
-#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references.
+#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references.
 #' @param quantile Numeric scalar specifying the quantile of the correlation distribution to use for computing the score, see \code{\link{classifySingleR}}.
 #' @param allow.lost Deprecated.
 #'
@@ -20,7 +21,7 @@
 #' For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference;
 #' scores are not recomputed for the other labels.
 #' \item \code{labels}, a character vector containing the per-cell combined label across references.
-#' \item \code{references}, an integer vector specifying the reference from which the combined label was derived.
+#' \item \code{reference}, an integer vector specifying the reference from which the combined label was derived.
 #' \item \code{orig.results}, a DataFrame containing \code{results}.
 #' }
 #' It may also contain \code{pruned.labels} if these were also present in \code{results}.
@@ -100,66 +101,82 @@
 #'
 #' @export
 #' @importFrom S4Vectors DataFrame metadata<-
+#' @importFrom beachmat initializeCpp
 combineRecomputedResults <- function(
     results, 
     test, 
     trained, 
     quantile=0.8, 
     assay.type.test="logcounts", 
-    check.missing=TRUE, 
-    allow.lost=FALSE, 
+    check.missing=FALSE,
     warn.lost=TRUE,
+    allow.lost=FALSE, 
     num.threads = bpnworkers(BPPARAM),
     BPPARAM=SerialParam())
 {
-    all.names <- c(list(colnames(test)), lapply(results, rownames))
-    if (length(unique(all.names)) != 1) {
-        stop("cell/cluster names in 'results' are not identical")
-    }
-    all.nrow <- c(ncol(test), vapply(results, nrow, 0L))
-    if (length(unique(all.nrow)) != 1) {
-        stop("numbers of cells/clusters in 'results' are not identical")
+    test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=FALSE, msg="test", BPPARAM=BPPARAM)
+
+    # Applying the sanity checks.
+    stopifnot(length(results) == length(trained))
+    for (i in seq_along(results)) {
+        curres <- results[[i]]
+        if (ncol(test) != nrow(curres)) {
+            stop("numbers of cells/clusters in 'results' are not identical")
+        }
+        if (!identical(rownames(curres), colnames(test))) {
+            stop("cell/cluster names in 'results' are not identical")
+        }
+
+        curtrain <- trained[[i]]
+        if (!all(curres$labels %in% curtrain$labels$unique)) {
+            stop("not all labels in 'results' are present in 'trained'")
+        }
+        .check_test_genes(test, curtrain)
     }
 
-    # Checking the marker consistency.
+    # Checking the genes.
     all.refnames <- lapply(trained, function(x) rownames(x$ref))
-    intersected <- Reduce(intersect, all.refnames)
-    for (i in seq_along(trained)) {
-        if (!all(trained[[i]]$markers$unique %in% rownames(test))) {
-            stop("all markers stored in 'results' should be present in 'test'")
-        } else if (warn.lost && !all(trained[[i]]$markers$unique %in% intersected)) {
-            warning("entries of 'trained' differ in the universe of available markers")
+    if (warn.lost) {
+        intersected <- Reduce(intersect, all.refnames)
+        for (i in seq_along(trained)) {
+            if (!all(trained[[i]]$markers$unique %in% intersected)) {
+                warning("not all markers in 'trained' are available in each reference")
+            }
         }
     }
-    
+
     # Applying the integration.
     universe <- Reduce(union, c(list(rownames(test)), all.refnames))
-    ibuilt <- integrate_build(
-        match(rownames(test), universe) - 1L,
-        lapply(trained, function(x) initializeCpp(x$ref)),
-        lapply(trained, function(x) match(rownames(x$ref), universe) - 1L), 
-        lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L),
-        lapply(trained, function(x) x$built),
+    ibuilt <- train_integrated(
+        test_features=match(rownames(test), universe) - 1L,
+        references=lapply(trained, function(x) initializeCpp(x$ref)),
+        ref_ids=lapply(all.refnames, function(x) match(x, universe) - 1L), 
+        labels=lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L),
+        prebuilt=lapply(trained, function(x) rebuildIndex(x)$built),
         nthreads = num.threads
     )
 
-    test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=check.missing, msg="test", BPPARAM=BPPARAM)
     collated <- vector("list", length(trained))
     for (i in seq_along(collated)) {
         collated[[i]] <- match(results[[i]]$labels, trained[[i]]$labels$unique) - 1L
     }
 
     parsed <- initializeCpp(test)
-    irun <- integrate_run(parsed, collated, ibuilt, quantile = quantile, nthreads = num.threads) 
-    scores <- irun$scores
+    irun <- classify_integrated(
+        test=parsed,
+        results=collated,
+        integrated_build=ibuilt,
+        quantile=quantile,
+        nthreads=num.threads
+    ) 
 
     # Organizing the outputs.
     base.scores <- vector("list", length(results))
     for (r in seq_along(base.scores)) {
         mat <- results[[r]]$scores   
         mat[] <- NA_real_
         idx <- cbind(seq_len(nrow(mat)), collated[[r]] + 1L)
-        mat[idx] <- scores[,r]
+        mat[idx] <- irun$scores[,r]
         base.scores[[r]] <- mat
     }