Skip to content

Commit

Permalink
Update to the latest version of the singlepp library in assorthead. (#…
Browse files Browse the repository at this point in the history
…274)

This allows us to get rid of our internally vendored copy of the library. 

The latest version of singlepp greatly simplifies management of gene IDs,
forcing us to perform intersections if we are not completely sure that the test
and reference row identities are the same. This is much easier to reason about
than the previous logic; now, our test and reference features are essentially
fixed, and the C++ code handles all of the intersections.

A consequence of this change is that we need to know the test features during
training (or we assume that they're the same as the reference). This involves
some minor changes to the trained output to record the test features, so that
we can check that they're the same as 'rownames(test)' during classification.
If they're not, we throw an error, which is more stringent than the previous
behavior where we would only throw if the subset of markers could not be found;
but in practice, the two are probably the same anyway.

Another consequence is that we can't do any filtering of the test matrix after
training, as that would cause mismatches between the trained output's record of
the test features and the test features used during classificaiton. This is
probably fine as most people shouldn't have NAs in the test matrix, and even
then, most people are using SingleR() and the filtering + intersections are
handled automatically before training anyway.

We also return to using BiocNeighbors. In this case, it provides C++-level
builder objects for the neighbor index construction, allowing us to avoid 
having to compile for all of the different NN search algorithms.
  • Loading branch information
LTLA authored Sep 6, 2024
1 parent 84a6e55 commit 74a6d59
Show file tree
Hide file tree
Showing 48 changed files with 524 additions and 4,122 deletions.
8 changes: 5 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: SingleR
Title: Reference-Based Single-Cell RNA-Seq Annotation
Version: 2.7.1
Date: 2024-05-22
Version: 2.7.2
Date: 2024-09-06
Authors@R: c(person("Dvir", "Aran", email="[email protected]", role=c("aut", "cph")),
person("Aaron", "Lun", email="[email protected]", role=c("ctb", "cre")),
person("Daniel", "Bunis", role="ctb"),
Expand All @@ -20,6 +20,7 @@ Imports:
DelayedMatrixStats,
BiocParallel,
BiocSingular,
BiocNeighbors,
stats,
utils,
Rcpp,
Expand All @@ -28,6 +29,7 @@ Imports:
LinkingTo:
Rcpp,
beachmat,
assorthead,
BiocNeighbors
Suggests:
testthat,
Expand Down Expand Up @@ -57,6 +59,6 @@ biocViews:
SystemRequirements: C++17
VignetteBuilder: knitr
Encoding: UTF-8
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
URL: https://github.com/LTLA/SingleR
BugReports: https://support.bioconductor.org/
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(S4Vectors,DataFrame)
importClassesFrom(S4Vectors,List)
importClassesFrom(SummarizedExperiment,SummarizedExperiment)
importFrom(BiocNeighbors,AnnoyParam)
importFrom(BiocNeighbors,KmknnParam)
importFrom(BiocNeighbors,defineBuilder)
importFrom(BiocParallel,SerialParam)
importFrom(BiocParallel,bpisup)
importFrom(BiocParallel,bpmapply)
Expand Down
32 changes: 16 additions & 16 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

classify_integrated <- function(test, results, integrated_build, quantile, nthreads) {
.Call('_SingleR_classify_integrated', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads)
}

#' @importFrom Rcpp sourceCpp
#' @useDynLib SingleR
classify_single <- function(test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) {
.Call('_SingleR_classify_single', PACKAGE = 'SingleR', test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)
}

find_classic_markers <- function(nlabels, ngenes, labels, ref, de_n, nthreads) {
.Call('_SingleR_find_classic_markers', PACKAGE = 'SingleR', nlabels, ngenes, labels, ref, de_n, nthreads)
}
Expand All @@ -9,31 +19,21 @@ grouped_medians <- function(ref, groups, ngroups, nthreads) {
.Call('_SingleR_grouped_medians', PACKAGE = 'SingleR', ref, groups, ngroups, nthreads)
}

integrate_build <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) {
.Call('_SingleR_integrate_build', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads)
}

integrate_run <- function(test, results, integrated_build, quantile, nthreads) {
.Call('_SingleR_integrate_run', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads)
train_integrated <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) {
.Call('_SingleR_train_integrated', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads)
}

#' @importFrom Rcpp sourceCpp
#' @useDynLib SingleR
prebuild <- function(ref, labels, markers, approximate, nthreads) {
.Call('_SingleR_prebuild', PACKAGE = 'SingleR', ref, labels, markers, approximate, nthreads)
train_single <- function(test_features, ref, ref_features, labels, markers, builder, nthreads) {
.Call('_SingleR_train_single', PACKAGE = 'SingleR', test_features, ref, ref_features, labels, markers, builder, nthreads)
}

get_subset <- function(built) {
.Call('_SingleR_get_subset', PACKAGE = 'SingleR', built)
get_ref_subset <- function(built) {
.Call('_SingleR_get_ref_subset', PACKAGE = 'SingleR', built)
}

is_valid_built <- function(built) {
.Call('_SingleR_is_valid_built', PACKAGE = 'SingleR', built)
}

#' @importFrom Rcpp sourceCpp
#' @useDynLib SingleR
run <- function(test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) {
.Call('_SingleR_run', PACKAGE = 'SingleR', test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)
}

37 changes: 14 additions & 23 deletions R/SingleR.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,33 +102,23 @@ SingleR <- function(
bpstart(BPPARAM)
on.exit(bpstop(BPPARAM))
}
test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM)

# Converting to a common list format for ease of data munging.
if (single.ref <- !.is_list(ref)) {
ref <- list(ref)
}

ref <- lapply(ref, FUN=.to_clean_matrix, assay.type=assay.type.ref,
check.missing=check.missing, msg="ref", BPPARAM=BPPARAM)
refnames <- Reduce(intersect, lapply(ref, rownames))
# We have to do all this row-subsetting at the start before trainSingleR,
# otherwise 'test.genes' won't match up to the filtered 'test'.
test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM)

keep <- intersect(rownames(test), refnames)
if (length(keep) == 0) {
stop("no common genes between 'test' and 'ref'")
tmp.ref <- ref
if (!is.list(tmp.ref) || is.data.frame(tmp.ref)) {
tmp.ref <- list(ref)
}
if (!identical(keep, rownames(test))) {
test <- test[keep,]
}
for (i in seq_along(ref)) {
if (!identical(keep, rownames(ref[[i]]))) {
ref[[i]] <- ref[[i]][keep,,drop=FALSE]
for (rr in tmp.ref) {
keep <- rownames(test) %in% rownames(rr)
if (!all(keep)) {
test <- DelayedArray(test)[keep,,drop=FALSE] # only keeping the intersection, for safety's sake - see ?combineRecomputedResults.
}
}

# Converting back.
if (single.ref) {
ref <- ref[[1]]
if (nrow(test) == 0) {
stop("no common genes between 'test' and 'ref")
}

trained <- trainSingleR(
Expand All @@ -143,7 +133,8 @@ SingleR <- function(
aggr.args = aggr.args,
recompute=recompute,
restrict = restrict,
check.missing=FALSE,
test.genes=rownames(test),
check.missing=check.missing,
BNPARAM=BNPARAM,
num.threads = num.threads,
BPPARAM=BPPARAM
Expand Down
54 changes: 35 additions & 19 deletions R/classifySingleR.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@
#' Assign labels to each cell in a test dataset, using a pre-trained classifier combined with an iterative fine-tuning approach.
#'
#' @param test A numeric matrix of single-cell expression values where rows are genes and columns are cells.
#' Each row should be named with the gene name.
#'
#' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix.
#' @param trained A \linkS4class{List} containing the output of the \code{\link{trainSingleR}} function.
#' If the row names of \code{test} are not exactly the same as the reference dataset, the call to \code{trainSingleR} should set \code{test.genes=rownames(test)}.
#'
#' Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multiple references.
#' @param quantile A numeric scalar specifying the quantile of the correlation distribution to use to compute the score for each label.
#' @param fine.tune A logical scalar indicating whether fine-tuning should be performed.
#' @param tune.thresh A numeric scalar specifying the maximum difference from the maximum correlation to use in fine-tuning.
#' @param sd.thresh Deprecated and ignored.
#' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.
#' @param check.missing Logical scalar indicating whether rows should be checked for missing values.
#' If true and any missing values are found, the rows containing these values are silently removed.
#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.
#' @param prune A logical scalar indicating whether label pruning should be performed.
#' @param num.threads Integer scalar specifying the number of threads to use for classification.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for \code{NA} scanning, when \code{check.missing=TRUE}.
Expand Down Expand Up @@ -98,25 +100,30 @@ classifySingleR <- function(
sd.thresh=NULL,
prune=TRUE,
assay.type="logcounts",
check.missing=TRUE,
check.missing=FALSE,
num.threads = bpnworkers(BPPARAM),
BPPARAM=SerialParam())
{
test <- .to_clean_matrix(test, assay.type, check.missing, msg="test", BPPARAM=BPPARAM)
test <- .to_clean_matrix(test, assay.type, check.missing=FALSE, msg="test", BPPARAM=BPPARAM)

solo <- .is_solo(trained)
if (solo) {
trained <- list(trained)
}

results <- lapply(trained, FUN=.classify_internals,
test=test,
quantile=quantile,
fine.tune=fine.tune,
tune.thresh=tune.thresh,
prune=prune,
num.threads=num.threads
)
results <- vector("list", length(trained))
names(results) <- names(trained)
for (l in seq_along(results)) {
results[[l]] <- .classify_internals(
test=test,
trained=trained[[l]],
quantile=quantile,
fine.tune=fine.tune,
tune.thresh=tune.thresh,
prune=prune,
num.threads=num.threads
)
}

if (solo) {
results[[1]]
Expand All @@ -131,21 +138,30 @@ classifySingleR <- function(
}
}

#' @importFrom S4Vectors DataFrame metadata metadata<- I
.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) {
m <- match(trained$markers$unique, rownames(test))
if (anyNA(m)) {
stop("'rownames(test)' does not contain all genes used in 'trained'")
.check_test_genes <- function(test, trained) {
if (!is.null(trained$options$test.genes)) {
if (!identical(trained$options$test.genes, rownames(test))) {
stop("expected 'rownames(test)' to be the same as 'test.genes' in 'trained'")
}
} else if (!identical(rownames(trained$ref), rownames(test))) {
stop("expected 'rownames(test)' to be the same as 'rownames(ref)' in 'trained'")
}
}

#' @importFrom S4Vectors DataFrame metadata metadata<- I
.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) {
.check_test_genes(test, trained)
trained <- rebuildIndex(trained, num.threads = num.threads)

parsed <- initializeCpp(test)
out <- run(parsed, m - 1L, trained$built,
out <- classify_single(
test = parsed,
prebuilt = trained$built,
quantile = quantile,
use_fine_tune = fine.tune,
fine_tune_threshold = tune.thresh,
nthreads = num.threads)
nthreads = num.threads
)

colnames(out$scores) <- trained$labels$unique
output <- DataFrame(
Expand Down
75 changes: 46 additions & 29 deletions R/combineRecomputedResults.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
#'
#' @param results A list of \linkS4class{DataFrame} prediction results as returned by \code{\link{classifySingleR}} when run on each reference separately.
#' @inheritParams SingleR
#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.
#' @param trained A list of \linkS4class{List}s containing the trained outputs of multiple references,
#' equivalent to either (i) the output of \code{\link{trainSingleR}} on multiple references with \code{recompute=TRUE},
#' or (ii) running \code{trainSingleR} on each reference separately and manually making a list of the trained outputs.
#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references.
#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references.
#' @param quantile Numeric scalar specifying the quantile of the correlation distribution to use for computing the score, see \code{\link{classifySingleR}}.
#' @param allow.lost Deprecated.
#'
Expand All @@ -20,7 +21,7 @@
#' For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference;
#' scores are not recomputed for the other labels.
#' \item \code{labels}, a character vector containing the per-cell combined label across references.
#' \item \code{references}, an integer vector specifying the reference from which the combined label was derived.
#' \item \code{reference}, an integer vector specifying the reference from which the combined label was derived.
#' \item \code{orig.results}, a DataFrame containing \code{results}.
#' }
#' It may also contain \code{pruned.labels} if these were also present in \code{results}.
Expand Down Expand Up @@ -100,66 +101,82 @@
#'
#' @export
#' @importFrom S4Vectors DataFrame metadata<-
#' @importFrom beachmat initializeCpp
combineRecomputedResults <- function(
results,
test,
trained,
quantile=0.8,
assay.type.test="logcounts",
check.missing=TRUE,
allow.lost=FALSE,
check.missing=FALSE,
warn.lost=TRUE,
allow.lost=FALSE,
num.threads = bpnworkers(BPPARAM),
BPPARAM=SerialParam())
{
all.names <- c(list(colnames(test)), lapply(results, rownames))
if (length(unique(all.names)) != 1) {
stop("cell/cluster names in 'results' are not identical")
}
all.nrow <- c(ncol(test), vapply(results, nrow, 0L))
if (length(unique(all.nrow)) != 1) {
stop("numbers of cells/clusters in 'results' are not identical")
test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=FALSE, msg="test", BPPARAM=BPPARAM)

# Applying the sanity checks.
stopifnot(length(results) == length(trained))
for (i in seq_along(results)) {
curres <- results[[i]]
if (ncol(test) != nrow(curres)) {
stop("numbers of cells/clusters in 'results' are not identical")
}
if (!identical(rownames(curres), colnames(test))) {
stop("cell/cluster names in 'results' are not identical")
}

curtrain <- trained[[i]]
if (!all(curres$labels %in% curtrain$labels$unique)) {
stop("not all labels in 'results' are present in 'trained'")
}
.check_test_genes(test, curtrain)
}

# Checking the marker consistency.
# Checking the genes.
all.refnames <- lapply(trained, function(x) rownames(x$ref))
intersected <- Reduce(intersect, all.refnames)
for (i in seq_along(trained)) {
if (!all(trained[[i]]$markers$unique %in% rownames(test))) {
stop("all markers stored in 'results' should be present in 'test'")
} else if (warn.lost && !all(trained[[i]]$markers$unique %in% intersected)) {
warning("entries of 'trained' differ in the universe of available markers")
if (warn.lost) {
intersected <- Reduce(intersect, all.refnames)
for (i in seq_along(trained)) {
if (!all(trained[[i]]$markers$unique %in% intersected)) {
warning("not all markers in 'trained' are available in each reference")
}
}
}

# Applying the integration.
universe <- Reduce(union, c(list(rownames(test)), all.refnames))
ibuilt <- integrate_build(
match(rownames(test), universe) - 1L,
lapply(trained, function(x) initializeCpp(x$ref)),
lapply(trained, function(x) match(rownames(x$ref), universe) - 1L),
lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L),
lapply(trained, function(x) x$built),
ibuilt <- train_integrated(
test_features=match(rownames(test), universe) - 1L,
references=lapply(trained, function(x) initializeCpp(x$ref)),
ref_ids=lapply(all.refnames, function(x) match(x, universe) - 1L),
labels=lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L),
prebuilt=lapply(trained, function(x) rebuildIndex(x)$built),
nthreads = num.threads
)

test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=check.missing, msg="test", BPPARAM=BPPARAM)
collated <- vector("list", length(trained))
for (i in seq_along(collated)) {
collated[[i]] <- match(results[[i]]$labels, trained[[i]]$labels$unique) - 1L
}

parsed <- initializeCpp(test)
irun <- integrate_run(parsed, collated, ibuilt, quantile = quantile, nthreads = num.threads)
scores <- irun$scores
irun <- classify_integrated(
test=parsed,
results=collated,
integrated_build=ibuilt,
quantile=quantile,
nthreads=num.threads
)

# Organizing the outputs.
base.scores <- vector("list", length(results))
for (r in seq_along(base.scores)) {
mat <- results[[r]]$scores
mat[] <- NA_real_
idx <- cbind(seq_len(nrow(mat)), collated[[r]] + 1L)
mat[idx] <- scores[,r]
mat[idx] <- irun$scores[,r]
base.scores[[r]] <- mat
}

Expand Down
Loading

0 comments on commit 74a6d59

Please sign in to comment.