From 74a6d59199bebfe0f898900aafee21d8f0e836e9 Mon Sep 17 00:00:00 2001 From: Aaron Lun Date: Fri, 6 Sep 2024 16:41:44 -0700 Subject: [PATCH] Update to the latest version of the singlepp library in assorthead. (#274) This allows us to get rid of our internally vendored copy of the library. The latest version of singlepp greatly simplifies management of gene IDs, forcing us to perform intersections if we are not completely sure that the test and reference row identities are the same. This is much easier to reason about than the previous logic; now, our test and reference features are essentially fixed, and the C++ code handles all of the intersections. A consequence of this change is that we need to know the test features during training (or we assume that they're the same as the reference). This involves some minor changes to the trained output to record the test features, so that we can check that they're the same as 'rownames(test)' during classification. If they're not, we throw an error, which is more stringent than the previous behavior where we would only throw if the subset of markers could not be found; but in practice, the two are probably the same anyway. Another consequence is that we can't do any filtering of the test matrix after training, as that would cause mismatches between the trained output's record of the test features and the test features used during classificaiton. This is probably fine as most people shouldn't have NAs in the test matrix, and even then, most people are using SingleR() and the filtering + intersections are handled automatically before training anyway. We also return to using BiocNeighbors. In this case, it provides C++-level builder objects for the neighbor index construction, allowing us to avoid having to compile for all of the different NN search algorithms. --- DESCRIPTION | 8 +- NAMESPACE | 3 + R/RcppExports.R | 32 +- R/SingleR.R | 37 +- R/classifySingleR.R | 54 +- R/combineRecomputedResults.R | 75 +- R/getClassicMarkers.R | 6 +- R/rebuildIndices.R | 3 +- R/trainSingleR.R | 184 ++--- R/utils.R | 6 +- inst/NEWS.Rd | 12 + inst/include/singlepp/BasicBuilder.hpp | 317 --------- inst/include/singlepp/BasicScorer.hpp | 292 -------- .../include/singlepp/ChooseClassicMarkers.hpp | 292 -------- inst/include/singlepp/Classifier.hpp | 312 --------- inst/include/singlepp/IntegratedBuilder.hpp | 578 ---------------- inst/include/singlepp/IntegratedScorer.hpp | 344 --------- inst/include/singlepp/Markers.hpp | 43 -- inst/include/singlepp/annotate_cells.hpp | 116 ---- inst/include/singlepp/build_indices.hpp | 110 --- inst/include/singlepp/compute_scores.hpp | 60 -- inst/include/singlepp/fine_tune.hpp | 170 ----- inst/include/singlepp/load_references.hpp | 650 ------------------ inst/include/singlepp/macros.hpp | 42 -- inst/include/singlepp/process_features.hpp | 153 ----- inst/include/singlepp/scaled_ranks.hpp | 155 ----- inst/include/singlepp/singlepp.hpp | 22 - inst/include/source.sh | 16 - man/SingleR.Rd | 2 +- man/classifySingleR.Rd | 8 +- man/combineRecomputedResults.Rd | 12 +- man/getClassicMarkers.Rd | 5 +- man/trainSingleR.Rd | 25 +- src/Makevars | 1 - src/RcppExports.cpp | 97 +-- ...egrate_run.cpp => classify_integrated.cpp} | 28 +- src/classify_single.cpp | 44 ++ src/find_classic_markers.cpp | 7 +- src/integrate_build.cpp | 36 - src/prebuild.cpp | 47 -- src/run.cpp | 45 -- src/train_integrated.cpp | 39 ++ src/train_single.cpp | 57 ++ src/utils.h | 13 +- tests/testthat/test-SingleR.R | 9 +- tests/testthat/test-classify.R | 22 +- tests/testthat/test-recomputed.R | 45 +- tests/testthat/test-train.R | 12 +- 48 files changed, 524 insertions(+), 4122 deletions(-) delete mode 100644 inst/include/singlepp/BasicBuilder.hpp delete mode 100644 inst/include/singlepp/BasicScorer.hpp delete mode 100644 inst/include/singlepp/ChooseClassicMarkers.hpp delete mode 100644 inst/include/singlepp/Classifier.hpp delete mode 100644 inst/include/singlepp/IntegratedBuilder.hpp delete mode 100644 inst/include/singlepp/IntegratedScorer.hpp delete mode 100644 inst/include/singlepp/Markers.hpp delete mode 100644 inst/include/singlepp/annotate_cells.hpp delete mode 100644 inst/include/singlepp/build_indices.hpp delete mode 100644 inst/include/singlepp/compute_scores.hpp delete mode 100644 inst/include/singlepp/fine_tune.hpp delete mode 100644 inst/include/singlepp/load_references.hpp delete mode 100644 inst/include/singlepp/macros.hpp delete mode 100644 inst/include/singlepp/process_features.hpp delete mode 100644 inst/include/singlepp/scaled_ranks.hpp delete mode 100644 inst/include/singlepp/singlepp.hpp delete mode 100755 inst/include/source.sh delete mode 100644 src/Makevars rename src/{integrate_run.cpp => classify_integrated.cpp} (61%) create mode 100644 src/classify_single.cpp delete mode 100644 src/integrate_build.cpp delete mode 100644 src/prebuild.cpp delete mode 100644 src/run.cpp create mode 100644 src/train_integrated.cpp create mode 100644 src/train_single.cpp diff --git a/DESCRIPTION b/DESCRIPTION index 7e96163..ef584aa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: SingleR Title: Reference-Based Single-Cell RNA-Seq Annotation -Version: 2.7.1 -Date: 2024-05-22 +Version: 2.7.2 +Date: 2024-09-06 Authors@R: c(person("Dvir", "Aran", email="dvir.aran@ucsf.edu", role=c("aut", "cph")), person("Aaron", "Lun", email="infinite.monkeys.with.keyboards@gmail.com", role=c("ctb", "cre")), person("Daniel", "Bunis", role="ctb"), @@ -20,6 +20,7 @@ Imports: DelayedMatrixStats, BiocParallel, BiocSingular, + BiocNeighbors, stats, utils, Rcpp, @@ -28,6 +29,7 @@ Imports: LinkingTo: Rcpp, beachmat, + assorthead, BiocNeighbors Suggests: testthat, @@ -57,6 +59,6 @@ biocViews: SystemRequirements: C++17 VignetteBuilder: knitr Encoding: UTF-8 -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 URL: https://github.com/LTLA/SingleR BugReports: https://support.bioconductor.org/ diff --git a/NAMESPACE b/NAMESPACE index 91f111f..5cbaed2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,9 @@ importClassesFrom(Matrix,dgCMatrix) importClassesFrom(S4Vectors,DataFrame) importClassesFrom(S4Vectors,List) importClassesFrom(SummarizedExperiment,SummarizedExperiment) +importFrom(BiocNeighbors,AnnoyParam) +importFrom(BiocNeighbors,KmknnParam) +importFrom(BiocNeighbors,defineBuilder) importFrom(BiocParallel,SerialParam) importFrom(BiocParallel,bpisup) importFrom(BiocParallel,bpmapply) diff --git a/R/RcppExports.R b/R/RcppExports.R index 78fc7a3..d4c7890 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,16 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +classify_integrated <- function(test, results, integrated_build, quantile, nthreads) { + .Call('_SingleR_classify_integrated', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads) +} + +#' @importFrom Rcpp sourceCpp +#' @useDynLib SingleR +classify_single <- function(test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) { + .Call('_SingleR_classify_single', PACKAGE = 'SingleR', test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) +} + find_classic_markers <- function(nlabels, ngenes, labels, ref, de_n, nthreads) { .Call('_SingleR_find_classic_markers', PACKAGE = 'SingleR', nlabels, ngenes, labels, ref, de_n, nthreads) } @@ -9,31 +19,21 @@ grouped_medians <- function(ref, groups, ngroups, nthreads) { .Call('_SingleR_grouped_medians', PACKAGE = 'SingleR', ref, groups, ngroups, nthreads) } -integrate_build <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) { - .Call('_SingleR_integrate_build', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads) -} - -integrate_run <- function(test, results, integrated_build, quantile, nthreads) { - .Call('_SingleR_integrate_run', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads) +train_integrated <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) { + .Call('_SingleR_train_integrated', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads) } #' @importFrom Rcpp sourceCpp #' @useDynLib SingleR -prebuild <- function(ref, labels, markers, approximate, nthreads) { - .Call('_SingleR_prebuild', PACKAGE = 'SingleR', ref, labels, markers, approximate, nthreads) +train_single <- function(test_features, ref, ref_features, labels, markers, builder, nthreads) { + .Call('_SingleR_train_single', PACKAGE = 'SingleR', test_features, ref, ref_features, labels, markers, builder, nthreads) } -get_subset <- function(built) { - .Call('_SingleR_get_subset', PACKAGE = 'SingleR', built) +get_ref_subset <- function(built) { + .Call('_SingleR_get_ref_subset', PACKAGE = 'SingleR', built) } is_valid_built <- function(built) { .Call('_SingleR_is_valid_built', PACKAGE = 'SingleR', built) } -#' @importFrom Rcpp sourceCpp -#' @useDynLib SingleR -run <- function(test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) { - .Call('_SingleR_run', PACKAGE = 'SingleR', test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) -} - diff --git a/R/SingleR.R b/R/SingleR.R index 6f6f4e7..a8bf931 100644 --- a/R/SingleR.R +++ b/R/SingleR.R @@ -102,33 +102,23 @@ SingleR <- function( bpstart(BPPARAM) on.exit(bpstop(BPPARAM)) } - test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM) - - # Converting to a common list format for ease of data munging. - if (single.ref <- !.is_list(ref)) { - ref <- list(ref) - } - ref <- lapply(ref, FUN=.to_clean_matrix, assay.type=assay.type.ref, - check.missing=check.missing, msg="ref", BPPARAM=BPPARAM) - refnames <- Reduce(intersect, lapply(ref, rownames)) + # We have to do all this row-subsetting at the start before trainSingleR, + # otherwise 'test.genes' won't match up to the filtered 'test'. + test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM) - keep <- intersect(rownames(test), refnames) - if (length(keep) == 0) { - stop("no common genes between 'test' and 'ref'") + tmp.ref <- ref + if (!is.list(tmp.ref) || is.data.frame(tmp.ref)) { + tmp.ref <- list(ref) } - if (!identical(keep, rownames(test))) { - test <- test[keep,] - } - for (i in seq_along(ref)) { - if (!identical(keep, rownames(ref[[i]]))) { - ref[[i]] <- ref[[i]][keep,,drop=FALSE] + for (rr in tmp.ref) { + keep <- rownames(test) %in% rownames(rr) + if (!all(keep)) { + test <- DelayedArray(test)[keep,,drop=FALSE] # only keeping the intersection, for safety's sake - see ?combineRecomputedResults. } } - - # Converting back. - if (single.ref) { - ref <- ref[[1]] + if (nrow(test) == 0) { + stop("no common genes between 'test' and 'ref") } trained <- trainSingleR( @@ -143,7 +133,8 @@ SingleR <- function( aggr.args = aggr.args, recompute=recompute, restrict = restrict, - check.missing=FALSE, + test.genes=rownames(test), + check.missing=check.missing, BNPARAM=BNPARAM, num.threads = num.threads, BPPARAM=BPPARAM diff --git a/R/classifySingleR.R b/R/classifySingleR.R index 3a3b747..27e055d 100644 --- a/R/classifySingleR.R +++ b/R/classifySingleR.R @@ -3,17 +3,19 @@ #' Assign labels to each cell in a test dataset, using a pre-trained classifier combined with an iterative fine-tuning approach. #' #' @param test A numeric matrix of single-cell expression values where rows are genes and columns are cells. +#' Each row should be named with the gene name. #' #' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix. #' @param trained A \linkS4class{List} containing the output of the \code{\link{trainSingleR}} function. +#' If the row names of \code{test} are not exactly the same as the reference dataset, the call to \code{trainSingleR} should set \code{test.genes=rownames(test)}. +#' #' Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multiple references. #' @param quantile A numeric scalar specifying the quantile of the correlation distribution to use to compute the score for each label. #' @param fine.tune A logical scalar indicating whether fine-tuning should be performed. #' @param tune.thresh A numeric scalar specifying the maximum difference from the maximum correlation to use in fine-tuning. #' @param sd.thresh Deprecated and ignored. #' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}. -#' @param check.missing Logical scalar indicating whether rows should be checked for missing values. -#' If true and any missing values are found, the rows containing these values are silently removed. +#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}. #' @param prune A logical scalar indicating whether label pruning should be performed. #' @param num.threads Integer scalar specifying the number of threads to use for classification. #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for \code{NA} scanning, when \code{check.missing=TRUE}. @@ -98,25 +100,30 @@ classifySingleR <- function( sd.thresh=NULL, prune=TRUE, assay.type="logcounts", - check.missing=TRUE, + check.missing=FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM=SerialParam()) { - test <- .to_clean_matrix(test, assay.type, check.missing, msg="test", BPPARAM=BPPARAM) + test <- .to_clean_matrix(test, assay.type, check.missing=FALSE, msg="test", BPPARAM=BPPARAM) solo <- .is_solo(trained) if (solo) { trained <- list(trained) } - results <- lapply(trained, FUN=.classify_internals, - test=test, - quantile=quantile, - fine.tune=fine.tune, - tune.thresh=tune.thresh, - prune=prune, - num.threads=num.threads - ) + results <- vector("list", length(trained)) + names(results) <- names(trained) + for (l in seq_along(results)) { + results[[l]] <- .classify_internals( + test=test, + trained=trained[[l]], + quantile=quantile, + fine.tune=fine.tune, + tune.thresh=tune.thresh, + prune=prune, + num.threads=num.threads + ) + } if (solo) { results[[1]] @@ -131,21 +138,30 @@ classifySingleR <- function( } } -#' @importFrom S4Vectors DataFrame metadata metadata<- I -.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) { - m <- match(trained$markers$unique, rownames(test)) - if (anyNA(m)) { - stop("'rownames(test)' does not contain all genes used in 'trained'") +.check_test_genes <- function(test, trained) { + if (!is.null(trained$options$test.genes)) { + if (!identical(trained$options$test.genes, rownames(test))) { + stop("expected 'rownames(test)' to be the same as 'test.genes' in 'trained'") + } + } else if (!identical(rownames(trained$ref), rownames(test))) { + stop("expected 'rownames(test)' to be the same as 'rownames(ref)' in 'trained'") } +} +#' @importFrom S4Vectors DataFrame metadata metadata<- I +.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) { + .check_test_genes(test, trained) trained <- rebuildIndex(trained, num.threads = num.threads) parsed <- initializeCpp(test) - out <- run(parsed, m - 1L, trained$built, + out <- classify_single( + test = parsed, + prebuilt = trained$built, quantile = quantile, use_fine_tune = fine.tune, fine_tune_threshold = tune.thresh, - nthreads = num.threads) + nthreads = num.threads + ) colnames(out$scores) <- trained$labels$unique output <- DataFrame( diff --git a/R/combineRecomputedResults.R b/R/combineRecomputedResults.R index 773baa4..f9b1fe9 100644 --- a/R/combineRecomputedResults.R +++ b/R/combineRecomputedResults.R @@ -6,10 +6,11 @@ #' #' @param results A list of \linkS4class{DataFrame} prediction results as returned by \code{\link{classifySingleR}} when run on each reference separately. #' @inheritParams SingleR +#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}. #' @param trained A list of \linkS4class{List}s containing the trained outputs of multiple references, #' equivalent to either (i) the output of \code{\link{trainSingleR}} on multiple references with \code{recompute=TRUE}, #' or (ii) running \code{trainSingleR} on each reference separately and manually making a list of the trained outputs. -#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references. +#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references. #' @param quantile Numeric scalar specifying the quantile of the correlation distribution to use for computing the score, see \code{\link{classifySingleR}}. #' @param allow.lost Deprecated. #' @@ -20,7 +21,7 @@ #' For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference; #' scores are not recomputed for the other labels. #' \item \code{labels}, a character vector containing the per-cell combined label across references. -#' \item \code{references}, an integer vector specifying the reference from which the combined label was derived. +#' \item \code{reference}, an integer vector specifying the reference from which the combined label was derived. #' \item \code{orig.results}, a DataFrame containing \code{results}. #' } #' It may also contain \code{pruned.labels} if these were also present in \code{results}. @@ -100,58 +101,74 @@ #' #' @export #' @importFrom S4Vectors DataFrame metadata<- +#' @importFrom beachmat initializeCpp combineRecomputedResults <- function( results, test, trained, quantile=0.8, assay.type.test="logcounts", - check.missing=TRUE, - allow.lost=FALSE, + check.missing=FALSE, warn.lost=TRUE, + allow.lost=FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM=SerialParam()) { - all.names <- c(list(colnames(test)), lapply(results, rownames)) - if (length(unique(all.names)) != 1) { - stop("cell/cluster names in 'results' are not identical") - } - all.nrow <- c(ncol(test), vapply(results, nrow, 0L)) - if (length(unique(all.nrow)) != 1) { - stop("numbers of cells/clusters in 'results' are not identical") + test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=FALSE, msg="test", BPPARAM=BPPARAM) + + # Applying the sanity checks. + stopifnot(length(results) == length(trained)) + for (i in seq_along(results)) { + curres <- results[[i]] + if (ncol(test) != nrow(curres)) { + stop("numbers of cells/clusters in 'results' are not identical") + } + if (!identical(rownames(curres), colnames(test))) { + stop("cell/cluster names in 'results' are not identical") + } + + curtrain <- trained[[i]] + if (!all(curres$labels %in% curtrain$labels$unique)) { + stop("not all labels in 'results' are present in 'trained'") + } + .check_test_genes(test, curtrain) } - # Checking the marker consistency. + # Checking the genes. all.refnames <- lapply(trained, function(x) rownames(x$ref)) - intersected <- Reduce(intersect, all.refnames) - for (i in seq_along(trained)) { - if (!all(trained[[i]]$markers$unique %in% rownames(test))) { - stop("all markers stored in 'results' should be present in 'test'") - } else if (warn.lost && !all(trained[[i]]$markers$unique %in% intersected)) { - warning("entries of 'trained' differ in the universe of available markers") + if (warn.lost) { + intersected <- Reduce(intersect, all.refnames) + for (i in seq_along(trained)) { + if (!all(trained[[i]]$markers$unique %in% intersected)) { + warning("not all markers in 'trained' are available in each reference") + } } } - + # Applying the integration. universe <- Reduce(union, c(list(rownames(test)), all.refnames)) - ibuilt <- integrate_build( - match(rownames(test), universe) - 1L, - lapply(trained, function(x) initializeCpp(x$ref)), - lapply(trained, function(x) match(rownames(x$ref), universe) - 1L), - lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L), - lapply(trained, function(x) x$built), + ibuilt <- train_integrated( + test_features=match(rownames(test), universe) - 1L, + references=lapply(trained, function(x) initializeCpp(x$ref)), + ref_ids=lapply(all.refnames, function(x) match(x, universe) - 1L), + labels=lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L), + prebuilt=lapply(trained, function(x) rebuildIndex(x)$built), nthreads = num.threads ) - test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=check.missing, msg="test", BPPARAM=BPPARAM) collated <- vector("list", length(trained)) for (i in seq_along(collated)) { collated[[i]] <- match(results[[i]]$labels, trained[[i]]$labels$unique) - 1L } parsed <- initializeCpp(test) - irun <- integrate_run(parsed, collated, ibuilt, quantile = quantile, nthreads = num.threads) - scores <- irun$scores + irun <- classify_integrated( + test=parsed, + results=collated, + integrated_build=ibuilt, + quantile=quantile, + nthreads=num.threads + ) # Organizing the outputs. base.scores <- vector("list", length(results)) @@ -159,7 +176,7 @@ combineRecomputedResults <- function( mat <- results[[r]]$scores mat[] <- NA_real_ idx <- cbind(seq_len(nrow(mat)), collated[[r]] + 1L) - mat[idx] <- scores[,r] + mat[idx] <- irun$scores[,r] base.scores[[r]] <- mat } diff --git a/R/getClassicMarkers.R b/R/getClassicMarkers.R index 5ab6c93..4a95dee 100644 --- a/R/getClassicMarkers.R +++ b/R/getClassicMarkers.R @@ -64,6 +64,10 @@ getClassicMarkers <- function(ref, labels, assay.type="logcounts", check.missing labels <- list(labels) } + for (i in seq_along(ref)) { + ref[[i]] <- .to_clean_matrix(ref[[i]], assay.type, check.missing, msg="ref", BPPARAM=BPPARAM) + } + # Setting up references. common <- Reduce(intersect, lapply(ref, rownames)) if (length(common)==0L && any(vapply(ref, nrow, 0L) > 0L)) { @@ -73,9 +77,7 @@ getClassicMarkers <- function(ref, labels, assay.type="logcounts", check.missing for (i in seq_along(ref)) { current <- ref[[i]][common,,drop=FALSE] - current <- .to_clean_matrix(current, assay.type, check.missing, msg="ref", BPPARAM=BPPARAM) curptr <- initializeCpp(current) - flabels <- factor(labels[[i]]) gm <- grouped_medians(curptr, as.integer(flabels) - 1L, nlevels(flabels), nthreads = num.threads) colnames(gm) <- levels(flabels) diff --git a/R/rebuildIndices.R b/R/rebuildIndices.R index 87566ab..72e3261 100644 --- a/R/rebuildIndices.R +++ b/R/rebuildIndices.R @@ -46,7 +46,8 @@ rebuildIndex <- function(trained, num.threads=1) { markers=trained$markers$full, labels=trained$labels$full, ulabels=trained$labels$unique, - approximate=trained$options$approximate, + BNPARAM=trained$options$BNPARAM, + test.genes=trained$options$test.genes, num.threads=num.threads) } trained diff --git a/R/trainSingleR.R b/R/trainSingleR.R index fe56ec5..e74ea41 100644 --- a/R/trainSingleR.R +++ b/R/trainSingleR.R @@ -8,12 +8,11 @@ #' #' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix. #' -#' Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references, -#' in which case the row names are expected to be the same across all objects. +#' Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references. #' @param labels A character vector or factor of known labels for all samples in \code{ref}. #' #' Alternatively, if \code{ref} is a list, \code{labels} should be a list of the same length. -#' Each element should contain a character vector or factor specifying the label for the corresponding entry of \code{ref}. +#' Each element should contain a character vector or factor specifying the labels for the columns of the corresponding element of \code{ref}. #' @param genes A string containing \code{"de"}, indicating that markers should be calculated from \code{ref}. #' For back compatibility, other string values are allowed but will be ignored with a deprecation warning. #' @@ -45,13 +44,15 @@ #' if \code{ref} is a \linkS4class{SummarizedExperiment} object (or is a list that contains one or more such objects). #' @param check.missing Logical scalar indicating whether rows should be checked for missing values. #' If true and any missing values are found, the rows containing these values are silently removed. -#' @param BNPARAM Deprecated and ignored. -#' @param approximate Logical scalar indicating whether a faster approximate method should be used to compute the quantile. +#' @param BNPARAM A \linkS4class{BiocNeighborParam} object specifying how the neighbor search index should be constructed. +#' @param approximate Deprecated, use \code{BNPARAM} instead. #' @param num.threads Integer scalar specifying the number of threads to use for index building. #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed. #' Relevant for marker detection if \code{genes = NULL}, aggregation if \code{aggr.ref = TRUE}, and \code{NA} checking if \code{check.missing = TRUE}. #' @param restrict A character vector of gene names to use for marker selection. #' By default, all genes in \code{ref} are used. +#' @param test.genes Character vector of the names of the genes in the test dataset, i.e., the row names of \code{test} in \code{\link{classifySingleR}}. +#' If \code{NULL}, it is assumed that the test dataset and \code{ref} have the same genes in the same row order. #' #' @return #' For a single reference, a \linkS4class{List} is returned containing: @@ -78,9 +79,15 @@ #' For each label, the top \code{de.n} genes with the largest differences compared to another label are chosen as markers to distinguish the two labels. #' The expression values are expected to be log-transformed and normalized. #' -#' If \code{restrict} is specified, \code{ref} is subsetted to only include the rows with names that are in \code{restrict}. -#' Marker selection and all subsequent classification will be performed using this restrictive subset of genes. -#' This can be convenient for ensuring that only appropriate genes are used (e.g., not pseudogenes or predicted genes). +#' Classification with \code{classifySingleR} assumes that the test dataset contains all marker genes that were detected from the reference. +#' If the test and reference datasets do not have the same genes in the same order, we can set \code{test.genes} to the row names of the test dataset. +#' This will instruct \code{trainSingleR} to only consider markers that are present in the test dataset. +#' Any subsequent call to \code{classifySingleR} will also check that \code{test.genes} is consistent with \code{rownames(test)}. +#' +#' On a similar note, if \code{restrict} is specified, marker selection will only be performed using the specified subset of genes. +#' This can be convenient for ignoring inappropriate genes like pseudogenes or predicted genes. +#' It has the same effect as filtering out undesirable rows from \code{ref} prior to calling \code{trainSingleR}. +#' Unlike \code{test.genes}, setting \code{restrict} does not introduce further checks on \code{rownames(test)} in \code{classifySingleR}. #' #' @section Custom feature specification: #' Rather than relying on the in-built feature selection, users can pass in their own features of interest to \code{genes}. @@ -164,11 +171,15 @@ #' #' @export #' @importFrom S4Vectors List isSingleString metadata metadata<- +#' @importFrom BiocNeighbors defineBuilder AnnoyParam KmknnParam #' @importFrom BiocParallel SerialParam bpisup bpstart bpstop #' @importFrom beachmat initializeCpp +#' @importFrom S4Vectors List +#' @importFrom SummarizedExperiment assay trainSingleR <- function( ref, labels, + test.genes=NULL, genes="de", sd.thresh=NULL, de.method=c("classic", "wilcox", "t"), @@ -195,70 +206,98 @@ trainSingleR <- function( } } - if (!bpisup(BPPARAM) && !is(BPPARAM, "MulticoreParam")) { - bpstart(BPPARAM) - on.exit(bpstop(BPPARAM)) - } - - ref <- lapply(ref, FUN=.to_clean_matrix, assay.type=assay.type, - check.missing=check.missing, msg="ref", BPPARAM=BPPARAM) - - # Cleaning the genes. - gns <- lapply(ref, rownames) - if (length(unique(gns))!=1L) { - stop("row names are not identical across references") - } - - if (!is.null(restrict)) { - keep <- gns[[1]] %in% restrict - ref <- lapply(ref, FUN="[", i=keep, , drop=FALSE) - } - if (isSingleString(genes)) { genes <- rep(genes, length(ref)) } else if (length(genes)!=length(ref)) { stop("list-like 'genes' should be the same length as 'ref'") } - # Cleaning the labels. - labels <- lapply(labels, as.character) - if (length(labels)!=length(ref)) { - stop("lists in 'labels' and 'ref' should be of the same length") + if (is.null(BNPARAM)) { + if (approximate) { + BNPARAM <- AnnoyParam() + } else { + BNPARAM <- KmknnParam() + } } - for (l in seq_along(labels)) { - keep <- !is.na(labels[[l]]) + if (!bpisup(BPPARAM) && !is(BPPARAM, "MulticoreParam")) { + bpstart(BPPARAM) + on.exit(bpstop(BPPARAM)) + } + + output <- vector("list", length(ref)) + names(output) <- names(ref) + for (l in seq_along(ref)) { + curref <- .to_clean_matrix(ref[[l]], assay.type, check.missing, msg="ref", BPPARAM=BPPARAM) + + curlabels <- as.character(labels[[l]]) + stopifnot(length(curlabels) == ncol(curref)) + keep <- !is.na(curlabels) if (!all(keep)) { - labels[[l]] <- labels[[l]][keep] - ref[[l]] <- ref[[l]][,keep,drop=FALSE] + curref <- curref[,keep,drop=FALSE] + curlabels <- curlabels[keep] } - } - gene.info <- mapply(FUN=.identify_genes, ref=ref, labels=labels, genes=genes, - MoreArgs=list(de.method=de.method, de.n=de.n, de.args=de.args, BPPARAM=BPPARAM), - SIMPLIFY=FALSE) + markers <- .identify_genes( + ref=curref, + labels=curlabels, + genes=genes[[l]], + de.method=de.method, + de.n=de.n, + de.args=de.args, + restrict=restrict, + test.genes=test.genes, + BPPARAM=BPPARAM + ) + + if (aggr.ref) { + aggr <- do.call(aggregateReference, c(list(ref=quote(curref), label=curlabels, check.missing=FALSE, BPPARAM=BPPARAM), aggr.args)) + curref <- assay(aggr) + curlabels <- aggr$label + } - if (!solo && !recompute) { - .Deprecated(old="recompute = FALSE") + ulabels <- .get_levels(curlabels) + built <- .build_index( + ref=curref, + labels=curlabels, + ulabels=ulabels, + test.genes=test.genes, + markers=markers, + BNPARAM=BNPARAM, + num.threads=num.threads + ) + + output[[l]] <- List( + built = built, + ref = curref, + labels = list(full = curlabels, unique = ulabels), + markers = list(full = markers, unique = rownames(curref)[get_ref_subset(built) + 1]), + options = list(BNPARAM = BNPARAM, test.genes = test.genes) + ) } - output <- mapply(FUN=.build_trained_index, ref=ref, labels=labels, markers=gene.info, - MoreArgs = list(aggr.ref=aggr.ref, aggr.args=aggr.args, BPPARAM=BPPARAM, approximate = approximate, num.threads = num.threads), - SIMPLIFY=FALSE) if (solo) { output[[1]] } else { - final <- List(output) - metadata(final)$recompute <- recompute - final + List(output) } } -.identify_genes <- function(ref, labels, genes="de", de.method="classic", de.n=NULL, de.args=list(), BPPARAM=BPPARAM) { +.identify_genes <- function(ref, labels, genes, de.method, de.n, test.genes, restrict, de.args, BPPARAM) { if (length(labels)!=ncol(ref)) { stop("number of labels must be equal to number of cells") } + # Note that the genes are reported as names rather than indexing, so these + # row-subsetting operations don't require re-indexing of the output. We use + # DelayedArrays to avoid making copies of the data. + if (!is.null(restrict)) { + ref <- DelayedArray(ref)[rownames(ref) %in% restrict,,drop=FALSE] + } + if (!is.null(test.genes)) { + ref <- DelayedArray(ref)[rownames(ref) %in% test.genes,,drop=FALSE] + } + if (.is_list(genes)) { is.char <- vapply(genes, is.character, TRUE) if (all(is.char)) { @@ -284,36 +323,8 @@ trainSingleR <- function( genes } -#' @importFrom S4Vectors List -#' @importFrom SummarizedExperiment assay -.build_trained_index <- function(ref, labels, markers, aggr.ref, aggr.args, search.info, approximate = FALSE, BPPARAM = SerialParam(), num.threads = 1) { - if (aggr.ref) { - aggr <- do.call(aggregateReference, c(list(ref=quote(ref), label=labels, check.missing=FALSE, BPPARAM=BPPARAM), aggr.args)) - ref <- assay(aggr) - labels <- aggr$label - } - - if (anyNA(labels)) { - keep <- !is.na(labels) - ref <- ref[,keep,drop=FALSE] - labels <- labels[keep] - } - - ulabels <- .get_levels(labels) - - built <- .build_index(ref, markers=markers, labels=labels, ulabels=ulabels, approximate=approximate, num.threads=num.threads) - - List( - built = built, - ref = ref, - labels = list(full = labels, unique = ulabels), - markers = list(full = markers, unique = rownames(ref)[get_subset(built) + 1]), - options = list(approximate = approximate) - ) -} - #' @importFrom beachmat initializeCpp -.build_index <- function(ref, markers, labels, ulabels, approximate, num.threads) { +.build_index <- function(ref, labels, ulabels, markers, test.genes, BNPARAM, num.threads) { for (m in seq_along(markers)) { current <- markers[[m]] for (n in seq_along(current)) { @@ -326,8 +337,25 @@ trainSingleR <- function( markers[[m]] <- current } + if (is.null(test.genes)) { + test.genes <- ref.genes <- seq_len(nrow(ref)) + } else { + universe <- union(test.genes, rownames(ref)) + test.genes <- match(test.genes, universe) + ref.genes <- match(rownames(ref), universe) + } + + builder <- defineBuilder(BNPARAM) parsed <- initializeCpp(ref) - prebuild(parsed, match(labels, ulabels) - 1L, markers, approximate = approximate, nthreads = num.threads) + train_single( + test_features=test.genes - 1L, + ref=parsed, + ref_features=ref.genes - 1L, + labels=match(labels, ulabels) - 1L, + markers=markers, + builder=builder, + nthreads=num.threads + ) } .get_levels <- function(labels) sort(unique(labels)) diff --git a/R/utils.R b/R/utils.R index 39ea03d..80bfdec 100644 --- a/R/utils.R +++ b/R/utils.R @@ -23,10 +23,12 @@ old <- getAutoBPPARAM() setAutoBPPARAM(BPPARAM) on.exit(setAutoBPPARAM(old)) - discard <- rowAnyNAs(DelayedArray(x)) + y <- DelayedArray(x) + discard <- rowAnyNAs(y) if (any(discard)) { - x <- x[!discard,,drop=FALSE] + # Returning a DelayedArray to avoid making an actual subset. + x <- y[!discard,,drop=FALSE] } } diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index e6bc6dc..264922e 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -2,6 +2,18 @@ \title{SingleR News} \encoding{UTF-8} +\section{Version 2.8.0}{\itemize{ +\item Added the \code{test.genes=} argument to \code{trainSingleR()}, to restrict marker detection to only those genes in the test dataset. +This is also checked against \code{rownames(test)} in \code{classifySingleR()} to ensure that the test's feature space is consistent with the space used during training. + +\item Restored the \code{BNPARAM=} argument in \code{trainSingleR()}, to enable more fine-grained specification of neighbor search algorithms. +The \code{approximate=} argument is deprecated. + +\item Soft-deprecated \code{check.missing=} in \code{classifySingleR()} and \code{combineRecomputedResults()}. +This is because any filtering will cause a mismatch between the row names of \code{tests} and the \code{test.genes} in \code{trained}. +Rather, filtering should be done prior to \code{trainSingleR()}, as is done in the main \code{SingleR()} function. +}} + \section{Version 2.0.0}{\itemize{ \item The format of the output of \code{trainSingleR()} has changed and is no longer back-compatible. \item \code{recompute=FALSE} in \code{trainSingleR()} does nothing; all integrated analyses are now done with \code{recompute=TRUE}. diff --git a/inst/include/singlepp/BasicBuilder.hpp b/inst/include/singlepp/BasicBuilder.hpp deleted file mode 100644 index 111f9bf..0000000 --- a/inst/include/singlepp/BasicBuilder.hpp +++ /dev/null @@ -1,317 +0,0 @@ -#ifndef SINGLEPP_BASIC_BUILDER_HPP -#define SINGLEPP_BASIC_BUILDER_HPP - -#include "macros.hpp" - -#include "knncolle/knncolle.hpp" -#include "tatami/tatami.hpp" - -#include "build_indices.hpp" -#include "process_features.hpp" - -#include - -/** - * @file BasicBuilder.hpp - * - * @brief Defines the `BasicBuilder` class. - */ - -namespace singlepp { - -/** - * @brief Construct a single pre-built reference for further classification. - * - * This class prepares a single labelled reference dataset for classification of a test dataset. - * We select the top marker genes to use for each pairwise comparison between labels, - * and we use them to pre-compute the ranks in advance of computing the Spearman correlations. - * The pre-built reference can then be used in `BasicScorer::run()`. - */ -class BasicBuilder { -public: - /** - * @brief Default parameters for reference building. - */ - struct Defaults { - /** - * See `set_top()` for details. - */ - static constexpr int top = -1; - - /** - * See `set_approximate()` for details. - */ - static constexpr bool approximate = false; - - /** - * See `set_num_threads()` for details. - */ - static constexpr int num_threads = 1; - }; - -private: - int top = Defaults::top; - bool approximate = Defaults::approximate; - int nthreads = Defaults::num_threads; - -public: - /** - * @param t Number of top markers to use from each pairwise comparison between labels, see `Classifier::set_top()`. - * - * @return A reference to this `BasicBuilder` object. - */ - BasicBuilder& set_top(int t = Defaults::top) { - top = t; - return *this; - } - - /** - * @param a Whether to use an approximate method to quickly find the quantile, see `Classifier::set_approximate()`. - * - * @return A reference to this `BasicBuilder` object. - */ - BasicBuilder& set_approximate(bool a = Defaults::approximate) { - approximate = a; - return *this; - } - - /** - * @param n Number of threads to use. - * - * @return A reference to this `BasicBuilder` object. - */ - BasicBuilder& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -private: - std::vector build_internal(const tatami::Matrix* ref, const int* labels, const std::vector& subset) const { - std::vector subref; - if (approximate) { - subref = build_indices( - ref, - labels, - subset, - [](size_t nr, size_t nc, const double* ptr) { - return std::shared_ptr >(new knncolle::AnnoyEuclidean(nr, nc, ptr)); - }, - nthreads - ); - } else { - subref = build_indices( - ref, - labels, - subset, - [](size_t nr, size_t nc, const double* ptr) { - return std::shared_ptr >(new knncolle::KmknnEuclidean(nr, nc, ptr)); - }, - nthreads - ); - } - return subref; - } - -public: - /** - * @brief Prebuilt reference that can be directly used for annotation. - */ - struct Prebuilt { - /** - * @cond - */ - Prebuilt(Markers m, std::vector s, std::vector r) : - markers(std::move(m)), subset(std::move(s)), references(std::move(r)) {} - /** - * @endcond - */ - - /** - * A vector of vectors of ranked marker genes to be used in the classification. - * Values are indices into the `subset` vector. - * The set of marker genes is typically a subset of those in the input `markers` in `build()`. - */ - Markers markers; - - /** - * The subset of features in the test/reference datasets that were used in the classification. - * Values are row indices into the relevant matrices. - */ - std::vector subset; - - /** - * @return Number of labels in this reference. - */ - size_t num_labels() const { - return references.size(); - } - - /** - * @return Number of profiles in this reference. - */ - size_t num_profiles() const { - size_t n = 0; - for (const auto& ref : references) { - n += ref.ranked.size(); - } - return n; - } - - /** - * @cond - */ - std::vector references; - /** - * @endcond - */ - }; - - /** - * @param ref Matrix for the reference expression profiles. - * Rows are genes while columns are samples. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * - * @return A `Prebuilt` instance that can be used in `run()` for annotation of a test dataset. - */ - Prebuilt run(const tatami::Matrix* ref, const int* labels, Markers markers) const { - auto subset = subset_markers(markers, top); - auto subref = build_internal(ref, labels, subset); - return Prebuilt(std::move(markers), std::move(subset), std::move(subref)); - } - -public: - /** - * @brief Prebuilt reference requiring an intersection of features. - */ - struct PrebuiltIntersection { - /** - * @cond - */ - PrebuiltIntersection(Markers m, std::vector mats, std::vector refs, std::vector r) : - markers(std::move(m)), mat_subset(std::move(mats)), ref_subset(std::move(refs)), references(std::move(r)) {} - /** - * @endcond - */ - - /** - * A vector of vectors of ranked marker genes to be used in the classification. - * Values are indices into the `mat_subset` and `ref_subset` vectors for the respective matrices. - * The set of marker genes is typically a subset of those in the input `markers` in `build()`. - */ - Markers markers; - - /** - * Row indices of test dataset, specifying the features in the intersection. - * This has the same length as `ref_subset`, where corresponding entries refer to the same features in the respective datasets. - */ - std::vector mat_subset; - - /** - * Row indices of reference dataset, specifying the features in the intersection. - * This has the same length as `mat_subset`, where corresponding entries refer to the same features in the respective datasets. - */ - std::vector ref_subset; - - /** - * @return Number of labels in this reference. - */ - size_t num_labels() const { - return references.size(); - } - - /** - * @return Number of profiles in this reference. - */ - size_t num_profiles() const { - size_t n = 0; - for (const auto& ref : references) { - n += ref.ranked.size(); - } - return n; - } - - /** - * @cond - */ - std::vector references; - /** - * @endcond - */ - }; - - /** - * @param intersection Vector defining the intersection of features betweent the test and reference datasets. - * Each entry is a pair where the first element is the row index in the test matrix, - * and the second element is the row index for the corresponding feature in the reference matrix. - * Each row index for either matrix should occur no more than once in `intersection`. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * - * @return A `PrebuiltIntersection` instance that can be used in `run()` for annotation of a test dataset with the same order of genes as specified in `mat_id`. - * - * This method deals with the case where the genes are not in the same order and number across the test and reference datasets. - * It finds the intersection of genes and then prepares the references accordingly. - */ - PrebuiltIntersection run( - std::vector > intersection, - const tatami::Matrix* ref, - const int* labels, - Markers markers) - const { - // Sorting it if it wasn't already. - for (size_t i = 1, end = intersection.size(); i < end; ++i) { - if (intersection[i] < intersection[i-1]) { - std::sort(intersection.begin(), intersection.end()); - break; - } - } - - subset_markers(intersection, markers, top); - auto pairs = unzip(intersection); - auto subref = build_internal(ref, labels, pairs.second); - return PrebuiltIntersection(std::move(markers), std::move(pairs.first), std::move(pairs.second), std::move(subref)); - } - - /** - * @tparam Id Gene identifier for each row. - * - * @param mat_nrow Number of rows (genes) in the test dataset. - * @param[in] mat_id Pointer to an array of identifiers of length equal to `mat_nrow`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * If any duplicate IDs are present, only the first occurrence is used. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * If any duplicate IDs are present, only the first occurrence is used. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * - * @return A `PrebuiltIntersection` instance that can be used in `run()` for annotation of a test dataset with the same order of genes as specified in `mat_id`. - * - * This method deals with the case where the genes are not in the same order and number across the test and reference datasets. - * It finds the intersection of genes and then prepares the references accordingly. - */ - template - PrebuiltIntersection run( - size_t mat_nrow, - const Id* mat_id, - const tatami::Matrix* ref, - const Id* ref_id, - const int* labels, - Markers markers) - const { - auto intersection = intersect_features(mat_nrow, mat_id, ref->nrow(), ref_id); - return run(std::move(intersection), ref, labels, std::move(markers)); - } -}; - -} - -#endif diff --git a/inst/include/singlepp/BasicScorer.hpp b/inst/include/singlepp/BasicScorer.hpp deleted file mode 100644 index af4db05..0000000 --- a/inst/include/singlepp/BasicScorer.hpp +++ /dev/null @@ -1,292 +0,0 @@ -#ifndef SINGLEPP_BASIC_SCORER_HPP -#define SINGLEPP_BASIC_SCORER_HPP - -#include "macros.hpp" - -#include "tatami/tatami.hpp" - -#include "annotate_cells.hpp" -#include "process_features.hpp" -#include "BasicBuilder.hpp" - -#include -#include - -/** - * @file BasicScorer.hpp - * - * @brief Defines the `BasicScorer` class. - */ - -namespace singlepp { - -/** - * @brief Classify cells from a single pre-built reference dataset. - * - * This class uses the pre-built reference from `BasicBuilder` to classify each cell in a test dataset. - * The algorithm and parameters are the same as described for the `Classifier` class; - * in fact, `Classifier` just calls `BasicBuilder::run()` and then `BasicScorer::run()`. - * - * It is occasionally useful to call these two functions separately if the same reference dataset is to be used multiple times, - * e.g., on different test datasets or with different parameters. - * In such cases, we can save time by avoiding redundant builds; - * we just have to call `BasicScorer::run()` in all subsequent uses of the pre-built reference. - */ -class BasicScorer { -public: - /** - * @brief Default parameters for classification. - */ - struct Defaults { - /** - * See `set_quantile()` for details. - */ - static constexpr double quantile = 0.8; - - /** - * See `set_fine_tune_threshold()` for details. - */ - static constexpr double fine_tune_threshold = 0.05; - - /** - * See `set_fine_tune()` for details. - */ - static constexpr bool fine_tune = true; - - /** - * See `set_num_threads()` for details. - */ - static constexpr int num_threads = 1; - }; - -private: - double quantile = Defaults::quantile; - double fine_tune_threshold = Defaults::fine_tune_threshold; - bool fine_tune = Defaults::fine_tune; - int nthreads = Defaults::num_threads; - -public: - /** - * @param q Quantile to use to compute a per-label score from the correlations, see `Classifier::set_quantile()`. - * - * @return A reference to this `BasicScorer` object. - */ - BasicScorer& set_quantile(double q = Defaults::quantile) { - quantile = q; - return *this; - } - - /** - * @param t Threshold to use to select the top-scoring subset of labels during fine-tuning, see `Classifier::set_fine_tune_threshold()`. - * - * @return A reference to this `BasicScorer` object. - */ - BasicScorer& set_fine_tune_threshold(double t = Defaults::fine_tune_threshold) { - fine_tune_threshold = t; - return *this; - } - - /** - * @param f Whether to perform fine-tuning, see `Classifier::set_fine_tune()`. - * - * @return A reference to this `BasicScorer` object. - */ - BasicScorer& set_fine_tune(bool f = Defaults::fine_tune) { - fine_tune = f; - return *this; - } - - /** - * @param n Number of threads to use. - * - * @return A reference to this `BasicScorer` object. - */ - BasicScorer& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -public: - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * This may have a different ordering of genes compared to the reference matrix used to create `built`, - * provided that all genes corresponding to `built.subset` are present. - * @param built An object produced by `BasicBuilder::build()`. - * @param[in] mat_subset Pointer to an array of length equal to that of `built.subset`, - * containing the index of the row of `mat` corresponding to each gene in `built.subset`. - * That is, row `mat_subset[i]` in `mat` should be the same gene as row `built.subset[i]` in the reference matrix. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the assigned label for each cell. - * @param[out] scores Vector of pointers to arrays of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for each label for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the difference between the highest and second-highest scores, possibly after fine-tuning. - * This may also be `NULL` in which case the deltas are not reported. - */ - void run(const tatami::Matrix* mat, const BasicBuilder::Prebuilt& built, const int* mat_subset, int* best, std::vector& scores, double* delta) const { - annotate_cells_simple( - mat, - built.subset.size(), - mat_subset, - built.references, - built.markers, - quantile, - fine_tune, - fine_tune_threshold, - best, - scores, - delta, - nthreads - ); - return; - } - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * This should have the same order and identity of genes as the reference matrix used to create `built`. - * @param built An object produced by `BasicBuilder::build()`. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the assigned label for each cell. - * @param[out] scores Vector of pointers to arrays of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for each label for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the difference between the highest and second-highest scores, possibly after fine-tuning. - * This may also be `NULL` in which case the deltas are not reported. - */ - void run(const tatami::Matrix* mat, const BasicBuilder::Prebuilt& built, int* best, std::vector& scores, double* delta) const { - run(mat, built, built.subset.data(), best, scores, delta); - return; - } - -public: - /** - * @brief Automated classification results. - */ - struct Results { - /** - * @cond - */ - Results(size_t ncells, size_t nlabels) : best(ncells), scores(nlabels, std::vector(ncells)), delta(ncells) {} - - std::vector scores_to_pointers() { - std::vector output(scores.size()); - for (size_t s = 0; s < scores.size(); ++s) { - output[s] = scores[s].data(); - } - return output; - }; - /** - * @endcond - */ - - /** - * Vector of length equal to the number of cells in the test dataset, - * containing the index of the assigned label for each cell. - */ - std::vector best; - - /** - * Vector of length equal to the number of labels, - * containing vectors of length equal to the number of cells in the test dataset. - * Each vector corresponds to a label and contains the (non-fine-tuned) score for each cell. - */ - std::vector > scores; - - /** - * Vector of length equal to the number of cells in the test dataset. - * This contains the difference between the highest and second-highest scores for each cell, possibly after fine-tuning. - */ - std::vector delta; - }; - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * Each row should correspond to an element of `Prebuilt::subset`. - * @param built An object produced by `BasicBuilder::build()`. - * - * @return A `Results` object containing the assigned labels and scores. - */ - Results run(const tatami::Matrix* mat, const BasicBuilder::Prebuilt& built) const { - size_t nlabels = built.references.size(); - Results output(mat->ncol(), nlabels); - auto scores = output.scores_to_pointers(); - run(mat, built, output.best.data(), scores, output.delta.data()); - return output; - } - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * This may have a different ordering of genes compared to the reference matrix used in `build()`, - * provided that all genes corresponding to `Prebuilt::subset` are present. - * @param built An object produced by `BasicBuilder::build()`. - * @param[in] mat_subset Pointer to an array of length equal to that of `Prebuilt::subset`, - * containing the index of the row of `mat` corresponding to each gene in `Prebuilt::subset`. - * - * @return A `Results` object containing the assigned labels and scores. - */ - Results run(const tatami::Matrix* mat, const BasicBuilder::Prebuilt& built, const int* mat_subset) const { - size_t nlabels = built.references.size(); - Results output(mat->ncol(), nlabels); - auto scores = output.scores_to_pointers(); - run(mat, built, mat_subset, output.best.data(), scores, output.delta.data()); - return output; - } - -public: - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param built An object produced by `build()` with intersections. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the assigned label for each cell. - * @param[out] scores Vector of pointers to arrays of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for each label for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, tkkhis is filled with the difference between the highest and second-highest scores, possibly after fine-tuning. - * This may also be `NULL` in which case the deltas are not reported. - */ - void run( - const tatami::Matrix* mat, - const BasicBuilder::PrebuiltIntersection& built, - int* best, - std::vector& scores, - double* delta) - const { - annotate_cells_simple(mat, - built.mat_subset.size(), - built.mat_subset.data(), - built.references, - built.markers, - quantile, - fine_tune, - fine_tune_threshold, - best, - scores, - delta, - nthreads - ); - return; - } - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param built An object produced by `build()` with intersections. - * - * @return A `Results` object containing the assigned labels and scores. - */ - Results run(const tatami::Matrix* mat, const BasicBuilder::PrebuiltIntersection& built) const { - size_t nlabels = built.references.size(); - Results output(mat->ncol(), nlabels); - auto scores = output.scores_to_pointers(); - - run(mat, built, output.best.data(), scores, output.delta.data()); - return output; - } -}; - -} - - -#endif diff --git a/inst/include/singlepp/ChooseClassicMarkers.hpp b/inst/include/singlepp/ChooseClassicMarkers.hpp deleted file mode 100644 index c84feea..0000000 --- a/inst/include/singlepp/ChooseClassicMarkers.hpp +++ /dev/null @@ -1,292 +0,0 @@ -#ifndef SINGLEPP_CHOOSE_CLASSIC_MARKERS_HPP -#define SINGLEPP_CHOOSE_CLASSIC_MARKERS_HPP - -#include "macros.hpp" - -#include "tatami/tatami.hpp" - -#include "Markers.hpp" - -#include -#include -#include - -/** - * @file ChooseClassicMarkers.hpp - * - * @brief Classic method for choosing markers. - */ - -namespace singlepp { - -/** - * @brief Classic method for choosing markers. - * - * This class implements the classic **SingleR** method for choosing markers from (typically bulk) reference dataasets. - * We assume that we have a matrix of representative log-expression profiles for each label, typically computed by taking some average across all reference profiles for that label. - * For the comparison between labels A and B, we define the marker set as the top genes with the largest positive differences in A's profile over B (i.e., the log-fold change). - * The number of top genes can either be explicitly specified or it can be automatically determined from the number of labels. - * - * If multiple references are present, we compute the sum of the log-fold changes across all references with both labels A and B. - * The ordering of the top genes is then performed using the sum across references. - * It is assumed that all references have the same number and ordering of features in their rows. - */ -class ChooseClassicMarkers { -public: - /** - * @brief Default parameter settings. - */ - struct Defaults { - /** - * See `set_number()` for more details. - */ - static constexpr int number = -1; - - /** - * See `set_num_threads()` for more details. - */ - static constexpr int num_threads = 1; - }; - -private: - int number = Defaults::number; - int nthreads = Defaults::num_threads; - -public: - /** - * @param n Number of top genes to use as the marker set in each pairwise comparison. - * If -1, this is automatically determined from the number of labels, see `number_of_markers()`. - * - * @return A reference to this `ChooseClassicMarkers` object. - */ - ChooseClassicMarkers& set_number(int n = Defaults::number) { - number = n; - return *this; - } - - /** - * @param nlabels Number of labels in the reference(s). - * - * @return An appropriate number of markers for each pairwise comparison. - * - * The exact expression is defined as \f$500 (\frac{2}{3})^{\log_2{N}}\f$ for \f$N\f$ labels, - * which steadily decreases the markers per comparison as the number of labels increases. - * This aims to avoid an excessive number of features when dealing with references with many labels. - */ - static int number_of_markers(int nlabels) { - return std::round(500.0 * std::pow(2.0/3.0, std::log(static_cast(nlabels)) / std::log(2.0))); - } - - /** - * @param n Number of threads to use. - * - * @return A reference to this `ChooseClassicMarkers` object. - */ - ChooseClassicMarkers& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -public: - /** - * @tparam Matrix A **tatami** matrix. - * @tparam Label Integer type for the label identity. - * - * @param representatives Vector of representative matrices. - * Each matrix should contain one column per label; each column should have a representative log-expression profile for that label. - * All matrices should have the same number of rows, corresponding to the same features. - * @param labels Vector of pointers of length equal to `representatives`. - * Each array should be of length equal to the number of columns of the corresponding entry of `representatives`. - * Each value of the array should specify the label for the corresponding column in its matrix. - * Values should lie in $[0, N)$ for $N$ unique labels across all entries of `labels`. - * - * @return A `Markers` object containing the top markers for each pairwise comparison between labels. - */ - template - Markers run(const std::vector& representatives, const std::vector& labels) const { - /** - * @cond - */ - size_t nrefs = representatives.size(); - if (nrefs != labels.size()) { - throw std::runtime_error("'representatives' and 'labels' should have the same length"); - } - if (nrefs == 0) { - throw std::runtime_error("'representatives' should contain at least one entry"); - } - size_t ngenes = representatives.front()->nrow(); - - // Determining the total number of labels. - int nlabels = 0; - for (size_t r = 0; r < nrefs; ++r) { - size_t ncols = representatives[r]->ncol(); - auto curlab = labels[r]; - for (size_t c = 0; c < ncols; ++c) { - if (nlabels <= curlab[c]) { - nlabels = curlab[c] + 1; - } - } - } - - // Generating mappings. - std::vector > labels_to_index(nrefs, std::vector(nlabels, -1)); - for (size_t r = 0; r < nrefs; ++r) { - size_t ncols = representatives[r]->ncol(); - auto curlab = labels[r]; - auto& current = labels_to_index[r]; - for (size_t c = 0; c < ncols; ++c) { - auto& dest = current[curlab[c]]; - if (dest != -1) { - throw std::runtime_error("each label should correspond to no more than one column in each reference"); - } - current[curlab[c]] = c; - } - } - - // Generating pairs for compute; this sacrifices some memory for convenience. - std::vector > pairs; - { - std::set > pairs0; - for (size_t r = 0; r < nrefs; ++r) { - size_t ncols = representatives[r]->ncol(); - auto curlab = labels[r]; - for (size_t c1 = 0; c1 < ncols; ++c1) { - for (size_t c2 = 0; c2 < c1; ++c2) { - pairs0.emplace(curlab[c1], curlab[c2]); - } - } - } - pairs.insert(pairs.end(), pairs0.begin(), pairs0.end()); - std::sort(pairs.begin(), pairs.end()); - } - size_t npairs = pairs.size(); - - Markers output(nlabels, std::vector >(nlabels)); - - int actual_number = number; - if (number < 0) { - actual_number = number_of_markers(nlabels); - } - if (actual_number > static_cast(ngenes)) { - actual_number = ngenes; - } - -#ifndef SINGLEPP_CUSTOM_PARALLEL - #pragma omp parallel num_threads(nthreads) - { -#else - SINGLEPP_CUSTOM_PARALLEL([&](int, size_t start, size_t len) -> void { -#endif - - std::vector > sorter(ngenes), sorted_copy(ngenes); - typedef typename Matrix::value_type Value_; - typedef typename Matrix::index_type Index_; - std::vector rbuffer(ngenes), lbuffer(ngenes); - std::vector > > rworks(nrefs), lworks(nrefs); - -#ifndef SINGLEPP_CUSTOM_PARALLEL - #pragma omp for - for (size_t p = 0; p < npairs; ++p) { -#else - for (size_t p = start, end = start + len; p < end; ++p) { -#endif - - auto curleft = pairs[p].first; - auto curright = pairs[p].second; - - auto sIt = sorter.begin(); - for (int g = 0; g < ngenes; ++g, ++sIt) { - sIt->first = 0; - sIt->second = g; - } - - for (size_t i = 0; i < nrefs; ++i) { - const auto& curavail = labels_to_index[i]; - auto lcol = curavail[curleft]; - auto rcol = curavail[curright]; - if (lcol == -1 || rcol == -1) { - continue; - } - - // Initialize extractors as needed. - auto& lwrk = lworks[i]; - if (!lwrk) { - lwrk = representatives[i]->dense_column(); - } - auto lptr = lwrk->fetch(lcol, lbuffer.data()); - - auto& rwrk = rworks[i]; - if (!rwrk) { - rwrk = representatives[i]->dense_column(); - } - auto rptr = rwrk->fetch(rcol, rbuffer.data()); - - auto sIt = sorter.begin(); - for (int g = 0; g < ngenes; ++g, ++lptr, ++rptr, ++sIt) { - sIt->first += *lptr - *rptr; - } - } - - for (int flip = 0; flip < 2; ++flip) { - // Flipping the log-fold changes so we do the reverse. - // We do a copy so that the treatment of tries would be the same as if - // we had sorted on the reversed log-fold changes in the first place; - // otherwise the first sort might change the order of ties. - if (flip) { - sorter = sorted_copy; - for (auto& s : sorter) { - s.first *= -1; - } - } else { - sorted_copy = sorter; - } - - // partial sort is guaranteed to be stable due to the second index resolving ties. - std::partial_sort(sorter.begin(), sorter.begin() + actual_number, sorter.end()); - - std::vector stuff; - stuff.reserve(actual_number); - for (int g = 0; g < actual_number && sorter[g].first < 0; ++g) { // only keeping those with positive log-fold changes (negative after reversing). - stuff.push_back(sorter[g].second); - } - - if (flip) { - output[curleft][curright] = std::move(stuff); - } else { - output[curright][curleft] = std::move(stuff); - } - } - } - -#ifndef SINGLEPP_CUSTOM_PARALLEL - } -#else - }, npairs, nthreads); -#endif - /** - * @endcond - */ - - return output; - } - - /** - * @tparam Matrix A **tatami** matrix. - * - * @param representative A representative matrix, containing one column per label. - * Each column should have a representative log-expression profile for that label. - * @param labels Pointer to an array of length equal to the number of columns in `representative`. - * Each value of the array should specify the label for the corresponding column. - * Values should lie in $[0, N)$ for $N$ unique labels. - * - * @return A `Markers` object containing the top markers for each pairwise comparison between labels. - */ - template - Markers run(const Matrix* representative, const int* labels) const { - return run(std::vector{ representative }, std::vector{ labels }); - } -}; - -} - -#endif diff --git a/inst/include/singlepp/Classifier.hpp b/inst/include/singlepp/Classifier.hpp deleted file mode 100644 index 8d784fc..0000000 --- a/inst/include/singlepp/Classifier.hpp +++ /dev/null @@ -1,312 +0,0 @@ -#ifndef SINGLEPP_CLASSIFIER_HPP -#define SINGLEPP_CLASSIFIER_HPP - -#include "macros.hpp" - -#include "tatami/tatami.hpp" - -#include "BasicBuilder.hpp" -#include "BasicScorer.hpp" - -#include -#include - -/** - * @file Classifier.hpp - * - * @brief Defines the `Classifier` class. - */ - -namespace singlepp { - -/** - * @brief Automatically assign cell type labels based on an expression matrix. - * - * This implements the [**SingleR**](https://bioconductor.org/packages/SingleR) algorithm for automated annotation of single-cell RNA-seq data. - * For each cell, we compute the Spearman rank correlation between that cell and the reference expression profiles. - * This is done using only the subset of genes that are label-specific markers, - * most typically the top genes from pairwise comparisons between each label's expression profiles. - * For each label, we take the correlations involving that label's reference profiles and convert it into a score. - * The label with the highest score is used as an initial label for that cell. - * - * For each cell, we apply fine-tuning iterations to improve the label accuracy by refining the feature space. - * At each iteration, we find the subset of labels with scores that are close to the maximum score according to some threshold. - * We recompute the scores based on the markers for this label subset, and we repeat the process until only one label is left in the subset or the subset is unchanged. - * At the end of the iterations, the label with the highest score (or the only label, if just one is left) is used as the label for the cell. - * This process aims to remove noise by eliminating irrelevant genes when attempting to distinguish closely related labels. - * - * Each label's score is defined as a user-specified quantile of the distribution of correlations across all reference profiles assigned to that label. - * (We typically consider a large quantile, e.g., the 80% percentile of the correlations.) - * The use of a quantile avoids problems with differences in the number of reference profiles per label; - * in contrast, just using the "top X correlations" would implicitly favor labels with more reference profiles. - * - * The choice of Spearman's correlation provides some robustness against batch effects when comparing reference and test datasets. - * Only the relative expression _within_ each cell needs to be comparable, not their relative expression across cells. - * As a result, it does not matter whether raw counts are supplied or log-transformed expression values, as the latter is a monotonic transformation of the latter (within each cell). - * The algorithm is also robust to differences in technologies between reference and test profiles, though it is preferable to have like-for-like comparisons. - * - * @see - * Aran D et al. (2019). - * Reference-based analysis of lung single-cell sequencing reveals a transitional profibrotic macrophage. - * _Nat. Immunol._ 20, 163-172 - */ -class Classifier { -public: - /** - * @brief Default parameters for annotation. - */ - struct Defaults { - /** - * See `set_quantile()` for details. - */ - static constexpr double quantile = BasicScorer::Defaults::quantile; - - /** - * See `set_fine_tune_threshold()` for details. - */ - static constexpr double fine_tune_threshold = BasicScorer::Defaults::fine_tune_threshold; - - /** - * See `set_fine_tune()` for details. - */ - static constexpr bool fine_tune = BasicScorer::Defaults::fine_tune; - - /** - * See `set_top()` for details. - */ - static constexpr int top = BasicBuilder::Defaults::top; - - /** - * See `set_approximate()` for details. - */ - static constexpr bool approximate = BasicBuilder::Defaults::approximate; - - /** - * See `set_num_threads()` for details. - */ - static constexpr int num_threads = 1; - }; - -private: - int top = Defaults::top; - bool approximate = Defaults::approximate; - double quantile = Defaults::quantile; - double fine_tune_threshold = Defaults::fine_tune_threshold; - bool fine_tune = Defaults::fine_tune; - int nthreads = Defaults::num_threads; - -public: - /** - * @param q Quantile to use to compute a per-label score from the correlations. - * - * @return A reference to this `Classifier` object. - * - * Values of `q` closer to 0.5 focus on the behavior of the majority of a label's reference profiles. - * Smaller values will be more sensitive to the presence of a subset of profiles that are more similar to the test cell, - * which can be useful when the reference profiles themselves are heterogeneous. - */ - Classifier& set_quantile(double q = Defaults::quantile) { - quantile = q; - return *this; - } - - /** - * @param t Threshold to use to select the top-scoring subset of labels during fine-tuning. - * Larger values increase the chance of recovering the correct label at the cost of computational time. - * - * @return A reference to this `Classifier` object. - * - * Needless to say, one should not set `t` to a value that is too large. - * Otherwise, the first fine-tuning iteration would just contain all labels and there would be no reduction of the marker space. - */ - Classifier& set_fine_tune_threshold(double t = Defaults::fine_tune_threshold) { - fine_tune_threshold = t; - return *this; - } - - /** - * @param f Whether to perform fine-tuning. - * This can be disabled to improve speed at the cost of accuracy. - * - * @return A reference to this `Classifier` object. - */ - Classifier& set_fine_tune(bool f = Defaults::fine_tune) { - fine_tune = f; - return *this; - } - - /** - * @param t Number of top markers to use from each pairwise comparison between labels. - * Larger values improve the stability of the correlations at the cost of increasing noise and computational work. - * - * Setting it to a negative value will instruct `run()` to use all supplied markers. - * This is useful in situations where the supplied markers have already been curated. - * - * @return A reference to this `Classifier` object. - */ - Classifier& set_top(int t = Defaults::top) { - top = t; - return *this; - } - - /** - * @param a Whether to use an approximate method to quickly find the quantile. - * This sacrifices some accuracy for speed when labels have many reference profiles. - * - * @return A reference to this `Classifier` object. - */ - Classifier& set_approximate(bool a = Defaults::approximate) { - approximate = a; - return *this; - } - - /** - * @param n Number of threads to use. - * - * @return A reference to this `Classifier` object. - */ - Classifier& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -private: - BasicBuilder::Prebuilt build_reference(const tatami::Matrix* ref, const int* labels, Markers markers) const { - BasicBuilder builder; - builder - .set_top(top) - .set_approximate(approximate) - .set_num_threads(nthreads); - return builder.run(ref, labels, std::move(markers)); - } - - template - BasicBuilder::PrebuiltIntersection build_reference(size_t mat_nrow, const Id* mat_id, const tatami::Matrix* ref, const Id* ref_id, const int* labels, Markers markers) const { - BasicBuilder builder; - builder - .set_top(top) - .set_approximate(approximate) - .set_num_threads(nthreads); - return builder.run(mat_nrow, mat_id, ref, ref_id, labels, std::move(markers)); - } - - BasicScorer set_up_scorer() const { - BasicScorer scorer; - scorer - .set_quantile(quantile) - .set_fine_tune(fine_tune) - .set_fine_tune_threshold(fine_tune_threshold) - .set_num_threads(nthreads); - return scorer; - } - -public: - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param ref An expression matrix for the reference expression profiles. - * This should have non-zero columns and the same number of rows (i.e., genes) at `mat`. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the assigned label for each cell. - * @param[out] scores Vector of pointers of length equal to the number of labels. - * Each pointer should point to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for that label for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the difference between the highest and second-highest scores, possibly after fine-tuning. - * This may also be `NULL` in which case the deltas are not reported. - */ - void run(const tatami::Matrix* mat, const tatami::Matrix* ref, const int* labels, Markers markers, int* best, std::vector& scores, double* delta) const { - auto prebuilt = build_reference(ref, labels, std::move(markers)); - set_up_scorer().run(mat, prebuilt, best, scores, delta); - return; - } - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param ref An expression matrix for the reference expression profiles. - * This should have non-zero columns and the same number of rows (i.e., genes) at `mat`. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * - * @return A `BasicScorer::Results` object containing the assigned labels and scores. - */ - BasicScorer::Results run(const tatami::Matrix* mat, const tatami::Matrix* ref, const int* labels, Markers markers) const { - auto prebuilt = build_reference(ref, labels, std::move(markers)); - return set_up_scorer().run(mat, prebuilt); - } - -public: - /** - * @tparam Id Gene identifier for each row. - * - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param[in] mat_id Pointer to an array of identifiers of length equal to the number of rows of `mat`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the assigned label for each cell. - * @param[out] scores Vector of pointers of length equal to the number of labels. - * Each pointer should point to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for that label for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the difference between the highest and second-highest scores, possibly after fine-tuning. - * This may also be `NULL` in which case the deltas are not reported. - * - * This version of `run()` applies an intersection to find the common genes between `mat` and `ref`, based on their shared values in `mat_id` and `ref_id`. - * The annotation is then performed using only the subset of common genes. - * The aim is to easily accommodate differences in feature annotation between the test and reference profiles. - */ - template - void run( - const tatami::Matrix* mat, - const Id* mat_id, - const tatami::Matrix* ref, - const Id* ref_id, - const int* labels, - Markers markers, - int* best, - std::vector& scores, - double* delta) - const { - auto built = build_reference(mat->nrow(), mat_id, ref, ref_id, labels, std::move(markers)); - set_up_scorer().run(mat, built, best, scores, delta); - return; - } - - /** - * @tparam Id Gene identifier for each row. - * - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param[in] mat_id Pointer to an array of identifiers of length equal to the number of rows of `mat`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * - * @return A `BasicScorer::Results` object containing the assigned labels and scores. - */ - template - BasicScorer::Results run(const tatami::Matrix* mat, const Id* mat_id, const tatami::Matrix* ref, const Id* ref_id, const int* labels, Markers markers) const { - auto built = build_reference(mat->nrow(), mat_id, ref, ref_id, labels, std::move(markers)); - return set_up_scorer().run(mat, built); - } -}; - -} - -#endif diff --git a/inst/include/singlepp/IntegratedBuilder.hpp b/inst/include/singlepp/IntegratedBuilder.hpp deleted file mode 100644 index 1ba3509..0000000 --- a/inst/include/singlepp/IntegratedBuilder.hpp +++ /dev/null @@ -1,578 +0,0 @@ -#ifndef SINGLEPP_INTEGRATED_BUILDER_HPP -#define SINGLEPP_INTEGRATED_BUILDER_HPP - -#include "macros.hpp" - -#include "scaled_ranks.hpp" -#include "BasicBuilder.hpp" - -#include -#include -#include -#include - -/** - * @file IntegratedBuilder.hpp - * - * @brief Prepare for integrated classification across references. - */ - -namespace singlepp { - -/** - * @brief Reference datasets prepared for integrated classification. - */ -struct IntegratedReferences { - /** - * @return Number of reference datasets. - * Each object corresponds to the reference used in an `IntegratedBuilder::add()` call, in the same order. - */ - size_t num_references() const { - return markers.size(); - } - - /** - * @param r Reference dataset of interest. - * @return Number of labels in this reference. - */ - size_t num_labels(size_t r) const { - return markers[r].size(); - } - - /** - * @param r Reference dataset of interest. - * @return Number of profiles in this reference. - */ - size_t num_profiles(size_t r) const { - size_t n = 0; - for (const auto& ref : ranked[r]) { - n += ref.size(); - } - return n; - } - - /** - * @cond - */ - std::vector universe; // To be used by IntegratedScorer for indexed extraction. - - std::vector check_availability; - std::vector > available; // indices to 'universe' - std::vector > > markers; // indices to 'universe' - std::vector > > > ranked; // .second contains indices to 'universe' - - void resize(size_t n) { - check_availability.resize(n); - available.resize(n); - markers.resize(n); - ranked.resize(n); - } - /** - * @endcond - */ -}; - -/** - * @brief Factory to prepare multiple references for integrated classification. - * - * For each reference dataset, we expect a `BasicBuilder::Prebuilt` or `BasicBuilder::PrebuiltIntersection` object, - * as well as the original data structures (matrix, labels, etc.) used to construct that object. - * These values are passed into `add()` to register that dataset, which can be repeated multiple times for different references. - * Finally, calling `finish()` will return a vector of integrated references that can be used in `IntegratedScorer::run()`. - * - * The preparation process mostly involves checking that the gene indices are consistent across references. - * This is especially true when each reference contains a different set of features that must be intersected with the features in the test dataset. - * See the documentation for `IntegratedScorer` for more details on the classification based on the integrated references. - */ -class IntegratedBuilder { -private: - std::vector*> stored_matrices; - std::vector stored_labels; - IntegratedReferences references; - std::vector > gene_mapping; - int nthreads = Defaults::num_threads; - -public: - /** - * @brief Default parameters. - */ - struct Defaults { - /** - * See `set_num_threads()` for details. - */ - static constexpr int num_threads = 1; - }; - - /** - * @param n Number of threads to use. - * - * @return A reference to this `IntegratedBuilder` object. - */ - IntegratedBuilder& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -private: - void add_internal_base(const tatami::Matrix* ref, const int* labels) { - stored_matrices.push_back(ref); - stored_labels.push_back(labels); - references.resize(stored_matrices.size()); - gene_mapping.resize(stored_matrices.size()); - } - - // Adding a reference without requiring any pruning of markers. - template - void add_internal_direct(const tatami::Matrix* ref, const int* labels, const Markers& old_markers, const Subset& mat_subset) { - add_internal_base(ref, labels); - - // Adding the markers for each label, indexed according to their - // position in the test matrix. This assumes that 'mat_subset' is - // appropriately specified to contain the test's row indices. - auto& new_markers = references.markers.back(); - new_markers.reserve(old_markers.size()); - - for (size_t i = 0; i < old_markers.size(); ++i) { - const auto& cur_old_markers = old_markers[i]; - - std::unordered_set unified; - for (const auto& x : cur_old_markers) { - unified.insert(x.begin(), x.end()); - } - - new_markers.emplace_back(unified.begin(), unified.end()); - - if constexpr(!std::is_same::value) { - auto& cur_new_markers = new_markers.back(); - for (auto& y : cur_new_markers) { - y = mat_subset[y]; - } - } - } - return; - } - - // Adding a reference with different features from the test, requiring some - // pruning to remove markers that are not present in the intersection. - template - void add_internal_intersect( - const std::vector >& intersection, - const tatami::Matrix* ref, - const int* labels, - const Markers& old_markers, - const Subset& ref_subset) - { - add_internal_base(ref, labels); - - // Manually constructing the markers. This involves (i) pruning out the - // markers that aren't present in the intersection, and (ii) updating - // their indices so that they point to rows of 'mat', not 'ref'. - std::unordered_map reverse_map; - for (const auto& i : intersection) { - reverse_map[i.second] = i.first; - } - - auto subindex = [&](int i) -> int { - if constexpr(!std::is_same::value) { - return ref_subset[i]; - } else { - return i; - } - }; - - auto& new_markers = references.markers.back(); - new_markers.resize(old_markers.size()); - - for (size_t i = 0; i < old_markers.size(); ++i) { - const auto& cur_old_markers = old_markers[i]; - - std::unordered_set unified; - for (const auto& x : cur_old_markers) { - unified.insert(x.begin(), x.end()); - } - - auto& cur_new_markers = new_markers[i]; - cur_new_markers.reserve(unified.size()); - - for (auto y : unified) { - auto it = reverse_map.find(subindex(y)); - if (it != reverse_map.end()) { - cur_new_markers.push_back(it->second); - } - } - } - - // Constructing the mapping of mat's rows to the reference rows. - references.check_availability.back() = true; - auto& mapping = gene_mapping.back(); - for (const auto& i : intersection) { - mapping[i.first] = i.second; - } - return; - } - -public: - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload assumes that the reference and test datasets have the same features. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * - * @param ref Matrix containing the reference expression values. - * Rows are features and columns are reference samples. - * The number and identity of features should be identical to the test dataset to be classified in `IntegratedScorer`. - * @param[in] labels Pointer to an array of label assignments. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels in `ref`, see `Markers` for more details. - */ - void add(const tatami::Matrix* ref, const int* labels, const Markers& markers) { - add_internal_direct(ref, labels, markers, false); - } - - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload automatically identifies the intersection of features between the test and reference datasets. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * `mat_id` and `mat_nrow` should also be constant for all invocations to `add()`. - * - * @tparam Id Type of the gene identifier for each row. - * - * @param mat_nrow Number of rows (genes) in the test dataset. - * @param[in] mat_id Pointer to an array of identifiers of length equal to `mat_nrow`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * If any duplicate IDs are present, only the first occurrence is used. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * If any duplicate IDs are present, only the first occurrence is used. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels in `ref`, see `Markers` for more details. - */ - template - void add(size_t mat_nrow, - const Id* mat_id, - const tatami::Matrix* ref, - const Id* ref_id, - const int* labels, - const Markers& markers) - { - auto intersection = intersect_features(mat_nrow, mat_id, ref->nrow(), ref_id); - add_internal_intersect(intersection, ref, labels, markers, false); - } - -public: - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload assumes that the reference and test datasets have the same features, - * and that the reference dataset has already been processed through `BasicBuilder::run()`. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * - * @param ref Matrix containing the reference expression values. - * Rows are features and columns are reference samples. - * The number and identity of features should be identical to the test dataset to be classified in `IntegratedScorer`. - * @param[in] labels Pointer to an array of label assignments. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param built The built reference created by running `BasicBuilder::run()` on `ref` and `labels`. - */ - void add(const tatami::Matrix* ref, const int* labels, const BasicBuilder::Prebuilt& built) { - add_internal_direct(ref, labels, built.markers, built.subset); - return; - } - - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload requires an existing intersection between the test and reference datasets, - * and assumes that the reference dataset has already been processed through `BasicBuilder::run()`. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * - * @param intersection Vector defining the intersection of features between the test and reference datasets. - * Each entry is a pair where the first element is the row index in the test matrix, - * and the second element is the row index for the corresponding feature in the reference matrix. - * Each row index for either matrix should occur no more than once in `intersection`. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param built The built reference created by running `BasicBuilder::run()` on all preceding arguments. - */ - void add(const std::vector >& intersection, - const tatami::Matrix* ref, - const int* labels, - const BasicBuilder::PrebuiltIntersection& built) - { - add_internal_direct(ref, labels, built.markers, built.mat_subset); - references.check_availability.back() = true; - - // Constructing the mapping of mat's rows to the reference rows. - auto& mapping = gene_mapping.back(); - for (const auto& i : intersection) { - mapping[i.first] = i.second; - } - return; - } - - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload automatically identifies the intersection of features between the test and reference datasets, - * and assumes that the reference dataset has already been processed through `BasicBuilder::run()`. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * `mat_id` and `mat_nrow` should also be constant for all invocations to `add()`. - * - * @tparam Id Type of the gene identifier for each row. - * - * @param mat_nrow Number of rows (genes) in the test dataset. - * @param[in] mat_id Pointer to an array of identifiers of length equal to `mat_nrow`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * If any duplicate IDs are present, only the first occurrence is used. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * If any duplicate IDs are present, only the first occurrence is used. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param built The built reference created by running `BasicBuilder::run()` on all preceding arguments. - */ - template - void add(size_t mat_nrow, - const Id* mat_id, - const tatami::Matrix* ref, - const Id* ref_id, - const int* labels, - const BasicBuilder::PrebuiltIntersection& built) - { - auto intersection = intersect_features(mat_nrow, mat_id, ref->nrow(), ref_id); - add(intersection, ref, labels, built); - return; - } - - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload assumes that the reference and test datasets have the same features, - * and that the reference dataset has already been processed through `BasicBuilder::run()`. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * - * @param intersection Vector defining the intersection of features betweent the test and reference datasets. - * Each entry is a pair where the first element is the row index in the test matrix, - * and the second element is the row index for the corresponding feature in the reference matrix. - * Each row index for either matrix should occur no more than once in `intersection`. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param built The built reference created by running `BasicBuilder::run()` on `ref` and `labels`. - */ - void add(const std::vector >& intersection, - const tatami::Matrix* ref, - const int* labels, - const BasicBuilder::Prebuilt& built) - { - add_internal_intersect(intersection, ref, labels, built.markers, built.subset); - } - - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload automatically identifies the intersection of features between the test and reference datasets, - * and assumes that the reference dataset has already been processed through `BasicBuilder::run()`. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * `mat_id` and `mat_nrow` should also be constant for all invocations to `add()`. - * - * @tparam Id Type of the gene identifier for each row. - * - * @param mat_nrow Number of rows (genes) in the test dataset. - * @param[in] mat_id Pointer to an array of identifiers of length equal to `mat_nrow`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * If any duplicate IDs are present, only the first occurrence is used. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * If any duplicate IDs are present, only the first occurrence is used. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param built The built reference created by running `BasicBuilder::run()` on `ref` and `labels`. - */ - template - void add(size_t mat_nrow, - const Id* mat_id, - const tatami::Matrix* ref, - const Id* ref_id, - const int* labels, - const BasicBuilder::Prebuilt& built) - { - auto intersection = intersect_features(mat_nrow, mat_id, ref->nrow(), ref_id); - add(intersection, ref, labels, built); - } - -private: - /* Here, we've split out some of the functions for easier reading. - * Otherwise everything would land in a single mega-function. - */ - - static void fill_ranks( - const tatami::Matrix* curmat, - const int* curlab, - const std::vector& in_use, - const std::vector& positions, - std::vector > >& cur_ranked, - int nthreads) - { - // If we don't need to check availability, this implies that - // the reference has 1:1 feature mapping to the test dataset. - // In that case, we can proceed quite simply. - tatami::parallelize([&](int, int start, int len) -> void { - RankedVector tmp_ranked; - tmp_ranked.reserve(in_use.size()); - - // 'in_use' is guaranteed to be sorted and unique, see its derivation in finish(). - // This means we can directly use it for indexed extraction. - auto wrk = tatami::consecutive_extractor(curmat, false, start, len, in_use); - std::vector buffer(in_use.size()); - - for (int c = start, end = start + len; c < end; ++c) { - auto ptr = wrk->fetch(c, buffer.data()); - - tmp_ranked.clear(); - for (int i = 0, end = in_use.size(); i < end; ++i, ++ptr) { - tmp_ranked.emplace_back(*ptr, i); - } - std::sort(tmp_ranked.begin(), tmp_ranked.end()); - - auto& final_ranked = cur_ranked[curlab[c]][positions[c]]; - simplify_ranks(tmp_ranked, final_ranked); - } - }, curmat->ncol(), nthreads); - } - - static void fill_ranks( - const tatami::Matrix* curmat, - const int* curlab, - const std::vector& in_use, - const std::vector& positions, - const std::unordered_map& cur_mapping, - std::unordered_set& cur_available, - std::vector > >& cur_ranked, - int nthreads) - { - // If we need to check availability, then we need to check - // the mapping of test genes to row indices of the reference. - std::vector > remapping; - remapping.reserve(in_use.size()); - - for (int i = 0, end = in_use.size(); i < end; ++i) { - auto it = cur_mapping.find(in_use[i]); - if (it != cur_mapping.end()) { - remapping.emplace_back(it->second, i); // using 'i' instead of 'in_use[i]', as we want to work with indices into 'in_use', not the values of 'in_use' themselves. - cur_available.insert(i); - } - } - - std::sort(remapping.begin(), remapping.end()); - - // This section is just to enable indexed extraction by tatami. - // There's no need to consider duplicates among the - // 'remapping[i].first', 'cur_mapping->second' is guaranteed to be - // unique as a consequence of how intersect_features works. - std::vector remapped_in_use; - remapped_in_use.reserve(remapping.size()); - for (const auto& p : remapping) { - remapped_in_use.push_back(p.first); - } - - tatami::parallelize([&](int, int start, int len) -> void { - RankedVector tmp_ranked; - tmp_ranked.reserve(remapped_in_use.size()); - std::vector buffer(remapped_in_use.size()); - auto wrk = tatami::consecutive_extractor(curmat, false, start, len, remapped_in_use); - - for (size_t c = start, end = start + len; c < end; ++c) { - auto ptr = wrk->fetch(c, buffer.data()); - - tmp_ranked.clear(); - for (const auto& p : remapping) { - tmp_ranked.emplace_back(*ptr, p.second); // remember, 'p.second' corresponds to indices into 'in_use'. - ++ptr; - } - std::sort(tmp_ranked.begin(), tmp_ranked.end()); - - auto& final_ranked = cur_ranked[curlab[c]][positions[c]]; - simplify_ranks(tmp_ranked, final_ranked); - } - }, curmat->ncol(), nthreads); - } - -public: - /** - * @return The set of integrated references, for use in `IntegratedScorer`. - * - * This function should only be called once, after all reference datasets have been registered with `add()`. - * Any further invocations of this function will not be valid. - */ - IntegratedReferences finish() { - // Identify the union of all marker genes. - auto& in_use = references.universe; - std::unordered_map remap_to_universe; - { - std::unordered_set in_use_tmp; - for (const auto& refmarkers : references.markers) { - for (const auto& mrk : refmarkers) { - in_use_tmp.insert(mrk.begin(), mrk.end()); - } - } - - in_use.insert(in_use.end(), in_use_tmp.begin(), in_use_tmp.end()); - std::sort(in_use.begin(), in_use.end()); - - for (int i = 0, end = in_use.size(); i < end; ++i) { - remap_to_universe[in_use[i]] = i; - } - } - - for (size_t r = 0, end = references.num_references(); r < end; ++r) { - auto curlab = stored_labels[r]; - auto curmat = stored_matrices[r]; - - // Reindexing the markers to point to the universe. - auto& curmarkers = references.markers[r]; - for (auto& outer : curmarkers) { - for (auto& x : outer) { - x = remap_to_universe.find(x)->second; - } - } - - auto& cur_ranked = references.ranked[r]; - std::vector positions; - { - size_t nlabels = curmarkers.size(); - size_t NC = curmat->ncol(); - positions.reserve(NC); - - std::vector samples_per_label(nlabels); - for (size_t c = 0; c < NC; ++c) { - auto& pos = samples_per_label[curlab[c]]; - positions.push_back(pos); - ++pos; - } - - cur_ranked.resize(nlabels); - for (size_t l = 0; l < nlabels; ++l) { - cur_ranked[l].resize(samples_per_label[l]); - } - } - - // Finally filling the rankings. - if (!references.check_availability[r]) { - fill_ranks(curmat, curlab, in_use, positions, references.ranked[r], nthreads); - } else { - fill_ranks(curmat, curlab, in_use, positions, gene_mapping[r], references.available[r], references.ranked[r], nthreads); - } - } - - return std::move(references); - } -}; - -} - -#endif diff --git a/inst/include/singlepp/IntegratedScorer.hpp b/inst/include/singlepp/IntegratedScorer.hpp deleted file mode 100644 index e57b804..0000000 --- a/inst/include/singlepp/IntegratedScorer.hpp +++ /dev/null @@ -1,344 +0,0 @@ -#ifndef SINGLEPP_RECOMPUTE_SCORES_HPP -#define SINGLEPP_RECOMPUTE_SCORES_HPP - -#include "macros.hpp" - -#include "tatami/tatami.hpp" - -#include "compute_scores.hpp" -#include "scaled_ranks.hpp" -#include "Classifier.hpp" -#include "IntegratedBuilder.hpp" - -#include -#include -#include - -/** - * @file IntegratedScorer.hpp - * - * @brief Integrate classifications from multiple references. - */ - -namespace singlepp { - -/** - * @brief Integrate classifications from multiple references. - * - * In situations where multiple reference datasets are available, - * we would like to obtain a single prediction for each cell from all of those references. - * This is somewhat tricky as the different references are likely to contain strong batch effects, - * complicating the calculation of marker genes between labels from different references (and thus precluding direct use of the usual `Classifier::run()`). - * The labels themselves also tend to be inconsistent, e.g., different vocabularies and resolutions, making it difficult to define sensible groups in a combined "super-reference". - * - * To avoid these issues, we first perform classification within each reference individually. - * For each test cell, we identify its predicted label from a given reference, and we collect all the marker genes for that label (across all pairwise comparisons in that reference). - * After doing this for each reference, we pool all of the collected markers to obtain a common set of interesting genes. - * We then compute the correlation-based score between the test cell's expression profile and its predicted label from each reference, using that common set of genes. - * The label with the highest score is considered the best representative across all references. - * - * This strategy is similar to using `Classifier::run()` without fine-tuning, - * except that we are choosing between the best labels from all references rather than between all labels from one reference. - * The main idea is to create a common feature set so that the correlations can be reasonably compared across references. - * Note that differences in the feature sets across references are tolerated by simply ignoring missing genes when computing the correlations. - * This reduces the comparability of the scores as the effective feature set will vary a little (or a lot, depending) across references; - * nonetheless, it is preferred to taking the intersection, which is liable to leave us with very few genes. - * - * Our approach avoids any direct comparison between the expression profiles of different references, - * allowing us to side-step the question of how to deal with the batch effects. - * Similarly, we defer responsibility on solving the issue of label heterogeneity, - * by just passing along the existing labels and leaving it to the user's interpretation. - */ -class IntegratedScorer { -public: - /** - * @brief Default parameters. - */ - struct Defaults { - /** - * See `set_quantile()` for details. - */ - static constexpr double quantile = Classifier::Defaults::quantile; - - /** - * See `set_num_threads()` for details. - */ - static constexpr int num_threads = 1; - }; - - /** - * @param q Quantile to use to compute a per-label score from the correlations. - * - * @return A reference to this `IntegratedScorer` object. - * - * See `Classifier::set_quantile()` for more details. - */ - IntegratedScorer& set_quantile(double q = Defaults::quantile) { - quantile = q; - return *this; - } - - /** - * @param n Number of threads to use. - * By default, this is inherited from the parent `IntegratedBuilder` object. - * - * @return A reference to this `IntegratedScorer` object. - */ - IntegratedScorer& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -private: - double quantile = Defaults::quantile; - int nthreads = Defaults::num_threads; - -private: - /* Here, we've split out some of the functions for easier reading. - * Otherwise everything would land in a single mega-function. - */ - - static void build_miniverse(int cell, - const std::vector& assigned, - const IntegratedReferences& built, - std::unordered_set& uset, - std::vector& uvec) - { - uset.clear(); - size_t nref = built.num_references(); - - for (size_t r = 0; r < nref; ++r) { - auto best = assigned[r][cell]; - const auto& markers = built.markers[r][best]; - uset.insert(markers.begin(), markers.end()); - } - - uvec.clear(); - uvec.insert(uvec.end(), uset.begin(), uset.end()); - std::sort(uvec.begin(), uvec.end()); - return; - } - - template - static void fill_ranks( - Extractor_* wrk, - const std::vector& miniverse, - int cell, - std::vector& buffer, - RankedVector& data_ranked) - { - data_ranked.clear(); - if (miniverse.empty()) { - return; - } - - auto ptr = wrk->fetch(cell, buffer.data()); - for (auto u : miniverse) { - data_ranked.emplace_back(ptr[u], u); - } - - std::sort(data_ranked.begin(), data_ranked.end()); - return; - } - - static void prepare_mapping( - const IntegratedReferences& built, - size_t ref, - const std::vector& miniverse, - std::unordered_map& mapping) - { - mapping.clear(); - if (built.check_availability[ref]) { - const auto& cur_available = built.available[ref]; - - // If we need to check availability, we reconstruct the mapping - // for the intersection of features available in this reference. - // We then calculate the scaled ranks for the data. - int counter = 0; - for (auto c : miniverse) { - if (cur_available.find(c) != cur_available.end()) { - mapping[c] = counter; - ++counter; - } - } - } else { - // If we don't need to check availability, the mapping is - // much simpler. Technically, we could do this in the outer - // loop if none of the references required checks, but this - // seems like a niche optimization in practical settings. - for (size_t s = 0; s < miniverse.size(); ++s) { - auto u = miniverse[s]; - mapping[u] = s; - } - } - return; - } - -public: - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * The identity of the rows should be consistent with the arguments used in `IntegratedBuilder::add()`. - * @param[in] assigned Vector of pointers of length equal to the number of references. - * Each pointer should point to an array of length equal to the number of columns in `mat`, - * containing the assigned label for each column in each reference. - * @param built Set of integrated references produced by `IntegratedBuilder::finish()`. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the reference with the best label for each cell. - * @param[out] scores Vector of pointers of length equal to the number of references. - * Each pointer should point to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for the best label of that reference for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the difference between the highest and second-highest scores. - * This may also be `NULL` in which case the deltas are not reported. - */ - void run( - const tatami::Matrix* mat, - const std::vector& assigned, - const IntegratedReferences& built, - int* best, - std::vector& scores, - double* delta) - const { - auto NR = mat->nrow(); - auto nref = built.num_references(); - - tatami::parallelize([&](int, int start, int len) -> void { - // We perform an indexed extraction, so all subsequent indices - // will refer to indices into this subset (i.e., 'built.universe'). - auto wrk = tatami::consecutive_extractor(mat, false, start, len, built.universe); - std::vector buffer(built.universe.size()); - - RankedVector data_ranked, data_ranked2; - data_ranked.reserve(NR); - data_ranked2.reserve(NR); - RankedVector ref_ranked; - ref_ranked.reserve(NR); - - std::vector scaled_data(NR); - std::vector scaled_ref(NR); - std::unordered_set miniverse_tmp; - std::vector miniverse; - std::unordered_map mapping; - std::vector all_correlations; - - for (int i = start, end = start + len; i < end; ++i) { - build_miniverse(i, assigned, built, miniverse_tmp, miniverse); - fill_ranks(wrk.get(), miniverse, i, buffer, data_ranked); - - // Scanning through each reference and computing the score for the best group. - double best_score = -1000, next_best = -1000; - int best_ref = 0; - - for (size_t r = 0; r < nref; ++r) { - prepare_mapping(built, r, miniverse, mapping); - - scaled_ref.resize(mapping.size()); - scaled_data.resize(mapping.size()); - - data_ranked2.clear(); - subset_ranks(data_ranked, data_ranked2, mapping); - scaled_ranks(data_ranked2, scaled_data.data()); - - // Now actually calculating the score for the best group for - // this cell in this reference. This assumes that 'ref.ranked' - // already contains sorted pairs where the indices refer to the - // rows of the original data matrix. - auto best = assigned[r][i]; - const auto& best_ranked = built.ranked[r][best]; - all_correlations.clear(); - - for (size_t s = 0; s < best_ranked.size(); ++s) { - ref_ranked.clear(); - subset_ranks(best_ranked[s], ref_ranked, mapping); - scaled_ranks(ref_ranked, scaled_ref.data()); - double cor = distance_to_correlation(scaled_ref.size(), scaled_data, scaled_ref); - all_correlations.push_back(cor); - } - - double score = correlations_to_scores(all_correlations, quantile); - if (scores[r]) { - scores[r][i] = score; - } - if (score > best_score) { - next_best = best_score; - best_score = score; - best_ref = r; - } else if (score > next_best) { - next_best = score; - } - } - - if (best) { - best[i] = best_ref; - } - if (delta && nref > 1) { - delta[i] = best_score - next_best; - } - } - - }, mat->ncol(), nthreads); - } - -public: - /** - * @brief Results of the integrated annotation. - */ - struct Results { - /** - * @cond - */ - Results(size_t ncells, size_t nrefs) : best(ncells), scores(nrefs, std::vector(ncells)), delta(ncells) {} - - std::vector scores_to_pointers() { - std::vector output(scores.size()); - for (size_t s = 0; s < scores.size(); ++s) { - output[s] = scores[s].data(); - } - return output; - }; - /** - * @endcond - */ - - /** - * Vector of length equal to the number of cells in the test dataset, - * containing the index of the reference with the top-scoring label for each cell. - */ - std::vector best; - - /** - * Vector of length equal to the number of references, - * containing vectors of length equal to the number of cells in the test dataset. - * Each vector corresponds to a reference and contains the score for the best label in that reference for each cell. - */ - std::vector > scores; - - /** - * Vector of length equal to the number of cells in the test dataset. - * This contains the difference between the highest and second-highest scores for each cell. - */ - std::vector delta; - }; - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * The identity of the rows should be consistent with the arguments used in `IntegratedBuilder::add()`. - * @param[in] assigned Vector of pointers of length equal to the number of references. - * Each pointer should point to an array of length equal to the number of columns in `mat`, - * containing the assigned label for each column in each reference. - * @param built Set of integrated references produced by `IntegratedBuilder::finish()`. - * - * @return A `Results` object containing the assigned labels and scores. - */ - Results run(const tatami::Matrix* mat, const std::vector& assigned, const IntegratedReferences& built) const { - Results output(mat->ncol(), built.num_references()); - auto scores = output.scores_to_pointers(); - run(mat, assigned, built, output.best.data(), scores, output.delta.data()); - return output; - } -}; - -} - -#endif diff --git a/inst/include/singlepp/Markers.hpp b/inst/include/singlepp/Markers.hpp deleted file mode 100644 index 9228ec6..0000000 --- a/inst/include/singlepp/Markers.hpp +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef SINGLEPP_MARKERS_HPP -#define SINGLEPP_MARKERS_HPP - -#include "macros.hpp" - -#include - -/** - * @file Markers.hpp - * - * @brief Define the `Markers` typedef. - */ - -namespace singlepp { - -/** - * A vector of vectors of ranked marker lists, used to determine which features should be used to compute correlations in `Classifier`. - * - * For a `Markers` object `markers`, let us consider the vector at `markers[0][1]`. - * This vector is expected to contain the ranked indices of the marker genes for label 0 compared to label 1. - * Most typically, this is generated by identifying the genes that are upregulated in label 0 compared to 1 and sorting by decreasing effect size. - * Indices should refer to the rows of the reference expression matrices (i.e., `ref` in the various `Classifier::run()` or `BasicBuilder::run()` methods). - * So, for example, `markers[0][1][0]` should contain the row index of the most upregulated gene in label 0 compared to 1. - * - * For a given reference dataset, the corresponding `Markers` object should have length equal to the number of labels in that reference. - * Each middle vector (i.e., `markers[i]` for non-negative `i` less than the number of labels) should also have length equal to the number of labels. - * The innermost vectors that are not on the diagonal (i.e., `markers[i][j]` for `i != j`) may be of any positive length and should contain unique row indices. - * Any innermost vector along the "diagonal" (i.e., `markers[i][i]`) is typically of zero length. - * - * Ideally, the non-diagonal innermost vectors should be at least as long as the setting of `Classifier::set_top()`. - * The cell type annotation will still work with shorter (non-empty) vectors but it will not be possible to achieve the user-specified `set_top()`. - * In cases involving feature intersections in `run()`, the vectors should be long enough to achieve the specified `set_top()` after removing non-shared genes. - * For longer vectors, the annotation will safely ignore the indices after the `set_top()` specification. - * - * As mentioned previously, the diagonal innermost vectors are typically empty, given that it makes little sense to identify upregulated markers in a label compared to itself. - * That said, any genes stored on the diagonal will be respected and used in all feature subsets for the corresponding label. - * This can be exploited by advanced users to efficiently store "universal" markers for a label, i.e., markers that are applicable in all comparisons to other labels. - */ -typedef std::vector > > Markers; - -} - -#endif diff --git a/inst/include/singlepp/annotate_cells.hpp b/inst/include/singlepp/annotate_cells.hpp deleted file mode 100644 index 1ae2b9d..0000000 --- a/inst/include/singlepp/annotate_cells.hpp +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef SINGLEPP_ANNOTATE_CELLS_HPP -#define SINGLEPP_ANNOTATE_CELLS_HPP - -#include "macros.hpp" - -#include "tatami/tatami.hpp" - -#include "scaled_ranks.hpp" -#include "process_features.hpp" -#include "build_indices.hpp" -#include "fine_tune.hpp" - -#include -#include -#include - -namespace singlepp { - -inline void annotate_cells_simple( - const tatami::Matrix* mat, - size_t num_subset, - const int* subset, - const std::vector& ref, - const Markers& markers, - double quantile, - bool fine_tune, - double threshold, - int* best, - std::vector& scores, - double* delta, - int nthreads) -{ - // Figuring out how many neighbors to keep and how to compute the quantiles. - const size_t NL = ref.size(); - std::vector search_k(NL); - std::vector > coeffs(NL); - for (size_t r = 0; r < NL; ++r) { - double denom = ref[r].index->nobs() - 1; - double prod = denom * (1 - quantile); - auto k = std::ceil(prod) + 1; - search_k[r] = k; - - // `(1 - quantile) - (k - 2) / denom` represents the gap to the smaller quantile, - // while `(k - 1) / denom - (1 - quantile)` represents the gap from the larger quantile. - // The size of the gap is used as the weight for the _other_ quantile, i.e., - // the closer you are to a quantile, the higher the weight. - // We convert these into proportions by dividing by their sum, i.e., `1/denom`. - coeffs[r].first = static_cast(k - 1) - prod; - coeffs[r].second = prod - static_cast(k - 2); - } - - std::vector subcopy(subset, subset + num_subset); - SubsetSorter subsorted(subcopy); - - tatami::parallelize([&](int, int start, int length) -> void { - auto wrk = tatami::consecutive_extractor(mat, false, start, length, subsorted.extraction_subset()); - RankedVector vec(num_subset); - std::vector buffer(num_subset); - - FineTuner ft; - std::vector curscores(NL); - - for (int c = start, end = start + length; c < end; ++c) { - auto ptr = wrk->fetch(c, buffer.data()); - subsorted.fill_ranks(ptr, vec); - scaled_ranks(vec, buffer.data()); // 'buffer' can be written to, as all data is extracted to 'vec'. - - curscores.resize(NL); - for (size_t r = 0; r < NL; ++r) { - size_t k = search_k[r]; - auto current = ref[r].index->find_nearest_neighbors(buffer.data(), k); - - double last = current[k - 1].second; - last = 1 - 2 * last * last; - if (k == 1) { - curscores[r] = last; - } else { - double next = current[k - 2].second; - next = 1 - 2 * next * next; - curscores[r] = coeffs[r].first * next + coeffs[r].second * last; - } - - if (scores[r]) { - scores[r][c] = curscores[r]; - } - } - - if (!fine_tune) { - auto top = std::max_element(curscores.begin(), curscores.end()); - best[c] = top - curscores.begin(); - if (delta) { - if (curscores.size() > 1) { - double topscore = *top; - *top = -100; - delta[c] = topscore - *std::max_element(curscores.begin(), curscores.end()); - } else { - delta[c] = std::numeric_limits::quiet_NaN(); - } - } - } else { - auto tuned = ft.run(vec, ref, markers, curscores, quantile, threshold); - best[c] = tuned.first; - if (delta) { - delta[c] = tuned.second; - } - } - } - - }, mat->ncol(), nthreads); - - return; -} - -} - -#endif diff --git a/inst/include/singlepp/build_indices.hpp b/inst/include/singlepp/build_indices.hpp deleted file mode 100644 index 0dbe22b..0000000 --- a/inst/include/singlepp/build_indices.hpp +++ /dev/null @@ -1,110 +0,0 @@ -#ifndef SINGLEPP_BUILD_INDICES_HPP -#define SINGLEPP_BUILD_INDICES_HPP - -#include "macros.hpp" - -#include "knncolle/knncolle.hpp" -#include "tatami/tatami.hpp" - -#include "process_features.hpp" -#include "scaled_ranks.hpp" - -#include -#include -#include - -namespace singlepp { - -inline size_t get_nlabels(size_t n, const int* labels) { - if (n == 0) { - throw std::runtime_error("reference dataset must have at least one column"); - } - return *std::max_element(labels, labels + n) + 1; -} - -struct Reference { - std::vector > ranked; - std::shared_ptr > index; -}; - -template -std::vector build_indices(const tatami::Matrix* ref, const int* labels, const std::vector& subset, const Builder& build, int nthreads) { - size_t NC = ref->ncol(); - size_t nlabels = get_nlabels(NC, labels); - std::vector label_count(nlabels); - for (size_t i = 0; i < NC; ++i) { - ++label_count[labels[i]]; - } - - size_t NR = subset.size(); - std::vector nnrefs(nlabels); - std::vector > nndata(nlabels); - for (size_t l = 0; l < nlabels; ++l) { - if (label_count[l] == 0) { - throw std::runtime_error(std::string("no entries for label ") + std::to_string(l)); - } - nnrefs[l].ranked.resize(label_count[l]); - nndata[l].resize(label_count[l] * NR); - } - - std::vector offsets(NC); - { - std::vector counter(nlabels); - for (size_t c = 0; c < NC; ++c) { - auto& o = counter[labels[c]]; - offsets[c] = o; - ++o; - } - } - - SubsetSorter subsorter(subset); - - tatami::parallelize([&](int, int start, int len) -> void { - RankedVector ranked(NR); - std::vector buffer(ref->nrow()); - auto wrk = tatami::consecutive_extractor(ref, false, start, len, subsorter.extraction_subset()); - - for (int c = start, end = start + len; c < end; ++c) { - auto ptr = wrk->fetch(c, buffer.data()); - subsorter.fill_ranks(ptr, ranked); - - auto curlab = labels[c]; - auto curoff = offsets[c]; - auto scaled = nndata[curlab].data() + curoff * NR; - scaled_ranks(ranked, scaled); - - // Storing as a pair of ints to save space; as long - // as we respect ties, everything should be fine. - auto& stored_ranks = nnrefs[curlab].ranked[curoff]; - stored_ranks.reserve(ranked.size()); - simplify_ranks(ranked, stored_ranks); - } - }, ref->ncol(), nthreads); - -#ifndef SINGLEPP_CUSTOM_PARALLEL - #pragma omp parallel for num_threads(nthreads) - for (size_t l = 0; l < nlabels; ++l) { -#else - SINGLEPP_CUSTOM_PARALLEL([&](int, size_t start, size_t len) -> void { - for (size_t l = start, end = start + len; l < end; ++l) { -#endif - nnrefs[l].index = build(NR, label_count[l], nndata[l].data()); - - // Trying to free the memory as we go along, to offset the copying - // of nndata into the memory store owned by the knncolle index. - nndata[l].clear(); - nndata[l].shrink_to_fit(); - -#ifndef SINGLEPP_CUSTOM_PARALLEL - } -#else - } - }, nlabels, nthreads); -#endif - - return nnrefs; -} - -} - -#endif diff --git a/inst/include/singlepp/compute_scores.hpp b/inst/include/singlepp/compute_scores.hpp deleted file mode 100644 index 8ad93fe..0000000 --- a/inst/include/singlepp/compute_scores.hpp +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef SINGLEPP_COMPUTE_SCORES_HPP -#define SINGLEPP_COMPUTE_SCORES_HPP - -#include "macros.hpp" - -#include -#include -#include -#include - -namespace singlepp { - -inline double correlations_to_scores (std::vector& correlations, double quantile) { - const size_t ncells=correlations.size(); - if (ncells==0) { - return std::numeric_limits::quiet_NaN(); - } else if (quantile==1 || ncells==1) { - return *std::max_element(correlations.begin(), correlations.end()); - } else { - const double denom = ncells - 1; - const double prod = denom * quantile; - const size_t left = std::floor(prod); - const size_t right = std::ceil(prod); - - std::nth_element(correlations.begin(), correlations.begin() + right, correlations.end()); - const double rightval=correlations[right]; - if (right == left) { - return rightval; - } - - // Do NOT be tempted to do the second nth_element with the end at begin()+right; - // this does not handle ties properly. - std::nth_element(correlations.begin(), correlations.begin() + left, correlations.end()); - const double leftval=correlations[left]; - - // `quantile - left / denom` represents the gap to the smaller quantile, - // while `right / denom - quantile` represents the gap from the larger quantile. - // The size of the gap is used as the weight for the _other_ quantile, i.e., - // the closer you are to a quantile, the higher the weight. - // We convert these into proportions by dividing by their sum, i.e., `1/denom`. - const double leftweight = right - prod; - const double rightweight = prod - left; - - return rightval * rightweight + leftval * leftweight; - } -} - -template -double distance_to_correlation(size_t n, const Vector& p1, const Vector& p2) { - double d2 = 0; - for (size_t i = 0; i < n; ++i) { - double tmp = p1[i] - p2[i]; - d2 += tmp * tmp; - } - return 1 - 2 * d2; -} - -} - -#endif diff --git a/inst/include/singlepp/fine_tune.hpp b/inst/include/singlepp/fine_tune.hpp deleted file mode 100644 index b7675c0..0000000 --- a/inst/include/singlepp/fine_tune.hpp +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef TATAMI_FINE_TUNE_HPP -#define TATAMI_FINE_TUNE_HPP - -#include "macros.hpp" - -#include "scaled_ranks.hpp" -#include "process_features.hpp" -#include "compute_scores.hpp" -#include "build_indices.hpp" - -#include -#include -#include - -namespace singlepp { - -inline std::pair fill_labels_in_use(const std::vector& scores, double threshold, std::vector& in_use) { - auto it = std::max_element(scores.begin(), scores.end()); - int best_label = it - scores.begin(); - double max_score = *it; - - in_use.clear(); - constexpr double DUMMY = -1000; - double next_score = DUMMY; - const double bound = max_score - threshold; - - for (size_t i = 0; i < scores.size(); ++i) { - const auto& val = scores[i]; - if (val >= bound) { - in_use.push_back(i); - } - if (i != best_label && next_score < val) { - next_score = val; - } - } - - return std::make_pair(best_label, max_score - next_score); -} - -inline std::pair replace_labels_in_use(const std::vector& scores, double threshold, std::vector& in_use) { - auto it = std::max_element(scores.begin(), scores.end()); - int best_index = it - scores.begin(); - double max_score = *it; - - int best_label = in_use[best_index]; - size_t counter = 0; - - constexpr double DUMMY = -1000; - double next_score = DUMMY; - const double bound = max_score - threshold; - - for (size_t i = 0; i < scores.size(); ++i) { - const auto& val = scores[i]; - if (val >= bound) { - in_use[counter] = in_use[i]; - ++counter; - } - if (i != best_index && next_score < val) { - next_score = val; - } - } - - in_use.resize(counter); - return std::make_pair(best_label, max_score - next_score); -} - -class FineTuner { - std::vector labels_in_use; - - std::unordered_map gene_subset; - - std::vector scaled_left, scaled_right; - - std::vector all_correlations; - - RankedVector input_sub; - - RankedVector ref_sub; - -public: - template - std::pair run( - const RankedVector& input, - const std::vector& ref, - const Markers& markers, - std::vector& scores, - double quantile, - double threshold) - { - if (scores.size() <= 1) { - return std::make_pair(0, std::numeric_limits::quiet_NaN()); - } - - auto candidate = fill_labels_in_use(scores, threshold, labels_in_use); - - // If there's only one top label, we don't need to do anything else. - if (labels_in_use.size() == 1) { - return candidate; - } - - // We also give up if every label is in range, because any subsequent - // calculations would use all markers and just give the same result. - // The 'test' parameter allows us to skip this bypass for testing. - if constexpr(!test) { - if (labels_in_use.size() == ref.size()) { - return candidate; - } - } - - while (labels_in_use.size() > 1) { - gene_subset.clear(); - gene_subset.reserve(input.size()); // known maximum. - for (auto l : labels_in_use) { - for (auto l2 : labels_in_use){ - const auto& current = markers[l][l2]; - for (auto c : current) { - if (gene_subset.find(c) == gene_subset.end()) { - const int counter = gene_subset.size(); - gene_subset[c] = counter; - } - } - } - } - - input_sub.clear(); - subset_ranks(input, input_sub, gene_subset); - scaled_left.resize(gene_subset.size()); - scaled_ranks(input_sub, scaled_left.data()); - - scaled_right.resize(gene_subset.size()); - scores.clear(); - - for (size_t i = 0; i < labels_in_use.size(); ++i) { - auto curlab = labels_in_use[i]; - - all_correlations.clear(); - const auto& curref = ref[curlab]; - size_t NR = curref.index->ndim(); - size_t NC = curref.index->nobs(); - - for (size_t c = 0; c < NC; ++c) { - // Technically we could be faster if we remembered the - // subset from the previous fine-tuning iteration, but this - // requires us to (possibly) make a copy of the entire - // reference set; we can't afford to do this in each thread. - ref_sub.clear(); - subset_ranks(curref.ranked[c], ref_sub, gene_subset); - scaled_ranks(ref_sub, scaled_right.data()); - - double cor = distance_to_correlation(scaled_left.size(), scaled_left, scaled_right); - all_correlations.push_back(cor); - } - - double score = correlations_to_scores(all_correlations, quantile); - scores.push_back(score); - } - - candidate = replace_labels_in_use(scores, threshold, labels_in_use); - if (labels_in_use.size() == scores.size()) { // i.e., unchanged. - break; - } - } - - return candidate; - } -}; - -} - -#endif diff --git a/inst/include/singlepp/load_references.hpp b/inst/include/singlepp/load_references.hpp deleted file mode 100644 index 412e329..0000000 --- a/inst/include/singlepp/load_references.hpp +++ /dev/null @@ -1,650 +0,0 @@ -#ifndef SINGLEPP_LOADERS_HPP -#define SINGLEPP_LOADERS_HPP - -#include "macros.hpp" - -#include "byteme/PerByte.hpp" -#include "byteme/RawFileReader.hpp" -#include "tatami/tatami.hpp" - -#ifdef SINGLEPP_USE_ZLIB -#include "byteme/GzipFileReader.hpp" -#include "byteme/ZlibBufferReader.hpp" -#endif - -#include "Markers.hpp" - -#include -#include -#include -#include - -/** - * @file load_references.hpp - * - * @brief Load reference datasets from a few expected formats. - */ - -namespace singlepp { - -/** - * @cond - */ -template -using SuperPerByte = typename std::conditional, byteme::PerByteParallel >::type; - -template -std::vector load_labels_internal(Reader_& reader) { - bool non_empty = false; - int current = 0; - std::vector labels; - bool remaining = true; - - SuperPerByte pb(&reader); - bool okay = pb.valid(); - while (okay) { - char x = pb.get(); - okay = pb.advance(); - - if (x == '\n') { - if (!non_empty) { - throw std::runtime_error("label index must be an integer"); - } - labels.push_back(current); - current = 0; - non_empty = false; - } else if (std::isdigit(x)) { - non_empty = true; - current *= 10; - current += (x - '0'); - } else { - throw std::runtime_error("label index must be an integer"); - } - } - - if (non_empty) { - labels.push_back(current); - } - - return labels; -} -/** - * @endcond - */ - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a text file containing the labels. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Vector containing the label index for each reference profile. - * - * The file should contain one line per profile, containing an integer label index for that profile. - * Label indices refer to another array containing the actual names of the labels. - * The total number of lines should be equal to the number of profiles in the dataset. - * The file should not contain any header. - */ -template -std::vector load_labels_from_text_file(const char* path, size_t buffer_size = 65536) { - byteme::RawFileReader reader(path, buffer_size); - return load_labels_internal(reader); -} - -#ifdef SINGLEPP_USE_ZLIB - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a Gzip-compressed file containing the labels. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Vector containing the label index for each reference profile. - * - * See `load_labels_from_text_file()` for details about the format. - */ -template -std::vector load_labels_from_gzip_file(const char* path, size_t buffer_size = 65536) { - byteme::GzipFileReader reader(path, buffer_size); - return load_labels_internal(reader); -} - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param[in] buffer Pointer to an array containing a Zlib/Gzip-compressed string of labels. - * @param len Length of the array for `buffer`. - * @param buffer_size Size of the buffer to use when decompressing the buffer. - * - * @return Vector containing the label index for each reference profile. - * - * See `load_labels_from_text_file()` for details about the format. - */ -template -inline std::vector load_labels_from_zlib_buffer(const unsigned char* buffer, size_t len, size_t buffer_size = 65536) { - byteme::ZlibBufferReader reader(buffer, len, 3, buffer_size); - return load_labels_internal(reader); -} - -#endif - -/** - * @cond - */ -template -std::vector load_names_internal(Reader_& reader) { - std::string current; - std::vector names; - - SuperPerByte pb(&reader); - bool okay = pb.valid(); - while (okay) { - char x = pb.get(); - okay = pb.advance(); - - if (x == '\n') { - names.emplace_back(std::move(current)); - current.clear(); - } else { - current += x; - } - } - - if (!current.empty()) { // absence of trailing newline is okay. - names.emplace_back(std::move(current)); - } - - return names; -}; -/** - * @endcond - */ - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a text file containing the label names. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Vector of strings containing the name for each reference profile. - * - * The file should contain one line per label, containing a (non-quoted) string with the full name for that label. - * The total number of lines should be equal to the number of unique labels in the dataset. - * The file should not contain any header. - */ -template -std::vector load_label_names_from_text_file(const char* path, size_t buffer_size = 65536) { - byteme::RawFileReader reader(path, buffer_size); - return load_names_internal(reader); -} - -#ifdef SINGLEPP_USE_ZLIB - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a Gzip-compressed file containing the label names. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Vector of strings containing the name for each label. - * - * See `load_label_names_from_text_file()` for details about the format. - */ -template -std::vector load_label_names_from_gzip_file(const char* path, size_t buffer_size = 65536) { - byteme::GzipFileReader reader(path, buffer_size); - return load_names_internal(reader); -} - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param[in] buffer Pointer to an array containing a Zlib/Gzip-compressed string of label names. - * @param len Length of the array for `buffer`. - * @param buffer_size Size of the buffer to use when decompressing the buffer. - * - * @return Vector of strings containing the name for each label. - * - * See `load_label_names_from_text_file()` for details about the format. - */ -template -std::vector load_label_names_from_zlib_buffer(const unsigned char* buffer, size_t len, size_t buffer_size = 65536) { - byteme::ZlibBufferReader reader(buffer, len, 3, buffer_size); - return load_names_internal(reader); -} - -#endif - -/** - * @cond - */ -template -std::pair, std::vector > load_features_internal(Reader_& reader) { - std::string current; - std::vector ensembl, symbols; - - SuperPerByte pb(&reader); - bool okay = pb.valid(); - while (okay) { - current.clear(); - - // Pulling out the Ensembl ID. - do { - char x = pb.get(); - if (x == ',') { // don't advance yet, so that okay check below doesn't trigger if the symbol is empty and file is not newline terminated. - break; - } else if (x == '\n') { - okay = false; // hit the error below. - break; - } - current += x; - okay = pb.advance(); - } while (okay); - - if (!okay) { - throw std::runtime_error("two comma-separated fields (Ensembl ID and symbol) expected on each line"); - } - okay = pb.advance(); - - ensembl.push_back(std::move(current)); - current.clear(); - - // Now pulling out the gene symbol. - while (okay) { - char x = pb.get(); - okay = pb.advance(); - if (x == '\n') { - break; - } - current += x; - } - - symbols.push_back(std::move(current)); // still gets added if file is not newline terminated. - current.clear(); - } - - return std::make_pair(std::move(ensembl), std::move(symbols)); -} -/** - * @endcond - */ - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a text file containing the feature annotation. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Pair of vectors, each of length equal to the number of features. - * The first contains Ensembl IDs while the second contains gene symbols. - * - * The file should contain one line per feature, with total number of lines equal to the number of features in the dataset. - * Each line should contain two strings separated by a comma. - * The first string should be the Ensembl ID while the second string should be the gene symbol; either string may be empty. - * The file should not contain any header. - */ -template -std::pair, std::vector > load_features_from_text_file(const char* path, size_t buffer_size = 65536) { - byteme::RawFileReader reader(path, buffer_size); - return load_features_internal(reader); -} - -#ifdef SINGLEPP_USE_ZLIB - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a Gzip-compressed file containing the feature annotation. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Pair of vectors, each of length equal to the number of features. - * The first contains Ensembl IDs while the second contains gene symbols. - * - * See `load_features_from_text_file()` for details about the format. - */ -template -std::pair, std::vector > load_features_from_gzip_file(const char* path, size_t buffer_size = 65536) { - byteme::GzipFileReader reader(path, buffer_size); - return load_features_internal(reader); -} - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param[in] buffer Pointer to an array containing a Zlib/Gzip-compressed string containing the feature annotation. - * @param len Length of the array for `buffer`. - * @param buffer_size Size of the buffer to use when decompressing the buffer. - * - * @return Pair of vectors, each of length equal to the number of features. - * The first contains Ensembl IDs while the second contains gene symbols. - * - * See `load_features_from_text_file()` for details about the format. - */ -template -std::pair, std::vector > load_features_from_zlib_buffer(const unsigned char* buffer, size_t len, size_t buffer_size = 65536) { - byteme::ZlibBufferReader reader(buffer, len, 3, buffer_size); - return load_features_internal(reader); -} - -#endif - -/** - * Matrix of ranks as a dense column-major matrix. - * Each column corresponds to a sample while each row corresponds to a feature. - * Each column contains the ranked expression values for all features. - * - * @tparam Data Numeric type for data in the matrix interface. - * @tparam Index Integer type for indices in the matrix interface. - */ -template -using RankMatrix = tatami::DenseColumnMatrix >; - -/** - * @cond - */ -template -RankMatrix load_rankings_internal(Reader& reader) { - size_t nfeatures = 0; - size_t line = 0; - std::vector values; - - int field = 0; - bool non_empty = false; - int current = 0; - - bool has_nfeatures = false; - auto check_nfeatures = [&]() -> void { - if (!has_nfeatures) { - has_nfeatures = true; - nfeatures = field + 1; - } else if (field + 1 != nfeatures) { - throw std::runtime_error("number of fields on each line should be equal to the number of features"); - } - }; - - SuperPerByte pb(&reader); - bool okay = pb.valid(); - while (okay) { - char x = pb.get(); - okay = pb.advance(); - - if (x == '\n') { - check_nfeatures(); - if (!non_empty) { - throw std::runtime_error("fields should not be empty"); - } - values.push_back(current); - current = 0; - field = 0; - non_empty = false; - ++line; - - } else if (x == ',') { - if (!non_empty) { - throw std::runtime_error("fields should not be empty"); - } - values.push_back(current); - current = 0; - ++field; - non_empty = false; - - } else if (std::isdigit(x)) { - non_empty = true; - current *= 10; - current += (x - '0'); - - } else { - throw std::runtime_error("fields should only contain integer ranks"); - } - } - - if (field || non_empty) { // aka no terminating newline. - check_nfeatures(); - if (!non_empty) { - throw std::runtime_error("fields should not be empty"); - } - values.push_back(current); - ++line; - } - - return RankMatrix(nfeatures, line, std::move(values)); -}; -/** - * @endcond - */ - -/** - * @tparam Data Numeric type for data in the matrix interface. - * @tparam Index Integer type for indices in the matrix interface. - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a text file containing the ranking matrix. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `RankMatrix` containing the feature rankings for each reference profile. - * Each column corresponds to a reference profile while each row corresponds to a feature. - * - * The file should contain one line per reference profile, with the total number of lines equal to the number of profiles in the dataset. - * Each line should contain the rank of each feature's expression within that profile, separated by commas. - * The number of comma-separated fields on each line should be equal to the number of features. - * Ranks should be strictly integer - tied ranks should default to the minimum rank among the index set of ties. - */ -template -RankMatrix load_rankings_from_text_file(const char* path, size_t buffer_size = 65536) { - byteme::RawFileReader reader(path, buffer_size); - return load_rankings_internal(reader); -} - -#ifdef SINGLEPP_USE_ZLIB - -/** - * @tparam Data Numeric type for data in the matrix interface. - * @tparam Index Integer type for indices in the matrix interface. - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a Gzip-compressed file containing the ranking matrix. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `RankMatrix` containing the feature rankings for each reference profile. - * Each column corresponds to a reference profile while each row corresponds to a feature. - * - * See `load_rankings_from_text_file()` for details about the format. - */ -template -RankMatrix load_rankings_from_gzip_file(const char* path, size_t buffer_size = 65536) { - byteme::GzipFileReader reader(path, buffer_size); - return load_rankings_internal(reader); -} - -/** - * @tparam Data Numeric type for data in the matrix interface. - * @tparam Index Integer type for indices in the matrix interface. - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param[in] buffer Pointer to an array containing a Zlib/Gzip-compressed string containing the ranking matrix. - * @param len Length of the array for `buffer`. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `RankMatrix` containing the feature rankings for each reference profile. - * Each column corresponds to a reference profile while each row corresponds to a feature. - * - * See `load_rankings_from_text_file()` for details about the format. - */ -template -RankMatrix load_rankings_from_zlib_buffer(const unsigned char* buffer, size_t len, size_t buffer_size = 65536) { - byteme::ZlibBufferReader reader(buffer, len, 3, buffer_size); - return load_rankings_internal(reader); -} - -#endif - -/** - * @cond - */ -template -Markers load_markers_internal(size_t nfeatures, size_t nlabels, Reader& reader) { - Markers markers(nlabels); - for (auto& m : markers) { - m.resize(nlabels); - } - - std::vector values; - SuperPerByte pb(&reader); - bool okay = pb.valid(); - while (okay) { - - // Processing the label IDs. - size_t first = 0, second = 0; - for (int l = 0; l < 2; ++l) { - auto& current = (l == 0 ? first : second); - bool non_empty = false; - - do { - char x = pb.get(); - okay = pb.advance(); - - if (x == '\t') { - if (!non_empty) { - throw std::runtime_error("empty field detected in the label indices"); - } - break; - } else if (x == '\n') { - okay = false; // hit the error below. - break; - } else if (!std::isdigit(x)) { - throw std::runtime_error("label indices should be integers"); - } - - non_empty = true; - current *= 10; - current += (x - '0'); - } while (okay); - - if (!okay) { - throw std::runtime_error("expected at least three tab-separated fields on each line"); - } - if (current >= markers.size()) { - throw std::runtime_error("label index out of range"); - } - } - - // Processing the actual gene indices. - bool non_empty = false; - int current = 0; - while (okay) { - char x = pb.get(); - okay = pb.advance(); - - if (std::isdigit(x)) { - non_empty = true; - current *= 10; - current += (x - '0'); - - } else if (x == '\t') { - if (!non_empty) { - throw std::runtime_error("gene index fields should not be empty"); - } - values.push_back(current); - current = 0; - non_empty = false; - - } else if (x == '\n') { - break; - - } else { - throw std::runtime_error("gene index fields should be integers"); - } - } - - // Adding the last element. We don't do this inside the newline check, - // as we need to account for cases where the file is not newline-terminated. - if (!non_empty) { - throw std::runtime_error("gene index fields should not be empty"); - } - values.push_back(current); - - for (auto v : values) { - if (static_cast(v) >= nfeatures) { - throw std::runtime_error("gene index out of range"); - } - } - - auto& store = markers[first][second]; - if (!store.empty()) { - throw std::runtime_error("multiple marker sets listed for a single pairwise comparison"); - } - store.swap(values); // implicit clear of 'values', as store is empty. - } - - return markers; -} -/** - * @endcond - */ - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a text file containing the marker lists. - * @param nfeatures Total number of features in the dataset. - * @param nlabels Number of labels in the dataset. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `Markers` object containing the markers from each pairwise comparison between labels. - * - * The file should contain one line per pairwise comparison between labels. - * Each line should at least 3 tab-delimited fields - the index of the first label, the index of the second label, - * and then the indices of the features selected as marker genes for the first label relative to the second. - * Any (non-zero) number of marker indices may be reported provided they are ordered by marker strength. - * The total number of lines in this file should be equal to the total number of pairwise comparisons between different labels, including permutations. - */ -template -Markers load_markers_from_text_file(const char* path, size_t nfeatures, size_t nlabels, size_t buffer_size = 65536) { - byteme::RawFileReader reader(path, buffer_size); - return load_markers_internal(nfeatures, nlabels, reader); -} - -#ifdef SINGLEPP_USE_ZLIB - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a Gzip-compressed file containing the marker lists. - * @param nfeatures Total number of features in the dataset. - * @param nlabels Number of labels in the dataset. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `Markers` object containing the markers from each pairwise comparison between labels. - * - * See `load_markers_from_text_file()` for details about the format. - */ -template -Markers load_markers_from_gzip_file(const char* path, size_t nfeatures, size_t nlabels, size_t buffer_size = 65536) { - byteme::GzipFileReader reader(path, buffer_size); - return load_markers_internal(nfeatures, nlabels, reader); -} - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param[in] buffer Pointer to an array containing a Zlib/Gzip-compressed string containing the marker lists. - * @param len Length of the array for `buffer`. - * @param nfeatures Total number of features in the dataset. - * @param nlabels Number of labels in the dataset. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `Markers` object containing the markers from each pairwise comparison between labels. - * - * See `load_markers_from_text_file()` for details about the format. - */ -template -Markers load_markers_from_zlib_buffer(const unsigned char* buffer, size_t len, size_t nfeatures, size_t nlabels, size_t buffer_size = 65536) { - byteme::ZlibBufferReader reader(buffer, len, 3, buffer_size); - return load_markers_internal(nfeatures, nlabels, reader); -} - -#endif - -} - -#endif diff --git a/inst/include/singlepp/macros.hpp b/inst/include/singlepp/macros.hpp deleted file mode 100644 index 8ac1ddd..0000000 --- a/inst/include/singlepp/macros.hpp +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef SINGLEPP_MACROS_HPP -#define SINGLEPP_MACROS_HPP - -/** - * @file macros.hpp - * - * @brief Set common macros used throughout **singlepp**. - * - * @details - * The `SINGLEPP_CUSTOM_PARALLEL` macro can be set to a function that specifies a custom parallelization scheme. - * This function should be a template that accept three arguments: - * - * - `fun`, a lambda that accepts three arguments: - * - `thread`, the thread number. - * - `start`, the index of the first job for this thread. - * - `length`, the number of jobs for this thread. - * - `njobs`, an integer specifying the number of jobs. - * - `nthreads`, an integer specifying the number of threads to use. - * - * The function should split `[0, njobs)` into any number of contiguous, non-overlapping intervals, and call `fun` on each interval, possibly in different threads. - * The details of the splitting and evaluation are left to the discretion of the developer defining the macro. - * The function should only return once all evaluations of `fun` are complete. - * - * If `SINGLEPP_CUSTOM_PARALLEL` is set, the following macros are also set (if they are not already defined): - * - * - `TATAMI_CUSTOM_PARALLEL`, from the [**tatami**](https://ltla.github.io/tatami) library. - * - * This ensures that any custom parallelization scheme is propagated to all of **singlepp**'s dependencies. - * If these libraries are used outside of **singlepp**, some care is required to ensure that the macros are consistently defined through the client library/application; - * otherwise, developers may observe ODR compilation errors. - */ - -// Synchronizing all parallelization schemes. -#ifdef SINGLEPP_CUSTOM_PARALLEL - -#ifndef TATAMI_CUSTOM_PARALLEL -#define TATAMI_CUSTOM_PARALLEL SINGLEPP_CUSTOM_PARALLEL -#endif - -#endif - -#endif diff --git a/inst/include/singlepp/process_features.hpp b/inst/include/singlepp/process_features.hpp deleted file mode 100644 index a8e533e..0000000 --- a/inst/include/singlepp/process_features.hpp +++ /dev/null @@ -1,153 +0,0 @@ -#ifndef SINGLEPP_PROCESS_FEATURES_HPP -#define SINGLEPP_PROCESS_FEATURES_HPP - -#include "macros.hpp" - -#include "Markers.hpp" - -#include -#include -#include -#include - -namespace singlepp { - -typedef std::vector > Intersection; - -template -Intersection intersect_features(size_t mat_n, const Id* mat_id, size_t ref_n, const Id* ref_id) { - std::unordered_map > intersection; - intersection.reserve(mat_n); - for (size_t i = 0; i < mat_n; ++i) { - if (intersection.find(mat_id[i]) == intersection.end()) { // only using the first occurrence of each ID in mat_id. - intersection[mat_id[i]] = std::make_pair(i, -1); - } - } - - for (size_t i = 0; i < ref_n; ++i) { - auto it = intersection.find(ref_id[i]); - if (it != intersection.end()) { // only using the first occurrence of each ID in ref_id. - auto& target = (it->second).second; - if (target < 0) { - target = i; - } - } - } - - Intersection pairings; - pairings.reserve(intersection.size()); - for (const auto& x : intersection) { - if (x.second.second >= 0) { - pairings.push_back(x.second); - } - } - - return pairings; -} - -inline void subset_markers(Intersection& intersection, Markers& markers, int top) { - std::unordered_set available; - available.reserve(intersection.size()); - for (const auto& in : intersection) { - available.insert(in.second); - } - - // Figuring out the top markers to retain, that are _also_ in the intersection. - std::unordered_set all_markers; - all_markers.reserve(intersection.size()); - - for (size_t i = 0; i < markers.size(); ++i) { - for (size_t j = 0; j < markers[i].size(); ++j) { - auto& current = markers[i][j]; - - std::vector replacement; - size_t upper_bound = static_cast(top >= 0 ? top : -1); // in effect, no upper bound if top = -1. - replacement.reserve(top >= 0 ? static_cast(top) : current.size()); - - for (size_t k = 0; k < current.size() && replacement.size() < upper_bound; ++k) { - if (available.find(current[k]) != available.end()) { - all_markers.insert(current[k]); - replacement.push_back(current[k]); - } - } - current.swap(replacement); - } - } - - // Subsetting the intersection down to the chosen set of markers. - std::unordered_map mapping; - mapping.reserve(intersection.size()); - { - size_t counter = 0; - for (size_t i = 0; i < intersection.size(); ++i) { - if (all_markers.find(intersection[i].second) != all_markers.end()) { - intersection[counter] = intersection[i]; - mapping[intersection[i].second] = counter; - ++counter; - } - } - intersection.resize(counter); - } - - // Reindexing the markers. - for (size_t i = 0; i < markers.size(); ++i) { - for (size_t j = 0; j < markers[i].size(); ++j) { - auto& current = markers[i][j]; - for (size_t k = 0; k < current.size(); ++k) { - auto it = mapping.find(current[k]); - current[k] = it->second; - } - } - } - - return; -} - -// Use this method when the feature spaces are already identical. -inline std::vector subset_markers(Markers& markers, int top) { - std::unordered_set available; - for (size_t i = 0; i < markers.size(); ++i) { - for (size_t j = 0; j < markers[i].size(); ++j) { - auto& current = markers[i][j]; - if (top >= 0) { - current.resize(std::min(current.size(), static_cast(top))); - } - available.insert(current.begin(), current.end()); - } - } - - std::vector subset(available.begin(), available.end()); - std::sort(subset.begin(), subset.end()); - - std::unordered_map mapping; - mapping.reserve(subset.size()); - for (size_t i = 0; i < subset.size(); ++i) { - mapping[subset[i]] = i; - } - - // Reindexing the markers. - for (size_t i = 0; i < markers.size(); ++i) { - for (size_t j = 0; j < markers[i].size(); ++j) { - auto& current = markers[i][j]; - for (size_t k = 0; k < current.size(); ++k) { - auto it = mapping.find(current[k]); - current[k] = it->second; - } - } - } - - return subset; -} - -inline std::pair, std::vector > unzip(const Intersection& intersection) { - std::vector left(intersection.size()), right(intersection.size()); - for (size_t i = 0; i < intersection.size(); ++i) { - left[i] = intersection[i].first; - right[i] = intersection[i].second; - } - return std::make_pair(std::move(left), std::move(right)); -} - -} - -#endif diff --git a/inst/include/singlepp/scaled_ranks.hpp b/inst/include/singlepp/scaled_ranks.hpp deleted file mode 100644 index f5eb381..0000000 --- a/inst/include/singlepp/scaled_ranks.hpp +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef SINGLEPP_SCALED_RANKS_HPP -#define SINGLEPP_SCALED_RANKS_HPP - -#include "macros.hpp" - -#include -#include -#include -#include - -namespace singlepp { - -template -using RankedVector = std::vector >; - -// This class sanitizes any user-provided subsets so that we can provide a -// sorted and unique subset to the tatami extractor. We then undo the sorting -// to use the original indices in the rank filler. This entire thing is -// necessary as the behavior of the subsets isn't something that the user can -// easily control (e.g., if the reference/test datasets do not use the same -// feature ordering, in which case the subset is necessarily unsorted). -struct SubsetSorter { - bool use_sorted_subset = false; - const std::vector* original_subset; - std::vector sorted_subset, original_indices; - - SubsetSorter(const std::vector& sub) : original_subset(&sub) { - size_t num_subset = sub.size(); - for (size_t i = 1; i < num_subset; ++i) { - if (sub[i] <= sub[i-1]) { - use_sorted_subset = true; - break; - } - } - - if (use_sorted_subset) { - std::vector > store; - store.reserve(num_subset); - for (size_t i = 0; i < num_subset; ++i) { - store.emplace_back(sub[i], i); - } - - std::sort(store.begin(), store.end()); - sorted_subset.reserve(num_subset); - original_indices.resize(num_subset); - for (const auto& s : store) { - if (sorted_subset.empty() || sorted_subset.back() != s.first) { - sorted_subset.push_back(s.first); - } - original_indices[s.second] = sorted_subset.size() - 1; - } - } - } - - const std::vector& extraction_subset() const { - if (use_sorted_subset) { - return sorted_subset; - } else { - return *original_subset; - } - } - - void fill_ranks(const double* ptr, RankedVector& vec) const { - if (use_sorted_subset) { - size_t num = original_indices.size(); - for (size_t s = 0; s < num; ++s) { - vec[s].first = ptr[original_indices[s]]; - vec[s].second = s; - } - } else { - size_t num = original_subset->size(); - for (size_t s = 0; s < num; ++s, ++ptr) { - vec[s].first = *ptr; - vec[s].second = s; - } - } - std::sort(vec.begin(), vec.end()); - } -}; - -template -void scaled_ranks(const RankedVector& collected, double* outgoing) { - // Computing tied ranks. - size_t cur_rank = 0; - auto cIt = collected.begin(); - - while (cIt != collected.end()) { - auto copy = cIt; - ++copy; - double accumulated_rank = cur_rank; - ++cur_rank; - - while (copy != collected.end() && copy->first == cIt->first) { - accumulated_rank += cur_rank; - ++cur_rank; - ++copy; - } - - double mean_rank= accumulated_rank / (copy - cIt); - while (cIt!=copy) { - outgoing[cIt->second] = mean_rank; - ++cIt; - } - } - - // Mean-adjusting and converting to cosine values. - double sum_squares = 0; - size_t N = collected.size(); - const double center_rank = static_cast(N - 1)/2; - for (size_t i = 0 ; i < N; ++i) { - auto& o = outgoing[i]; - o -= center_rank; - sum_squares += o*o; - } - - // Special behaviour for no-variance cells; these are left as all-zero scaled ranks. - sum_squares = std::max(sum_squares, 0.00000001); - sum_squares = std::sqrt(sum_squares)*2; - for (size_t i = 0; i < N; ++i) { - outgoing[i] /= sum_squares; - } - - return; -} - -template -void subset_ranks(const RankedVector& x, RankedVector& output, const std::unordered_map& subset) { - for (size_t i = 0; i < x.size(); ++i) { - auto it = subset.find(x[i].second); - if (it != subset.end()) { - output.emplace_back(x[i].first, it->second); - } - } - return; -} - -template -void simplify_ranks(const RankedVector& x, RankedVector& output) { - if (x.size()) { - Index counter = 0; - auto last = x[0].first; - for (const auto& r : x) { - if (r.first != last) { - ++counter; - last = r.first; - } - output.emplace_back(counter, r.second); - } - } - return; -} - -} - -#endif diff --git a/inst/include/singlepp/singlepp.hpp b/inst/include/singlepp/singlepp.hpp deleted file mode 100644 index 21e35fa..0000000 --- a/inst/include/singlepp/singlepp.hpp +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef SINGLEPP_UMBRELLA_HPP -#define SINGLEPP_UMBRELLA_HPP - -/** - * @file singlepp.hpp - * - * @brief Umbrella header for all includes. - */ - -#include "Classifier.hpp" -#include "Markers.hpp" -#include "IntegratedScorer.hpp" -#include "IntegratedBuilder.hpp" -#include "ChooseClassicMarkers.hpp" -#include "load_references.hpp" - -/** - * @namespace singlepp - * Cell type classification using the SingleR algorithm in C++. - */ - -#endif diff --git a/inst/include/source.sh b/inst/include/source.sh deleted file mode 100755 index b36c255..0000000 --- a/inst/include/source.sh +++ /dev/null @@ -1,16 +0,0 @@ -# Fetches all of the header files. - -if [ ! -e source-singlepp ] -then - git clone https://github.com/LTLA/singlepp source-singlepp - cd source-singlepp -else - cd source-singlepp - git pull -fi - -git checkout 53bc74819c367db9d26c785de66206fe6c4f3890 -rm -rf ../singlepp -cp -r include/singlepp/ ../singlepp -git checkout master -cd - diff --git a/man/SingleR.Rd b/man/SingleR.Rd index ddc9c18..d706e0f 100644 --- a/man/SingleR.Rd +++ b/man/SingleR.Rd @@ -45,7 +45,7 @@ Row names may be different across entries but only the intersection will be used \item{labels}{A character vector or factor of known labels for all samples in \code{ref}. Alternatively, if \code{ref} is a list, \code{labels} should be a list of the same length. -Each element should contain a character vector or factor specifying the label for the corresponding entry of \code{ref}.} +Each element should contain a character vector or factor specifying the labels for the columns of the corresponding element of \code{ref}.} \item{method}{Deprecated.} diff --git a/man/classifySingleR.Rd b/man/classifySingleR.Rd index 51a71f8..13b8b9e 100644 --- a/man/classifySingleR.Rd +++ b/man/classifySingleR.Rd @@ -13,17 +13,20 @@ classifySingleR( sd.thresh = NULL, prune = TRUE, assay.type = "logcounts", - check.missing = TRUE, + check.missing = FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM = SerialParam() ) } \arguments{ \item{test}{A numeric matrix of single-cell expression values where rows are genes and columns are cells. +Each row should be named with the gene name. Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix.} \item{trained}{A \linkS4class{List} containing the output of the \code{\link{trainSingleR}} function. +If the row names of \code{test} are not exactly the same as the reference dataset, the call to \code{trainSingleR} should set \code{test.genes=rownames(test)}. + Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multiple references.} \item{quantile}{A numeric scalar specifying the quantile of the correlation distribution to use to compute the score for each label.} @@ -38,8 +41,7 @@ Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multip \item{assay.type}{Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.} -\item{check.missing}{Logical scalar indicating whether rows should be checked for missing values. -If true and any missing values are found, the rows containing these values are silently removed.} +\item{check.missing}{Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.} \item{num.threads}{Integer scalar specifying the number of threads to use for classification.} diff --git a/man/combineRecomputedResults.Rd b/man/combineRecomputedResults.Rd index 19920e9..356b9da 100644 --- a/man/combineRecomputedResults.Rd +++ b/man/combineRecomputedResults.Rd @@ -10,9 +10,9 @@ combineRecomputedResults( trained, quantile = 0.8, assay.type.test = "logcounts", - check.missing = TRUE, - allow.lost = FALSE, + check.missing = FALSE, warn.lost = TRUE, + allow.lost = FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM = SerialParam() ) @@ -32,11 +32,11 @@ or (ii) running \code{trainSingleR} on each reference separately and manually ma \item{assay.type.test}{An integer scalar or string specifying the assay of \code{test} containing the relevant expression matrix, if \code{test} is a \linkS4class{SummarizedExperiment} object.} -\item{check.missing}{Logical scalar indicating whether rows should be checked for missing values (and if found, removed).} +\item{check.missing}{Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.} -\item{allow.lost}{Deprecated.} +\item{warn.lost}{Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references.} -\item{warn.lost}{Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references.} +\item{allow.lost}{Deprecated.} \item{num.threads}{Integer scalar specifying the number of threads to use for index building and classification.} @@ -51,7 +51,7 @@ This mimics the output of \code{\link{classifySingleR}} and contains the followi For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference; scores are not recomputed for the other labels. \item \code{labels}, a character vector containing the per-cell combined label across references. -\item \code{references}, an integer vector specifying the reference from which the combined label was derived. +\item \code{reference}, an integer vector specifying the reference from which the combined label was derived. \item \code{orig.results}, a DataFrame containing \code{results}. } It may also contain \code{pruned.labels} if these were also present in \code{results}. diff --git a/man/getClassicMarkers.Rd b/man/getClassicMarkers.Rd index a04662a..54a7927 100644 --- a/man/getClassicMarkers.Rd +++ b/man/getClassicMarkers.Rd @@ -21,13 +21,12 @@ In general, the expression values are expected to be log-transformed, see Detail Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix. -Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references, -in which case the row names are expected to be the same across all objects.} +Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references.} \item{labels}{A character vector or factor of known labels for all samples in \code{ref}. Alternatively, if \code{ref} is a list, \code{labels} should be a list of the same length. -Each element should contain a character vector or factor specifying the label for the corresponding entry of \code{ref}.} +Each element should contain a character vector or factor specifying the labels for the columns of the corresponding element of \code{ref}.} \item{assay.type}{An integer scalar or string specifying the assay of \code{ref} containing the relevant expression matrix, if \code{ref} is a \linkS4class{SummarizedExperiment} object (or is a list that contains one or more such objects).} diff --git a/man/trainSingleR.Rd b/man/trainSingleR.Rd index af34530..ca17f58 100644 --- a/man/trainSingleR.Rd +++ b/man/trainSingleR.Rd @@ -7,6 +7,7 @@ trainSingleR( ref, labels, + test.genes = NULL, genes = "de", sd.thresh = NULL, de.method = c("classic", "wilcox", "t"), @@ -31,13 +32,15 @@ In general, the expression values are expected to be log-transformed, see Detail Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix. -Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references, -in which case the row names are expected to be the same across all objects.} +Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references.} \item{labels}{A character vector or factor of known labels for all samples in \code{ref}. Alternatively, if \code{ref} is a list, \code{labels} should be a list of the same length. -Each element should contain a character vector or factor specifying the label for the corresponding entry of \code{ref}.} +Each element should contain a character vector or factor specifying the labels for the columns of the corresponding element of \code{ref}.} + +\item{test.genes}{Character vector of the names of the genes in the test dataset, i.e., the row names of \code{test} in \code{\link{classifySingleR}}. +If \code{NULL}, it is assumed that the test dataset and \code{ref} have the same genes in the same row order.} \item{genes}{A string containing \code{"de"}, indicating that markers should be calculated from \code{ref}. For back compatibility, other string values are allowed but will be ignored with a deprecation warning. @@ -83,11 +86,11 @@ if \code{ref} is a \linkS4class{SummarizedExperiment} object (or is a list that \item{check.missing}{Logical scalar indicating whether rows should be checked for missing values. If true and any missing values are found, the rows containing these values are silently removed.} -\item{approximate}{Logical scalar indicating whether a faster approximate method should be used to compute the quantile.} +\item{approximate}{Deprecated, use \code{BNPARAM} instead.} \item{num.threads}{Integer scalar specifying the number of threads to use for index building.} -\item{BNPARAM}{Deprecated and ignored.} +\item{BNPARAM}{A \linkS4class{BiocNeighborParam} object specifying how the neighbor search index should be constructed.} \item{BPPARAM}{A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed. Relevant for marker detection if \code{genes = NULL}, aggregation if \code{aggr.ref = TRUE}, and \code{NA} checking if \code{check.missing = TRUE}.} @@ -120,9 +123,15 @@ This is done by identifying the median expression within each label, and computi For each label, the top \code{de.n} genes with the largest differences compared to another label are chosen as markers to distinguish the two labels. The expression values are expected to be log-transformed and normalized. -If \code{restrict} is specified, \code{ref} is subsetted to only include the rows with names that are in \code{restrict}. -Marker selection and all subsequent classification will be performed using this restrictive subset of genes. -This can be convenient for ensuring that only appropriate genes are used (e.g., not pseudogenes or predicted genes). +Classification with \code{classifySingleR} assumes that the test dataset contains all marker genes that were detected from the reference. +If the test and reference datasets do not have the same genes in the same order, we can set \code{test.genes} to the row names of the test dataset. +This will instruct \code{trainSingleR} to only consider markers that are present in the test dataset. +Any subsequent call to \code{classifySingleR} will also check that \code{test.genes} is consistent with \code{rownames(test)}. + +On a similar note, if \code{restrict} is specified, marker selection will only be performed using the specified subset of genes. +This can be convenient for ignoring inappropriate genes like pseudogenes or predicted genes. +It has the same effect as filtering out undesirable rows from \code{ref} prior to calling \code{trainSingleR}. +Unlike \code{test.genes}, setting \code{restrict} does not introduce further checks on \code{rownames(test)} in \code{classifySingleR}. } \section{Custom feature specification}{ diff --git a/src/Makevars b/src/Makevars deleted file mode 100644 index 4f42038..0000000 --- a/src/Makevars +++ /dev/null @@ -1 +0,0 @@ -PKG_CPPFLAGS=-I../inst/include/ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 6b96b6d..f6bdc17 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -10,6 +10,35 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif +// classify_integrated +SEXP classify_integrated(Rcpp::RObject test, Rcpp::List results, SEXP integrated_build, double quantile, int nthreads); +RcppExport SEXP _SingleR_classify_integrated(SEXP testSEXP, SEXP resultsSEXP, SEXP integrated_buildSEXP, SEXP quantileSEXP, SEXP nthreadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< Rcpp::RObject >::type test(testSEXP); + Rcpp::traits::input_parameter< Rcpp::List >::type results(resultsSEXP); + Rcpp::traits::input_parameter< SEXP >::type integrated_build(integrated_buildSEXP); + Rcpp::traits::input_parameter< double >::type quantile(quantileSEXP); + Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); + rcpp_result_gen = Rcpp::wrap(classify_integrated(test, results, integrated_build, quantile, nthreads)); + return rcpp_result_gen; +END_RCPP +} +// classify_single +SEXP classify_single(Rcpp::RObject test, SEXP prebuilt, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads); +RcppExport SEXP _SingleR_classify_single(SEXP testSEXP, SEXP prebuiltSEXP, SEXP quantileSEXP, SEXP use_fine_tuneSEXP, SEXP fine_tune_thresholdSEXP, SEXP nthreadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< Rcpp::RObject >::type test(testSEXP); + Rcpp::traits::input_parameter< SEXP >::type prebuilt(prebuiltSEXP); + Rcpp::traits::input_parameter< double >::type quantile(quantileSEXP); + Rcpp::traits::input_parameter< bool >::type use_fine_tune(use_fine_tuneSEXP); + Rcpp::traits::input_parameter< double >::type fine_tune_threshold(fine_tune_thresholdSEXP); + Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); + rcpp_result_gen = Rcpp::wrap(classify_single(test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)); + return rcpp_result_gen; +END_RCPP +} // find_classic_markers Rcpp::List find_classic_markers(int nlabels, int ngenes, Rcpp::List labels, Rcpp::List ref, int de_n, int nthreads); RcppExport SEXP _SingleR_find_classic_markers(SEXP nlabelsSEXP, SEXP ngenesSEXP, SEXP labelsSEXP, SEXP refSEXP, SEXP de_nSEXP, SEXP nthreadsSEXP) { @@ -38,9 +67,9 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// integrate_build -SEXP integrate_build(Rcpp::IntegerVector test_features, Rcpp::List references, Rcpp::List ref_ids, Rcpp::List labels, Rcpp::List prebuilt, int nthreads); -RcppExport SEXP _SingleR_integrate_build(SEXP test_featuresSEXP, SEXP referencesSEXP, SEXP ref_idsSEXP, SEXP labelsSEXP, SEXP prebuiltSEXP, SEXP nthreadsSEXP) { +// train_integrated +SEXP train_integrated(Rcpp::IntegerVector test_features, Rcpp::List references, Rcpp::List ref_ids, Rcpp::List labels, Rcpp::List prebuilt, int nthreads); +RcppExport SEXP _SingleR_train_integrated(SEXP test_featuresSEXP, SEXP referencesSEXP, SEXP ref_idsSEXP, SEXP labelsSEXP, SEXP prebuiltSEXP, SEXP nthreadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type test_features(test_featuresSEXP); @@ -49,45 +78,33 @@ BEGIN_RCPP Rcpp::traits::input_parameter< Rcpp::List >::type labels(labelsSEXP); Rcpp::traits::input_parameter< Rcpp::List >::type prebuilt(prebuiltSEXP); Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); - rcpp_result_gen = Rcpp::wrap(integrate_build(test_features, references, ref_ids, labels, prebuilt, nthreads)); + rcpp_result_gen = Rcpp::wrap(train_integrated(test_features, references, ref_ids, labels, prebuilt, nthreads)); return rcpp_result_gen; END_RCPP } -// integrate_run -SEXP integrate_run(Rcpp::RObject test, Rcpp::List results, SEXP integrated_build, double quantile, int nthreads); -RcppExport SEXP _SingleR_integrate_run(SEXP testSEXP, SEXP resultsSEXP, SEXP integrated_buildSEXP, SEXP quantileSEXP, SEXP nthreadsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::traits::input_parameter< Rcpp::RObject >::type test(testSEXP); - Rcpp::traits::input_parameter< Rcpp::List >::type results(resultsSEXP); - Rcpp::traits::input_parameter< SEXP >::type integrated_build(integrated_buildSEXP); - Rcpp::traits::input_parameter< double >::type quantile(quantileSEXP); - Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); - rcpp_result_gen = Rcpp::wrap(integrate_run(test, results, integrated_build, quantile, nthreads)); - return rcpp_result_gen; -END_RCPP -} -// prebuild -SEXP prebuild(Rcpp::RObject ref, Rcpp::IntegerVector labels, Rcpp::List markers, bool approximate, int nthreads); -RcppExport SEXP _SingleR_prebuild(SEXP refSEXP, SEXP labelsSEXP, SEXP markersSEXP, SEXP approximateSEXP, SEXP nthreadsSEXP) { +// train_single +SEXP train_single(Rcpp::IntegerVector test_features, Rcpp::RObject ref, Rcpp::IntegerVector ref_features, Rcpp::IntegerVector labels, Rcpp::List markers, Rcpp::RObject builder, int nthreads); +RcppExport SEXP _SingleR_train_single(SEXP test_featuresSEXP, SEXP refSEXP, SEXP ref_featuresSEXP, SEXP labelsSEXP, SEXP markersSEXP, SEXP builderSEXP, SEXP nthreadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type test_features(test_featuresSEXP); Rcpp::traits::input_parameter< Rcpp::RObject >::type ref(refSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type ref_features(ref_featuresSEXP); Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type labels(labelsSEXP); Rcpp::traits::input_parameter< Rcpp::List >::type markers(markersSEXP); - Rcpp::traits::input_parameter< bool >::type approximate(approximateSEXP); + Rcpp::traits::input_parameter< Rcpp::RObject >::type builder(builderSEXP); Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); - rcpp_result_gen = Rcpp::wrap(prebuild(ref, labels, markers, approximate, nthreads)); + rcpp_result_gen = Rcpp::wrap(train_single(test_features, ref, ref_features, labels, markers, builder, nthreads)); return rcpp_result_gen; END_RCPP } -// get_subset -Rcpp::IntegerVector get_subset(SEXP built); -RcppExport SEXP _SingleR_get_subset(SEXP builtSEXP) { +// get_ref_subset +Rcpp::IntegerVector get_ref_subset(SEXP built); +RcppExport SEXP _SingleR_get_ref_subset(SEXP builtSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::traits::input_parameter< SEXP >::type built(builtSEXP); - rcpp_result_gen = Rcpp::wrap(get_subset(built)); + rcpp_result_gen = Rcpp::wrap(get_ref_subset(built)); return rcpp_result_gen; END_RCPP } @@ -101,32 +118,16 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// run -SEXP run(Rcpp::RObject test, Rcpp::IntegerVector subset, SEXP prebuilt, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads); -RcppExport SEXP _SingleR_run(SEXP testSEXP, SEXP subsetSEXP, SEXP prebuiltSEXP, SEXP quantileSEXP, SEXP use_fine_tuneSEXP, SEXP fine_tune_thresholdSEXP, SEXP nthreadsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::traits::input_parameter< Rcpp::RObject >::type test(testSEXP); - Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type subset(subsetSEXP); - Rcpp::traits::input_parameter< SEXP >::type prebuilt(prebuiltSEXP); - Rcpp::traits::input_parameter< double >::type quantile(quantileSEXP); - Rcpp::traits::input_parameter< bool >::type use_fine_tune(use_fine_tuneSEXP); - Rcpp::traits::input_parameter< double >::type fine_tune_threshold(fine_tune_thresholdSEXP); - Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); - rcpp_result_gen = Rcpp::wrap(run(test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)); - return rcpp_result_gen; -END_RCPP -} static const R_CallMethodDef CallEntries[] = { + {"_SingleR_classify_integrated", (DL_FUNC) &_SingleR_classify_integrated, 5}, + {"_SingleR_classify_single", (DL_FUNC) &_SingleR_classify_single, 6}, {"_SingleR_find_classic_markers", (DL_FUNC) &_SingleR_find_classic_markers, 6}, {"_SingleR_grouped_medians", (DL_FUNC) &_SingleR_grouped_medians, 4}, - {"_SingleR_integrate_build", (DL_FUNC) &_SingleR_integrate_build, 6}, - {"_SingleR_integrate_run", (DL_FUNC) &_SingleR_integrate_run, 5}, - {"_SingleR_prebuild", (DL_FUNC) &_SingleR_prebuild, 5}, - {"_SingleR_get_subset", (DL_FUNC) &_SingleR_get_subset, 1}, + {"_SingleR_train_integrated", (DL_FUNC) &_SingleR_train_integrated, 6}, + {"_SingleR_train_single", (DL_FUNC) &_SingleR_train_single, 7}, + {"_SingleR_get_ref_subset", (DL_FUNC) &_SingleR_get_ref_subset, 1}, {"_SingleR_is_valid_built", (DL_FUNC) &_SingleR_is_valid_built, 1}, - {"_SingleR_run", (DL_FUNC) &_SingleR_run, 7}, {NULL, NULL, 0} }; diff --git a/src/integrate_run.cpp b/src/classify_integrated.cpp similarity index 61% rename from src/integrate_run.cpp rename to src/classify_integrated.cpp index b787468..9fbc20d 100644 --- a/src/integrate_run.cpp +++ b/src/classify_integrated.cpp @@ -3,9 +3,9 @@ #include //[[Rcpp::export(rng=false)]] -SEXP integrate_run(Rcpp::RObject test, Rcpp::List results, SEXP integrated_build, double quantile, int nthreads) { +SEXP classify_integrated(Rcpp::RObject test, Rcpp::List results, SEXP integrated_build, double quantile, int nthreads) { Rtatami::BoundNumericPointer curtest(test); - IntegratedXPtr iptr(integrated_build); + TrainedIntegratedPointer iptr(integrated_build); // Setting up the previous results. std::vector previous_results_vec; @@ -25,27 +25,25 @@ SEXP integrate_run(Rcpp::RObject test, Rcpp::List results, SEXP integrated_build Rcpp::IntegerVector best(ncells); Rcpp::NumericVector delta(ncells); + singlepp::ClassifyIntegratedBuffers buffers; + buffers.best = static_cast(best.begin()); + buffers.delta = static_cast(delta.begin()); + size_t nrefs = iptr->num_references(); Rcpp::NumericMatrix scores(ncells, nrefs); - std::vector scores_ptr(nrefs); if (nrefs) { - scores_ptr[0] = static_cast(scores.begin()); + buffers.scores.resize(nrefs); + buffers.scores[0] = static_cast(scores.begin()); for (size_t l = 1; l < nrefs; ++l) { - scores_ptr[l] = scores_ptr[l-1] + ncells; + buffers.scores[l] = buffers.scores[l-1] + ncells; } } // Running the integrated scoring. - singlepp::IntegratedScorer scorer; - scorer.set_num_threads(nthreads).set_quantile(quantile); - scorer.run( - curtest->ptr.get(), - previous_results, - *iptr, - static_cast(best.begin()), - scores_ptr, - static_cast(delta.begin()) - ); + singlepp::ClassifyIntegratedOptions opts; + opts.num_threads = nthreads; + opts.quantile = quantile; + singlepp::classify_integrated(*(curtest->ptr), previous_results, *iptr, buffers, opts); return Rcpp::List::create( Rcpp::Named("best") = best, diff --git a/src/classify_single.cpp b/src/classify_single.cpp new file mode 100644 index 0000000..56275c1 --- /dev/null +++ b/src/classify_single.cpp @@ -0,0 +1,44 @@ +#include "utils.h" // must be before raticate, singlepp includes. + +#include + +//' @importFrom Rcpp sourceCpp +//' @useDynLib SingleR +//[[Rcpp::export(rng=false)]] +SEXP classify_single(Rcpp::RObject test, SEXP prebuilt, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads) { + Rtatami::BoundNumericPointer parsed(test); + TrainedSingleIntersectPointer built(prebuilt); + + // Setting up outputs. + size_t ncells = parsed->ptr->ncol(); + Rcpp::IntegerVector best(ncells); + Rcpp::NumericVector delta(ncells); + + singlepp::ClassifySingleBuffers buffers; + buffers.best = static_cast(best.begin()); + buffers.delta = static_cast(delta.begin()); + + size_t nlabels = built->num_labels(); + Rcpp::NumericMatrix scores(ncells, nlabels); + if (nlabels) { + buffers.scores.resize(nlabels); + buffers.scores[0] = static_cast(scores.begin()); + for (size_t l = 1; l < nlabels; ++l) { + buffers.scores[l] = buffers.scores[l-1] + ncells; + } + } + + // Running the analysis. + singlepp::ClassifySingleOptions opts; + opts.num_threads = nthreads; + opts.quantile = quantile; + opts.fine_tune = use_fine_tune; + opts.fine_tune_threshold = fine_tune_threshold; + singlepp::classify_single_intersect(*(parsed->ptr), *built, buffers, opts); + + return Rcpp::List::create( + Rcpp::Named("best") = best, + Rcpp::Named("scores") = scores, + Rcpp::Named("delta") = delta + ); +} diff --git a/src/find_classic_markers.cpp b/src/find_classic_markers.cpp index 87f8835..51e75f1 100644 --- a/src/find_classic_markers.cpp +++ b/src/find_classic_markers.cpp @@ -34,9 +34,10 @@ Rcpp::List find_classic_markers(int nlabels, int ngenes, Rcpp::List labels, Rcpp lab_ptrs.push_back(static_cast(lab_vec.back().begin())); } - singlepp::ChooseClassicMarkers mrk; - mrk.set_number(de_n).set_num_threads(nthreads); - auto store = mrk.run(ref_ptrs, lab_ptrs); + singlepp::ChooseClassicMarkersOptions opts; + opts.number = de_n; + opts.num_threads = nthreads; + auto store = singlepp::choose_classic_markers(ref_ptrs, lab_ptrs, opts); // Returning everything in R space. Rcpp::List output(nlabels); diff --git a/src/integrate_build.cpp b/src/integrate_build.cpp deleted file mode 100644 index 7f05490..0000000 --- a/src/integrate_build.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include "utils.h" // must be before all other includes. - -#include -#include - -//[[Rcpp::export(rng=false)]] -SEXP integrate_build(Rcpp::IntegerVector test_features, Rcpp::List references, Rcpp::List ref_ids, Rcpp::List labels, Rcpp::List prebuilt, int nthreads) { - singlepp::IntegratedBuilder builder; - builder.set_num_threads(nthreads); - - size_t nrefs = references.size(); - std::vector holding_labs; - holding_labs.reserve(nrefs); - - for (size_t r = 0; r < nrefs; ++r) { - Rcpp::RObject curref(references[r]); - auto parsed = Rtatami::BoundNumericPointer(curref); - - Rcpp::IntegerVector curids(ref_ids[r]); - holding_labs.emplace_back(labels[r]); - Rcpp::RObject built = prebuilt[r]; - PrebuiltXPtr curbuilt(built); - - builder.add( - test_features.size(), - static_cast(test_features.begin()), - parsed->ptr.get(), - static_cast(curids.begin()), - static_cast(holding_labs.back().begin()), - *curbuilt - ); - } - - auto finished = builder.finish(); - return IntegratedXPtr(new singlepp::IntegratedReferences(std::move(finished)), true); -} diff --git a/src/prebuild.cpp b/src/prebuild.cpp deleted file mode 100644 index 444d4a3..0000000 --- a/src/prebuild.cpp +++ /dev/null @@ -1,47 +0,0 @@ -#include "utils.h" - -#include -#include - -//' @importFrom Rcpp sourceCpp -//' @useDynLib SingleR -//[[Rcpp::export(rng=false)]] -SEXP prebuild(Rcpp::RObject ref, Rcpp::IntegerVector labels, Rcpp::List markers, bool approximate, int nthreads) { - singlepp::BasicBuilder builder; - builder.set_num_threads(nthreads); - - // Use all available markers; assume subsetting was applied on the R side. - builder.set_top(-1).set_approximate(approximate); - - // Setting up the markers. - singlepp::Markers markers2(markers.size()); - for (size_t m = 0; m < markers.size(); ++m) { - Rcpp::List curmarkers(markers[m]); - auto& curmarkers2 = markers2[m]; - curmarkers2.resize(curmarkers.size()); - - for (size_t n = 0; n < curmarkers.size(); ++n) { - Rcpp::IntegerVector seq(curmarkers[n]); - auto& seq2 = curmarkers2[n]; - seq2.insert(seq2.end(), seq.begin(), seq.end()); - } - } - - // Building the indices. - auto parsed = Rtatami::BoundNumericPointer(ref); - auto built = builder.run(parsed->ptr.get(), static_cast(labels.begin()), std::move(markers2)); - - // Moving it into the external pointer. - return PrebuiltXPtr(new singlepp::BasicBuilder::Prebuilt(std::move(built)), true); -} - -//[[Rcpp::export(rng=false)]] -Rcpp::IntegerVector get_subset(SEXP built) { - PrebuiltXPtr ptr(built); - return Rcpp::IntegerVector(ptr->subset.begin(), ptr->subset.end()); -} - -//[[Rcpp::export(rng=false)]] -Rcpp::LogicalVector is_valid_built(SEXP built) { - return Rf_ScalarLogical(!!R_ExternalPtrAddr(built)); -} diff --git a/src/run.cpp b/src/run.cpp deleted file mode 100644 index b490897..0000000 --- a/src/run.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "utils.h" // must be before raticate, singlepp includes. - -#include - -//' @importFrom Rcpp sourceCpp -//' @useDynLib SingleR -//[[Rcpp::export(rng=false)]] -SEXP run(Rcpp::RObject test, Rcpp::IntegerVector subset, SEXP prebuilt, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads) { - auto parsed = Rtatami::BoundNumericPointer(test); - PrebuiltXPtr built(prebuilt); - - // Setting up outputs. - size_t ncells = parsed->ptr->ncol(); - Rcpp::IntegerVector best(ncells); - Rcpp::NumericVector delta(ncells); - - size_t nlabels = built->num_labels(); - Rcpp::NumericMatrix scores(ncells, nlabels); - std::vector scores_ptr(nlabels); - if (nlabels) { - scores_ptr[0] = static_cast(scores.begin()); - for (size_t l = 1; l < nlabels; ++l) { - scores_ptr[l] = scores_ptr[l-1] + ncells; - } - } - - // Running the analysis. - singlepp::BasicScorer runner; - runner.set_num_threads(nthreads); - runner.set_quantile(quantile).set_fine_tune(use_fine_tune).set_fine_tune_threshold(fine_tune_threshold); - runner.run( - parsed->ptr.get(), - *built, - static_cast(subset.begin()), - static_cast(best.begin()), - scores_ptr, - static_cast(delta.begin()) - ); - - return Rcpp::List::create( - Rcpp::Named("best") = best, - Rcpp::Named("scores") = scores, - Rcpp::Named("delta") = delta - ); -} diff --git a/src/train_integrated.cpp b/src/train_integrated.cpp new file mode 100644 index 0000000..0705b33 --- /dev/null +++ b/src/train_integrated.cpp @@ -0,0 +1,39 @@ +#include "utils.h" // must be before all other includes. + +#include +#include + +//[[Rcpp::export(rng=false)]] +SEXP train_integrated(Rcpp::IntegerVector test_features, Rcpp::List references, Rcpp::List ref_ids, Rcpp::List labels, Rcpp::List prebuilt, int nthreads) { + size_t nrefs = references.size(); + + std::vector > inputs; + inputs.reserve(nrefs); + std::vector holding_labs; + holding_labs.reserve(nrefs); + + for (size_t r = 0; r < nrefs; ++r) { + Rcpp::RObject curref(references[r]); + Rtatami::BoundNumericPointer parsed(curref); + + Rcpp::IntegerVector curids(ref_ids[r]); + holding_labs.emplace_back(labels[r]); + Rcpp::RObject built = prebuilt[r]; + TrainedSingleIntersectPointer curbuilt(built); + + inputs.push_back(singlepp::prepare_integrated_input_intersect( + static_cast(test_features.size()), + static_cast(test_features.begin()), + *(parsed->ptr), + static_cast(curids.begin()), + static_cast(holding_labs.back().begin()), + *curbuilt + )); + } + + singlepp::TrainIntegratedOptions opts; + opts.num_threads = nthreads; + auto finished = singlepp::train_integrated(std::move(inputs), opts); + + return TrainedIntegratedPointer(new TrainedIntegrated(std::move(finished)), true); +} diff --git a/src/train_single.cpp b/src/train_single.cpp new file mode 100644 index 0000000..77aa41a --- /dev/null +++ b/src/train_single.cpp @@ -0,0 +1,57 @@ +#include "utils.h" +#include "BiocNeighbors.h" + +#include +#include + +//' @importFrom Rcpp sourceCpp +//' @useDynLib SingleR +//[[Rcpp::export(rng=false)]] +SEXP train_single(Rcpp::IntegerVector test_features, Rcpp::RObject ref, Rcpp::IntegerVector ref_features, Rcpp::IntegerVector labels, Rcpp::List markers, Rcpp::RObject builder, int nthreads) { + singlepp::TrainSingleOptions opts; + opts.num_threads = nthreads; + opts.top = -1; // Use all available markers; assume subsetting was applied on the R side. + + BiocNeighbors::BuilderPointer bptr(builder); + opts.trainer = std::shared_ptr(std::shared_ptr{}, bptr.get()); // make a no-op shared pointer. + + // Setting up the markers. We assume that these are already 0-indexed on the R side. + singlepp::Markers markers2(markers.size()); + for (size_t m = 0; m < markers.size(); ++m) { + Rcpp::List curmarkers(markers[m]); + auto& curmarkers2 = markers2[m]; + curmarkers2.resize(curmarkers.size()); + + for (size_t n = 0; n < curmarkers.size(); ++n) { + Rcpp::IntegerVector seq(curmarkers[n]); + auto& seq2 = curmarkers2[n]; + seq2.insert(seq2.end(), seq.begin(), seq.end()); + } + } + + // Building the indices. + Rtatami::BoundNumericPointer parsed(ref); + auto built = singlepp::train_single_intersect( + static_cast(test_features.size()), + static_cast(test_features.begin()), + *(parsed->ptr), + static_cast(ref_features.begin()), + static_cast(labels.begin()), + std::move(markers2), + opts + ); + + return TrainedSingleIntersectPointer(new TrainedSingleIntersect(std::move(built)), true); +} + +//[[Rcpp::export(rng=false)]] +Rcpp::IntegerVector get_ref_subset(SEXP built) { + TrainedSingleIntersectPointer ptr(built); + const auto& rsub = ptr->get_ref_subset(); + return Rcpp::IntegerVector(rsub.begin(), rsub.end()); +} + +//[[Rcpp::export(rng=false)]] +Rcpp::LogicalVector is_valid_built(SEXP built) { + return Rf_ScalarLogical(!!R_ExternalPtrAddr(built)); +} diff --git a/src/utils.h b/src/utils.h index 4cd1173..3e9f97e 100644 --- a/src/utils.h +++ b/src/utils.h @@ -2,16 +2,15 @@ #define UTILS_H #include "Rcpp.h" -#include "Rtatami.h" +#include "Rtatami.h" // before singlepp includes to ensure the tatami_r::parallelize() override is set. +#include "singlepp/singlepp.hpp" -// must be before singlepp includes. -#define SINGLEPP_CUSTOM_PARALLEL tatami_r::parallelize -#define __ERROR_PRINTER_OVERRIDE__ REprintf // avoid R CMD check warnings about stderr in Annoy. +typedef singlepp::TrainedSingleIntersect TrainedSingleIntersect; -#include "singlepp/singlepp.hpp" +typedef Rcpp::XPtr TrainedSingleIntersectPointer; -typedef Rcpp::XPtr PrebuiltXPtr; +typedef singlepp::TrainedIntegrated TrainedIntegrated; -typedef Rcpp::XPtr IntegratedXPtr; +typedef Rcpp::XPtr TrainedIntegratedPointer; #endif diff --git a/tests/testthat/test-SingleR.R b/tests/testthat/test-SingleR.R index 5b0b06a..37f6e43 100644 --- a/tests/testthat/test-SingleR.R +++ b/tests/testthat/test-SingleR.R @@ -70,10 +70,11 @@ test_that("SingleR handles DelayedArray inputs", { }) test_that("SingleR works with multiple references", { - # Handles mismatching row names. - chosen0 <- sample(rownames(training), 900) - chosen1 <- sample(rownames(training), 900) - chosen2 <- sample(rownames(training), 900) + # Handles mismatching row names. Note that the sorting is necessary + # to ensure that tied genes are handled in a consistent way. + chosen0 <- sort(sample(rownames(training), 900)) + chosen1 <- sort(sample(rownames(training), 900)) + chosen2 <- sort(sample(rownames(training), 900)) # Works with recomputation. out <- SingleR(test[chosen0,], list(training[chosen1,], training[chosen2,]), diff --git a/tests/testthat/test-classify.R b/tests/testthat/test-classify.R index 4ad16e0..2537ef5 100644 --- a/tests/testthat/test-classify.R +++ b/tests/testthat/test-classify.R @@ -79,18 +79,6 @@ test_that("classifySingleR behaves with no-variance cells", { expect_identical(out$labels[-(1:10)], ref$labels[-(1:10)]) }) -test_that("classifySingleR behaves with missing values", { - # Can't just set the first entry to NA, as we need to ensure - # that the test set contains a superset of genes in the training set. - sce <- BiocGenerics::rbind(test[1,], test) - logcounts(sce)[1,1] <- NA - - Q <- 0.8 - out <- classifySingleR(sce, trained, fine.tune=FALSE, quantile=Q) - ref <- classifySingleR(test, trained, fine.tune=FALSE, quantile=Q) - expect_identical(out, ref) -}) - test_that("classifySingleR works with multiple references", { training1 <- training2 <- training training1 <- training1[sample(nrow(training1)),] @@ -98,15 +86,23 @@ test_that("classifySingleR works with multiple references", { mtrain <- trainSingleR(list(training1, training2), list(training1$label, training2$label)) out <- classifySingleR(test, mtrain) + expect_identical(names(out$orig.results), c("ref1", "ref2")) + expect_true(all(out$reference %in% 1:2)) ref1 <- classifySingleR(test, mtrain[[1]]) ref2 <- classifySingleR(test, mtrain[[2]]) expect_identical(out, combineRecomputedResults(list(ref1, ref2), test, mtrain)) + + # Preserves names of the references themselves. + mtrain <- trainSingleR(list(foo=training1, bar=training2), list(training1$label, training2$label)) + out <- classifySingleR(test, mtrain) + expect_identical(names(out$orig.results), c("foo", "bar")) + expect_true(all(out$reference %in% 1:2)) }) test_that("classifySingleR behaves with silly inputs", { out <- classifySingleR(test[,0], trained, fine.tune=FALSE) expect_identical(nrow(out$scores), 0L) expect_identical(length(out$labels), 0L) - expect_error(classifySingleR(test[0,], trained, fine.tune=FALSE), "does not contain") + expect_error(classifySingleR(test[0,], trained, fine.tune=FALSE), "expected 'rownames(test)' to be the same", fixed=TRUE) }) diff --git a/tests/testthat/test-recomputed.R b/tests/testthat/test-recomputed.R index 4c75c3f..be75d4c 100644 --- a/tests/testthat/test-recomputed.R +++ b/tests/testthat/test-recomputed.R @@ -19,11 +19,11 @@ test <- .mockTestData(ref) test <- scuttle::logNormCounts(test) ref1 <- scuttle::logNormCounts(ref1) -train1 <- trainSingleR(ref1, labels=ref1$label) +train1 <- trainSingleR(ref1, labels=ref1$label, test.genes=rownames(test)) pred1 <- classifySingleR(test, train1) ref2 <- scuttle::logNormCounts(ref2) -train2 <- trainSingleR(ref2, labels=ref2$label) +train2 <- trainSingleR(ref2, labels=ref2$label, test.genes=rownames(test)) pred2 <- classifySingleR(test, train2) test_that("combineRecomputedResults works as expected (light check)", { @@ -100,47 +100,28 @@ test_that("combineRecomputedResults handles mismatches to rows and cells", { trained=list(train1, train2)), "not identical") colnames(test) <- NULL - # Correctly reorders the gene universes. - ref <- combineRecomputedResults( - results=list(pred1, pred2), - test=test, - trained=list(train1, train2)) - + # Responds to mismatches in the genes. s <- sample(nrow(test)) - out <- combineRecomputedResults( + expect_error(combineRecomputedResults( results=list(pred1, pred2), test=test[s,], - trained=list(train1, train2)) - expect_equal(ref, out) + trained=list(train1, train2)), "test.genes") }) test_that("combineRecomputedResults emits warnings when missing genes are present", { + half <- nrow(test) / 2 + # Spiking in some missing genes. - ref1b <- ref1[c(1, seq_len(nrow(ref1))),] - rownames(ref1b)[1] <- "BLAH" - markers1 <- train1$markers$full - markers1$A$B <- c(markers1$A$B, "BLAH") - train1b <- trainSingleR(ref1b, labels=ref1$label, genes=markers1) - - ref2b <- ref2[c(1, seq_len(nrow(ref2))),] - rownames(ref2b)[1] <- "WHEE" - markers2 <- train2$markers$full - markers2$A$B <- c(markers2$a$b, "WHEE") - train2b <- trainSingleR(ref2b, labels=ref2$label, genes=markers2) - - expect_error(out <- combineRecomputedResults( - results=list(pred1, pred2), - test=test, - trained=list(train1b, train2b)), "should be present") + ref1b <- ref1[seq_len(half),,drop=FALSE] + train1b <- trainSingleR(ref1b, labels=ref1$label, test.genes=rownames(test)) - test2 <- test[c(1,seq_len(nrow(test)),1),] - rownames(test2)[1] <- "WHEE" - rownames(test2)[length(rownames(test2))] <- "BLAH" + ref2b <- ref2[half + seq_len(half),] + train2b <- trainSingleR(ref2b, labels=ref2$label, test.genes=rownames(test)) expect_warning(out <- combineRecomputedResults( results=list(pred1, pred2), - test=test2, - trained=list(train1b, train2b)), "differ in the universe") + test=test, + trained=list(train1b, train2b)), "available in each reference") }) test_that("combineRecomputedResults is invariant to ordering", { diff --git a/tests/testthat/test-train.R b/tests/testthat/test-train.R index 03ead0f..f216fdd 100644 --- a/tests/testthat/test-train.R +++ b/tests/testthat/test-train.R @@ -128,7 +128,8 @@ test_that("trainSingleR strips out NAs", { out <- trainSingleR(sce, sce$label) ref <- trainSingleR(sce[-1,], sce$label) - expect_identical(out$ref, ref$ref) + expect_identical(as.matrix(out$ref), ref$ref) + expect_identical(out$markers, ref$markers) }) test_that("trainSingleR behaves with multiple references, plus recomputation", { @@ -136,16 +137,17 @@ test_that("trainSingleR behaves with multiple references, plus recomputation", { training1 <- training1[sample(nrow(training1)),] rownames(training1) <- rownames(training) - set.seed(1000) ref1 <- trainSingleR(training1, training1$label) ref2 <- trainSingleR(training2, training2$label) - - set.seed(1000) - out <- trainSingleR(list(training1, training2), list(training1$label, training2$label), recompute=TRUE) + out <- trainSingleR(list(training1, training2), list(training1$label, training2$label)) except.built <- setdiff(names(ref1), "built") expect_identical(ref1[except.built], out[[1]][except.built]) expect_identical(ref2[except.built], out[[2]][except.built]) + + # Same result with names. + out <- trainSingleR(list(foo=training1, bar=training2), list(training1$label, training2$label)) + expect_identical(names(out), c("foo", "bar")) }) test_that("trainSingleR behaves with aggregation turned on", {