From 3bfe934a7f8d9a6e159d94af2619e3676cf32280 Mon Sep 17 00:00:00 2001 From: Johannes Rainer Date: Wed, 25 Sep 2024 07:50:48 +0200 Subject: [PATCH] docs: restructure documentation --- R/AllGenerics.R | 10 - R/MsBackend.R | 4 +- R/Spectra-estimatePrecursorMz.R | 4 + R/Spectra-functions.R | 42 +- R/Spectra-neutralLoss.R | 4 + R/Spectra.R | 1870 +++++++++++++++++++++++++--- R/countIdentifications.R | 4 + R/peaks-functions.R | 8 +- man/MsBackend.Rd | 4 +- man/Spectra.Rd | 1889 ++--------------------------- man/countIdentifications.Rd | 3 + man/estimatePrecursorIntensity.Rd | 1 + man/estimatePrecursorMz.Rd | 3 + man/hidden_aliases.Rd | 27 - man/joinPeaks.Rd | 7 +- man/neutralLoss.Rd | 21 +- man/processingChunkSize.Rd | 10 +- 17 files changed, 1931 insertions(+), 1980 deletions(-) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 0b69bdaf..5ec6d054 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -5,42 +5,32 @@ NULL setMethod("bin", "numeric", MsCoreUtils::bin) setGeneric("combinePeaks", function(object, ...) standardGeneric("combinePeaks")) -#' @rdname hidden_aliases setGeneric("containsMz", function(object, ...) standardGeneric("containsMz")) -#' @rdname hidden_aliases setGeneric("containsNeutralLoss", function(object, ...) standardGeneric("containsNeutralLoss")) setGeneric("dataStorageBasePath", function(object, ...) standardGeneric("dataStorageBasePath")) setGeneric("dataStorageBasePath<-", function(object, ..., value) standardGeneric("dataStorageBasePath<-")) -#' @rdname hidden_aliases setGeneric("dropNaSpectraVariables", function(object, ...) standardGeneric("dropNaSpectraVariables")) -#' @rdname hidden_aliases setGeneric("entropy", function(object, ...) standardGeneric("entropy")) -#' @rdname hidden_aliases setGeneric("export", function(object, ...) standardGeneric("export")) setGeneric("filterFourierTransformArtefacts", function(object, ...) standardGeneric("filterFourierTransformArtefacts")) -#' @rdname neutralLoss setGeneric("neutralLoss", function(object, param, ...) standardGeneric("neutralLoss")) -#' @rdname hidden_aliases setGeneric("pickPeaks", function(object, ...) standardGeneric("pickPeaks")) setGeneric("plotSpectraMirror", function(x, y, ...) standardGeneric("plotSpectraMirror")) -#' @rdname hidden_aliases setGeneric("replaceIntensitiesBelow", function(object, threshold = min, ...) standardGeneric("replaceIntensitiesBelow")) -#' @rdname hidden_aliases setGeneric("reset", function(object, ...) standardGeneric("reset")) -#' @rdname hidden_aliases setGeneric("selectSpectraVariables", function(object, ...) standardGeneric("selectSpectraVariables")) setGeneric("Spectra", function(object, ...) standardGeneric("Spectra")) diff --git a/R/MsBackend.R b/R/MsBackend.R index eae122b1..b89e5303 100644 --- a/R/MsBackend.R +++ b/R/MsBackend.R @@ -702,7 +702,7 @@ #' #' The parameters are: #' - `object`: an instance of the `MsBackendMzR` class. -#' - `x`: the [Spectra-class] object to be exported. +#' - `x`: the [Spectra] object to be exported. #' - `file`: `character` with the (full) output file name(s). Should be #' of length 1 or equal `length(x)`. If a single file is specified, all #' spectra are exported to that file. Alternatively it is possible to specify @@ -715,7 +715,7 @@ #' backend and if `dataOrigin(x)` contains the original MS data file names. #' - `BPPARAM`: parallel processing settings. #' -#' See examples in [Spectra-class] or the vignette for more details and +#' See examples in [Spectra] or the vignette for more details and #' examples. #' #' The `MsBackendMzR` ignores parameter `columns` of the `peaksData()` diff --git a/R/Spectra-estimatePrecursorMz.R b/R/Spectra-estimatePrecursorMz.R index 72743d57..ad6ff630 100644 --- a/R/Spectra-estimatePrecursorMz.R +++ b/R/Spectra-estimatePrecursorMz.R @@ -55,6 +55,10 @@ #' #' @author Mar Garcia-Aloy, Johannes Rainer #' +#' @seealso +#' +#' [addProcessing()] for other data analysis and manipulation functions. +#' #' @export #' #' @examples diff --git a/R/Spectra-functions.R b/R/Spectra-functions.R index 517452c4..033a2b2d 100644 --- a/R/Spectra-functions.R +++ b/R/Spectra-functions.R @@ -207,7 +207,7 @@ NULL #' @export applyProcessing #' -#' @rdname Spectra +#' @rdname addProcessing applyProcessing <- function(object, f = processingChunkFactor(object), BPPARAM = bpparam(), ...) { queue <- object@processingQueue @@ -538,14 +538,14 @@ applyProcessing <- function(object, f = processingChunkFactor(object), #' @export concatenateSpectra #' -#' @rdname Spectra +#' @rdname combineSpectra concatenateSpectra <- function(x, ...) { .concatenate_spectra(unlist(unname(list(unname(x), ...)))) } #' @export combineSpectra #' -#' @rdname Spectra +#' @rdname combineSpectra combineSpectra <- function(x, f = x$dataStorage, p = x$dataStorage, FUN = combinePeaksData, ..., BPPARAM = bpparam()) { if (!is.factor(f)) @@ -622,7 +622,7 @@ combineSpectra <- function(x, f = x$dataStorage, p = x$dataStorage, #' @export joinSpectraData #' -#' @rdname Spectra +#' @rdname combineSpectra joinSpectraData <- function(x, y, by.x = "spectrumId", by.y, @@ -685,7 +685,7 @@ joinSpectraData <- function(x, y, #' @export #' -#' @rdname Spectra +#' @rdname addProcessing processingLog <- function(x) { x@processing } @@ -831,9 +831,7 @@ chunkapply <- function(x, FUN, ..., chunkSize = 1000L, chunks = factor()) { as.factor(rep(1:ceiling(len / chunkSize), each = chunkSize)[seq_len(len)]) } -#' @rdname Spectra -#' -#' @author Nir Shahaf, Johannes Rainer +#' @rdname filterMsLevel #' #' @export deisotopeSpectra <- @@ -845,9 +843,7 @@ deisotopeSpectra <- substDefinition = im, charge = charge) } -#' @rdname Spectra -#' -#' @author Nir Shahaf, Johannes Rainer +#' @rdname filterMsLevel #' #' @export reduceSpectra <- function(x, tolerance = 0, ppm = 20) { @@ -856,9 +852,7 @@ reduceSpectra <- function(x, tolerance = 0, ppm = 20) { addProcessing(x, .peaks_reduce, tolerance = tolerance, ppm = ppm) } -#' @rdname Spectra -#' -#' @author Nir Shahaf +#' @rdname filterMsLevel #' #' @export filterPrecursorMaxIntensity <- function(x, tolerance = 0, ppm = 20) { @@ -891,9 +885,7 @@ filterPrecursorMaxIntensity <- function(x, tolerance = 0, ppm = 20) { x } -#' @rdname Spectra -#' -#' @author Nir Shahaf +#' @rdname filterMsLevel #' #' @export filterPrecursorIsotopes <- @@ -926,9 +918,7 @@ filterPrecursorIsotopes <- x } -#' @rdname Spectra -#' -#' @author Johannes Rainer +#' @rdname addProcessing #' #' @export scalePeaks <- function(x, by = sum, msLevel. = uniqueMsLevels(x)) { @@ -941,7 +931,7 @@ scalePeaks <- function(x, by = sum, msLevel. = uniqueMsLevels(x)) { x } -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @export filterPrecursorPeaks <- function(object, tolerance = 0, ppm = 20, @@ -992,6 +982,11 @@ filterPrecursorPeaks <- function(object, tolerance = 0, ppm = 20, #' per file parallel processing if `f` or `chunkSize` is not defined. #' Other on-disk backends: only if requested by the user. #' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. +#' +#' @param object `Spectra` object. +#' #' @param x `Spectra` object. #' #' @param chunkSize `integer` defining the size of chunks into which `x` should @@ -1067,6 +1062,11 @@ filterPrecursorPeaks <- function(object, tolerance = 0, ppm = 20, #' For these, the `backendBpparam()` function will always return a #' `SerialParam()` independently on how parallel processing was defined. #' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. +#' +#' @param object `Spectra` object. +#' #' @param x `Spectra`. #' #' @param value `integer(1)` defining the chunk size. diff --git a/R/Spectra-neutralLoss.R b/R/Spectra-neutralLoss.R index 53f3b2b5..dc9cf32c 100644 --- a/R/Spectra-neutralLoss.R +++ b/R/Spectra-neutralLoss.R @@ -87,6 +87,10 @@ setClassUnion("functionOrNull", c("function", "NULL")) #' Analysis in METLIN. Journal of the American Society for Mass Spectrometry. #' \doi{10.1021/jasms.1c00343} #' +#' @seealso +#' +#' [addProcessing()] for other data analysis and manipulation functions. +#' #' @examples #' #' ## Create a simple example Spectra object with some MS1, MS2 and MS3 spectra. diff --git a/R/Spectra.R b/R/Spectra.R index d3da9b44..179ee58c 100644 --- a/R/Spectra.R +++ b/R/Spectra.R @@ -9,12 +9,12 @@ NULL #' @title The Spectra class to manage and access MS data #' -#' @name Spectra-class +#' @name Spectra #' +#' @aliases Spectra-class #' @aliases Spectra #' @aliases setBackend #' @aliases export -#' @aliases dataStorageBasePath #' #' @description #' @@ -29,12 +29,52 @@ NULL #' #' Documentation on other topics and functionality of `Spectra`can be found in: #' -#' LLLLLLL add links to individual documentations. +#' - [spectraData()] for accessing and using MS data through `Spectra` objects. +#' - [filterMsLevel()] to subset and filter `Spectra` objects. +#' - [plotSpectra()] for visualization of `Spectra` orbjects. #' - [processingChunkSize()] for information on parallel and chunk-wise data #' processing. -#' - [plotSpectra()] for visualization of `Spectra`. -#' - [spectraData()] for accessing and using MS data through `Spectra`. +#' - [combineSpectra()] for merging, aggregating and splitting of `Spectra` +#' objects. +#' - [combinePeaks()] for merging and aggregating `Spectra`'s mass peaks data. +#' - [addProcessing()] for data analysis functions. +#' - [compareSpectra()] for spectra similarity calculations. +#' +#' @param backend For `Spectra()`: [MsBackend-class] to be used as backend. See +#' section on creation of `Spectra` objects for details. For `setBackend()`: +#' instance of [MsBackend-class] that supports `setBackend()` (i.e. for +#' which `supportsSetBackend()` returns `TRUE`). Such backends have a +#' parameter `data` in their `backendInitialize()` function that support +#' passing the full spectra data to the initialize method. See section on +#' creation of `Spectra` objects for details. +#' For `export()`: [MsBackend-class] to be used to export the data. #' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. +#' +#' @param f For `setBackend()`: factor defining how to split the data +#' for parallelized copying of the spectra data to the new backend. For +#' some backends changing this parameter can lead to errors. Defaults to +#' [processingChunkFactor()]. +#' +#' @param metadata For `Spectra()`: optional `list` with metadata information. +#' +#' @param object For `Spectra()`: an object to instantiate the `Spectra` +#' object and initialize the with data.. See section on creation of +#' `Spectra` objects for details. For all other methods a `Spectra` object. +#' +#' @param processingQueue For `Spectra()`: optional `list` of +#' [ProcessingStep-class] objects. +#' +#' @param source For `Spectra()`: instance of [MsBackend-class] that can be +#' used to import spectrum data from the provided files. See section +#' *Creation of objects* for more details. +#' +#' @param value For `dataStorageBasePath()`: A `character` vector that defines +#' the base directory where the data storage files can be found. +#' +#' @param ... Additional arguments. #' #' @section Data stored in a `Spectra` object: #' @@ -175,11 +215,12 @@ NULL #' The `Spectra` class uses by default a lazy data manipulation strategy, #' i.e. data manipulations such as performed with `replaceIntensitiesBelow()` #' are not applied immediately to the data, but applied on-the-fly to the -#' spectrum data once it is retrieved. For some backends that allow to write -#' data back to the data storage (such as the [MsBackendMemory()], -#' [MsBackendDataFrame()] and [MsBackendHdf5Peaks()]) it is possible to apply -#' to queue with the `applyProcessing` function. See the *Data manipulation and -#' analysis *methods* section below for more details. +#' spectrum data once it is retrieved. This enables data manipulation +#' operations also for *read only* data representations. For some backends that +#' allow to write data back to the data storage (such as the +#' [MsBackendMemory()], [MsBackendDataFrame()] and [MsBackendHdf5Peaks()]) it +#' is possible to apply to queue with the [applyProcessing()] function (see +#' the [applyProcessing()] function for details). #' #' Clarifications regarding scan/acquisition numbers and indices: #' @@ -208,7 +249,7 @@ NULL #' #' @examples #' -#' ## ---- CREATION OF SPECTRA OBJECTS ---- +#' ## -------- CREATION OF SPECTRA OBJECTS -------- #' #' ## Create a Spectra providing a `DataFrame` containing the spectrum data. #' @@ -227,7 +268,7 @@ NULL #' sciex #' #' -#' ## ---- CHANGING DATA REPRESENTATIONS ---- +#' ## -------- CHANGING DATA REPRESENTATIONS -------- #' #' ## The MS data is on disk and will be read into memory on-demand. We can #' ## however change the backend to a MsBackendMemory backend which will @@ -264,7 +305,7 @@ NULL #' head(dataOrigin(sciex_im)) #' #' -#' ## ---- DATA EXPORT ---- +#' ## -------- DATA EXPORT -------- #' #' ## Some `MsBackend` classes provide an `export()` method to export the data #' ## to the file format supported by the backend. @@ -293,10 +334,11 @@ NULL #' #' mz(res) #' mz(data) +NULL #' The Spectra class #' -#' The [Spectra-class] encapsulates data and meta-data for mass +#' The [Spectra] class encapsulates data and meta-data for mass #' spectrometry experiments. #' #' @slot backend A derivate of [MsBackend-class] holding/controlling the spectra @@ -378,7 +420,7 @@ setMethod("show", "Spectra", } }) -#' @rdname Spectra-class +#' @rdname Spectra setMethod("Spectra", "missing", function(object, processingQueue = list(), metadata = list(), ..., backend = MsBackendMemory(), @@ -389,7 +431,7 @@ setMethod("Spectra", "missing", function(object, processingQueue = list(), else callNextMethod() }) -#' @rdname Spectra-class +#' @rdname Spectra setMethod("Spectra", "MsBackend", function(object, processingQueue = list(), metadata = list(), ..., BPPARAM = bpparam()) { @@ -397,7 +439,7 @@ setMethod("Spectra", "MsBackend", function(object, processingQueue = list(), backend = object) }) -#' @rdname Spectra-class +#' @rdname Spectra #' #' @importFrom methods callNextMethod setMethod("Spectra", "character", function(object, processingQueue = list(), @@ -413,7 +455,7 @@ setMethod("Spectra", "character", function(object, processingQueue = list(), else sp }) -#' @rdname Spectra-class +#' @rdname Spectra setMethod("Spectra", "ANY", function(object, processingQueue = list(), metadata = list(), source = MsBackendMemory(), @@ -439,7 +481,7 @@ setMethod("Spectra", "ANY", function(object, processingQueue = list(), backend = backend) } -#' @rdname Spectra-class +#' @rdname Spectra #' #' @importMethodsFrom ProtGenerics setBackend #' @@ -488,7 +530,7 @@ setMethod( object }) -#' @rdname Spectra-class +#' @rdname Spectra #' #' @export setMethod("export", "Spectra", @@ -498,12 +540,12 @@ setMethod("export", "Spectra", export(backend, object, ...) }) -#' @rdname Spectra-class +#' @rdname Spectra setMethod("dataStorageBasePath", "Spectra", function(object) { dataStorageBasePath(object@backend) }) -#' @rdname Spectra-class +#' @rdname Spectra setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { dataStorageBasePath(object@backend) <- value object @@ -524,6 +566,7 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' @aliases collisionEnergy #' @aliases dataOrigin #' @aliases dataStorage +#' @aliases intensity #' @aliases ionCount #' @aliases isCentroided #' @aliases isEmpty @@ -550,7 +593,7 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' #' @description #' -#' As detailed in the documentation of the [Spectra-class], a `Spectra` object +#' As detailed in the documentation of the [Spectra] class, a `Spectra` object #' is a container for mass spectrometry (MS) data that includes both the mass #' peaks data (or *peaks data*, generally *m/z* and intensity values) as well #' as spectra metadata (so called *spectra variables*). Spectra variables @@ -563,6 +606,52 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' used by the `Spectra` to represent the data, data can also be added or #' replaced (again, using dedicated functions or using `$<-`). #' +#' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. See also [processingChunkSize()] for more information +#' on parallel processing. +#' +#' @param columns For `spectraData()` accessor: optional `character` with +#' column names (spectra variables) that should be included in the +#' returned `DataFrame`. By default, all columns are returned. +#' For `peaksData()` accessor: optional `character` with requested columns +#' in the individual `matrix` of the returned `list`. Defaults to +#' `c("mz", "value")` but any values returned by `peaksVariables(object)` +#' with `object` being the `Spectra` object are supported. +#' +#' @param f For `intensity()`, `mz()` and `peaksData()`: factor defining how +#' data should be chunk-wise loaded an processed. Defaults to +#' [processingChunkFactor()]. +#' +#' @param i For `asDataFrame()`: A `numeric` indicating which scans to coerce +#' to a `DataFrame` (default is `seq_along(object)`). +#' +#' @param initial For `tic()`: `logical(1)` whether the initially +#' reported total ion current should be reported, or whether the +#' total ion current should be (re)calculated on the actual data +#' (`initial = FALSE`, same as `ionCount()`). +#' +#' @param j For `[`: not supported. +#' +#' @param name For `$` and `$<-`: the name of the spectra variable to return +#' or set. +#' +#' @param object A `Spectra` object. +#' +#' @param spectraVars `character()` indicating what spectra variables to add to +#' the `DataFrame`. Default is `spectraVariables(object)`, i.e. all +#' available variables. +#' +#' @param use.names For `lengths()`: ignored. +#' +#' @param value A vector with values to replace the respective spectra +#' variable. Needs to be of the correct data type for the spectra variable. +#' +#' @param x A `Spectra` object. +#' +#' @param ... Additional arguments. +#' +#' #' @section Spectra variables: #' #' A common set of *core spectra variables* are defined for `Spectra`. These @@ -642,7 +731,7 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' The set of available functions to extract data from, or set data in, a #' `Spectra` object are (in alphabetical order) listed below. Note that there #' are also other functions to extract information from a `Spectra` object -#' documented LLLLLLL +#' documented in [addProcessing()]. #' #' - `$`, `$<-`: gets (or sets) a spectra variable for all spectra in `object`. #' See examples for details. Note that replacing values of a peaks variable @@ -651,10 +740,16 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' [applyProcessing()] needs to be called first to apply all cached data #' operations. #' +#' - `[[`, `[[<-`: access or set/add a single spectrum variable (column) in the +#' backend. +#' #' - `acquisitionNum()`: returns the acquisition number of each #' spectrum. Returns an `integer` of length equal to the number of #' spectra (with `NA_integer_` if not available). #' +#' - `asDataFrame()`: converts the `Spectra` to a `DataFrame` (in long format) +#' contining all data. Returns a `DataFrame`. +#' #' - `centroided()`, `centroided<-`: gets or sets the centroiding #' information of the spectra. `centroided()` returns a `logical` #' vector of length equal to the number of spectra with `TRUE` if a @@ -814,9 +909,16 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' #' @md #' +#' @seealso +#' +#' - [addProcessing()] for functions to analyze `Spectra`. +#' +#' - [Spectra] for a general description of the `Spectra` object. +#' #' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail #' #' @examples +#' #' ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk #' ## backend. #' sciex_file <- dir(system.file("sciex", package = "msdata"), @@ -854,17 +956,6 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' s <- Spectra(spd) #' s #' -#' ## Get the peak data (m/z and intensity values). -#' pks <- peaksData(s) -#' pks -#' pks[[1]] -#' pks[[2]] -#' -#' ## Note that we could get the same resulb by coercing the `Spectra` to -#' ## a `list` or `SimpleList`: -#' as(data, "list") -#' as(data, "SimpleList") -#' #' ## List all available spectra variables (i.e. spectrum data and metadata). #' spectraVariables(s) #' @@ -888,10 +979,26 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' s$spectrum_id #' #' ## Extract specific spectra variables. -#' spectraData(s columns = c("spectrum_id", "msLevel")) +#' spectraData(s, columns = c("spectrum_id", "msLevel")) +#' #' +#' ## -------- PEAKS VARIABLES AND DATA -------- +#' +#' ## Get the peak data (m/z and intensity values). +#' pks <- peaksData(s) +#' pks +#' pks[[1]] +#' pks[[2]] +#' +#' ## Note that we could get the same resulb by coercing the `Spectra` to +#' ## a `list` or `SimpleList`: +#' as(s, "list") +#' as(s, "SimpleList") #' -#' ## ---- PEAKS VARIABLES AND DATA ---- +#' ## Or use `mz()` and `intensity()` to extract the m/z and intensity values +#' ## separately +#' mz(s) +#' intensity(s) #' #' ## Some `MsBackend` classes provide support for arbitrary peaks variables #' ## (in addition to the mandatory `"mz"` and `"intensity"` values. Below @@ -919,6 +1026,8 @@ setReplaceMethod("dataStorageBasePath", "Spectra", function(object, value) { #' #' ## Access just the pk_ann variable #' sps$pk_ann +#' +#' NULL #' @importFrom methods setAs @@ -933,10 +1042,6 @@ setAs("Spectra", "SimpleList", function(from, to) { #' @export #' #' @rdname spectraData -#' -#' @param spectraVars `character()` indicating what spectra variables to add to -#' the `DataFrame`. Default is `spectraVariables(object)`, i.e. all -#' available variables. asDataFrame <- function(object, i = seq_along(object), spectraVars = spectraVariables(object)) { stopifnot(inherits(object, "Spectra")) @@ -1285,6 +1390,38 @@ setReplaceMethod("$", "Spectra", function(x, name, value) { x }) +#' @rdname spectraData +#' +#' @export +setMethod("[[", "Spectra", function(x, i, j, ...) { + if (!is.character(i)) + stop("'i' is supposed to be a character defining the spectra ", + "variable to access.") + if (!missing(j)) + stop("'j' is not supported.") + if (!(i %in% c(spectraVariables(x), "mz", "intensity"))) + stop("No spectra variable '", i, "' available") + if (i == "mz") + mz(x) + else if (i == "intensity") + intensity(x) + else + do.call("[[", list(x@backend, i)) +}) + +#' @rdname spectraData +#' +#' @export +setReplaceMethod("[[", "Spectra", function(x, i, j, ..., value) { + if (!is.character(i)) + stop("'i' is supposed to be a character defining the spectra ", + "variable to replace or create.") + if (!missing(j)) + stop("'j' is not supported.") + x@backend <- do.call("[[<-", list(x@backend, i = i, value = value)) + x +}) + ################################################################################ ## @@ -1292,18 +1429,236 @@ setReplaceMethod("$", "Spectra", function(x, name, value) { ## ################################################################################ -#' @title Merging, splitting and aggregating Spectra +#' @title Merging, aggregating and splitting Spectra #' -#' @rdname Spectra +#' @name combineSpectra +#' +#' @aliases combineSpectra +#' @aliases split +#' @aliases joinSpectraData +#' +#' @description +#' +#' Various functions are availabe to combine, aggregate or split data from one +#' of more `Spectra` objects. These are: +#' +#' - `c()` and `concatenateSpectra()`: combines several `Spectra` objects into +#' a single object. The resulting `Spectra` contains all data from all +#' individual `Spectra`, i.e. the union of all their spectra variables. +#' Concatenation will fail if the processing queue of any of the `Spectra` +#' objects is not empty or if different backends are used for the `Spectra` +#' objects. In such cases it is suggested to first change the backends of +#' all `Spectra` to the same type of backend (using the [setBackend()] +#' function and to eventually (if needed) apply the processing queue using +#' the [applyProcessing()] function. +#' +#' - `combineSpectra()`: combines sets of spectra (defined with parameter `f`) +#' into a single spectrum per set aggregating their MS data (i.e. their +#' *peaks data* matrices with the *m/z* and intensity values of their +#' mass peaks). The spectra variable values of the first spectrum per set +#' are reported for the combined spectrum. The peak matrices of the spectra +#' per set are combined using the function specified with parameter `FUN` +#' which uses by default the [combinePeaksData()] function. See the +#' documentation of [combinePeaksData()] for details on the aggregation of +#' the peak data and the package vignette for examples. +#' The sets of spectra can be specified with parameter `f` which is expected +#' to be a `factor` or `vector` of length equal to the length of the +#' `Spectra` specifying to which set a spectrum belongs to. The function +#' returns a `Spectra` of length equal to the unique levels of `f`. The +#' optional parameter `p` allows to define how the `Spectra` should be +#' split for potential parallel processing. The default is +#' `p = x$dataStorage` and hence a per storage file parallel processing is +#' applied for `Spectra` with on disk data representations (such as the +#' [MsBackendMzR()]). This also prevents that spectra from different data +#' files/samples are combined (eventually use e.g. `p = x$dataOrigin` or any +#' other spectra variables defining the originating samples for a spectrum). +#' Before combining the peaks data, all eventual present processing steps are +#' applied (by calling [applyProcessing()] on the `Spectra`). This function +#' will replace the original *m/z* and intensity values of a `Spectra` hence +#' it can not be called on a `Spectra` with a *read-only* backend. In such +#' cases, the backend should be changed to a *writeable* backend before +#' using the [setBackend()] function (to e.g. a [MsBackendMemory()] backend). +#' +#' - `joinSpectraData()`: Individual spectra variables can be directly +#' added with the `$<-` or `[[<-` syntax. The `joinSpectraData()` +#' function allows to merge a `DataFrame` to the existing spectra +#' data of a `Spectra`. This function diverges from the [merge()] method in +#' two main ways: +#' - The `by.x` and `by.y` column names must be of length 1. +#' - If variable names are shared in `x` and `y`, the spectra +#' variables of `x` are not modified. It's only the `y` +#' variables that are appended with the suffix defined in +#' `suffix.y`. This is to avoid modifying any core spectra +#' variables that would lead to an invalid object. +#' - Duplicated Spectra keys (i.e. `x[[by.x]]`) are not +#' allowed. Duplicated keys in the `DataFrame` (i.e `y[[by.y]]`) +#' throw a warning and only the last occurrence is kept. These +#' should be explored and ideally be removed using for +#' `QFeatures::reduceDataFrame()`, `PMS::reducePSMs()` or similar +#' functions. +#' +#' - `split()`: splits the `Spectra` object based on parameter `f` into a `list` +#' of `Spectra` objects. +#' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. +#' +#' @param by.x A `character(1)` specifying the spectra variable used +#' for merging. Default is `"spectrumId"`. +#' +#' @param by.y A `character(1)` specifying the column used for +#' merging. Set to `by.x` if missing. +#' +#' @param drop For `split()`: not considered. +#' +#' @param f For `split()`: factor defining how to split `x`. See [base::split()] +#' for details. +#' For `combineSpectra()`: `factor` defining the grouping of the spectra +#' that should be combined. Defaults to `x$dataStorage`. +#' +#' @param FUN For `combineSpectra()`: function to combine the (peak matrices) +#' of the spectra. Defaults to [combinePeaksData()]. +#' +#' @param p For `combineSpectra()`: `factor` defining how to split the input +#' `Spectra` for parallel processing. Defaults to `x$dataStorage`, i.e., +#' depending on the used backend, per-file parallel processing will be +#' performed. +#' +#' @param suffix.y A `character(1)` specifying the suffix to be used +#' for making the names of columns in the merged spectra variables +#' unique. This suffix will be used to amend `names(y)`, while +#' `spectraVariables(x)` will remain unchanged. +#' +#' @param x A `Spectra` object. +#' +#' @param y A `DataFrame` with the spectra variables to join/add. +#' +#' @param ... Additional arguments. +#' +#' @seealso +#' +#' - [combinePeaks()] for functions to aggregate mass peaks data. +#' +#' - [Spectra] for a general description of the `Spectra` object. #' #' @importFrom MsCoreUtils vapply1c #' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto +#' +#' @examples +#' +#' ## Create a Spectra providing a `DataFrame` containing a MS data. +#' +#' spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) +#' spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2)) +#' spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) +#' +#' s <- Spectra(spd) +#' s +#' +#' ## Create a second Spectra from mzML files and use the `MsBackendMzR` +#' ## on-disk backend. +#' sciex_file <- dir(system.file("sciex", package = "msdata"), +#' full.names = TRUE) +#' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +#' sciex +#' +#' ## Subset to the first 100 spectra to reduce running time of the examples +#' sciex <- sciex[1:100] +#' +#' +#' ## -------- COMBINE SPECTRA -------- +#' +#' ## Combining the `Spectra` object `s` with the MS data from `sciex`. +#' ## Calling directly `c(s, sciex)` would result in an error because +#' ## both backends use a different backend. We thus have to first change +#' ## the backends to the same backend. We change the backend of the `sciex` +#' ## `Spectra` to a `MsBackendMemory`, the backend used by `s`. +#' +#' sciex <- setBackend(sciex, MsBackendMemory()) +#' +#' ## Combine the two `Spectra` +#' all <- c(s, sciex) +#' all +#' +#' ## The new `Spectra` objects contains the union of spectra variables from +#' ## both: +#' spectraVariables(all) +#' +#' ## The spectra variables that were not present in `s`: +#' setdiff(spectraVariables(all), spectraVariables(s)) +#' +#' ## The values for these were filled with missing values for spectra from +#' ## `s`: +#' all$peaksCount |> head() +#' +#' +#' ## -------- AGGREGATE SPECTRA -------- +#' +#' ## Sets of spectra can be combined into a single, representative spectrum +#' ## per set using `combineSpectra()`. This aggregates the peaks data (i.e. +#' ## the spectra's m/z and intensity values) while using the values for all +#' ## spectra variables from the first spectrum per set. Below we define the +#' ## sets as all spectra measured in the *same second*, i.e. rounding their +#' ## retention time to the next closer integer value. +#' f <- round(rtime(sciex)) +#' head(f) +#' +#' cmp <- combineSpectra(sciex, f = f) +#' +#' ## The length of `cmp` is now equal to the length of unique levels in `f`: +#' length(cmp) +#' +#' ## The spectra variable value from the first spectrum per set is used in +#' ## the representative/combined spectrum: +#' cmp$rtime +#' +#' ## The peaks data was aggregated: the number of mass peaks of the first six +#' ## spectra from the original `Spectra`: +#' lengths(sciex) |> head() +#' +#' ## and for the first aggreagated spectra: +#' lengths(cmp) |> head() +#' +#' ## The default peaks data aggregation method joins all mass peaks. See +#' ## documentation of the `combinePeaksData()` function for more options. +#' +#' +#' ## -------- SPLITTING DATA -------- +#' +#' ## A `Spectra` can be split into a `list` of `Spectra` objects using the +#' ## `split()` function defining the sets into which the `Spectra` should +#' ## be splitted into with parameter `f`. +#' sciex_split <- split(sciex, f) +#' +#' length(sciex_split) +#' sciex_split |> head() +#' +#' +#' ## -------- ADDING SPECTRA DATA -------- +#' +#' ## Adding new spectra variables +#' sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) +#' spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging +#' var1 = rnorm(10), +#' var2 = sample(letters, 10)) +#' spv +#' +#' sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") +#' +#' spectraVariables(sciex2) +#' spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] +NULL + +#' @rdname combineSpectra +#' #' @exportMethod c setMethod("c", "Spectra", function(x, ...) { .concatenate_spectra(unname(list(unname(x), ...))) }) -#' @rdname Spectra +#' @rdname combineSpectra setMethod("split", "Spectra", function(x, f, drop = FALSE, ...) { bcknds <- split(x@backend, f, ...) lapply(bcknds, function(b) { @@ -1313,77 +1668,111 @@ setMethod("split", "Spectra", function(x, f, drop = FALSE, ...) { }) - ################################################################################ ## -## Filtering, subsetting Spectra: subsetting Spectra and its data content. +## Aggregating peaks data ## ################################################################################ -#' @title Filtering and subsetting Spectra objects +#' @title Aggregating and combining mass peaks data #' -#' @aliases [,Spectra-method - -#' @rdname Spectra -setMethod("dropNaSpectraVariables", "Spectra", function(object) { - object@backend <- dropNaSpectraVariables(object@backend) - object -}) - -#' @rdname Spectra -setMethod( - "selectSpectraVariables", "Spectra", - function(object, spectraVariables = union(spectraVariables(object), - peaksVariables(object))) { - spectraVariables <- union(spectraVariables, "dataStorage") - object@backend <- selectSpectraVariables( - object@backend, spectraVariables = spectraVariables) - object - }) - - -#' @rdname Spectra +#' @name combinePeaks #' -#' @export -setMethod("[[", "Spectra", function(x, i, j, ...) { - if (!is.character(i)) - stop("'i' is supposed to be a character defining the spectra ", - "variable to access.") - if (!missing(j)) - stop("'j' is not supported.") - if (!(i %in% c(spectraVariables(x), "mz", "intensity"))) - stop("No spectra variable '", i, "' available") - if (i == "mz") - mz(x) - else if (i == "intensity") - intensity(x) - else - do.call("[[", list(x@backend, i)) -}) - -#' @rdname Spectra +#' @description #' -#' @export -setReplaceMethod("[[", "Spectra", function(x, i, j, ..., value) { - if (!is.character(i)) - stop("'i' is supposed to be a character defining the spectra ", - "variable to replace or create.") - if (!missing(j)) - stop("'j' is not supported.") - x@backend <- do.call("[[<-", list(x@backend, i = i, value = value)) - x -}) - - -#' @rdname Spectra -setMethod("[", "Spectra", function(x, i, j, ..., drop = FALSE) { - if (!missing(j)) - stop("Subsetting 'Spectra' by columns is not (yet) supported") - if (missing(i)) - return(x) - slot(x, "backend", check = FALSE) <- x@backend[i = i] - x -}) +#' In addition to aggregating content of spectra variables (describe in +#' [combineSpectra()]) it is also possible to aggregate and combine mass peaks +#' data from individual spectra within a `Spectra`. These `combinePeaks()` +#' function combines mass peaks **within each spectrum** with a difference in +#' their m/z values that is smaller than the maximal acceptable difference +#' defined by `ppm` and `tolerance`. Parameters `intensityFun` and `mzFun` +#' allow to define functions to aggregate the intensity and m/z values for +#' each such group of peaks. With `weighted = TRUE` (the default), the m/z +#' value of the combined peak is calculated using an intensity-weighted mean +#' and parameter `mzFun` is ignored. The [MsCoreUtils::group()] function is +#' used for the grouping of mass peaks. Parameter `msLevel.` allows to define +#' selected MS levels for which peaks should be combined. This function +#' returns a `Spectra` with the same number of spectra than the input object, +#' but with possibly combined peaks within each spectrum. +#' Additional peak variables (other than `"mz"` and `"intensity"`) are +#' dropped (i.e. their values are replaced with `NA`) for combined peaks +#' unless they are constant across the combined peaks. See also +#' [reduceSpectra()] for a function to select a single *representative* +#' mass peak for each peak group. +#' +#' @param intensityFun Function to aggregate intensities for all peaks in +#' each peak group into a single intensity value. +#' +#' @param msLevel. `integer` defining the MS level(s) of the spectra to which +#' the function should be applied (defaults to all MS levels of `object`. +#' +#' @param mzFun Function to aggregate m/z values for all mass peaks within +#' each peak group into a single m/z value. This parameter is ignored if +#' `weighted = TRUE` (the default). +#' +#' @param object A `Spectra` object. +#' +#' @param ppm `numeric(1)` defining a relative, m/z-dependent, maximal +#' accepted difference between m/z values for peaks to be grouped. Default +#' is `ppm = 20`. +#' +#' @param tolerance `numeric(1)` allowing to define a constant maximal +#' accepted difference between m/z values for peaks to be grouped. Default +#' is `tolerance = 0`. +#' +#' @param weighted `logical(1)` whether m/z values of peaks within each peak +#' group should be aggregated into a single m/z value using an +#' intensity-weighted mean. Defaults to `weighted = TRUE`. +#' +#' @param ... ignored. +#' +#' @md +#' +#' @seealso +#' +#' - [combineSpectra()] for functions to combine or aggregate `Spectra`'s +#' spectra data. +#' +#' - [combinePeaksData()] for the function to combine the mass peaks data. +#' +#' - [reduceSpectra()] and similar functions to filter mass peaks data. +#' +#' - [Spectra] for a general description of the `Spectra` object. +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto +#' +#' @examples +#' +#' ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk +#' ## backend. +#' sciex_file <- dir(system.file("sciex", package = "msdata"), +#' full.names = TRUE) +#' sciex <- Spectra(sciex_file, backend = MsBackendMzR()) +#' +#' ## Combine mass peaks per spectrum with a difference in their m/z value +#' ## that is smaller than 20 ppm. The intensity values of such peaks are +#' ## combined by summing their values, while for the m/z values the median +#' ## is reported +#' sciex_comb <- combinePeaks(sciex, ppm = 20, +#' intensityFun = sum, mzFun = median) +#' +#' ## Comparing the number of mass peaks before and after aggregation +#' lengths(sciex) |> head() +#' lengths(sciex_comb) |> head() +#' +#' ## Plotting the first spectrum before and after aggregation +#' par(mfrow = c(1, 2)) +#' plotSpectra(sciex[2L]) +#' plotSpectra(sciex_comb[2L]) +#' +#' ## Using `reduceSpectra()` to keep for each group of mass peaks with a +#' ## difference in their m/z values < 20ppm the one with the highest intensity. +#' sciex_red <- reduceSpectra(sciex, ppm = 20) +#' +#' ## Comparing the number of mass peaks before and after the operation +#' lengths(sciex) |> head() +#' lengths(sciex_red) |> head() +NULL #' @rdname hidden_aliases setMethod("combinePeaks", "list", function(object, ...) { @@ -1394,7 +1783,7 @@ setMethod("combinePeaks", "list", function(object, ...) { combinePeaksData(object, ...) }) -#' @rdname Spectra +#' @rdname combinePeaks #' #' @exportMethod combinePeaks setMethod("combinePeaks", "Spectra", function(object, tolerance = 0, ppm = 20, @@ -1413,7 +1802,620 @@ setMethod("combinePeaks", "Spectra", function(object, tolerance = 0, ppm = 20, object }) -#' @rdname Spectra + +################################################################################ +## +## Filtering, subsetting Spectra: subsetting Spectra and its data content. +## +################################################################################ + +#' @title Filter and subset Spectra objects +#' +#' @name filterMsLevel +#' +#' @aliases [,Spectra-method +#' @aliases filterAcquisitionNum +#' @aliases filterDataOrigin +#' @aliases filterDataStorage +#' @aliases filterEmptySpectra +#' @aliases filterIsolationWindow +#' @aliases filterMsLevel +#' @aliases filterPolarity +#' @aliases filterPrecursorCharge +#' @aliases filterPrecursorIsotopes +#' @aliases filterPrecursorMzRange +#' @aliases filterPrecursorMzValues +#' @aliases filterPrecursorScan +#' @aliases filterRanges +#' @aliases filterRt +#' @aliases filterValues +#' @aliases dropNaSpectraVariables +#' @aliases selectSpectraVariables +#' @aliases filterIntensity +#' @aliases filterMzRange +#' @aliases filterMzValues +#' @aliases reduceSpectra +#' +#' @description +#' +#' A variety of functions to filter or subset `Spectra` objects are available. +#' These can be generally separated into two main classes: I) *classical* +#' subset operations that immediately reduce the number of spectra in the +#' object and II) filters that reduce the **content** of the object without +#' changing its length (i.e. the number of spectra). The latter can be further +#' subdivided into functions that affect the content of the `spectraData` (i.e. +#' the general spectrum metadata) and those that reduce the content of the +#' object's `peaksData` (i.e. the m/z and intensity values of a spectrum's +#' mass peaks). +#' +#' A description of functions from these 3 different categories are given below +#' in sections *Subset `Spectra`*, *Filter content of `spectraData()`* and +#' *Filter content of `peaksData()`*, respectively. +#' +#' +#' @section Subset `Spectra`: +#' +#' These functions affect the number of spectra in a `Spectra` object creating +#' a subset of the original object without affecting its content. +#' +#' - `[`: subsets the spectra keeping only selected elements (`i`). The method +#' **always** returns a `Spectra` object. +#' +#' - `filterAcquisitionNum()`: filters the object keeping only spectra matching +#' the provided acquisition numbers (argument `n`). If `dataOrigin` or +#' `dataStorage` is also provided, `object` is subsetted to the spectra with +#' an acquisition number equal to `n` **in spectra with matching dataOrigin +#' or dataStorage values** retaining all other spectra. +#' Returns the filtered `Spectra`. +#' +#' - `filterDataOrigin()`: filters the object retaining spectra matching the +#' provided `dataOrigin`. Parameter `dataOrigin` has to be of type +#' `character` and needs to match exactly the data origin value of the +#' spectra to subset. +#' Returns the filtered `Spectra` object (with spectra ordered according to +#' the provided `dataOrigin` parameter). +#' +#' - `filterDataStorage()`: filters the object retaining spectra stored in the +#' specified `dataStorage`. Parameter `dataStorage` has to be of type +#' `character` and needs to match exactly the data storage value of the +#' spectra to subset. +#' Returns the filtered `Spectra` object (with spectra ordered according to +#' the provided `dataStorage` parameter). +#' +#' - `filterEmptySpectra()`: removes empty spectra (i.e. spectra without peaks). +#' Returns the filtered `Spectra` object (with spectra in their +#' original order). +#' +#' - `filterIsolationWindow()`: retains spectra that contain `mz` in their +#' isolation window m/z range (i.e. with an `isolationWindowLowerMz` <= `mz` +#' and `isolationWindowUpperMz` >= `mz`. Returns the filtered `Spectra` +#' object (with spectra in their original order). +#' +#' - `filterMsLevel()`: filters object by MS level keeping only spectra matching +#' the MS level specified with argument `msLevel`. Returns the filtered +#' `Spectra` (with spectra in their original order). +#' +#' - `filterPolarity()`: filters the object keeping only spectra matching the +#' provided polarity. Returns the filtered `Spectra` (with spectra in their +#' original order). +#' +#' - `filterPrecursorCharge()`: retains spectra with the defined precursor +#' charge(s). +#' +#' - `filterPrecursorIsotopes()`: groups MS2 spectra based on their precursor +#' m/z and precursor intensity into predicted isotope groups and keep for each +#' only the spectrum representing the monoisotopic precursor. MS1 spectra +#' are returned as is. See documentation for `deisotopeSpectra()` below for +#' details on isotope prediction and parameter description. +#' +#' - `filterPrecursorMaxIntensity()`: filters the `Spectra` keeping for groups +#' of (MS2) spectra with similar precursor m/z values (given parameters +#' `ppm` and `tolerance`) the one with the highest precursor intensity. The +#' function filters only MS2 spectra and returns all MS1 spectra. If +#' precursor intensities are `NA` for all spectra within a spectra group, the +#' first spectrum of that groups is returned. +#' Note: some manufacturers don't provide precursor intensities. These can +#' however also be estimated with [estimatePrecursorIntensity()]. +#' +#' - `filterPrecursorMzRange()` (previously `filterPrecursorMz()` which is now +#' deprecated): retains spectra with a precursor m/z within the +#' provided m/z range. See examples for details on selecting spectra with +#' a precursor m/z for a target m/z accepting a small difference in *ppm*. +#' +#' - `filterPrecursorMzValues()`: retains spectra with precursor m/z matching +#' any of the provided m/z values (given `ppm` and `tolerance`). Spectra with +#' missing precursor m/z value (e.g. MS1 spectra) are dropped. +#' +#' - `filterPrecursorScan()`: retains parent (e.g. MS1) and children scans (e.g. +#' MS2) of acquisition number `acquisitionNum`. Returns the filtered +#' `Spectra` (with spectra in their original order). Parameter `f` allows to +#' define which spectra belong to the same sample or original data file ( +#' defaults to `f = dataOrigin(object)`). +#' +#' - `filterRanges()`: allows filtering of the `Spectra` object based on user +#' defined *numeric* ranges (parameter `ranges`) for one or more available +#' spectra variables in object (spectra variable names can be specified with +#' parameter `spectraVariables`). Spectra for which the value of a spectra +#' variable is within it's defined range are retained. If multiple +#' ranges/spectra variables are defined, the `match` parameter can be used +#' to specify whether all conditions (`match = "all"`; the default) or if +#' any of the conditions must match (`match = "any"`; all spectra for which +#' values are within any of the provided ranges are retained). +#' +#' - `filterRt()`: retains spectra of MS level `msLevel` with retention +#' times (in seconds) within (`>=`) `rt[1]` and (`<=`) +#' `rt[2]`. Returns the filtered `Spectra` (with spectra in their +#' original order). +#' +#' - `filterValues()`: allows filtering of the `Spectra` object based on +#' similarities of *numeric* values of one or more `spectraVariables(object)` +#' (parameter `spectraVariables`) to provided values (parameter `values`) +#' given acceptable differences (parameters tolerance and ppm). If multiple +#' values/spectra variables are defined, the `match` parameter can be used +#' to specify whether all conditions (`match = "all"`; the default) or if +#' any of the conditions must match (`match = "any"`; all spectra for which +#' values are within any of the provided ranges are retained). +#' +#' +#' @section Filter content of `spectraData()`: +#' +#' The functions described in this section filter the content from a +#' `Spectra`'s spectra data, i.e. affect values of, or complete, spectra +#' variables. None of these functions reduces the object's number of spectra. +#' +#' - `dropNaSpectraVariables()`: removes spectra variables (i.e. columns in the +#' object's `spectraData` that contain only missing values (`NA`). Note that +#' while columns with only `NA`s are removed, a `spectraData()` call after +#' `dropNaSpectraVariables()` might still show columns containing `NA` values +#' for *core* spectra variables. The total number of spectra is not changed +#' by this function. +#' +#' - `selectSpectraVariables()`: reduces the information within the object to +#' the selected spectra variables: all data for variables not specified will +#' be dropped. For mandatory columns (i.e., those listed by +#' [coreSpectraVariables()], such as *msLevel*, *rtime* ...) only +#' the values will be dropped but not the variable itself. Additional (or +#' user defined) spectra variables will be completely removed. +#' Returns the filtered `Spectra`. +#' +#' +#' @section Filter content of `peaksData()`: +#' +#' The functions described in this section filter the content of the +#' `Spectra`'s peaks data, i.e. either the number or the values (*m/z* or +#' intensity values) of the mass peaks. Also, the actual operation is only +#' executed once peaks data is accessed (through `peaksData()`, +#' `mz()` or `intensity()`) or `applyProcessing()` is called. +#' These operations don't affect the number of spectra in the `Spectra` object. +#' +#' - `deisotopeSpectra()`: *deisotopes* each spectrum keeping only the +#' monoisotopic peak for groups of isotopologues. Isotopologues are +#' estimated using the [isotopologues()] function from the +#' *MetaboCoreUtils* package. Note that +#' the default parameters for isotope prediction/detection have been +#' determined using data from the Human Metabolome Database (HMDB) and +#' isotopes for elements other than CHNOPS might not be detected. See +#' parameter `substDefinition` in the documentation of [isotopologues()] for +#' more information. The approach and code to define the parameters for +#' isotope prediction is described +#' [here](https://github.com/EuracBiomedicalResearch/isotopologues). +#' +#' - `filterFourierTransformArtefacts()`: removes (Orbitrap) fast fourier +#' artefact peaks from spectra (see examples below). The function iterates +#' through all intensity ordered peaks in a spectrum and removes all peaks +#' with an m/z within +/- `halfWindowSize` of the current peak if their +#' intensity is lower than `threshold` times the current peak's intensity. +#' Additional parameters `keepIsotopes`, `maxCharge` and `isotopeTolerance` +#' allow to avoid removing of potential `[13]C` isotope peaks (`maxCharge` +#' being the maximum charge that should be considered and `isotopeTolerance` +#' the absolute acceptable tolerance for matching their m/z). +#' See [filterFourierTransformArtefacts()] for details and background and +#' `deisitopeSpectra()` for an alternative. +#' +#' - `filterIntensity()`: filters mass peaks in each spectrum keeping only +#' those with intensities that are within the provided range or match the +#' criteria of the provided function. For the former, parameter `intensity` +#' has to be a `numeric` defining the intensity range, for the latter a +#' `function` that takes the intensity values of the spectrum and returns +#' a `logical` whether the peak should be retained or not (see examples +#' below for details) - additional parameters to the function can be passed +#' with `...`. +#' To remove only peaks with intensities below a certain threshold, say +#' 100, use `intensity = c(100, Inf)`. Note: also a single value can be +#' passed with the `intensity` parameter in which case an upper limit of +#' `Inf` is used. +#' Note that this function removes also peaks with missing intensities +#' (i.e. an intensity of `NA`). Parameter `msLevel.` allows to restrict the +#' filtering to spectra of the specified MS level(s). +#' +#' - `filterMzRange()`: filters mass peaks in the object keeping or removing +#' those in each spectrum that are within the provided m/z range. Whether +#' peaks are retained or removed can be configured with parameter `keep` +#' (default `keep = TRUE`). +#' +#' - `filterMzValues()`: filters mass peaks in the object keeping all +#' peaks in each spectrum that match the provided m/z value(s) (for +#' `keep = TRUE`, the default) or removing all of them (for `keep = FALSE`). +#' The m/z matching considers also the absolute `tolerance` and m/z-relative +#' `ppm` values. `tolerance` and `ppm` have to be of length 1. +#' +#' - `filterPeaksRanges()`: filters mass peaks of a `Spectra` object using any +#' set of range-based filters on numeric spectra or peaks variables. See +#' [filterPeaksRanges()] for more information. +#' +#' - `filterPrecursorPeaks()`: removes peaks from each spectrum in `object` with +#' an m/z equal or larger than the m/z of the precursor, depending on the +#' value of parameter `mz`: for `mz = ==" (the default) peaks with matching +#' m/z (considering an absolute and relative acceptable difference depending +#' on `tolerance` and `ppm`, respectively) are removed. For `mz = ">="` all +#' peaks with an m/z larger or equal to the precursor m/z (minus `tolerance` +#' and the `ppm` of the precursor m/z) are removed. Parameter `msLevel.` +#' allows to restrict the filter to certain MS levels (by default the filter +#' is applied to all MS levels). Note that no peaks are removed if the +#' precursor m/z is `NA` (e.g. typically for MS1 spectra). +#' +#' - `reduceSpectra()`: keeps for groups of peaks with similar m/z values in +#' (given `ppm` and `tolerance`) in each spectrum only the mass peak with the +#' highest intensity removing all other peaks hence *reducing* each +#' spectrum to the highest intensity peaks per *peak group*. +#' Peak groups are defined using the [group()] function from the +#' *MsCoreUtils* package. See also the [combinePeaks()] function for an +#' alternative function to combine peaks within each spectrum. +#' +#' @param acquisitionNum for `filterPrecursorScan()`: `integer` with the +#' acquisition number of the spectra to which the object should be +#' subsetted. +#' +#' @param charge For `deisotopeSpectra()`: expected charge of the ionized +#' compounds. See [isotopologues()] for details. +#' +#' @param dataOrigin For `filterDataOrigin()`: `character` to define which +#' spectra to keep. +#' For `filterAcquisitionNum()`: optionally specify if filtering should +#' occurr only for spectra of selected `dataOrigin`. +#' +#' @param dataStorage For `filterDataStorage()`: `character` to define which +#' spectra to keep. +#' For `filterAcquisitionNum()`: optionally specify if filtering should +#' occur only for spectra of selected `dataStorage`. +#' +#' @param drop For `[`: not considered. +#' +#' @param f For `filterPrecursorScan()`: defining which spectra +#' belong to the same original data file (sample): Defaults to +#' `f = dataOrigin(x)`. +#' +#' @param halfWindowSize For `filterFourierTransformArtefacts()`: `numeric(1)` +#' defining the m/z window left and right of a peak where to remove +#' fourier transform artefacts. +#' +#' @param i For `[`: `integer`, `logical` or `character` to subset the +#' object. +#' +#' @param intensity For `filterIntensity()`: `numeric` of length 1 or 2 +#' defining either the lower or the lower and upper intensity limit for the +#' filtering, or a `function` that takes the intensities as input and +#' returns a `logical` (same length then peaks in the spectrum) whether the +#' peak should be retained or not. Defaults to `intensity = c(0, Inf)` thus +#' only peaks with `NA` intensity are removed. +#' +#' @param isotopeTolerance For `filterFourierTransformArtefacts()`: the m/z +#' `tolerance` to be used to define whether peaks might be isotopes of +#' the current tested peak. +#' +#' @param j For `[`: not supported. +#' +#' @param keep For `filterMzValues()` and `filterMzRange()`: `logical(1)` +#' whether the matching peaks should be retained (`keep = TRUE`, the +#' default) or dropped (`keep = FALSE`). +#' +#' @param keepIsotopes For `filterFourierTransformArtefacts()`: whether isotope +#' peaks should not be removed as fourier artefacts. +#' +#' @param match For `filterRanges()` and `filterValues()`: `character(1) ` +#' defining whether the condition has to match for all provided +#' `ranges`/`values` (`match = "all"`; the default), or for any of them +#' (`match = "any"`) for spectra to be retained. +#' +#' @param maxCharge For `filterFourierTransformArtefacts()`: the maximum charge +#' to be considered for isotopes. +#' +#' @param msLevel. `integer` defining the MS level(s) of the spectra to which +#' the function should be applied (defaults to all MS levels of `object`. +#' For `filterMsLevel()`: the MS level to which `object` should be +#' subsetted. +#' +#' @param mz For `filterIsolationWindow()`: `numeric(1)` with the m/z value to +#' filter the object. For `filterPrecursorMz()` and `filterMzRange()`: +#' `numeric(2)` defining the lower and upper m/z boundary. +#' For `filterMzValues()` and `filterPrecursorMzValues()`: `numeric` with +#' the m/z values to match peaks or precursor m/z against. +#' For `filterPrecursorPeaks()`: `character(1)` defining whether mass peaks +#' with an m/z matching the spectrum's precursor m/z (`mz = "=="`, +#' the default) or mass peaks with a m/z that is equal or larger +#' (`mz = ">="`) should be removed. +#' +#' @param n for `filterAcquisitionNum()`: `integer` with the acquisition +#' numbers to filter for. +#' +#' @param object `Spectra` object. +#' +#' @param polarity for `filterPolarity()`: `integer` specifying the polarity to +#' to subset `object`. +#' +#' @param ppm For `filterMzValues()` and `reduceSpectra()`: `numeric(1)` +#' defining a relative, m/z-dependent, maximal accepted difference between +#' m/z values for peaks to be matched (or grouped). +#' For `filterPrecursorMaxIntensity()`: `numeric(1)` defining the relative +#' maximal accepted difference of precursor m/z values of spectra for +#' grouping them into *precursor groups*. For `filterPrecursorIsotopes()`: +#' passed directly to the [isotopologues()] function. +#' For `filterValues()`: `numeric` of any length allowing to define +#' a maximal accepted difference between user input `values` and the +#' `spectraVariables` values. If it is not equal to the length of the +#' value provided with parameter `spectraVariables`, `ppm[1]` will be +#' recycled. +#' +#' @param ranges for `filterRanges()`: A `numeric` vector of paired values +#' (upper and lower boundary) that define the ranges to filter the `object`. +#' These paired values need to be in the same order as the +#' `spectraVariables` parameter (see below). +#' +#' @param rt for `filterRt()`: `numeric(2)` defining the retention time range to +#' be used to subset/filter `object`. +#' +#' @param spectraVariables For `selectSpectraVariables()`: `character` with the +#' names of the spectra variables to which the backend should be +#' subsetted. For `filterRanges()` and `filterValues()`: `character` +#' vector specifying the column(s) from `spectraData(object)` on which +#' to filter the data and that correspond to the the names of the +#' spectra variables that should be used for the filtering. +#' +#' @param substDefinition For `deisotopeSpectra()` and +#' `filterPrecursorIsotopes()`: `matrix` or `data.frame` with definitions +#' of isotopic substitutions. Uses by default isotopic substitutions +#' defined from all compounds in the Human Metabolome Database (HMDB). See +#' [isotopologues()] or [isotopicSubstitutionMatrix()] in the +#' *MetaboCoreUtils* for details. +#' +#' @param threshold For `filterFourierTransformArtefacts()`: the relative +#' intensity (to a peak) below which peaks are considered fourier +#' artefacts. Defaults to `threshold = 0.2` hence removing peaks that +#' have an intensity below 0.2 times the intensity of the tested peak +#' (within the selected `halfWindowSize`). +#' +#' @param tolerance For `filterMzValues()` and `reduceSpectra()`: +#' `numeric(1)` allowing to define a constant maximal accepted difference +#' between m/z values for peaks to be matched (or grouped). For +#' `containsMz()` it can also be of length equal `mz` to specify a different +#' tolerance for each m/z value. +#' For `filterPrecursorMaxIntensity()`: `numeric(1)` defining the +#' (constant) maximal accepted difference of precursor m/z values of +#' spectra for grouping them into *precursor groups*. For +#' `filterPrecursorIsotopes()`: passed directly to the [isotopologues()] +#' function. For `filterValues()`: `numeric` of any length allowing to +#' define a maximal accepted difference between user input `values` and the +#' `spectraVariables` values. If it is not equal to the length of the +#' value provided with parameter `spectraVariables`, `tolerance[1]` will be +#' recycled. Default is `tolerance = 0`. +#' +#' @param values for `filterValues()`: A `numeric` vector that define the +#' values to filter the Spectra data. These values need to be in the same +#' order as the `spectraVariables` parameter. +#' +#' @param x `Spectra` object. +#' +#' @param z For `filterPrecursorCharge()`: `integer()` with the precursor +#' charges to be used as filter. +#' +#' @param ... Additional arguments. +#' +#' @seealso +#' +#' - [combineSpectra()] for functions to combine or aggregate `Spectra`. +#' +#' - [combinePeaks()] for functions to combine or aggregate a `Spectra`'s +#' `peaksData()` +#' +#' @md +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf +#' +#' @examples +#' +#' ## Load a `Spectra` object with LC-MS/MS data. +#' fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", +#' package = "msdata") +#' sps_dda <- Spectra(fl) +#' sps_dda +#' +#' +#' ## -------- SUBSET SPECTRA -------- +#' +#' ## Subset to the first 3 spectra +#' tmp <- sps_dda[1:3] +#' tmp +#' length(tmp) +#' +#' ## Subset to all MS2 spectra; this could be done with [, or, more +#' ## efficiently, with the `filterMsLevel` function: +#' sps_dda[msLevel(sps_dda) == 2L] +#' filterMsLevel(sps_dda, 2L) +#' +#' ## Filter the object keeping only MS2 spectra with an precursor m/z value +#' ## between a specified range: +#' filterPrecursorMzRange(sps_dda, c(80, 90)) +#' +#' ## Filter the object to MS2 spectra with an precursor m/z matching a +#' ## pre-defined value (given ppm and tolerance) +#' filterPrecursorMzValues(sps_dda, 85, ppm = 5, tolerance = 0.1) +#' +#' ## The `filterRanges()` function allows to filter a `Spectra` based on +#' ## numerical ranges of any of its (numerical) spectra variables. +#' ## First, determine the variable(s) on which to base the filtering: +#' sv <- c("rtime", "precursorMz", "peaksCount") +#' ## Note that ANY variables can be chosen here, and as many as wanted. +#' +#' ## Define the ranges (pairs of values with lower and upper boundary) to be +#' ## used for the individual spectra variables. The first two values will be +#' ## used for the first spectra variable (e.g., `"rtime"` here), the next two +#' ## for the second (e.g. `"precursorMz"` here) and so on: +#' ranges <- c(30, 350, 200, 500, 350, 600) +#' +#' ## Input the parameters within the filterRanges function: +#' filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, +#' ranges = ranges) +#' filt_spectra +#' +#' ## `filterRanges()` can also be used to filter a `Spectra` object with +#' ## multiple ranges for the same `spectraVariable` (e.g, here `"rtime"`) +#' sv <- c("rtime", "rtime") +#' ranges <- c(30, 100, 200, 300) +#' filt_spectra <- filterRanges(sps_dda, spectraVariables = sv, +#' ranges = ranges, match = "any") +#' filt_spectra +#' +#' ## While `filterRanges()` filtered on numeric ranges, `filterValues()` +#' ## allows to filter an object matching spectra variable values to user +#' ## provided values (allowing to configure allowed differences using the +#' ## `ppm` and `tolerance` parameters). +#' ## First determine the variable(s) on which to base the filtering: +#' sv <- c("rtime", "precursorMz") +#' ## Note that ANY variables can be chosen here, and as many as wanted. +#' +#' ## Define the values that will be used to filter the spectra based on their +#' ## similarities to their respective `spectraVariables`. +#' ## The first values in the parameters values, tolerance and ppm will be +#' ## used for the first spectra variable (e.g. `"rtime"` here), the next for +#' ## the second (e.g. `"precursorMz"` here) and so on: +#' values <- c(350, 80) +#' tolerance <- c(100, 0.1) +#' ppm <- c(0, 50) +#' +#' ## Input the parameters within the `filterValues()` function: +#' filt_spectra <- filterValues(sps_dda, spectraVariables = sv, +#' values = values, tolerance = tolerance, ppm = ppm) +#' filt_spectra +#' +#' +#' ## -------- FILTER SPECTRA DATA -------- +#' +#' ## Remove spectra variables without content (i.e. with only missing values) +#' sps_noNA <- dropNaSpectraVariables(sps_dda) +#' +#' ## This reduced the size of the object slightly +#' print(object.size(sps_dda), unit = "MB") +#' print(object.size(sps_noNA), unit = "MB") +#' +#' ## With the `selectSpectraVariables()` function it is in addition possible +#' ## to subset the data of a `Spectra` to the selected columns/variables, +#' ## keeping only their data: +#' tmp <- selectSpectraVariables(sps_dda, c("msLevel", "mz", "intensity", +#' "scanIndex")) +#' print(object.size(tmp), units = "MB") +#' +#' ## Except the selected variables, all data is now removed. Accessing +#' ## core spectra variables still works, but returns only NA +#' rtime(tmp) |> head() +#' +#' +#' ## -------- FILTER PEAKS DATA -------- +#' +#' ## `filterMzValues()` filters the mass peaks data of a `Spectra` retaining +#' ## only those mass peaks with an m/z value matching the provided value(s). +#' sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), tolerance = 0.3) +#' +#' ## The filtered `Spectra` has the same length +#' length(sps_dda) +#' length(sps_sub) +#' +#' ## But the number of mass peaks changed +#' lengths(sps_dda) |> head() +#' lengths(sps_sub) |> head() +#' +#' ## This function can also be used to remove specific peaks from a spectrum +#' ## by setting `keep = FALSE`. +#' sps_sub <- filterMzValues(sps_dda, mz = c(103, 104), +#' tolerance = 0.3, keep = FALSE) +#' lengths(sps_sub) |> head() +#' +#' ## With the `filterMzRange()` function it is possible to keep (or remove) +#' ## mass peaks with m/z values within a specified numeric range. +#' sps_sub <- filterMzRange(sps_dda, mz = c(100, 150)) +#' lengths(sps_sub) |> head() +#' +#' ## See also the `filterPeaksRanges()` function for a more flexible framework +#' ## to filter mass peaks +#' +#' +#' ## Removing fourier transform artefacts seen in Orbitra data. +#' +#' ## Loading an Orbitrap spectrum with artefacts. +#' data(fft_spectrum) +#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) +#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) +#' +#' fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) +#' fft_spectrum +#' plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) +#' +#' ## Using a few examples peaks in your data you can optimize the parameters +#' fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, +#' halfWindowSize = 0.2, +#' threshold = 0.005, +#' keepIsotopes = TRUE, +#' maxCharge = 5, +#' isotopeTolerance = 0.005 +#' ) +#' +#' fft_spectrum_filtered +#' length(mz(fft_spectrum_filtered)[[1]]) +#' plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) +#' +#' +#' ## *Reducing* a `Spectra` keeping for groups of mass peaks (characterized +#' ## by similarity of their m/z values) only one representative peak. This +#' ## function helps cleaning fragment spectra. +#' ## Filter the data set to MS2 spectra +#' ms2 <- filterMsLevel(sps_dda, 2L) +#' +#' ## For groups of fragment peaks with a difference in m/z < 0.1, keep only +#' ## the largest one. +#' ms2_red <- reduceSpectra(ms2, ppm = 0, tolerance = 0.1) +#' lengths(ms2) |> tail() +#' lengths(ms2_red) |> tail() +NULL + +#' @rdname filterMsLevel +setMethod("dropNaSpectraVariables", "Spectra", function(object) { + object@backend <- dropNaSpectraVariables(object@backend) + object +}) + +#' @rdname filterMsLevel +setMethod( + "selectSpectraVariables", "Spectra", + function(object, spectraVariables = union(spectraVariables(object), + peaksVariables(object))) { + spectraVariables <- union(spectraVariables, "dataStorage") + object@backend <- selectSpectraVariables( + object@backend, spectraVariables = spectraVariables) + object + }) + +#' @rdname filterMsLevel +#' +#' @export +setMethod("[", "Spectra", function(x, i, j, ..., drop = FALSE) { + if (!missing(j)) + stop("Subsetting 'Spectra' by columns is not (yet) supported") + if (missing(i)) + return(x) + slot(x, "backend", check = FALSE) <- x@backend[i = i] + x +}) + +#' @rdname filterMsLevel setMethod("filterAcquisitionNum", "Spectra", function(object, n = integer(), dataStorage = character(), dataOrigin = character()) { @@ -1431,7 +2433,7 @@ setMethod("filterAcquisitionNum", "Spectra", function(object, n = integer(), object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterEmptySpectra", "Spectra", function(object) { object@backend <- object@backend[as.logical(lengths(object))] object@processing <- .logging(object@processing, @@ -1439,7 +2441,7 @@ setMethod("filterEmptySpectra", "Spectra", function(object) { object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterDataOrigin", "Spectra", function(object, dataOrigin = character()) { if (length(dataOrigin) && !is.character(dataOrigin)) @@ -1451,7 +2453,7 @@ setMethod("filterDataOrigin", "Spectra", function(object, object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterDataStorage", "Spectra", function(object, dataStorage = character()) { if (length(dataStorage) && !is.character(dataStorage)) @@ -1463,7 +2465,7 @@ setMethod("filterDataStorage", "Spectra", function(object, object }) -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @exportMethod filterFourierTransformArtefacts setMethod("filterFourierTransformArtefacts", "Spectra", @@ -1481,7 +2483,7 @@ setMethod("filterFourierTransformArtefacts", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @importMethodsFrom ProtGenerics filterIntensity #' @@ -1525,7 +2527,7 @@ setMethod("filterIntensity", "Spectra", }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterIsolationWindow", "Spectra", function(object, mz = numeric()) { object@backend <- filterIsolationWindow(object@backend, mz = mz) object@processing <- .logging(object@processing, @@ -1534,7 +2536,7 @@ setMethod("filterIsolationWindow", "Spectra", function(object, mz = numeric()) { object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterMsLevel", "Spectra", function(object, msLevel. = integer()) { object@backend <- filterMsLevel(object@backend, msLevel = msLevel.) object@processing <- .logging(object@processing, @@ -1543,7 +2545,7 @@ setMethod("filterMsLevel", "Spectra", function(object, msLevel. = integer()) { object }) -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @importMethodsFrom ProtGenerics filterMzRange #' @@ -1566,7 +2568,7 @@ setMethod("filterMzRange", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @importMethodsFrom ProtGenerics filterMzValues #' @@ -1605,7 +2607,7 @@ setMethod("filterMzValues", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterPolarity", "Spectra", function(object, polarity = integer()) { object@backend <- filterPolarity(object@backend, polarity = polarity) object@processing <- .logging(object@processing, @@ -1614,7 +2616,7 @@ setMethod("filterPolarity", "Spectra", function(object, polarity = integer()) { object }) -#' @rdname Spectra +#' @rdname filterMsLevel #' #' @export setMethod("filterPrecursorMz", "Spectra", @@ -1630,7 +2632,7 @@ setMethod("filterPrecursorMz", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterPrecursorMzRange", "Spectra", function(object, mz = numeric()) { object@backend <- filterPrecursorMzRange(object@backend, mz) @@ -1641,7 +2643,7 @@ setMethod("filterPrecursorMzRange", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterPrecursorMzValues", "Spectra", function(object, mz = numeric(), ppm = 20, tolerance = 0) { object@backend <- filterPrecursorMzValues( @@ -1653,7 +2655,7 @@ setMethod("filterPrecursorMzValues", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterPrecursorCharge", "Spectra", function(object, z = integer()) { z <- unique(z) @@ -1665,7 +2667,7 @@ setMethod("filterPrecursorCharge", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterPrecursorScan", "Spectra", function(object, acquisitionNum = integer(), f = dataOrigin(object)) { if (!all(f %in% unique(dataOrigin(object)))) @@ -1681,7 +2683,7 @@ setMethod("filterPrecursorScan", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterRt", "Spectra", function(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) { if (!is.numeric(msLevel.)) @@ -1700,7 +2702,7 @@ setMethod("filterRt", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterRanges", "Spectra", function(object, spectraVariables = character(), ranges = numeric(), match = c("all", "any")){ @@ -1715,7 +2717,7 @@ setMethod("filterRanges", "Spectra", object }) -#' @rdname Spectra +#' @rdname filterMsLevel setMethod("filterValues", "Spectra", function(object, spectraVariables = character(), values = numeric(), ppm = 0, tolerance = 0, match = c("all", "any")){ @@ -1737,7 +2739,469 @@ setMethod("filterValues", "Spectra", #' @title Data manipulation and analysis methods #' -#' `reset()` to clean the lazy processing queue. +#' @name addProcessing +#' +#' @aliases addProcessing +#' @aliases applyProcessing +#' @aliases bin +#' @aliases containsMz +#' @aliases containsNeutralLoss +#' @aliases entropy +#' @aliases pickPeaks +#' @aliases replaceIntensitiesBelow +#' @aliases reset +#' @aliases smooth +#' @aliases spectrapply +#' +#' @description +#' +#' Various data analysis functions are available for `Spectra` objects. These +#' can be categorized into functions that either return a `Spectra` object +#' (with the manipulated data) and functions that directly return the +#' result from the calculation. For the former category, the data manipulations +#' are cached in the result object's *processing queue* and only exectuted +#' on-the-fly when the respective data gets extracted from the `Spectra` (see +#' section *The processing queue* for more information). +#' +#' For the second category, the calculations are directly executed and the +#' result, usually one value per spectrum, returned. Generally, to reduce +#' memory demand, a chunk-wise processing of the data is performed. +#' +#' +#' @section Data analysis methods returning a `Spectra`: +#' +#' The methods listed here return a `Spectra` object as a result. +#' +#' - `addProcessing()`: adds an arbitrary function that should be applied to the +#' peaks matrix of every spectrum in `object`. The function (can be passed +#' with parameter `FUN`) is expected to take a peaks matrix as input and to +#' return a peaks matrix. A peaks matrix is a numeric matrix with two columns, +#' the first containing the m/z values of the peaks and the second the +#' corresponding intensities. The function has to have `...` in its +#' definition. Additional arguments can be passed with `...`. With parameter +#' `spectraVariables` it is possible to define additional spectra variables +#' from `object` that should be passed to the function `FUN`. These will be +#' passed by their name (e.g. specifying `spectraVariables = "precursorMz"` +#' will pass the spectra's precursor m/z as a parameter named `precursorMz` +#' to the function. The only exception is the spectra's MS level, these will +#' be passed to the function as a parameter called `spectrumMsLevel` (i.e. +#' with `spectraVariables = "msLevel"` the MS levels of each spectrum will be +#' submitted to the function as a parameter called `spectrumMsLevel`). +#' Examples are provided in the package vignette. +#' +#' - `bin()`: aggregates individual spectra into discrete (m/z) bins. Binning is +#' performed only on spectra of the specified MS level(s) (parameter +#' `msLevel`, by default all MS levels of `x`). The bins can be defined with +#' parameter `breaks` which by default are equally sized bins, with size +#' being defined by parameter `binSize`, from the minimal to the maximal m/z +#' of all spectra (of MS level `msLevel`) within `x`. The same bins are used +#' for all spectra in `x`. All intensity values for peaks falling into the +#' same bin are aggregated using the function provided with parameter `FUN` +#' (defaults to `FUN = sum`, i.e. all intensities are summed up). Note that +#' the binning operation is applied to the peak data on-the-fly upon data +#' access and it is possible to *revert* the operation with the `reset()` +#' function (see description of `reset()` below). +#' +#' - `countIdentifications`: counts the number of identifications each scan has +#' led to. See [countIdentifications()] for more details. +#' +#' - `pickPeaks()`: picks peaks on individual spectra using a moving +#' window-based approach (window size = `2 * halfWindowSize`). For noisy +#' spectra there are currently two different noise estimators available, +#' the *M*edian *A*bsolute *D*eviation (`method = "MAD"`) and +#' Friedman's Super Smoother (`method = "SuperSmoother"`), +#' as implemented in the [`MsCoreUtils::noise()`]. +#' The method supports also to optionally *refine* the m/z value of +#' the identified centroids by considering data points that belong (most +#' likely) to the same mass peak. Therefore the m/z value is calculated as an +#' intensity weighted average of the m/z values within the peak region. +#' The peak region is defined as the m/z values (and their respective +#' intensities) of the `2 * k` closest signals to the centroid or the closest +#' valleys (`descending = TRUE`) in the `2 * k` region. For the latter the `k` +#' has to be chosen general larger. See [`MsCoreUtils::refineCentroids()`] for +#' details. +#' If the ratio of the signal to the highest intensity of the peak is below +#' `threshold` it will be ignored for the weighted average. +#' +#' - `replaceIntensitiesBelow()`: replaces intensities below a specified +#' threshold with the provided `value`. Parameter `threshold` can be either +#' a single numeric value or a function which is applied to all non-`NA` +#' intensities of each spectrum to determine a threshold value for each +#' spectrum. The default is `threshold = min` which replaces all values +#' which are <= the minimum intensity in a spectrum with `value` (the +#' default for `value` is `0`). Note that the function specified with +#' `threshold` is expected to have a parameter `na.rm` since `na.rm = TRUE` +#' will be passed to the function. If the spectrum is in profile mode, +#' ranges of successive non-0 peaks <= `threshold` are set to 0. +#' Parameter `msLevel.` allows to apply this to only spectra of certain MS +#' level(s). +#' +#' - `scalePeaks()`: scales intensities of peaks within each spectrum depending +#' on parameter `by`. With `by = sum` (the default) peak intensities are +#' divided by the sum of peak intensities within each spectrum. The sum of +#' intensities is thus 1 for each spectrum after scaling. Parameter +#' `msLevel.` allows to apply the scaling of spectra of a certain MS level. +#' By default (`msLevel. = uniqueMsLevels(x)`) intensities for all +#' spectra will be scaled. +#' +#' - `smooth()`: smooths individual spectra using a moving window-based approach +#' (window size = `2 * halfWindowSize`). Currently, the +#' Moving-Average- (`method = "MovingAverage"`), +#' Weighted-Moving-Average- (`method = "WeightedMovingAverage")`, +#' weights depending on the distance of the center and calculated +#' `1/2^(-halfWindowSize:halfWindowSize)`) and +#' Savitzky-Golay-Smoothing (`method = "SavitzkyGolay"`) are supported. +#' For details how to choose the correct `halfWindowSize` please see +#' [`MsCoreUtils::smooth()`]. +#' +#' +#' @section Data analysis methods returning the result from the calculation: +#' +#' The functions listed in this section return immediately the result from the +#' calculation. To reduce memory demand (and allow parallel processing) the +#' calculations a chunk-wise processing is generally performed. +#' +#' - `chunkapply()`: apply an arbitrary function to chunks of spectra. See +#' [chunkapply()] for details and examples. +#' +#' - `containsMz()`: checks for each of the spectra whether they contain mass +#' peaks with an m/z equal to `mz` (given acceptable difference as defined by +#' parameters `tolerance` and `ppm` - see [common()] for details). Parameter +#' `which` allows to define whether any (`which = "any"`, the default) or +#' all (`which = "all"`) of the `mz` have to match. The function returns +#' `NA` if `mz` is of length 0 or is `NA`. +#' +#' - `containsNeutralLoss()`: checks for each spectrum in `object` if it has a +#' peak with an m/z value equal to its precursor m/z - `neutralLoss` (given +#' acceptable difference as defined by parameters `tolerance` and `ppm`). +#' Returns `NA` for MS1 spectra (or spectra without a precursor m/z). +#' +#' - `entropy()`: calculates the entropy of each spectra based on the metrics +#' suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). +#' See also [nentropy()] in the *MsCoreUtils* package for details. +#' +#' - `estimatePrecursorIntensity()`: defines the precursor intensities for MS2 +#' spectra using the intensity of the matching MS1 peak from the +#' closest MS1 spectrum (i.e. the last MS1 spectrum measured before the +#' respective MS2 spectrum). With `method = "interpolation"` it is also +#' possible to calculate the precursor intensity based on an interpolation of +#' intensity values (and retention times) of the matching MS1 peaks from the +#' previous and next MS1 spectrum. See [estimatePrecursorIntensity()] for +#' examples and more details. +#' +#' - `estimatePrecursorMz()`: **for DDA data**: allows to estimate a fragment +#' spectra's precursor m/z based on the reported precursor m/z and the data +#' from the previous MS1 spectrum. See [estimatePrecursorMz()] for details. +#' +#' - `neutralLoss()`: calculates neutral loss spectra for fragment spectra. See +#' [neutralLoss()] for detailed documentation. +#' +#' - `spectrapply()`: applies a given function to each individual spectrum or +#' sets of a `Spectra` object. By default, the `Spectra` is split into +#' individual spectra (i.e. `Spectra` of length 1) and the function `FUN` +#' is applied to each of them. An alternative splitting can be defined with +#' parameter `f`. Parameters for `FUN` can be passed using `...`. +#' The returned result and its order depend on the function `FUN` and how +#' `object` is split (hence on `f`, if provided). Parallel processing is +#' supported and can be configured with parameter `BPPARAM`, is however only +#' suggested for computational intense `FUN`. +#' As an alternative to the (eventual parallel) processing of the full +#' `Spectra`, `spectrapply()` supports also a chunk-wise processing. For this, +#' parameter `chunkSize` needs to be specified. `object` is then split into +#' chunks of size `chunkSize` which are then (stepwise) processed by `FUN`. +#' This guarantees a lower memory demand (especially for on-disk backends) +#' since only the data for one chunk needs to be loaded into memory in each +#' iteration. Note that by specifying `chunkSize`, parameters `f` and +#' `BPPARAM` will be ignored. +#' See also `chunkapply()` above or examples below for details on chunk-wise +#' processing. +#' +#' +#' @section The processing queue: +#' +#' Operations that modify mass peak data, i.e. the m/z and intensity values of +#' a `Spectra` are generally not applied immediately to the data but are +#' *cached* within the object's *processing queue*. These operations are then +#' applied to the data only upon request, for example when m/z and/or +#' intensity values are extracted. This lazy execution guarantees that the +#' same functionality can be applied to any `Spectra` object, regardless of +#' the type of backend that is used. Thus, data manipulation operations can +#' also be applied to data that is *read only*. As a side effect, this enables +#' also to *undo* operations using the `reset()` function. +#' +#' Functions related to the processing queue are: +#' +#' - `applyProcessing()`: for `Spectra` objects that use a **writeable** backend +#' only: apply all steps from the lazy processing queue to the peak data and +#' write it back to the data storage. Parameter `f` allows to specify how +#' `object` should be split for parallel processing. This should either be +#' equal to the `dataStorage`, or `f = rep(1, length(object))` to disable +#' parallel processing alltogether. Other partitionings might result in +#' errors (especially if a `MsBackendHdf5Peaks` backend is used). +#' +#' - `processingLog()`: returns a `character` vector with the processing log +#' messages. +#' +#' - `reset()`: restores the data to its original state (as much as possible): +#' removes any processing steps from the lazy processing queue and calls +#' `reset()` on the backend which, depending on the backend, can also undo +#' e.g. data filtering operations. Note that a `reset*(` call after +#' `applyProcessing()` will not have any effect. See examples below for more +#' information. +#' +#' @param binSize For `bin()`: `numeric(1)` defining the size for the m/z bins. +#' Defaults to `binSize = 1`. +#' +#' @param BPPARAM Parallel setup configuration. See [bpparam()] for more +#' information. This is passed directly to the [backendInitialize()] method +#' of the [MsBackend-class]. See also [processingChunkSize()] for +#' additional information on parallel processing. +#' +#' @param breaks For `bin()`: `numeric` defining the m/z breakpoints between +#' bins. +#' +#' @param by For `scalePeaks()`: function to calculate a single `numeric` from +#' intensity values of a spectrum by which all intensities (of +#' that spectrum) should be divided by. The default `by = sum` will +#' divide intensities of each spectrum by the sum of intensities of that +#' spectrum. +#' +#' @param chunkSize For `spectrapply()`: size of the chunks into which the +#' `Spectra` should be split. This parameter overrides parameters +#' `f` and `BPPARAM`. +#' +#' @param descending For `pickPeaks()`: `logical`, if `TRUE` just values +#' betwee the nearest valleys around the peak centroids are used. +# +#' @param f For `spectrapply()` and `applyProcessing()`: `factor` defining +#' how `object` should be splitted for eventual parallel processing. +#' Defaults to `factor()` for `spectrapply()` hence the object is not +#' splitted while it defaults to `f = processingChunkSize(object)` for +#' `applyProcessing()` splitting thus the object by default into chunks +#' depending on [processingChunkSize()]. +#' +#' @param FUN For `addProcessing()`: function to be applied to the peak matrix +#' of each spectrum in `object`. +#' For `bin()`: function to aggregate intensity values of peaks falling +#' into the same bin. Defaults to `FUN = sum` thus summing up intensities. +#' For `spectrapply()` and `chunkapply()`: function to be applied to +#' each individual or each chunk of `Spectra`. +#' +#' @param halfWindowSize For `pickPeaks()`: `integer(1)`, used in the +#' identification of the mass peaks: a local maximum has to be the +#' maximum in the window from `(i - halfWindowSize):(i + halfWindowSize)`. +#' For `smooth()`: `integer(1)`, used in the smoothing algorithm, the +#' window reaches from `(i - halfWindowSize):(i + halfWindowSize)`. +#' +#' @param k For `pickPeaks()`: `integer(1)`, number of values left and right of +#' the peak that should be considered in the weighted mean calculation. +#' +#' @param method For `pickPeaks()`: `character(1)`, the noise estimators that +#' should be used, currently the the *M*edian *A*bsolute *D*eviation +#' (`method = "MAD"`) and Friedman's Super Smoother +#' (`method = "SuperSmoother"`) are supported. +#' For `smooth()`: `character(1)`, the smoothing function that should be +#' used, currently, the Moving-Average- (`method = "MovingAverage"`), +#' Weighted-Moving-Average- (`method = "WeightedMovingAverage")`, +#' Savitzky-Golay-Smoothing (`method = "SavitzkyGolay"`) are supported. +#' +#' @param msLevel. `integer` defining the MS level(s) of the spectra to which +#' the function should be applied (defaults to all MS levels of `object`. +#' +#' @param mz For `containsMz()`: `numeric` with the m/z value(s) of the mass +#' peaks to check. +#' +#' @param neutralLoss for `containsNeutralLoss()`: `numeric(1)` defining the +#' value which should be subtracted from the spectrum's precursor m/z. +#' +#' @param normalized for `entropy()`: `logical(1)` whether the normalized +#' entropy should be calculated (default). See also [nentropy()] for +#' details. +#' +#' @param object A `Spectra` object. +#' +#' @param ppm For `containsMz()` and `neutralLoss()`: `numeric(1)` defining a +#' relative, m/z-dependent, maximal accepted difference between m/z values +#' for peaks to be matched. +#' +#' @param snr For `pickPeaks()`: `double(1)` defining the +#' *S*ignal-to-*N*oise-*R*atio. The intensity of a local maximum has to be +#' higher than `snr * noise` to be considered as peak. +#' +#' @param spectraVariables For `addProcessing()`: `character` with additional +#' spectra variables that should be passed along to the function defined +#' with `FUN`. See function description for details. +#' +#' @param threshold For `pickPeaks()`: a `numeric(1)` defining the proportion +#' of the maximal peak intensity. Only values above the threshold are +#' used for the weighted mean calculation. +#' For `replaceIntensitiesBelow()`: a `numeric(1)` defining the threshold +#' or a `function` to calculate the threshold for each spectrum on its +#' intensity values. Defaults to `threshold = min`. +#' +#' @param tolerance For `containsMz()` and `neutralLoss()`: +#' `numeric(1)` allowing to define a constant maximal accepted difference +#' between m/z values for peaks to be matched. +#' +#' @param value For `replaceIntensitiesBelow()`: `numeric(1)` defining the +#' value with which intensities should be replaced with. +#' +#' @param which For `containsMz()`: either `"any"` or `"all"` defining whether +#' any (the default) or all provided `mz` have to be present in the +#' spectrum. +#' +#' @param x A `Spectra`. +#' +#' @param zero.rm For `bin()`: `logical(1)` indicating whether to remove bins +#' with zero intensity. Defaults to `TRUE`, meaning the function will +#' discard bins created with an intensity of 0 to enhance memory +#' efficiency. +#' +#' @param ... Additional arguments passed to internal and downstream functions. +#' +#' @return +#' +#' See the documentation of the individual functions for a description of the +#' return value. +#' +#' @md +#' +#' @seealso +#' +#' - [compareSpectra()] for calculation of spectra similarity scores. +#' +#' - [processingChunkSize()] for information on parallel and chunk-wise data +#' processing. +#' +#' - [Spectra] for a general description of the `Spectra` object. +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail, Nir Shahaf, Mar Garcia-Aloy +#' +#' @examples +#' +#' ## Load a `Spectra` object with LC-MS/MS data. +#' fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", +#' package = "msdata") +#' sps_dda <- Spectra(fl) +#' sps_dda +#' +#' +#' ## -------- FUNCTIONS RETURNING A SPECTRA -------- +#' +#' ## Replace peak intensities below 40 with a value of 1 +#' sps_mod <- replaceIntensitiesBelow(sps_dda, threshold = 20, value = 1) +#' sps_mod +#' +#' ## Get the intensities of the first spectrum before and after the +#' ## operation +#' intensity(sps_dda[1]) +#' intensity(sps_mod[1]) +#' +#' ## Remove all peaks with an intensity below 5. +#' sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) +#' +#' intensity(sps_mod) +#' +#' ## In addition it is possible to pass a function to `filterIntensity()`: in +#' ## the example below we want to keep only peaks that have an intensity which +#' ## is larger than one third of the maximal peak intensity in that spectrum. +#' keep_peaks <- function(x, prop = 3) { +#' x > max(x, na.rm = TRUE) / prop +#' } +#' sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks) +#' intensity(sps_mod) +#' +#' ## We can also change the proportion by simply passing the `prop` parameter +#' ## to the function. To keep only peaks that have an intensity which is +#' ## larger than half of the maximum intensity: +#' sps_mod <- filterIntensity(sps_dda, intensity = keep_peaks, prop = 2) +#' intensity(sps_mod) +#' +#' ## With the `scalePeaks()` function we can alternatively scale the +#' ## intensities of mass peaks per spectrum to relative intensities. This +#' ## is specifically useful for fragment (MS2) spectra. We below thus +#' ## scale the intensities per spectrum by the total sum of intensities +#' ## (such that the sum of all intensities per spectrum is 1). +#' ## Below we scale the intensities of all MS2 spectra in our data set. +#' sps_mod <- scalePeaks(sps_dda, msLevel = 2L) +#' +#' ## MS1 spectra were not affected +#' sps_mod |> +#' filterMsLevel(1L) |> +#' intensity() +#' +#' ## Intensities of MS2 spectra were scaled +#' sps_mod |> +#' filterMsLevel(2L) |> +#' intensity() +#' +#' ## Since data manipulation operations are by default not directly applied to +#' ## the data but only cached in the internal processing queue, it is also +#' ## possible to remove these data manipulations with the `reset()` function: +#' tmp <- reset(sps_mod) +#' tmp +#' lengths(sps_dda) |> head() +#' lengths(sps_mod) |> head() +#' lengths(tmp) |> head() +#' +#' ## Data manipulation operations cached in the processing queue can also be +#' ## applied to the mass peaks data with the `applyProcessing()` function, if +#' ## the `Spectra` uses a backend that supports that (i.e. allows replacing +#' ## the mass peaks data). Below we first change the backend to a +#' ## `MsBackendMemory()` and then use the `applyProcessing()` to modify the +#' ## mass peaks data +#' sps_dda <- setBackend(sps_dda, MsBackendMemory()) +#' sps_mod <- filterIntensity(sps_dda, intensity = c(5, Inf)) +#' sps_mod <- applyProcessing(sps_mod) +#' sps_mod +#' +#' ## While we can't *undo* this filtering operation now using the `reset()` +#' ## function, accessing the data would now be faster, because the operation +#' ## does no longer to be applied to the original data before returning to the +#' ## user. +#' +#' +#' ## -------- FUNCTIONS RETURNING THE RESULT -------- +#' +#' ## With the `spectrapply()` function it is possible to apply an +#' ## arbitrary function to each spectrum in a Spectra. +#' ## In the example below we calculate the mean intensity for each spectrum +#' ## in a subset of the sciex_im data. Note that we can access all variables +#' ## of each individual spectrum either with the `$` operator or the +#' ## corresponding method. +#' res <- spectrapply(sps_dda[1:20], FUN = function(x) mean(x$intensity[[1]])) +#' head(res) +#' +#' ## As an alternative, applying a function `FUN` to a `Spectra` can be +#' ## performed *chunk-wise*. The advantage of this is, that only the data for +#' ## one chunk at a time needs to be loaded into memory reducing the memory +#' ## demand. This type of processing can be performed by specifying the size +#' ## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` +#' ## parameter +#' spectrapply(sps_dda[1:20], lengths, chunkSize = 5L) +#' +#' ## Precursor intensity estimation. Some manufacturers don't report the +#' ## precursor intensity for MS2 spectra: +#' sps_dda |> +#' filterMsLevel(2L) |> +#' precursorIntensity() +#' +#' ## This intensity can however be estimated from the previously measured +#' ## MS1 scan with the `estimatePrecursorIntensity()` function: +#' pi <- estimatePrecursorIntensity(sps_dda) +#' +#' ## This function returned the result as a `numeric` vector with one +#' ## value per spectrum: +#' pi +#' +#' ## We can replace the precursor intensity values of the originating +#' ## object: +#' sps_dda$precursorIntensity <- pi +#' sps_dda |> +#' filterMsLevel(2L) |> +#' precursorIntensity() +#' +NULL #' @exportMethod addProcessing #' @@ -1751,7 +3215,7 @@ setMethod("filterValues", "Spectra", #' #' @importFrom BiocGenerics updateObject #' -#' @rdname Spectra +#' @rdname addProcessing setMethod("addProcessing", "Spectra", function(object, FUN, ..., spectraVariables = character()) { if (missing(FUN)) @@ -1766,12 +3230,7 @@ setMethod("addProcessing", "Spectra", function(object, FUN, ..., object }) -#' @rdname Spectra -setMethod("backendBpparam", "Spectra", function(object, BPPARAM = bpparam()) { - backendBpparam(object@backend, BPPARAM) -}) - -#' @rdname Spectra +#' @rdname addProcessing #' #' @importMethodsFrom ProtGenerics bin #' @@ -1799,7 +3258,7 @@ setMethod("bin", "Spectra", function(x, binSize = 1L, breaks = NULL, x }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @exportMethod containsMz setMethod("containsMz", "Spectra", function(object, mz = numeric(), @@ -1825,7 +3284,7 @@ setMethod("containsMz", "Spectra", function(object, mz = numeric(), } }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @exportMethod containsNeutralLoss setMethod("containsNeutralLoss", "Spectra", function(object, neutralLoss = 0, @@ -1848,7 +3307,7 @@ setMethod("containsNeutralLoss", "Spectra", function(object, neutralLoss = 0, } }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @importFrom MsCoreUtils entropy nentropy #' @@ -1863,12 +3322,12 @@ setMethod("entropy", "Spectra", function(object, normalized = TRUE) { ) } else numeric() }) -#' @rdname Spectra +#' @rdname addProcessing setMethod("entropy", "ANY", function(object, ...) { MsCoreUtils::entropy(object) }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @exportMethod pickPeaks setMethod("pickPeaks", "Spectra", @@ -1910,7 +3369,7 @@ setMethod("pickPeaks", "Spectra", object }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @exportMethod replaceIntensitiesBelow setMethod("replaceIntensitiesBelow", "Spectra", @@ -1937,7 +3396,7 @@ setMethod("replaceIntensitiesBelow", "Spectra", object }) -#' @rdname Spectra +#' @rdname addProcessing setMethod("reset", "Spectra", function(object, ...) { object@backend <- reset(object@backend) object@processingQueue <- list() @@ -1948,7 +3407,7 @@ setMethod("reset", "Spectra", function(object, ...) { object }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @importFrom ProtGenerics smooth #' @importFrom MsCoreUtils coefMA coefWMA coefSG @@ -1979,7 +3438,7 @@ setMethod("smooth", "Spectra", x }) -#' @rdname Spectra +#' @rdname addProcessing #' #' @importMethodsFrom ProtGenerics spectrapply #' @@ -2089,16 +3548,119 @@ setMethod( ################################################################################ #' @title Spectra similarity calculations - -#' @rdname Spectra #' -#' @exportMethod compareSpectra +#' @name compareSpectra +#' +#' @aliases compareSpectra +#' +#' @description +#' +#' `compareSpectra()` compares each spectrum in `x` with each spectrum in `y` +#' using the function provided with `FUN` (defaults to [ndotproduct()]). If +#' `y` is missing, each spectrum in `x` is compared with each other spectrum +#' in `x`. +#' The matching/mapping of peaks between the compared spectra is done with the +#' `MAPFUN` function. The default [joinPeaks()] matches peaks of both spectra +#' and allows to keep all peaks from the first spectrum (`type = "left"`), +#' from the second (`type = "right"`), from both (`type = "outer"`) and to +#' keep only matching peaks (`type = "inner"`); see [joinPeaks()] for more +#' information and examples). The `MAPFUN` function should have parameters +#' `x`, `y`, `xPrecursorMz` and `yPrecursorMz` as these values are passed to +#' the function. +#' +#' In addition to `joinPeaks()` also [joinPeaksGnps()] is supported for +#' GNPS-like similarity score calculations. Note that `joinPeaksGnps()` should +#' only be used in combination with `FUN = MsCoreUtils::gnps` +#' (see [joinPeaksGnps()] for more information and details). Use +#' `MAPFUN = joinPeaksNone` to disable internal peak matching/mapping if a +#' similarity scoring function is used that performs the matching internally. +#' +#' `FUN` is supposed to be a function to compare intensities of (matched) +#' peaks of the two spectra that are compared. The function needs to take two +#' matrices with columns `"mz"` and `"intensity"` as input and is supposed +#' to return a single numeric as result. In addition to the two peak matrices +#' the spectra's precursor m/z values are passed to the function as parameters +#' `xPrecursorMz` (precursor m/z of the `x` peak matrix) and `yPrecursorMz` +#' (precursor m/z of the `y` peak matrix). Additional parameters to functions +#' `FUN` and `MAPFUN` can be passed with `...`. Parameters `ppm` and +#' `tolerance` are passed to both `MAPFUN` and `FUN`. +#' The function returns a `matrix` with the results of `FUN` for each +#' comparison, number of rows equal to `length(x)` and number of columns +#' equal `length(y)` (i.e. element in row 2 and column 3 is the result from +#' the comparison of `x[2]` with `y[3]`). If `SIMPLIFY = TRUE` the `matrix` +#' is *simplified* to a `numeric` if length of `x` or `y` is one. See also +#' the vignette for additional examples, such as using spectral entropy +#' similarity in the scoring. +#' +#' @param FUN function to compare intensities of peaks between two spectra. +#' Defaults to [ndotproduct()]. +#' +#' @param MAPFUN For `compareSpectra()`: function to map/match peaks between +#' the two compared spectra. See [joinPeaks()] for more information and +#' possible functions. Defaults to [joinPeaks()]. +#' +#' @param ppm `numeric(1)` defining a relative, m/z-dependent, maximal +#' accepted difference between m/z values for peaks to be matched. This +#' parameter is directly passed to `MAPFUN`. +#' +#' @param tolerance `numeric(1)` allowing to define a constant maximal +#' accepted difference between m/z values for peaks to be matched. This +#' parameter is directly passed to `MAPFUN`. +#' +#' @param x A `Spectra` object. +#' +#' @param y A `Spectra` object. +#' +#' @param SIMPLIFY `logical(1)` defining whether the result matrix should be +#' *simplified* to a `numeric` if possible (i.e. if either `x` or `y` is +#' of length 1). +#' +#' @param ... Additional arguments passed to the internal functions. #' #' @importFrom MsCoreUtils ndotproduct #' #' @importMethodsFrom ProtGenerics compareSpectra #' #' @exportMethod compareSpectra +#' +#' @author Sebastian Gibb, Johannes Rainer, Laurent Gatto +#' +#' @examples +#' +#' ## Load a `Spectra` object with LC-MS/MS data. +#' fl <- system.file("TripleTOF-SWATH", "PestMix1_DDA.mzML", +#' package = "msdata") +#' sps_dda <- Spectra(fl) +#' sps_dda +#' +#' ## Restrict to MS2 (fragment) spectra: +#' sps_ms2 <- filterMsLevel(sps_dda, msLevel = 2L) +#' +#' ## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using +#' ## the normalized dotproduct method. +#' res <- compareSpectra(sps_ms2[2:3], sps_ms2[10:20]) +#' ## first row contains comparisons of spectrum 2 with spectra 10 to 20 and +#' ## the second row comparisons of spectrum 3 with spectra 10 to 20 +#' res +#' +#' ## We next calculate the pairwise similarity for the first 10 spectra +#' compareSpectra(sps_ms2[1:10]) +#' +#' ## Use compareSpectra to determine the number of common (matching) peaks +#' ## with a ppm of 10: +#' ## type = "inner" uses a *inner join* to match peaks, i.e. keeps only +#' ## peaks that can be mapped betwen both spectra. The provided FUN returns +#' ## simply the number of matching peaks. +#' compareSpectra(sps_ms2[2:3], sps_ms2[10:20], ppm = 10, type = "inner", +#' FUN = function(x, y, ...) nrow(x)) +#' +#' ## We repeat this calculation between all pairwise combinations +#' ## of the first 20 spectra +#' compareSpectra(sps_ms2[1:20], ppm = 10, type = "inner", +#' FUN = function(x, y, ...) nrow(x)) +NULL + +#' @rdname compareSpectra setMethod("compareSpectra", signature(x = "Spectra", y = "Spectra"), function(x, y, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, FUN = ndotproduct, ..., SIMPLIFY = TRUE) { @@ -2109,7 +3671,7 @@ setMethod("compareSpectra", signature(x = "Spectra", y = "Spectra"), mat <- as.vector(mat) mat }) -#' @rdname Spectra +#' @rdname compareSpectra setMethod("compareSpectra", signature(x = "Spectra", y = "missing"), function(x, y = NULL, MAPFUN = joinPeaks, tolerance = 0, ppm = 20, FUN = ndotproduct, ..., SIMPLIFY = TRUE) { @@ -2125,3 +3687,15 @@ setMethod("compareSpectra", signature(x = "Spectra", y = "missing"), mat <- as.vector(mat) mat }) + + +################################################################################ +## +## methods with documentation in Spectra-functions.R +## +################################################################################ + +#' @rdname processingChunkSize +setMethod("backendBpparam", "Spectra", function(object, BPPARAM = bpparam()) { + backendBpparam(object@backend, BPPARAM) +}) diff --git a/R/countIdentifications.R b/R/countIdentifications.R index b7ddb687..2f3e8c15 100644 --- a/R/countIdentifications.R +++ b/R/countIdentifications.R @@ -40,6 +40,10 @@ #' spectra variable `countIdentifications` with the number of #' identification for each scan. #' +#' @seealso +#' +#' [addProcessing()] for other data analysis functions. +#' #' @author Laurent Gatto #' #' @export diff --git a/R/peaks-functions.R b/R/peaks-functions.R index 7639538a..f34adde9 100644 --- a/R/peaks-functions.R +++ b/R/peaks-functions.R @@ -308,7 +308,13 @@ NULL #' #' @author Johannes Rainer, Michael Witting #' -#' @seealso [gnps()] +#' @seealso +#' +#' - [compareSpectra()] for the function to calculate similarities between +#' spectra. +#' +#' - [gnps()] in the *MsCoreUtils* package for more information on the GNPS +#' similarity score. #' #' @importFrom MsCoreUtils join ppm #' diff --git a/man/MsBackend.Rd b/man/MsBackend.Rd index 19bd8b7c..0bf98b0a 100644 --- a/man/MsBackend.Rd +++ b/man/MsBackend.Rd @@ -938,7 +938,7 @@ This backend provides an \code{export()} method to export data from a \code{Spec The parameters are: \itemize{ \item \code{object}: an instance of the \code{MsBackendMzR} class. -\item \code{x}: the \linkS4class{Spectra} object to be exported. +\item \code{x}: the \link{Spectra} object to be exported. \item \code{file}: \code{character} with the (full) output file name(s). Should be of length 1 or equal \code{length(x)}. If a single file is specified, all spectra are exported to that file. Alternatively it is possible to specify @@ -952,7 +952,7 @@ backend and if \code{dataOrigin(x)} contains the original MS data file names. \item \code{BPPARAM}: parallel processing settings. } -See examples in \linkS4class{Spectra} or the vignette for more details and +See examples in \link{Spectra} or the vignette for more details and examples. The \code{MsBackendMzR} ignores parameter \code{columns} of the \code{peaksData()} diff --git a/man/Spectra.Rd b/man/Spectra.Rd index b4f87b54..1116f60c 100644 --- a/man/Spectra.Rd +++ b/man/Spectra.Rd @@ -1,169 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Spectra-functions.R, R/Spectra.R -\name{applyProcessing} -\alias{applyProcessing} -\alias{concatenateSpectra} -\alias{combineSpectra} -\alias{joinSpectraData} -\alias{processingLog} -\alias{deisotopeSpectra} -\alias{reduceSpectra} -\alias{filterPrecursorMaxIntensity} -\alias{filterPrecursorIsotopes} -\alias{scalePeaks} -\alias{filterPrecursorPeaks} +% Please edit documentation in R/Spectra.R +\name{Spectra} \alias{Spectra} \alias{Spectra-class} -\alias{[,Spectra-method} -\alias{uniqueMsLevels} -\alias{uniqueMsLevels,Spectra-method} -\alias{combinePeaks} +\alias{setBackend} +\alias{export} \alias{Spectra,missing-method} \alias{Spectra,MsBackend-method} \alias{Spectra,character-method} \alias{Spectra,ANY-method} \alias{setBackend,Spectra,MsBackend-method} -\alias{c,Spectra-method} -\alias{split,Spectra,ANY-method} \alias{export,Spectra-method} -\alias{acquisitionNum,Spectra-method} -\alias{peaksData,Spectra-method} -\alias{peaksVariables,Spectra-method} -\alias{centroided,Spectra-method} -\alias{centroided<-,Spectra-method} -\alias{collisionEnergy,Spectra-method} -\alias{collisionEnergy<-,Spectra-method} -\alias{dataOrigin,Spectra-method} -\alias{dataOrigin<-,Spectra-method} -\alias{dataStorage,Spectra-method} -\alias{dropNaSpectraVariables,Spectra-method} -\alias{intensity,Spectra-method} -\alias{ionCount,Spectra-method} -\alias{isCentroided,Spectra-method} -\alias{isEmpty,Spectra-method} -\alias{isolationWindowLowerMz,Spectra-method} -\alias{isolationWindowLowerMz<-,Spectra-method} -\alias{isolationWindowTargetMz,Spectra-method} -\alias{isolationWindowTargetMz<-,Spectra-method} -\alias{isolationWindowUpperMz,Spectra-method} -\alias{isolationWindowUpperMz<-,Spectra-method} -\alias{containsMz,Spectra-method} -\alias{containsNeutralLoss,Spectra-method} -\alias{spectrapply,Spectra-method} -\alias{length,Spectra-method} -\alias{msLevel,Spectra-method} -\alias{mz,Spectra-method} -\alias{lengths,Spectra-method} -\alias{polarity,Spectra-method} -\alias{polarity<-,Spectra-method} -\alias{precScanNum,Spectra-method} -\alias{precursorCharge,Spectra-method} -\alias{precursorIntensity,Spectra-method} -\alias{precursorMz,Spectra-method} -\alias{rtime,Spectra-method} -\alias{rtime<-,Spectra-method} -\alias{scanIndex,Spectra-method} -\alias{selectSpectraVariables,Spectra-method} -\alias{smoothed,Spectra-method} -\alias{smoothed<-,Spectra-method} -\alias{spectraData,Spectra-method} -\alias{spectraData<-,Spectra-method} -\alias{spectraNames,Spectra-method} -\alias{spectraNames<-,Spectra-method} -\alias{spectraVariables,Spectra-method} -\alias{tic,Spectra-method} -\alias{$,Spectra-method} -\alias{$<-,Spectra-method} -\alias{[[,Spectra-method} -\alias{[[<-,Spectra-method} -\alias{filterAcquisitionNum,Spectra-method} -\alias{filterEmptySpectra,Spectra-method} -\alias{filterDataOrigin,Spectra-method} -\alias{filterDataStorage,Spectra-method} -\alias{filterFourierTransformArtefacts,Spectra-method} -\alias{filterIntensity,Spectra-method} -\alias{filterIsolationWindow,Spectra-method} -\alias{filterMsLevel,Spectra-method} -\alias{filterMzRange,Spectra-method} -\alias{filterMzValues,Spectra-method} -\alias{filterPolarity,Spectra-method} -\alias{filterPrecursorMz,Spectra-method} -\alias{filterPrecursorMzRange,Spectra-method} -\alias{filterPrecursorMzValues,Spectra-method} -\alias{filterPrecursorCharge,Spectra-method} -\alias{filterPrecursorScan,Spectra-method} -\alias{filterRt,Spectra-method} -\alias{reset,Spectra-method} -\alias{filterRanges,Spectra-method} -\alias{filterValues,Spectra-method} -\alias{bin,Spectra-method} -\alias{compareSpectra,Spectra,Spectra-method} -\alias{compareSpectra,Spectra,missing-method} -\alias{pickPeaks,Spectra-method} -\alias{replaceIntensitiesBelow,Spectra-method} -\alias{smooth,Spectra-method} -\alias{addProcessing,Spectra-method} -\alias{coreSpectraVariables} -\alias{backendBpparam,Spectra-method} -\alias{combinePeaks,Spectra-method} -\alias{entropy,Spectra-method} -\alias{entropy,ANY-method} \alias{dataStorageBasePath,Spectra-method} \alias{dataStorageBasePath<-,Spectra-method} -\alias{asDataFrame} \title{The Spectra class to manage and access MS data} \usage{ -applyProcessing( - object, - f = processingChunkFactor(object), - BPPARAM = bpparam(), - ... -) - -concatenateSpectra(x, ...) - -combineSpectra( - x, - f = x$dataStorage, - p = x$dataStorage, - FUN = combinePeaksData, - ..., - BPPARAM = bpparam() -) - -joinSpectraData(x, y, by.x = "spectrumId", by.y, suffix.y = ".y") - -processingLog(x) - -deisotopeSpectra( - x, - substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL"), - tolerance = 0, - ppm = 20, - charge = 1 -) - -reduceSpectra(x, tolerance = 0, ppm = 20) - -filterPrecursorMaxIntensity(x, tolerance = 0, ppm = 20) - -filterPrecursorIsotopes( - x, - tolerance = 0, - ppm = 20, - substDefinition = isotopicSubstitutionMatrix("HMDB_NEUTRAL") -) - -scalePeaks(x, by = sum, msLevel. = uniqueMsLevels(x)) - -filterPrecursorPeaks( - object, - tolerance = 0, - ppm = 20, - mz = c("==", ">="), - msLevel. = uniqueMsLevels(object) -) - \S4method{Spectra}{missing}( object, processingQueue = list(), @@ -209,431 +60,24 @@ filterPrecursorPeaks( BPPARAM = bpparam() ) -\S4method{c}{Spectra}(x, ...) - -\S4method{split}{Spectra,ANY}(x, f, drop = FALSE, ...) - \S4method{export}{Spectra}(object, backend, ...) -\S4method{acquisitionNum}{Spectra}(object) - -\S4method{peaksData}{Spectra}( - object, - columns = c("mz", "intensity"), - f = processingChunkFactor(object), - ..., - BPPARAM = bpparam() -) - -\S4method{peaksVariables}{Spectra}(object) - -\S4method{centroided}{Spectra}(object) - -\S4method{centroided}{Spectra}(object) <- value - -\S4method{collisionEnergy}{Spectra}(object) - -\S4method{collisionEnergy}{Spectra}(object) <- value - -\S4method{dataOrigin}{Spectra}(object) - -\S4method{dataOrigin}{Spectra}(object) <- value - -\S4method{dataStorage}{Spectra}(object) - -\S4method{dropNaSpectraVariables}{Spectra}(object) - -\S4method{intensity}{Spectra}(object, f = processingChunkFactor(object), ...) - -\S4method{ionCount}{Spectra}(object) - -\S4method{isCentroided}{Spectra}(object, ...) - -\S4method{isEmpty}{Spectra}(x) - -\S4method{isolationWindowLowerMz}{Spectra}(object) - -\S4method{isolationWindowLowerMz}{Spectra}(object) <- value - -\S4method{isolationWindowTargetMz}{Spectra}(object) - -\S4method{isolationWindowTargetMz}{Spectra}(object) <- value - -\S4method{isolationWindowUpperMz}{Spectra}(object) - -\S4method{isolationWindowUpperMz}{Spectra}(object) <- value - -\S4method{containsMz}{Spectra}( - object, - mz = numeric(), - tolerance = 0, - ppm = 20, - which = c("any", "all"), - BPPARAM = bpparam() -) - -\S4method{containsNeutralLoss}{Spectra}( - object, - neutralLoss = 0, - tolerance = 0, - ppm = 20, - BPPARAM = bpparam() -) - -\S4method{spectrapply}{Spectra}( - object, - FUN, - ..., - chunkSize = integer(), - f = factor(), - BPPARAM = SerialParam() -) - -\S4method{length}{Spectra}(x) - -\S4method{msLevel}{Spectra}(object) - -\S4method{mz}{Spectra}(object, f = processingChunkFactor(object), ...) - -\S4method{lengths}{Spectra}(x, use.names = FALSE) - -\S4method{polarity}{Spectra}(object) - -\S4method{polarity}{Spectra}(object) <- value - -\S4method{precScanNum}{Spectra}(object) - -\S4method{precursorCharge}{Spectra}(object) - -\S4method{precursorIntensity}{Spectra}(object) - -\S4method{precursorMz}{Spectra}(object) - -\S4method{rtime}{Spectra}(object) - -\S4method{rtime}{Spectra}(object) <- value - -\S4method{scanIndex}{Spectra}(object) - -\S4method{selectSpectraVariables}{Spectra}( - object, - spectraVariables = union(spectraVariables(object), peaksVariables(object)) -) - -\S4method{smoothed}{Spectra}(object) - -\S4method{smoothed}{Spectra}(object) <- value - -\S4method{spectraData}{Spectra}(object, columns = spectraVariables(object)) - -\S4method{spectraData}{Spectra}(object) <- value - -\S4method{spectraNames}{Spectra}(object) - -\S4method{spectraNames}{Spectra}(object) <- value - -\S4method{spectraVariables}{Spectra}(object) - -\S4method{tic}{Spectra}(object, initial = TRUE) - -\S4method{$}{Spectra}(x, name) - -\S4method{$}{Spectra}(x, name) <- value - -\S4method{[[}{Spectra}(x, i, j, ...) - -\S4method{[[}{Spectra}(x, i, j, ...) <- value - -\S4method{[}{Spectra}(x, i, j, ..., drop = FALSE) - -\S4method{filterAcquisitionNum}{Spectra}( - object, - n = integer(), - dataStorage = character(), - dataOrigin = character() -) - -\S4method{filterEmptySpectra}{Spectra}(object) - -\S4method{filterDataOrigin}{Spectra}(object, dataOrigin = character()) - -\S4method{filterDataStorage}{Spectra}(object, dataStorage = character()) - -\S4method{filterFourierTransformArtefacts}{Spectra}( - object, - halfWindowSize = 0.05, - threshold = 0.2, - keepIsotopes = TRUE, - maxCharge = 5, - isotopeTolerance = 0.005 -) - -\S4method{filterIntensity}{Spectra}( - object, - intensity = c(0, Inf), - msLevel. = uniqueMsLevels(object), - ... -) - -\S4method{filterIsolationWindow}{Spectra}(object, mz = numeric()) - -\S4method{filterMsLevel}{Spectra}(object, msLevel. = integer()) - -\S4method{filterMzRange}{Spectra}( - object, - mz = numeric(), - msLevel. = uniqueMsLevels(object), - keep = TRUE -) - -\S4method{filterMzValues}{Spectra}( - object, - mz = numeric(), - tolerance = 0, - ppm = 20, - msLevel. = uniqueMsLevels(object), - keep = TRUE -) - -\S4method{filterPolarity}{Spectra}(object, polarity = integer()) - -\S4method{filterPrecursorMz}{Spectra}(object, mz = numeric()) - -\S4method{filterPrecursorMzRange}{Spectra}(object, mz = numeric()) - -\S4method{filterPrecursorMzValues}{Spectra}(object, mz = numeric(), ppm = 20, tolerance = 0) - -\S4method{filterPrecursorCharge}{Spectra}(object, z = integer()) - -\S4method{filterPrecursorScan}{Spectra}(object, acquisitionNum = integer(), f = dataOrigin(object)) - -\S4method{filterRt}{Spectra}(object, rt = numeric(), msLevel. = uniqueMsLevels(object)) - -\S4method{reset}{Spectra}(object, ...) - -\S4method{filterRanges}{Spectra}( - object, - spectraVariables = character(), - ranges = numeric(), - match = c("all", "any") -) - -\S4method{filterValues}{Spectra}( - object, - spectraVariables = character(), - values = numeric(), - ppm = 0, - tolerance = 0, - match = c("all", "any") -) - -\S4method{bin}{Spectra}( - x, - binSize = 1L, - breaks = NULL, - msLevel. = uniqueMsLevels(x), - FUN = sum, - zero.rm = TRUE -) - -\S4method{compareSpectra}{Spectra,Spectra}( - x, - y, - MAPFUN = joinPeaks, - tolerance = 0, - ppm = 20, - FUN = ndotproduct, - ..., - SIMPLIFY = TRUE -) - -\S4method{compareSpectra}{Spectra,missing}( - x, - y = NULL, - MAPFUN = joinPeaks, - tolerance = 0, - ppm = 20, - FUN = ndotproduct, - ..., - SIMPLIFY = TRUE -) - -\S4method{pickPeaks}{Spectra}( - object, - halfWindowSize = 2L, - method = c("MAD", "SuperSmoother"), - snr = 0, - k = 0L, - descending = FALSE, - threshold = 0, - msLevel. = uniqueMsLevels(object), - ... -) - -\S4method{replaceIntensitiesBelow}{Spectra}( - object, - threshold = min, - value = 0, - msLevel. = uniqueMsLevels(object) -) - -\S4method{smooth}{Spectra}( - x, - halfWindowSize = 2L, - method = c("MovingAverage", "WeightedMovingAverage", "SavitzkyGolay"), - msLevel. = uniqueMsLevels(x), - ... -) - -\S4method{addProcessing}{Spectra}(object, FUN, ..., spectraVariables = character()) - -coreSpectraVariables() - -\S4method{uniqueMsLevels}{Spectra}(object, ...) - -\S4method{backendBpparam}{Spectra}(object, BPPARAM = bpparam()) - -\S4method{combinePeaks}{Spectra}( - object, - tolerance = 0, - ppm = 20, - intensityFun = base::mean, - mzFun = base::mean, - weighted = TRUE, - msLevel. = uniqueMsLevels(object), - ... -) - -\S4method{entropy}{Spectra}(object, normalized = TRUE) - -\S4method{entropy}{ANY}(object, ...) - \S4method{dataStorageBasePath}{Spectra}(object) \S4method{dataStorageBasePath}{Spectra}(object) <- value - -asDataFrame( - object, - i = seq_along(object), - spectraVars = spectraVariables(object) -) } \arguments{ -\item{object}{For \code{Spectra()}: either a \code{DataFrame} or \code{missing}. See -section on creation of \code{Spectra} objects for details. For all other -methods a \code{Spectra} object.} - -\item{f}{For \code{split()}: factor defining how to split \code{x}. See \code{\link[base:split]{base::split()}} -for details. For \code{setBackend()}: factor defining how to split the data -for parallelized copying of the spectra data to the new backend. For some -backends changing this parameter can lead to errors. -For \code{combineSpectra()}: \code{factor} defining the grouping of the spectra -that should be combined. For \code{spectrapply()}: \code{factor} how \code{object} -should be splitted. For \code{filterPrecursorScan()}: defining which spectra -belong to the same original data file (sample): Defaults to -\code{f = dataOrigin(x)}. -For \code{intensity()}, \code{mz()} and \code{peaksData()}: factor defining how data -should be chunk-wise loaded an processed. Defaults to -\code{\link[=processingChunkFactor]{processingChunkFactor()}}.} - -\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more -information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method -of the \linkS4class{MsBackend}.} - -\item{...}{Additional arguments.} - -\item{x}{A \code{Spectra} object.} - -\item{p}{For \code{combineSpectra()}: \code{factor} defining how to split the input -\code{Spectra} for parallel processing. Defaults to \code{x$dataStorage}, i.e., -depending on the used backend, per-file parallel processing will be -performed.} - -\item{FUN}{For \code{addProcessing()}: function to be applied to the peak matrix -of each spectrum in \code{object}. For \code{compareSpectra()}: function to compare -intensities of peaks between two spectra with each other. -For \code{combineSpectra()}: function to combine the (peak matrices) of the -spectra. See section \emph{Data manipulations} and examples below for more -details. -For \code{bin()}: function to aggregate intensity values of peaks falling -into the same bin. Defaults to \code{FUN = sum} thus summing up intensities. -For \code{spectrapply()} and \code{chunkapply()}: function to be applied to -\code{Spectra}.} - -\item{y}{A \code{Spectra} object. A \code{DataFrame} for \code{joinSpectraData()}.} - -\item{by.x}{A \code{character(1)} specifying the spectra variable used -for merging. Default is \code{"spectrumId"}.} - -\item{by.y}{A \code{character(1)} specifying the column used for -merging. Set to \code{by.x} if missing.} - -\item{suffix.y}{A \code{character(1)} specifying the suffix to be used -for making the names of columns in the merged spectra variables -unique. This suffix will be used to amend \code{names(y)}, while -\code{spectraVariables(x)} will remain unchanged.} - -\item{substDefinition}{For \code{deisotopeSpectra()} and -\code{filterPrecursorIsotopes()}: \code{matrix} or \code{data.frame} with definitions -of isotopic substitutions. Uses by default isotopic substitutions -defined from all compounds in the Human Metabolome Database (HMDB). See -\code{\link[=isotopologues]{isotopologues()}} or \code{\link[=isotopicSubstitutionMatrix]{isotopicSubstitutionMatrix()}} for details.} - -\item{tolerance}{For \code{compareSpectra()}, \code{containsMz()}, -\code{deisotopeSpectra()}, \code{filterMzValues()} and \code{reduceSpectra()}: -\code{numeric(1)} allowing to define a constant maximal accepted difference -between m/z values for peaks to be matched (or grouped). For -\code{containsMz()} it can also be of length equal \code{mz} to specify a different -tolerance for each m/z value. -For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the -(constant) maximal accepted difference of precursor m/z values of -spectra for grouping them into \emph{precursor groups}. For -\code{filterPrecursorIsotopes()}: passed directly to the \code{\link[=isotopologues]{isotopologues()}} -function. For \code{filterValues()}: \code{numeric} of any length allowing to -define a maximal accepted difference between user input \code{values} and the -\code{spectraVariables} values. If it is not equal to the length of the -value provided with parameter \code{spectraVariables}, \code{tolerance[1]} will be -recycled. Default is \code{tolerance = 0}} - -\item{ppm}{For \code{compareSpectra()}, \code{containsMz()}, \code{deisotopeSpectra()}, -\code{filterMzValues()} and \code{reduceSpectra()}: \code{numeric(1)} -defining a relative, m/z-dependent, maximal accepted difference between -m/z values for peaks to be matched (or grouped). -For \code{filterPrecursorMaxIntensity()}: \code{numeric(1)} defining the relative -maximal accepted difference of precursor m/z values of spectra for -grouping them into \emph{precursor groups}. For \code{filterPrecursorIsotopes()}: -passed directly to the \code{\link[=isotopologues]{isotopologues()}} function. -For \code{filterValues()}: \code{numeric} of any length allowing to define -a maximal accepted difference between user input \code{values} and the -\code{spectraVariables} values. If it is not equal to the length of the -value provided with parameter \code{spectraVariables}, \code{ppm[1]} will be -recycled.} - -\item{charge}{For \code{deisotopeSpectra()}: expected charge of the ionized -compounds. See \code{\link[=isotopologues]{isotopologues()}} for details.} - -\item{by}{For \code{scalePeaks()}: function to calculate a single \code{numeric} from -intensity values of a spectrum by which all intensities (of -that spectrum) should be divided by. The default \code{by = sum} will -divide intensities of each spectrum by the sum of intensities of that -spectrum.} - -\item{msLevel.}{\code{integer} defining the MS level(s) of the spectra to which -the function should be applied (defaults to all MS levels of \code{object}. -For \code{filterMsLevel()}: the MS level to which \code{object} should be -subsetted.} - -\item{mz}{For \code{filterIsolationWindow()}: \code{numeric(1)} with the m/z value to -filter the object. For \code{filterPrecursorMz()} and \code{filterMzRange()}: -\code{numeric(2)} defining the lower and upper m/z boundary. -For \code{filterMzValues()} and \code{filterPrecursorMzValues()}: \code{numeric} with -the m/z values to match peaks or precursor m/z against.} +\item{object}{For \code{Spectra()}: an object to instantiate the \code{Spectra} +object and initialize the with data.. See section on creation of +\code{Spectra} objects for details. For all other methods a \code{Spectra} object.} \item{processingQueue}{For \code{Spectra()}: optional \code{list} of \linkS4class{ProcessingStep} objects.} \item{metadata}{For \code{Spectra()}: optional \code{list} with metadata information.} +\item{...}{Additional arguments.} + \item{backend}{For \code{Spectra()}: \linkS4class{MsBackend} to be used as backend. See section on creation of \code{Spectra} objects for details. For \code{setBackend()}: instance of \linkS4class{MsBackend} that supports \code{setBackend()} (i.e. for @@ -643,244 +87,56 @@ passing the full spectra data to the initialize method. See section on creation of \code{Spectra} objects for details. For \code{export()}: \linkS4class{MsBackend} to be used to export the data.} -\item{source}{For \code{Spectra()}: instance of \linkS4class{MsBackend} that can be used -to import spectrum data from the provided files. See section \emph{Creation -of objects, conversion and changing the backend} for more details.} - -\item{drop}{For \code{[}, \code{split()}: not considered.} - -\item{columns}{For \code{spectraData()} accessor: optional \code{character} with -column names (spectra variables) that should be included in the -returned \code{DataFrame}. By default, all columns are returned. -For \code{peaksData()} accessor: optional \code{character} with requested columns -in the individual \code{matrix} of the returned \code{list}. Defaults to -\code{c("mz", "value")} but any values returned by \code{peaksVariables(object)} -with \code{object} being the \code{Spectra} object are supported.} - -\item{value}{replacement value for \verb{<-} methods. See individual -method description or expected data type.} - -\item{which}{for \code{containsMz()}: either \code{"any"} or \code{"all"} defining whether -any (the default) or all provided \code{mz} have to be present in the -spectrum.} - -\item{neutralLoss}{for \code{containsNeutralLoss()}: \code{numeric(1)} defining the -value which should be subtracted from the spectrum's precursor m/z.} - -\item{chunkSize}{For \code{spectrapply()}: size of the chunks into which \code{Spectra} -should be split. This parameter overrides parameters \code{f} and \code{BPPARAM}.} - -\item{use.names}{For \code{lengths()}: ignored.} - -\item{spectraVariables}{\itemize{ -\item For \code{selectSpectraVariables()}: \code{character} with the -names of the spectra variables to which the backend should be -subsetted. -\itemize{ -\item For \code{addProcessing()}: \code{character} with additional spectra variables -that should be passed along to the function defined with \code{FUN}. See -function description for details. -\item For \code{filterRanges()} and \code{filterValues()}: \code{character} vector -specifying the column(s) from \code{spectraData(object)} on which to filter -the data and that correspond to the the names of the spectra variables -that should be used for the filtering. -} -}} - -\item{initial}{For \code{tic()}: \code{logical(1)} whether the initially -reported total ion current should be reported, or whether the -total ion current should be (re)calculated on the actual data -(\code{initial = FALSE}, same as \code{ionCount()}).} - -\item{name}{For \code{$} and \verb{$<-}: the name of the spectra variable to return -or set.} - -\item{i}{For \code{[}: \code{integer}, \code{logical} or \code{character} to subset the -object. For \code{asDataFrame()} an \code{numeric} indicating which scans to coerce -to a \code{DataFrame} (default is \code{seq_along(object)}).} - -\item{j}{For \code{[}: not supported.} - -\item{n}{for \code{filterAcquisitionNum()}: \code{integer} with the acquisition -numbers to filter for.} - -\item{dataStorage}{For \code{filterDataStorage()}: \code{character} to define which -spectra to keep. -For \code{filterAcquisitionNum()}: optionally specify if filtering should -occur only for spectra of selected \code{dataStorage}.} +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information. This is passed directly to the \code{\link[=backendInitialize]{backendInitialize()}} method +of the \linkS4class{MsBackend}.} -\item{dataOrigin}{For \code{filterDataOrigin()}: \code{character} to define which -spectra to keep. -For \code{filterAcquisitionNum()}: optionally specify if filtering should -occurr only for spectra of selected \code{dataOrigin}.} +\item{source}{For \code{Spectra()}: instance of \linkS4class{MsBackend} that can be +used to import spectrum data from the provided files. See section +\emph{Creation of objects} for more details.} -\item{halfWindowSize}{\itemize{ -\item For \code{pickPeaks()}: \code{integer(1)}, used in the -identification of the mass peaks: a local maximum has to be the maximum -in the window from \code{(i - halfWindowSize):(i + halfWindowSize)}. -\itemize{ -\item For \code{smooth()}: \code{integer(1)}, used in the smoothing algorithm, the -window reaches from \code{(i - halfWindowSize):(i + halfWindowSize)}. -\item For \code{filterFourierTransformArtefacts()}: \code{numeric(1)} defining the m/z -window left and right of a peak where to remove fourier transform -artefacts. -} -}} +\item{f}{For \code{setBackend()}: factor defining how to split the data +for parallelized copying of the spectra data to the new backend. For +some backends changing this parameter can lead to errors. Defaults to +\code{\link[=processingChunkFactor]{processingChunkFactor()}}.} -\item{threshold}{\itemize{ -\item For \code{pickPeaks()}: a \code{double(1)} defining the proportion of the maximal -peak intensity. Just values above are used for the weighted mean -calculation. -\itemize{ -\item For \code{replaceIntensitiesBelow()}: a \code{numeric(1)} defining the threshold -or a \code{function} to calculate the threshold for each spectrum on its -intensity values. Defaults to \code{threshold = min}. -\item For \code{filterFourierTransformArtefacts()}: the relative intensity (to a -peak) below which peaks are considered fourier artefacts. Defaults to -\code{threshold = 0.2} hence removing peaks that have an intensity below 0.2 -times the intensity of the tested peak (within the selected -\code{halfWindowSize}). +\item{value}{For \code{dataStorageBasePath()}: A \code{character} vector that defines +the base directory where the data storage files can be found.} } -}} - -\item{keepIsotopes}{For \code{filterFourierTransformArtefacts()}: whether isotope -peaks should not be removed as fourier artefacts.} - -\item{maxCharge}{For \code{filterFourierTransformArtefacts()}: the maximum charge -to be considered for isotopes.} - -\item{isotopeTolerance}{For \code{filterFourierTransformArtefacts()}: the m/z -\code{tolerance} to be used to define whether peaks might be isotopes of -the current tested peak.} - -\item{intensity}{For \code{filterIntensity()}: \code{numeric} of length 1 or 2 -defining either the lower or the lower and upper intensity limit for the -filtering, or a \code{function} that takes the intensities as input and -returns a \code{logical} (same length then peaks in the spectrum) whether the -peak should be retained or not. Defaults to \code{intensity = c(0, Inf)} thus -only peaks with \code{NA} intensity are removed.} - -\item{keep}{For \code{filterMzValues()} and \code{filterMzRange()}: \code{logical(1)} -whether the matching peaks should be retained (\code{keep = TRUE}, the -default) or dropped (\code{keep = FALSE}).} - -\item{polarity}{for \code{filterPolarity()}: \code{integer} specifying the polarity to -to subset \code{object}.} - -\item{z}{For \code{filterPrecursorCharge()}: \code{integer()} with the precursor -charges to be used as filter.} - -\item{acquisitionNum}{for \code{filterPrecursorScan()}: \code{integer} with the -acquisition number of the spectra to which the object should be -subsetted.} - -\item{rt}{for \code{filterRt()}: \code{numeric(2)} defining the retention time range to -be used to subset/filter \code{object}.} - -\item{ranges}{for \code{filterRanges()}: A \code{numeric} vector of paired values -(upper and lower boundary) that define the ranges to filter the \code{object}. -These paired values need to be in the same order as the -\code{spectraVariables} parameter (see below).} - -\item{match}{For \code{filterRanges()} and \code{filterValues()}: \code{character(1) } -defining whether the condition has to match for all provided -\code{ranges}/\code{values} (\code{match = "all"}; the default), or for any of them -(\code{match = "any"}) for spectra to be retained.} - -\item{values}{for \code{filterValues()}: A \code{numeric} vector that define the -values to filter the Spectra data. These values need to be in the same -order as the \code{spectraVariables} parameter.} - -\item{binSize}{For \code{bin()}: \code{numeric(1)} defining the size for the m/z bins. -Defaults to \code{binSize = 1}.} - -\item{breaks}{For \code{bin()}: \code{numeric} defining the m/z breakpoints between -bins.} - -\item{zero.rm}{\code{logical}. For \code{bin()}: indicating whether to remove bins -with zero intensity. Defaults to \code{TRUE}, meaning the function will -discard bins created with an intensity of 0 to enhance memory efficiency.} - -\item{MAPFUN}{For \code{compareSpectra()}: function to map/match peaks between the -two compared spectra. See \code{\link[=joinPeaks]{joinPeaks()}} for more information and possible -functions.} - -\item{SIMPLIFY}{For \code{compareSpectra()} whether the result matrix should be -\emph{simplified} to a \code{numeric} if possible (i.e. if either \code{x} or \code{y} is -of length 1).} - -\item{method}{\itemize{ -\item For \code{pickPeaks()}: \code{character(1)}, the noise estimators that -should be used, currently the the \emph{M}edian \emph{A}bsolute \emph{D}eviation -(\code{method = "MAD"}) and Friedman's Super Smoother -(\code{method = "SuperSmoother"}) are supported. +\description{ +The \code{Spectra} class encapsules spectral mass spectrometry (MS) data and +related metadata. The MS data is represented by a \emph{backend} extending the +virual \link{MsBackend} class which provides the data to the \code{Spectra} object. +The \code{Spectra} class implements only data accessor, filtering and analysis +methods for the MS data and relies on its \emph{backend} to provide the MS data. +This allows to change data representations of a \code{Spectra} object depending +on the user's needs and properties of the data. Different backends and +their properties are explained in the \link{MsBackend} documentation. + +Documentation on other topics and functionality of \code{Spectra}can be found in: \itemize{ -\item For \code{smooth()}: \code{character(1)}, the smoothing function that should be -used, currently, the Moving-Average- (\code{method = "MovingAverage"}), -Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, -Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported. -} -}} - -\item{snr}{For \code{pickPeaks()}: \code{double(1)} defining the -\emph{S}ignal-to-\emph{N}oise-\emph{R}atio. The intensity of a local maximum has to be -higher than \code{snr * noise} to be considered as peak.} - -\item{k}{For \code{pickPeaks()}: \code{integer(1)}, number of values left and right of -the peak that should be considered in the weighted mean calculation.} - -\item{descending}{For \code{pickPeaks()}: \code{logical}, if \code{TRUE} just values between -the nearest valleys around the peak centroids are used.} - -\item{intensityFun}{For \code{combinePeaks()}: function to be used to aggregate -intensities for all peaks in each peak group into a single intensity -value.} - -\item{mzFun}{For \code{combinePeaks()}: function to aggregate m/z values for all -peaks within each peak group into a single m/z value. This parameter -is ignored if \code{weighted = TRUE} (the default).} - -\item{weighted}{For \code{combinePeaks()}: \code{logical(1)} whether m/z values of -peaks within each peak group should be aggregated into a single m/z -value using an intensity-weighted mean. Defaults to \code{weighted = TRUE}.} - -\item{normalized}{for \code{entropy()}: \code{logical(1)} whether the normalized -entropy should be calculated (default). See also \code{\link[=nentropy]{nentropy()}} for -details.} - -\item{spectraVars}{\code{character()} indicating what spectra variables to add to -the \code{DataFrame}. Default is \code{spectraVariables(object)}, i.e. all -available variables.} -} -\value{ -See individual method description for the return value. +\item \code{\link[=spectraData]{spectraData()}} for accessing and using MS data through \code{Spectra} objects. +\item \code{\link[=filterMsLevel]{filterMsLevel()}} to subset and filter \code{Spectra} objects. +\item \code{\link[=plotSpectra]{plotSpectra()}} for visualization of \code{Spectra} orbjects. +\item \code{\link[=processingChunkSize]{processingChunkSize()}} for information on parallel and chunk-wise data +processing. +\item \code{\link[=combineSpectra]{combineSpectra()}} for merging, aggregating and splitting of \code{Spectra} +objects. +\item \code{\link[=combinePeaks]{combinePeaks()}} for merging and aggregating \code{Spectra}'s mass peaks data. +\item \code{\link[=addProcessing]{addProcessing()}} for data analysis functions. +\item \code{\link[=compareSpectra]{compareSpectra()}} for spectra similarity calculations. } -\description{ -The \code{Spectra} class encapsules spectral mass spectrometry data and -related metadata. - -It supports multiple data backends, e.g. in-memory (\link{MsBackendMemory}, -\code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}}), on-disk as mzML (\code{\link[=MsBackendMzR]{MsBackendMzR()}}) or HDF5 -(\code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}). } \details{ The \code{Spectra} class uses by default a lazy data manipulation strategy, i.e. data manipulations such as performed with \code{replaceIntensitiesBelow()} are not applied immediately to the data, but applied on-the-fly to the -spectrum data once it is retrieved. For some backends that allow to write -data back to the data storage (such as the \code{\link[=MsBackendMemory]{MsBackendMemory()}}, -\code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}} and \code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}) it is possible to apply -to queue with the \code{applyProcessing} function. See the *Data manipulation and -analysis \emph{methods} section below for more details. - -For more information on parallel or chunk-wise processing (especially -helpful for very large data sets) see \code{\link[=processingChunkSize]{processingChunkSize()}}. - -To apply arbitrary functions to a \code{Spectra} use the \code{spectrapply()} function -(or directly \code{\link[=chunkapply]{chunkapply()}} for chunk-wise processing). See description of -the \code{spectrapply()} function below for details. - -For details on plotting spectra, see \code{\link[=plotSpectra]{plotSpectra()}}. +spectrum data once it is retrieved. This enables data manipulation +operations also for \emph{read only} data representations. For some backends that +allow to write data back to the data storage (such as the +\code{\link[=MsBackendMemory]{MsBackendMemory()}}, \code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}} and \code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}) it +is possible to apply to queue with the \code{\link[=applyProcessing]{applyProcessing()}} function (see +the \code{\link[=applyProcessing]{applyProcessing()}} function for details). Clarifications regarding scan/acquisition numbers and indices: \itemize{ @@ -897,15 +153,33 @@ the \code{acquisitionNum}) See also \href{https://github.com/lgatto/MSnbase/issues/525}{this issue}. } -\section{Creation of objects, conversion, changing the backend and export}{ +\section{Data stored in a \code{Spectra} object}{ + + +The \code{Spectra} object is a container for MS data that includes mass peak +data (\emph{m/z} and related intensity values, also referred to as \emph{peaks data} +in the context of \code{Spectra}) and metadata of individual spectra (so called +\emph{spectra variables}). While a core set of spectra variables (the +\code{coreSpectraVariables()}) are guaranteed to be provided by a +\code{Spectra}, it is possible to add arbitrary additional spectra variables to +a \code{Spectra} object. + +The \code{Spectra} object is designed to contain MS data of a (large) set of mass +spectra. The data is organized \emph{linearly} and can be thought of a list of +mass spectra, i.e. each element in the \code{Spectra} is one spectrum. +} + +\section{Creation of objects}{ \code{Spectra} classes can be created with the \code{Spectra()} constructor function which supports the following formats: \itemize{ \item parameter \code{object} is a \code{data.frame} or \code{DataFrame} containing the -spectrum data. The provided \code{backend} (by default a -\linkS4class{MsBackendMemory}) will be initialized with that data. +full spectrum data (spectra variables in columns as well as columns +with the individual MS peak data, \emph{m/z} and intensity). The provided +\code{backend} (by default a \linkS4class{MsBackendMemory}) will be initialized +with that data. \item parameter \code{object} is a \linkS4class{MsBackend} (assumed to be already initialized). \item parameter \code{object} is missing, in which case it is supposed that the data @@ -920,41 +194,79 @@ which allows to import spectra data from mzML, mzXML or CDF files. With \code{...} additional arguments can be passed to the backend's \code{\link[=backendInitialize]{backendInitialize()}} method. Parameter \code{backend} allows to specify which -\linkS4class{MsBackend} should be used for data storage. +\linkS4class{MsBackend} should be used for data representation and storage. +} + +\section{Data representation of a \code{Spectra}}{ + + +The MS data which can be accessed through the \code{Spectra} object is +\emph{represented} by its backend, which means that this backend defines how +and where the data is stored (e.g. in memory or on disk). The \code{Specrta} +object relies on the backend to provide the MS data whenever it needs it +for data processing. +Different backends with different properties, such as minimal memory +requirement or fast data access, are defined in the \emph{Spectra} package or +one of the MsBackend* packages. More information on backends and their +properties is provided in the documentation of \link{MsBackend}. + +On-disk backends keep only a limited amount of data in memory retrieving +most of the data (usually the MS peak data) upon request on-the-fly from +their on-disk data representations. Moving the on-disk data storage of such +a backend or a serialized object to a different location in the file +system will cause data corruption. The \code{dataStorageBasePath()} and +\verb{dataStorageBasePath<-} functions allow in such cases (and if thebackend +classes support this operation), to get or change the \emph{base} +path to the directory of the backend's data storage. In-memory backends +such as \link{MsBackendMemory} or \link{MsBackendDataFrame} keeping all MS data in +memory don't support, and need, this function, but for \link{MsBackendMzR} this +function can be used to update/adapt the path to the directory containing +the original data files. Thus, for \code{Spectra} objects (using this backend) +that were moved to another file system or computer, these functions allow to +adjust/adapt the base file path. +} + +\section{Changing data representation of a \code{Spectra}}{ -The backend of a \code{Spectra} object can be changed with the \code{setBackend()} -method that takes an instance of the new backend as second parameter -\code{backend}. A call to \code{setBackend(sps, backend = MsBackendDataFrame())} + +The data representation, i.e. the backend of a \code{Spectra} object can be +changed with the \code{setBackend()} method that takes an instance of the new +backend as second parameter \code{backend}. A call to +\code{setBackend(sps, backend = MsBackendDataFrame())} would for example change the backend of \code{sps} to the \emph{in-memory} \code{MsBackendDataFrame}. Changing to a backend is only supported if that backend has a \code{data} parameter in its \code{backendInitialize()} method and if \code{supportsSetBackend()} returns \code{TRUE} for that backend. \code{setBackend()} will -transfer the full spectra data from the originating backend as a -\code{DataFrame} to the new backend. -Most \emph{read-only} backends do not support \code{setBackend()}. It is for example -not possible to change the backend to a \emph{read-only} backend (such as -the \code{\link[=MsBackendMzR]{MsBackendMzR()}} backend). +transfer the full spectra data from the originating backend as a \code{DataFrame} +to the new backend. + +Generally, it is not possible to change \strong{to} a read-only backend such as +the \code{\link[=MsBackendMzR]{MsBackendMzR()}} backend. The definition of the function is: \code{setBackend(object, backend, ..., f = dataStorage(object), BPPARAM = bpparam())} and its parameters are: \itemize{ -\item parameter \code{object}: the \code{Spectra} object. -\item parameter \code{backend}: an instance of the new backend, e.g. -\verb{[MsBackendMemory()]}. -\item parameter \code{f}: factor allowing to parallelize the change of the backends. -By default the process of copying the spectra data from the original to the +\item \code{object}: the \code{Spectra} object. +\item \code{backend}: an instance of the new backend, e.g. \verb{[MsBackendMemory()]}. +\item \code{f}: factor allowing to parallelize the change of the backends. By +default the process of copying the spectra data from the original to the new backend is performed separately (and in parallel) for each file. Users are advised to use the default setting. -\item parameter \code{...}: optional additional arguments passed to the -\code{\link[=backendInitialize]{backendInitialize()}} method of the new \code{backend}. -\item parameter \code{BPPARAM}: setup for the parallel processing. See \code{\link[=bpparam]{bpparam()}} for +\item \code{...}: optional additional arguments passed to the \code{\link[=backendInitialize]{backendInitialize()}} +method of the new \code{backend}. +\item \code{BPPARAM}: setup for the parallel processing. See \code{\link[=bpparam]{bpparam()}} for details. } +} + +\section{Exporting data from a \code{Spectra} object}{ + Data from a \code{Spectra} object can be \strong{exported} to a file with the -\code{export()} function. The actual export of the data has to be performed by +\code{export()} function. The actual export of the data is performed by the \code{export} method of the \link{MsBackend} class defined with the mandatory -parameter \code{backend}. Note however that not all backend classes support +parameter \code{backend} which defines also the format in which the data +is exported. Note however that not all backend classes support export of data. From the \code{MsBackend} classes in the \code{Spectra} package currently only the \code{MsBackendMzR} backend supports data export (to mzML/mzXML file(s)); see the help page of the \linkS4class{MsBackend} for @@ -971,604 +283,12 @@ of the data (i.e. which has a defined \code{export} method). \item \code{...}: additional parameters specific for the \code{MsBackend} passed with parameter \code{backend}. } - -The \code{dataStorageBasePath()} and \verb{dataStorageBasePath<-} functions allow, for -backend classes that support this operation, to get or change the \emph{base} -path to the directory where the backend stores the data. In-memory backends -such as \link{MsBackendMemory} or \link{MsBackendDataFrame} keeping all MS data in -memory don't support, and need, this function, but for \link{MsBackendMzR} this -function can be used to update/adapt the path to the directory containing -the original data files. Thus, for \code{Spectra} objects (using this backend) -that were moved to another file system or computer, these functions allow to -adjust/adapt the base file path. -} - -\section{Accessing spectra data}{ - -\itemize{ -\item \code{$}, \verb{$<-}: gets (or sets) a spectra variable for all spectra in \code{object}. -See examples for details. Note that replacing values of a peaks variable -is not supported with a non-empty processing queue, i.e. if any filtering -or data manipulations on the peaks data was performed. In these cases -\code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all cached data -operations. -\item \code{[[}, \verb{[[<-}: access or set/add a single spectrum variable (column) in the -backend. -\item \code{acquisitionNum()}: returns the acquisition number of each -spectrum. Returns an \code{integer} of length equal to the number of -spectra (with \code{NA_integer_} if not available). -\item \code{centroided()}, \verb{centroided<-}: gets or sets the centroiding -information of the spectra. \code{centroided()} returns a \code{logical} -vector of length equal to the number of spectra with \code{TRUE} if a -spectrum is centroided, \code{FALSE} if it is in profile mode and \code{NA} -if it is undefined. See also \code{isCentroided()} for estimating from -the spectrum data whether the spectrum is centroided. \code{value} -for \verb{centroided<-} is either a single \code{logical} or a \code{logical} of -length equal to the number of spectra in \code{object}. -\item \code{collisionEnergy()}, \verb{collisionEnergy<-}: gets or sets the -collision energy for all spectra in \code{object}. \code{collisionEnergy()} -returns a \code{numeric} with length equal to the number of spectra -(\code{NA_real_} if not present/defined), \verb{collisionEnergy<-} takes a -\code{numeric} of length equal to the number of spectra in \code{object}. -\item \code{coreSpectraVariables()}: returns the \emph{core} spectra variables along with -their expected data type. -\item \code{dataOrigin()}, \verb{dataOrigin<-}: gets or sets the \emph{data origin} for each -spectrum. \code{dataOrigin()} returns a \code{character} vector (same length than -\code{object}) with the origin of the spectra. \verb{dataOrigin<-} expects a -\code{character} vector (same length than \code{object}) with the replacement -values for the data origin of each spectrum. -\item \code{dataStorage()}: returns a \code{character} vector (same length than \code{object}) -with the data storage location of each spectrum. -\item \code{intensity()}: gets the intensity values from the spectra. Returns -a \code{\link[=NumericList]{NumericList()}} of \code{numeric} vectors (intensity values for each -spectrum). The length of the list is equal to the number of -\code{spectra} in \code{object}. -\item \code{ionCount()}: returns a \code{numeric} with the sum of intensities for -each spectrum. If the spectrum is empty (see \code{isEmpty()}), -\code{NA_real_} is returned. -\item \code{isCentroided()}: a heuristic approach assessing if the spectra in -\code{object} are in profile or centroided mode. The function takes -the \code{qtl}th quantile top peaks, then calculates the difference -between adjacent m/z value and returns \code{TRUE} if the first -quartile is greater than \code{k}. (See \code{Spectra:::.isCentroided()} for -the code.) -\item \code{isEmpty()}: checks whether a spectrum in \code{object} is empty -(i.e. does not contain any peaks). Returns a \code{logical} vector of -length equal number of spectra. -\item \code{isolationWindowLowerMz()}, \verb{isolationWindowLowerMz<-}: gets or sets the -lower m/z boundary of the isolation window. -\item \code{isolationWindowTargetMz()}, \verb{isolationWindowTargetMz<-}: gets or sets the -target m/z of the isolation window. -\item \code{isolationWindowUpperMz()}, \verb{isolationWindowUpperMz<-}: gets or sets the -upper m/z boundary of the isolation window. -\item \code{containsMz()}: checks for each of the spectra whether they contain mass -peaks with an m/z equal to \code{mz} (given acceptable difference as defined by -parameters \code{tolerance} and \code{ppm} - see \code{\link[=common]{common()}} for details). Parameter -\code{which} allows to define whether any (\code{which = "any"}, the default) or -all (\code{which = "all"}) of the \code{mz} have to match. The function returns -\code{NA} if \code{mz} is of length 0 or is \code{NA}. -\item \code{containsNeutralLoss()}: checks for each spectrum in \code{object} if it has a -peak with an m/z value equal to its precursor m/z - \code{neutralLoss} (given -acceptable difference as defined by parameters \code{tolerance} and \code{ppm}). -Returns \code{NA} for MS1 spectra (or spectra without a precursor m/z). -\item \code{length()}: gets the number of spectra in the object. -\item \code{lengths()}: gets the number of peaks (m/z-intensity values) per -spectrum. Returns an \code{integer} vector (length equal to the -number of spectra). For empty spectra, \code{0} is returned. -\item \code{msLevel()}: gets the spectra's MS level. Returns an integer vector (names -being spectrum names, length equal to the number of spectra) with the MS -level for each spectrum. -\item \code{mz()}: gets the mass-to-charge ratios (m/z) from the -spectra. Returns a \code{\link[=NumericList]{NumericList()}} or length equal to the number of -spectra, each element a \code{numeric} vector with the m/z values of -one spectrum. -\item \code{peaksData()}: gets the \emph{peaks} data for all spectra in \code{object}. Peaks -data consist of the m/z and intensity values as well as possible additional -annotations (variables) of all peaks of each spectrum. The function -returns a \code{\link[=SimpleList]{SimpleList()}} of two dimensional arrays (either \code{matrix} or -\code{data.frame}), with each array providing the values for the requested -\emph{peak variables} (by default \code{"mz"} and \code{"intensity"}). Optional parameter -\code{columns} is passed to the backend's \code{peaksData()} function to allow -the selection of specific (or additional) peaks variables (columns) that -should be extracted (if available). Importantly, -it is \strong{not} guaranteed that each backend supports this parameter (while -each backend must support extraction of \code{"mz"} and \code{"intensity"} columns). -Parameter \code{columns} defaults to \code{c("mz", "intensity")} but any value -returned by \code{peaksVariables(object)} is supported. -Note also that it is possible to extract the peak data with -\code{as(x, "list")} and \code{as(x, "SimpleList")} as a \code{list} and \code{SimpleList}, -respectively. Note however that, in contrast to \code{peaksData()}, \code{as()} -does not support the parameter \code{columns}. -\item \code{peaksVariables()}: lists the available variables for mass peaks provided -by the backend. Default peak variables are \code{"mz"} and \code{"intensity"} (which -all backends need to support and provide), but some backends might provide -additional variables. -These variables correspond to the column names of the peak data array -returned by \code{peaksData()}. -\item \code{polarity()}, \verb{polarity<-}: gets or sets the polarity for each -spectrum. \code{polarity()} returns an \code{integer} vector (length equal -to the number of spectra), with \code{0} and \code{1} representing negative -and positive polarities, respectively. \verb{polarity<-} expects an -\code{integer} vector of length 1 or equal to the number of spectra. -\item \code{precursorCharge()}, \code{precursorIntensity()}, \code{precursorMz()}, -\code{precScanNum()}, \code{precAcquisitionNum()}: gets the charge (\code{integer}), -intensity (\code{numeric}), m/z (\code{numeric}), scan index (\code{integer}) -and acquisition number (\code{interger}) of the precursor for MS level > -2 spectra from the object. Returns a vector of length equal to -the number of spectra in \code{object}. \code{NA} are reported for MS1 -spectra of if no precursor information is available. -\item \code{rtime()}, \verb{rtime<-}: gets or sets the retention times (in seconds) -for each spectrum. \code{rtime()} returns a \code{numeric} vector (length -equal to the number of spectra) with the retention time for each -spectrum. \verb{rtime<-} expects a numeric vector with length equal -to the number of spectra. -\item \code{scanIndex()}: returns an \code{integer} vector with the \emph{scan index} -for each spectrum. This represents the relative index of the -spectrum within each file. Note that this can be different to the -\code{acquisitionNum} of the spectrum which represents the index of the -spectrum during acquisition/measurement (as reported in the mzML file). -\item \code{smoothed()},\verb{smoothed<-}: gets or sets whether a spectrum is -\emph{smoothed}. \code{smoothed()} returns a \code{logical} vector of length equal -to the number of spectra. \verb{smoothed<-} takes a \code{logical} vector -of length 1 or equal to the number of spectra in \code{object}. -\item \code{spectraData()}: gets general spectrum metadata (annotation, also called -header). \code{spectraData()} returns a \code{DataFrame}. Note that this -method does by default \strong{not} return m/z or intensity values. -\item \verb{spectraData<-}: \strong{replaces} the full spectra data of the \code{Spectra} -object with the one provided with \code{value}. The \verb{spectraData<-} function -expects a \code{DataFrame} to be passed as value with the same number of rows -as there a spectra in \code{object}. Note that replacing values of -peaks variables is not supported with a non-empty processing queue, i.e. -if any filtering or data manipulations on the peaks data was performed. -In these cases \code{\link[=applyProcessing]{applyProcessing()}} needs to be called first to apply all -cached data operations and empty the processing queue. -\item \code{spectraNames()}, \verb{spectraNames<-}: gets or sets the spectra names. -\item \code{spectraVariables()}: returns a \code{character} vector with the -available spectra variables (columns, fields or attributes of each -spectrum) available in \code{object}. Note that \code{spectraVariables()} does not -list the \emph{peak variables} (\code{"mz"}, \code{"intensity"} and eventual additional -annotations for each MS peak). Peak variables are returned by -\code{peaksVariables()}. -\item \code{tic()}: gets the total ion current/count (sum of signal of a -spectrum) for all spectra in \code{object}. By default, the value -reported in the original raw data file is returned. For an empty -spectrum, \code{0} is returned. -\item \code{uniqueMsLevels()}: get the unique MS levels available in \code{object}. This -function is supposed to be more efficient than \code{unique(msLevel(object))}. -} -} - -\section{Filter spectra data}{ - - -Filter a \code{Spectra} object based on the spectra data. This includes subset -operations that immediately reduce the number of spectra in the object as -well as filters that reduce the \emph{content} of the \code{Spectra} object. -See section \emph{Filter peaks data} below for functions that filter the peaks -data of a \code{Spectra}. -\itemize{ -\item \code{[}: subsets the spectra keeping only selected elements (\code{i}). The method -\strong{always} returns a \code{Spectra} object. -\item \code{dropNaSpectraVariables()}: removes spectra variables (i.e. columns in the -object's \code{spectraData} that contain only missing values (\code{NA}). Note that -while columns with only \code{NA}s are removed, a \code{spectraData()} call after -\code{dropNaSpectraVariables()} might still show columns containing \code{NA} values -for \emph{core} spectra variables. The total number of spectra is not changed -by this function. -\item \code{filterAcquisitionNum()}: filters the object keeping only spectra matching -the provided acquisition numbers (argument \code{n}). If \code{dataOrigin} or -\code{dataStorage} is also provided, \code{object} is subsetted to the spectra with -an acquisition number equal to \code{n} \strong{in spectra with matching dataOrigin -or dataStorage values} retaining all other spectra. -Returns the filtered \code{Spectra}. -\item \code{filterDataOrigin()}: filters the object retaining spectra matching the -provided \code{dataOrigin}. Parameter \code{dataOrigin} has to be of type -\code{character} and needs to match exactly the data origin value of the -spectra to subset. -Returns the filtered \code{Spectra} object (with spectra ordered according to -the provided \code{dataOrigin} parameter). -\item \code{filterDataStorage()}: filters the object retaining spectra stored in the -specified \code{dataStorage}. Parameter \code{dataStorage} has to be of type -\code{character} and needs to match exactly the data storage value of the -spectra to subset. -Returns the filtered \code{Spectra} object (with spectra ordered according to -the provided \code{dataStorage} parameter). -\item \code{filterEmptySpectra()}: removes empty spectra (i.e. spectra without peaks). -Returns the filtered \code{Spectra} object (with spectra in their -original order). -\item \code{filterIsolationWindow()}: retains spectra that contain \code{mz} in their -isolation window m/z range (i.e. with an \code{isolationWindowLowerMz} <= \code{mz} -and \code{isolationWindowUpperMz} >= \code{mz}. Returns the filtered \code{Spectra} -object (with spectra in their original order). -\item \code{filterMsLevel()}: filters object by MS level keeping only spectra matching -the MS level specified with argument \code{msLevel}. Returns the filtered -\code{Spectra} (with spectra in their original order). -\item \code{filterPolarity()}: filters the object keeping only spectra matching the -provided polarity. Returns the filtered \code{Spectra} (with spectra in their -original order). -\item \code{filterPrecursorCharge()}: retains spectra with the defined precursor -charge(s). -\item \code{filterPrecursorIsotopes()}: groups MS2 spectra based on their precursor -m/z and precursor intensity into predicted isotope groups and keep for each -only the spectrum representing the monoisotopic precursor. MS1 spectra -are returned as is. See documentation for \code{deisotopeSpectra()} below for -details on isotope prediction and parameter description. -\item \code{filterPrecursorMaxIntensity()}: filters the \code{Spectra} keeping for groups -of (MS2) spectra with similar precursor m/z values (given parameters -\code{ppm} and \code{tolerance}) the one with the highest precursor intensity. The -function filters only MS2 spectra and returns all MS1 spectra. If -precursor intensities are \code{NA} for all spectra within a spectra group, the -first spectrum of that groups is returned. -Note: some manufacturers don't provide precursor intensities. These can -however also be estimated with \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}}. -\item \code{filterPrecursorMzRange()} (previously \code{filterPrecursorMz()} which is now -deprecated): retains spectra with a precursor m/z within the -provided m/z range. See examples for details on selecting spectra with -a precursor m/z for a target m/z accepting a small difference in \emph{ppm}. -\item \code{filterPrecursorMzValues()}: retains spectra with precursor m/z matching -any of the provided m/z values (given \code{ppm} and \code{tolerance}). Spectra with -missing precursor m/z value (e.g. MS1 spectra) are dropped. -\item \code{filterPrecursorScan()}: retains parent (e.g. MS1) and children scans (e.g. -MS2) of acquisition number \code{acquisitionNum}. Returns the filtered -\code{Spectra} (with spectra in their original order). Parameter \code{f} allows to -define which spectra belong to the same sample or original data file ( -defaults to \code{f = dataOrigin(object)}). -\item \code{filterRanges()}: allows filtering of the \code{Spectra} object based on user -defined \emph{numeric} ranges (parameter \code{ranges}) for one or more available -spectra variables in object (spectra variable names can be specified with -parameter \code{spectraVariables}). Spectra for which the value of a spectra -variable is within it's defined range are retained. If multiple -ranges/spectra variables are defined, the \code{match} parameter can be used -to specify whether all conditions (\code{match = "all"}; the default) or if -any of the conditions must match (\code{match = "any"}; all spectra for which -values are within any of the provided ranges are retained). -\item \code{filterRt()}: retains spectra of MS level \code{msLevel} with retention -times (in seconds) within (\code{>=}) \code{rt[1]} and (\code{<=}) -\code{rt[2]}. Returns the filtered \code{Spectra} (with spectra in their -original order). -\item \code{filterValues()}: allows filtering of the \code{Spectra} object based on -similarities of \emph{numeric} values of one or more \code{spectraVariables(object)} -(parameter \code{spectraVariables}) to provided values (parameter \code{values}) -given acceptable differences (parameters tolerance and ppm). If multiple -values/spectra variables are defined, the \code{match} parameter can be used -to specify whether all conditions (\code{match = "all"}; the default) or if -any of the conditions must match (\code{match = "any"}; all spectra for which -values are within any of the provided ranges are retained). -\item \code{selectSpectraVariables()}: reduces the information within the object to -the selected spectra variables: all data for variables not specified will -be dropped. For mandatory columns (i.e., those listed by -\code{\link[=coreSpectraVariables]{coreSpectraVariables()}}, such as \emph{msLevel}, \emph{rtime} ...) only -the values will be dropped but not the variable itself. Additional (or -user defined) spectra variables will be completely removed. -Returns the filtered \code{Spectra}. -} -} - -\section{Filter or aggregate mass peak data}{ - - -Operations that filter or aggregate the mass peak data from each spectrum -without changing the number of spectra in a \code{Spectra} object. Also, the -actual subsetting/aggregation operation is only executed once peaks data is -accessed (through \code{peaksData()}, \code{mz()} or \code{intensity()}) or -\code{applyProcessing()} is called. -\itemize{ -\item \code{combinePeaks()}: combines mass peaks \strong{within each spectrum} with a -difference in their m/z values that is smaller than the maximal -acceptable difference defined by \code{ppm} and \code{tolerance}. Parameters -\code{intensityFun} and \code{mzFun} allow to define functions to aggregate the -intensity and m/z values for each such group of peaks. With -\code{weighted = TRUE} (the default), the m/z value of the combined peak is -calculated using an intensity-weighted mean and parameter \code{mzFun} is -ignored. The \code{\link[MsCoreUtils:group]{MsCoreUtils::group()}} function is used for the grouping of -mass peaks. Parameter \code{msLevel.} allows to define selected MS levels for -which peaks should be combined. This function returns a \code{Spectra} with -the same number of spectra than the input object, but with possibly -combined peaks within each spectrum. -dropped (i.e. their values are replaced with \code{NA}) for combined peaks -unless they are constant across the combined peaks. See also -\code{reduceSpectra()} for a function to select a single \emph{representative} -mass peak for each peak group. -\item \code{deisotopeSpectra()}: \emph{deisotopes} each spectrum keeping only the -monoisotopic peak for groups of isotopologues. Isotopologues are -estimated using the \code{\link[=isotopologues]{isotopologues()}} function from the -\emph{MetaboCoreUtils} package. Note that -the default parameters for isotope prediction/detection have been -determined using data from the Human Metabolome Database (HMDB) and -isotopes for elements other than CHNOPS might not be detected. See -parameter \code{substDefinition} in the documentation of \code{\link[=isotopologues]{isotopologues()}} for -more information. The approach and code to define the parameters for -isotope prediction is described -\href{https://github.com/EuracBiomedicalResearch/isotopologues}{here}. -\item \code{filterFourierTransformArtefacts()}: removes (Orbitrap) fast fourier -artefact peaks from spectra (see examples below). The function iterates -through all intensity ordered peaks in a spectrum and removes all peaks -with an m/z within +/- \code{halfWindowSize} of the current peak if their -intensity is lower than \code{threshold} times the current peak's intensity. -Additional parameters \code{keepIsotopes}, \code{maxCharge} and \code{isotopeTolerance} -allow to avoid removing of potential \verb{[13]C} isotope peaks (\code{maxCharge} -being the maximum charge that should be considered and \code{isotopeTolerance} -the absolute acceptable tolerance for matching their m/z). -See \code{\link[=filterFourierTransformArtefacts]{filterFourierTransformArtefacts()}} for details and background and -\code{deisitopeSpectra()} for an alternative. -\item \code{filterIntensity()}: filters mass peaks in each spectrum keeping only -those with intensities that are within the provided range or match the -criteria of the provided function. For the former, parameter \code{intensity} -has to be a \code{numeric} defining the intensity range, for the latter a -\code{function} that takes the intensity values of the spectrum and returns -a \code{logical} whether the peak should be retained or not (see examples -below for details) - additional parameters to the function can be passed -with \code{...}. -To remove only peaks with intensities below a certain threshold, say -100, use \code{intensity = c(100, Inf)}. Note: also a single value can be -passed with the \code{intensity} parameter in which case an upper limit of -\code{Inf} is used. -Note that this function removes also peaks with missing intensities -(i.e. an intensity of \code{NA}). Parameter \code{msLevel.} allows to restrict the -filtering to spectra of the specified MS level(s). -\item \code{filterMzRange()}: filters mass peaks in the object keeping or removing -those in each spectrum that are within the provided m/z range. Whether -peaks are retained or removed can be configured with parameter \code{keep} -(default \code{keep = TRUE}). -\item \code{filterMzValues()}: filters mass peaks in the object keeping all -peaks in each spectrum that match the provided m/z value(s) (for -\code{keep = TRUE}, the default) or removing all of them (for \code{keep = FALSE}). -The m/z matching considers also the absolute \code{tolerance} and m/z-relative -\code{ppm} values. \code{tolerance} and \code{ppm} have to be of length 1. -\item \code{filterPeaksRanges()}: filters mass peaks of a \code{Spectra} object using any -set of range-based filters on numeric spectra or peaks variables. See -\code{\link[=filterPeaksRanges]{filterPeaksRanges()}} for more information. -\item \code{filterPrecursorPeaks()}: removes peaks from each spectrum in \code{object} with -an m/z equal or larger than the m/z of the precursor, depending on the -value of parameter \code{mz}: for \verb{mz = ==" (the default) peaks with matching m/z (considering an absolute and relative acceptable difference depending on }tolerance\code{and}ppm\verb{, respectively) are removed. For }mz = ">="\verb{all peaks with an m/z larger or equal to the precursor m/z (minus}tolerance\verb{and the}ppm\verb{of the precursor m/z) are removed. Parameter}msLevel.\verb{allows to restrict the filter to certain MS levels (by default the filter is applied to all MS levels). Note that no peaks are removed if the precursor m/z is}NA` (e.g. typically for MS1 spectra). -\item \code{reduceSpectra()}: keeps for groups of peaks with similar m/z values in -(given \code{ppm} and \code{tolerance}) in each spectrum only the peak with the -highest intensity removing all other peaks hence \emph{reducing} each -spectrum to the highest intensity peaks per \emph{peak group}. -Peak groups are defined using the \code{\link[=group]{group()}} function from the -\emph{MsCoreUtils} package. See also the \code{combinePeaks()} function for an -alternative function to combine peaks within each spectrum. -} -} - -\section{Merging, aggregating and splitting}{ - - -Several \code{Spectra} objects can be concatenated into a single object with the -\code{c()} or the \code{concatenateSpectra()} function. Concatenation will fail if the -processing queue of any of the \code{Spectra} objects is not empty or if -different backends are used in the \code{Spectra} objects. Thus, in these cases, -prior to merging \code{Spectra} object it is suggested to change the backend to -a \code{MsBackendMemory} using the \code{setBackend()} function, and to \emph{apply} all -data processing steps using \code{applyProcessing()}. The spectra variables -of the resulting \code{Spectra} object is the union of the spectra variables of -the individual \code{Spectra} objects. -\itemize{ -\item \code{combineSpectra()}: combines MS data (i.e. mass peaks) from sets of -spectra into a single spectrum per set (in contrast to \code{combinePeaks()} -or \code{reduceSpectra()} that combine mass peaks \strong{within each spectrum}). -For each spectrum group (set), spectra variables from the first spectrum -are used and the peak matrices are combined using the function specified -with \code{FUN}, which defaults to \code{\link[=combinePeaksData]{combinePeaksData()}}. Please refer to the -\code{\link[=combinePeaksData]{combinePeaksData()}} help page for details and options of the actual -combination of peaks across the sets of spectra and to the package -vignette for examples and alternative ways to aggregate spectra. -The sets of spectra can be specified with parameter \code{f}. -In addition it is possible to define, with parameter \code{p} if and how to -split the input data for parallel processing. -This defaults to \code{p = x$dataStorage} and hence a per-file parallel -processing is applied for \code{Spectra} with file-based backends (such as the -\code{\link[=MsBackendMzR]{MsBackendMzR()}}). -Prior combination of the spectra all processings queued in the lazy -evaluation queue are applied. Be aware that calling \code{combineSpectra()} on a -\code{Spectra} object with certain backends that allow modifications might -\strong{overwrite} the original data. This does not happen with a -\code{MsBackendMemory} or \code{MsBackendDataFrame} backend, but with a -\code{MsBackendHdf5Peaks} backend the m/z and intensity values in the original -hdf5 file(s) will be overwritten. -The function returns a \code{Spectra} of length equal to the unique levels -of \code{f}. -\item \code{joinSpectraData()}: Individual spectra variables can be directly -added with the \verb{$<-} or \verb{[[<-} syntax. The \code{joinSpectraData()} -function allows to merge a \code{DataFrame} to the existing spectra -data. This function diverges from the \code{\link[=merge]{merge()}} method in two -main ways: -\itemize{ -\item The \code{by.x} and \code{by.y} column names must be of length 1. -\item If variable names are shared in \code{x} and \code{y}, the spectra -variables of \code{x} are not modified. It's only the \code{y} -variables that are appended the suffix defined in -\code{suffix.y}. This is to avoid modifying any core spectra -variables that would lead to an invalid object. -\item Duplicated Spectra keys (i.e. \code{x[[by.x]]}) are not -allowed. Duplicated keys in the \code{DataFrame} (i.e \code{y[[by.y]]}) -throw a warning and only the last occurrence is kept. These -should be explored and ideally be removed using for -\code{QFeatures::reduceDataFrame()}, \code{PMS::reducePSMs()} or similar -functions. -} -\item \code{split()}: splits the \code{Spectra} object based on parameter \code{f} into a \code{list} -of \code{Spectra} objects. -} -} - -\section{Data manipulation and analysis methods}{ - - -Many data manipulation operations, such as those listed in this section, are -not applied immediately to the spectra, but added to a -\emph{lazy processing/manipulation queue}. Operations stored in this queue are -applied on-the-fly to spectra data each time it is accessed. This lazy -execution guarantees the same functionality for \code{Spectra} objects with -any backend, i.e. backends supporting to save changes to spectrum data -(\code{\link[=MsBackendMemory]{MsBackendMemory()}}, \code{\link[=MsBackendDataFrame]{MsBackendDataFrame()}} or \code{\link[=MsBackendHdf5Peaks]{MsBackendHdf5Peaks()}}) as -well as read-only backends (such as the \code{\link[=MsBackendMzR]{MsBackendMzR()}}). -Note that for the former it is possible to apply the processing queue and -write the modified peak data back to the data storage with the -\code{applyProcessing()} function. -\itemize{ -\item \code{addProcessing()}: adds an arbitrary function that should be applied to the -peaks matrix of every spectrum in \code{object}. The function (can be passed -with parameter \code{FUN}) is expected to take a peaks matrix as input and to -return a peaks matrix. A peaks matrix is a numeric matrix with two columns, -the first containing the m/z values of the peaks and the second the -corresponding intensities. The function has to have \code{...} in its -definition. Additional arguments can be passed with \code{...}. With parameter -\code{spectraVariables} it is possible to define additional spectra variables -from \code{object} that should be passed to the function \code{FUN}. These will be -passed by their name (e.g. specifying \code{spectraVariables = "precursorMz"} -will pass the spectra's precursor m/z as a parameter named \code{precursorMz} -to the function. The only exception is the spectra's MS level, these will -be passed to the function as a parameter called \code{spectrumMsLevel} (i.e. -with \code{spectraVariables = "msLevel"} the MS levels of each spectrum will be -submitted to the function as a parameter called \code{spectrumMsLevel}). -Examples are provided in the package vignette. -\item \code{applyProcessing()}: for \code{Spectra} objects that use a \strong{writeable} backend -only: apply all steps from the lazy processing queue to the peak data and -write it back to the data storage. Parameter \code{f} allows to specify how -\code{object} should be split for parallel processing. This should either be -equal to the \code{dataStorage}, or \code{f = rep(1, length(object))} to disable -parallel processing alltogether. Other partitionings might result in -errors (especially if a \code{MsBackendHdf5Peaks} backend is used). -\item \code{bin()}: aggregates individual spectra into discrete (m/z) bins. Binning is -performed only on spectra of the specified MS level(s) (parameter -\code{msLevel}, by default all MS levels of \code{x}). The bins can be defined with -parameter \code{breaks} which by default are equally sized bins, with size -being defined by parameter \code{binSize}, from the minimal to the maximal m/z -of all spectra (of MS level \code{msLevel}) within \code{x}. The same bins are used -for all spectra in \code{x}. All intensity values for peaks falling into the -same bin are aggregated using the function provided with parameter \code{FUN} -(defaults to \code{FUN = sum}, i.e. all intensities are summed up). Note that -the binning operation is applied to the peak data on-the-fly upon data -access and it is possible to \emph{revert} the operation with the \code{reset()} -function (see description of \code{reset()} above). -\item \code{compareSpectra()}: compares each spectrum in \code{x} with each spectrum in \code{y} -using the function provided with \code{FUN} (defaults to \code{\link[=ndotproduct]{ndotproduct()}}). If -\code{y} is missing, each spectrum in \code{x} is compared with each other spectrum -in \code{x}. -The matching/mapping of peaks between the compared spectra is done with the -\code{MAPFUN} function. The default \code{\link[=joinPeaks]{joinPeaks()}} matches peaks of both spectra -and allows to keep all peaks from the first spectrum (\code{type = "left"}), -from the second (\code{type = "right"}), from both (\code{type = "outer"}) and to -keep only matching peaks (\code{type = "inner"}); see \code{\link[=joinPeaks]{joinPeaks()}} for more -information and examples). The \code{MAPFUN} function should have parameters -\code{x}, \code{y}, \code{xPrecursorMz} and \code{yPrecursorMz} as these values are passed to -the function. In addition to \code{joinPeaks()} also \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} is -supported for GNPS-like similarity score calculations. Note that -\code{joinPeaksGnps()} should only be used in combination with -\code{FUN = MsCoreUtils::gnps} (see \code{\link[=joinPeaksGnps]{joinPeaksGnps()}} for more information and -details). Use \code{MAPFUN = joinPeaksNone} to disable internal peak -matching/mapping if a similarity scoring function is used that performs -the matching internally. -\code{FUN} is supposed to be a function to compare intensities of (matched) -peaks of the two spectra that are compared. The function needs to take two -matrices with columns \code{"mz"} and \code{"intensity"} as input and is supposed -to return a single numeric as result. In addition to the two peak matrices -the spectra's precursor m/z values are passed to the function as parameters -\code{xPrecursorMz} (precursor m/z of the \code{x} peak matrix) and \code{yPrecursorMz} -(precursor m/z of the \code{y} peak matrix). Additional parameters to functions -\code{FUN} and \code{MAPFUN} can be passed with \code{...}. Parameters \code{ppm} and -\code{tolerance} are passed to both \code{MAPFUN} and \code{FUN}. -The function returns a \code{matrix} with the results of \code{FUN} for each -comparison, number of rows equal to \code{length(x)} and number of columns -equal \code{length(y)} (i.e. element in row 2 and column 3 is the result from -the comparison of \code{x[2]} with \code{y[3]}). If \code{SIMPLIFY = TRUE} the \code{matrix} -is \emph{simplified} to a \code{numeric} if length of \code{x} or \code{y} is one. See also -the vignette for additional examples, such as using spectral entropy -similarity in the scoring. -\item \code{entropy()}: calculates the entropy of each spectra based on the metrics -suggested by Li et al. (https://doi.org/10.1038/s41592-021-01331-z). -See also \code{\link[=nentropy]{nentropy()}} in the \emph{MsCoreUtils} package for details. -\item \code{estimatePrecursorIntensity()}: defines the precursor intensities for MS2 -spectra using the intensity of the matching MS1 peak from the -closest MS1 spectrum (i.e. the last MS1 spectrum measured before the -respective MS2 spectrum). With \code{method = "interpolation"} it is also -possible to calculate the precursor intensity based on an interpolation of -intensity values (and retention times) of the matching MS1 peaks from the -previous and next MS1 spectrum. See \code{\link[=estimatePrecursorIntensity]{estimatePrecursorIntensity()}} for -examples and more details. -\item \code{estimatePrecursorMz()}: \strong{for DDA data}: allows to estimate a fragment -spectra's precursor m/z based on the reported precursor m/z and the data -from the previous MS1 spectrum. See \code{\link[=estimatePrecursorMz]{estimatePrecursorMz()}} for details. -\item \code{neutralLoss()}: calculates neutral loss spectra for fragment spectra. See -\code{\link[=neutralLoss]{neutralLoss()}} for detailed documentation. -\item \code{processingLog()}: returns a \code{character} vector with the processing log -messages. -\item \code{reset()}: restores the data to its original state (as much as possible): -removes any processing steps from the lazy processing queue and calls -\code{reset()} on the backend which, depending on the backend, can also undo -e.g. data filtering operations. Note that a \verb{reset*(} call after -\code{applyProcessing()} will not have any effect. See examples below for more -information. -\item \code{scalePeaks()}: scales intensities of peaks within each spectrum depending -on parameter \code{by}. With \code{by = sum} (the default) peak intensities are -divided by the sum of peak intensities within each spectrum. The sum of -intensities is thus 1 for each spectrum after scaling. Parameter -\code{msLevel.} allows to apply the scaling of spectra of a certain MS level. -By default (\code{msLevel. = uniqueMsLevels(x)}) intensities for all -spectra will be scaled. -\item \code{spectrapply()}: applies a given function to each individual spectrum or -sets of a \code{Spectra} object. By default, the \code{Spectra} is split into -individual spectra (i.e. \code{Spectra} of length 1) and the function \code{FUN} -is applied to each of them. An alternative splitting can be defined with -parameter \code{f}. Parameters for \code{FUN} can be passed using \code{...}. -The returned result and its order depend on the function \code{FUN} and how -\code{object} is split (hence on \code{f}, if provided). Parallel processing is -supported and can be configured with parameter \code{BPPARAM}, is however only -suggested for computational intense \code{FUN}. -As an alternative to the (eventual parallel) processing of the full -\code{Spectra}, \code{spectrapply()} supports also a chunk-wise processing. For this, -parameter \code{chunkSize} needs to be specified. \code{object} is then split into -chunks of size \code{chunkSize} which are then (stepwise) processed by \code{FUN}. -This guarantees a lower memory demand (especially for on-disk backends) -since only the data for one chunk needs to be loaded into memory in each -iteration. Note that by specifying \code{chunkSize}, parameters \code{f} and -\code{BPPARAM} will be ignored. -See also \code{\link[=chunkapply]{chunkapply()}} or examples below for details on chunk-wise -processing. -\item \code{smooth()}: smooths individual spectra using a moving window-based approach -(window size = \code{2 * halfWindowSize}). Currently, the -Moving-Average- (\code{method = "MovingAverage"}), -Weighted-Moving-Average- (\verb{method = "WeightedMovingAverage")}, -weights depending on the distance of the center and calculated -\code{1/2^(-halfWindowSize:halfWindowSize)}) and -Savitzky-Golay-Smoothing (\code{method = "SavitzkyGolay"}) are supported. -For details how to choose the correct \code{halfWindowSize} please see -\code{\link[MsCoreUtils:smooth]{MsCoreUtils::smooth()}}. -\item \code{pickPeaks()}: picks peaks on individual spectra using a moving -window-based approach (window size = \code{2 * halfWindowSize}). For noisy -spectra there are currently two different noise estimators available, -the \emph{M}edian \emph{A}bsolute \emph{D}eviation (\code{method = "MAD"}) and -Friedman's Super Smoother (\code{method = "SuperSmoother"}), -as implemented in the \code{\link[MsCoreUtils:noise]{MsCoreUtils::noise()}}. -The method supports also to optionally \emph{refine} the m/z value of -the identified centroids by considering data points that belong (most -likely) to the same mass peak. Therefore the m/z value is calculated as an -intensity weighted average of the m/z values within the peak region. -The peak region is defined as the m/z values (and their respective -intensities) of the \code{2 * k} closest signals to the centroid or the closest -valleys (\code{descending = TRUE}) in the \code{2 * k} region. For the latter the \code{k} -has to be chosen general larger. See \code{\link[MsCoreUtils:refineCentroids]{MsCoreUtils::refineCentroids()}} for -details. -If the ratio of the signal to the highest intensity of the peak is below -\code{threshold} it will be ignored for the weighted average. -\item \code{replaceIntensitiesBelow()}: replaces intensities below a specified -threshold with the provided \code{value}. Parameter \code{threshold} can be either -a single numeric value or a function which is applied to all non-\code{NA} -intensities of each spectrum to determine a threshold value for each -spectrum. The default is \code{threshold = min} which replaces all values -which are <= the minimum intensity in a spectrum with \code{value} (the -default for \code{value} is \code{0}). Note that the function specified with -\code{threshold} is expected to have a parameter \code{na.rm} since \code{na.rm = TRUE} -will be passed to the function. If the spectrum is in profile mode, -ranges of successive non-0 peaks <= \code{threshold} are set to 0. -Parameter \code{msLevel.} allows to apply this to only spectra of certain MS -level(s). -} } \examples{ +## -------- CREATION OF SPECTRA OBJECTS -------- + ## Create a Spectra providing a `DataFrame` containing the spectrum data. spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2)) @@ -1578,12 +298,6 @@ spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8)) data <- Spectra(spd) data -## Get the number of spectra -length(data) - -## Get the number of peaks per spectrum -lengths(data) - ## Create a Spectra from mzML files and use the `MsBackendMzR` on-disk ## backend. sciex_file <- dir(system.file("sciex", package = "msdata"), @@ -1591,6 +305,9 @@ sciex_file <- dir(system.file("sciex", package = "msdata"), sciex <- Spectra(sciex_file, backend = MsBackendMzR()) sciex + +## -------- CHANGING DATA REPRESENTATIONS -------- + ## The MS data is on disk and will be read into memory on-demand. We can ## however change the backend to a MsBackendMemory backend which will ## keep all of the data in memory. @@ -1626,311 +343,7 @@ head(dataOrigin(sciex)) head(dataOrigin(sciex_im)) -## ---- ACCESSING AND ADDING DATA ---- - -## Get the MS level for each spectrum. -msLevel(data) - -## Alternatively, we could also use $ to access a specific spectra variable. -## This could also be used to add additional spectra variables to the -## object (see further below). -data$msLevel - -## Get the intensity and m/z values. -intensity(data) -mz(data) - -## Determine whether one of the spectra has a specific m/z value -containsMz(data, mz = 120.4) - -## Accessing spectra variables works for all backends: -intensity(sciex) -intensity(sciex_im) - -## Get the m/z for the first spectrum. -mz(data)[[1]] - -## Get the peak data (m/z and intensity values). -pks <- peaksData(data) -pks -pks[[1]] -pks[[2]] - -## Note that we could get the same resulb by coercing the `Spectra` to -## a `list` or `SimpleList`: -as(data, "list") -as(data, "SimpleList") - -## List all available spectra variables (i.e. spectrum data and metadata). -spectraVariables(data) - -## For all *core* spectrum variables accessor functions are available. These -## return NA if the variable was not set. -centroided(data) -dataStorage(data) -rtime(data) -precursorMz(data) - -## The core spectra variables are: -coreSpectraVariables() - -## Add an additional metadata column. -data$spectrum_id <- c("sp_1", "sp_2") - -## List spectra variables, "spectrum_id" is now also listed -spectraVariables(data) - -## Get the values for the new spectra variable -data$spectrum_id - -## Extract specific spectra variables. -spectraData(data, columns = c("spectrum_id", "msLevel")) - -## Drop spectra variable data and/or columns. -res <- selectSpectraVariables(data, c("mz", "intensity")) - -## This removed the additional columns "spectrum_id" and deleted all values -## for all spectra variables, except "mz" and "intensity". -spectraData(res) - -## Compared to the data before selectSpectraVariables. -spectraData(data) - - -## ---- SUBSETTING, FILTERING AND COMBINING - -## Subset to all MS2 spectra. -data[msLevel(data) == 2] - -## Same with the filterMsLevel function -filterMsLevel(data, 2) - -## Below we combine the `data` and `sciex_im` objects into a single one. -data_comb <- c(data, sciex_im) - -## The combined Spectra contains a union of all spectra variables: -head(data_comb$spectrum_id) -head(data_comb$rtime) -head(data_comb$dataStorage) -head(data_comb$dataOrigin) - -## Filter a Spectra for a target precursor m/z with a tolerance of 10ppm -spd$precursorMz <- c(323.4, 543.2302) -data_filt <- Spectra(spd) -filterPrecursorMzRange(data_filt, mz = 543.23 + ppm(c(-543.23, 543.23), 10)) - -## Filter a Spectra keeping only peaks matching certain m/z values -sps_sub <- filterMzValues(data, mz = c(103, 104), tolerance = 0.3) -mz(sps_sub) - -## This function can also be used to remove specific peaks from a spectrum -## by setting `keep = FALSE`. -sps_sub <- filterMzValues(data, mz = c(103, 104), - tolerance = 0.3, keep = FALSE) -mz(sps_sub) - -## Note that `filterMzValues()` keeps or removes all peaks with a matching -## m/z given the provided `ppm` and `tolerance` parameters. - -## Filter a Spectra keeping only peaks within a m/z range -sps_sub <- filterMzRange(data, mz = c(100, 300)) -mz(sps_sub) - -## Remove empty spectra variables -sciex_noNA <- dropNaSpectraVariables(sciex) - -## Available spectra variables before and after `dropNaSpectraVariables()` -spectraVariables(sciex) -spectraVariables(sciex_noNA) - - -## Adding new spectra variables -sciex1 <- filterDataOrigin(sciex, dataOrigin(sciex)[1]) -spv <- DataFrame(spectrumId = sciex1$spectrumId[3:12], ## used for merging - var1 = rnorm(10), - var2 = sample(letters, 10)) -spv - -sciex2 <- joinSpectraData(sciex1, spv, by.y = "spectrumId") - -spectraVariables(sciex2) -spectraData(sciex2)[1:13, c("spectrumId", "var1", "var2")] - -## Removing fourier transform artefacts seen in Orbitra data. - -## Loading an Orbitrap spectrum with artefacts. -data(fft_spectrum) -plotSpectra(fft_spectrum, xlim = c(264.5, 265.5)) -plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) - -fft_spectrum <- filterFourierTransformArtefacts(fft_spectrum) -fft_spectrum -plotSpectra(fft_spectrum, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) - -## Using a few examples peaks in your data you can optimize the parameters -fft_spectrum_filtered <- filterFourierTransformArtefacts(fft_spectrum, - halfWindowSize = 0.2, - threshold = 0.005, - keepIsotopes = TRUE, - maxCharge = 5, - isotopeTolerance = 0.005 - ) - -fft_spectrum_filtered -length(mz(fft_spectrum_filtered)[[1]]) -plotSpectra(fft_spectrum_filtered, xlim = c(264.5, 265.5), ylim = c(0, 5e6)) - -## Using filterRanges to filter spectra object based on variables available -## in `spectraData`. -## First, determine the variable(s) on which to base the filtering: -sv <- c("rtime", "precursorMz", "peaksCount") -## Note that ANY variables can be chosen here, and as many as wanted. - -## Define the ranges (pairs of values with lower and upper boundary) to be -## used for the individual spectra variables. The first two values will be -## used for the first spectra variable (e.g., rtime here), the next two for -## the second (e.g. precursorMz here) and so on: -ranges <- c(30, 350, 200,500, 350, 600) - -## Input the parameters within the filterRanges function: -filt_spectra <- filterRanges(sciex, spectraVariables = sv, - ranges = ranges) - -## Using `filterRanges()` to filter spectra object with multiple ranges for -## the same `spectraVariable` (e.g, here rtime) -sv <- c("rtime", "rtime") -ranges <- c(30, 100, 200, 300) -filt_spectra <- filterRanges(sciex, spectraVariables = sv, - ranges = ranges, match = "any") - -## Using filterValues in a similar way to a filter spectra object based on -## variables available in `spectraData`. However, this time not based on -## ranges but similarities to user input single values with given -## tolerance/ppm -## First determine the variable(s) on which to base the filtering: -sv <- c("rtime", "precursorMz") -## Note that ANY variables can be chosen here, and as many as wanted. - -## Define the values that will be used to filter the spectra based on their -## similarities to their respective spectraVariables. -## The first values in the parameters values, tolerance and ppm will be -## used for the first spectra variable (e.g. rtime here), the next for the -## second (e.g. precursorMz here) and so on: -values <- c(350, 400) -tolerance <- c(100, 0) -ppm <- c(0,50) - -## Input the parameters within the `filterValues()` function: -filt_spectra <- filterValues(sciex, spectraVariables = sv, - values = values, tolerance = tolerance, ppm = ppm) - -## ---- DATA MANIPULATIONS AND OTHER OPERATIONS ---- - -## Set the data to be centroided -centroided(data) <- TRUE - -## Replace peak intensities below 40 with 3. -res <- replaceIntensitiesBelow(data, threshold = 40, value = 3) -res - -## Get the intensities of the first and second spectrum. -intensity(res)[[1]] -intensity(res)[[2]] - -## Remove all peaks with an intensity below 40. -res <- filterIntensity(res, intensity = c(40, Inf)) - -## Get the intensities of the first and second spectrum. -intensity(res)[[1]] -intensity(res)[[2]] - -## Lengths of spectra is now different -lengths(mz(res)) -lengths(mz(data)) - -## In addition it is possible to pass a function to `filterIntensity()`: in -## the example below we want to keep only peaks that have an intensity which -## is larger than one third of the maximal peak intensity in that spectrum. -keep_peaks <- function(x, prop = 3) { - x > max(x, na.rm = TRUE) / prop -} -res2 <- filterIntensity(data, intensity = keep_peaks) -intensity(res2)[[1L]] -intensity(data)[[1L]] - -## We can also change the proportion by simply passing the `prop` parameter -## to the function. To keep only peaks that have an intensity which is -## larger than half of the maximum intensity: -res2 <- filterIntensity(data, intensity = keep_peaks, prop = 2) -intensity(res2)[[1L]] -intensity(data)[[1L]] - -## Since data manipulation operations are by default not directly applied to -## the data but only added to the internal lazy evaluation queue, it is also -## possible to remove these data manipulations with the `reset()` function: -res_rest <- reset(res) -res_rest -lengths(mz(res_rest)) -lengths(mz(res)) -lengths(mz(data)) - -## `reset()` after a `applyProcessing()` can not restore the data, because -## the data in the backend was changed. Similarly, `reset()` after any -## filter operations can not restore data for a `Spectra` with a -## `MsBackendMemory` or `MsBackendDataFrame`. -res_2 <- applyProcessing(res) -res_rest <- reset(res_2) -lengths(mz(res)) -lengths(mz(res_rest)) - - -## Compare spectra: comparing spectra 2 and 3 against spectra 10:20 using -## the normalized dotproduct method. -res <- compareSpectra(sciex_im[2:3], sciex_im[10:20]) -## first row contains comparisons of spectrum 2 with spectra 10 to 20 and -## the second row comparisons of spectrum 3 with spectra 10 to 20 -res - -## To use a simple Pearson correlation instead we can define a function -## that takes the two peak matrices and calculates the correlation for -## their second columns (containing the intensity values). -correlateSpectra <- function(x, y, use = "pairwise.complete.obs", ...) { - cor(x[, 2], y[, 2], use = use) -} -res <- compareSpectra(sciex_im[2:3], sciex_im[10:20], - FUN = correlateSpectra) -res - -## Use compareSpectra to determine the number of common (matching) peaks -## with a ppm of 10: -## type = "inner" uses a *inner join* to match peaks, i.e. keeps only -## peaks that can be mapped betwen both spectra. The provided FUN returns -## simply the number of matching peaks. -compareSpectra(sciex_im[2:3], sciex_im[10:20], ppm = 10, type = "inner", - FUN = function(x, y, ...) nrow(x)) - -## Apply an arbitrary function to each spectrum in a Spectra. -## In the example below we calculate the mean intensity for each spectrum -## in a subset of the sciex_im data. Note that we can access all variables -## of each individual spectrum either with the `$` operator or the -## corresponding method. -res <- spectrapply(sciex_im[1:20], FUN = function(x) mean(x$intensity[[1]])) -head(res) - -## It is however important to note that dedicated methods to access the -## data (such as `intensity`) are much more efficient than using `lapply()`: -res <- lapply(intensity(sciex_im[1:20]), mean) -head(res) - -## As an alternative, applying a function `FUN` to a `Spectra` can be -## performed *chunk-wise*. The advantage of this is, that only the data for -## one chunk at a time needs to be loaded into memory reducing the memory -## demand. This type of processing can be performed by specifying the size -## of the chunks (i.e. number of spectra per chunk) with the `chunkSize` -## parameter -spectrapply(sciex_im[1:20], lengths, chunkSize = 5L) - -## ---- DATA EXPORT ---- +## -------- DATA EXPORT -------- ## Some `MsBackend` classes provide an `export()` method to export the data ## to the file format supported by the backend. @@ -1959,45 +372,7 @@ res <- Spectra(backendInitialize(MsBackendMzR(), fls[1])) mz(res) mz(data) - -## ---- PEAKS VARIABLES AND DATA ---- - -## Some `MsBackend` classes provide support for arbitrary peaks variables -## (in addition to the mandatory `"mz"` and `"intensity"` values. Below -## we create a simple data frame with an additional peak variable `"pk_ann"` -## and create a `Spectra` with a `MsBackendMemory` for that data. -## Importantly the number of values (per spectrum) need to be the same -## for all peak variables. - -tmp <- data.frame(msLevel = c(2L, 2L), rtime = c(123.2, 123.5)) -tmp$mz <- list(c(103.1, 110.4, 303.1), c(343.2, 453.1)) -tmp$intensity <- list(c(130.1, 543.1, 40), c(0.9, 0.45)) -tmp$pk_ann <- list(c(NA_character_, "A", "P"), c("B", "P")) - -## Create the Spectra. With parameter `peaksVariables` we can define -## the columns in `tmp` that contain peaks variables. -sps <- Spectra(tmp, source = MsBackendMemory(), - peaksVariables = c("mz", "intensity", "pk_ann")) -peaksVariables(sps) - -## Extract just the m/z and intensity values -peaksData(sps)[[1L]] - -## Extract the full peaks data -peaksData(sps, columns = peaksVariables(sps))[[1L]] - -## Access just the pk_ann variable -sps$pk_ann - -## Convert a subset of the Spectra object to a long DataFrame. -asDataFrame(sciex, i = 1:3, spectraVars = c("rtime", "msLevel")) } \author{ -Nir Shahaf, Johannes Rainer - -Nir Shahaf - -Johannes Rainer - Sebastian Gibb, Johannes Rainer, Laurent Gatto, Philippine Louail } diff --git a/man/countIdentifications.Rd b/man/countIdentifications.Rd index c7904ef6..08afd04b 100644 --- a/man/countIdentifications.Rd +++ b/man/countIdentifications.Rd @@ -109,6 +109,9 @@ sp <- countIdentifications(sp) ## and three PSMs respectively. table(sp$countIdentifications, sp$msLevel) } +\seealso{ +\code{\link[=addProcessing]{addProcessing()}} for other data analysis functions. +} \author{ Laurent Gatto } diff --git a/man/estimatePrecursorIntensity.Rd b/man/estimatePrecursorIntensity.Rd index 97a2cde2..8780aab4 100644 --- a/man/estimatePrecursorIntensity.Rd +++ b/man/estimatePrecursorIntensity.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/Spectra.R \name{estimatePrecursorIntensity,Spectra-method} \alias{estimatePrecursorIntensity,Spectra-method} +\alias{estimatePrecursorIntensity} \title{Estimate Precursor Intensities} \usage{ \S4method{estimatePrecursorIntensity}{Spectra}( diff --git a/man/estimatePrecursorMz.Rd b/man/estimatePrecursorMz.Rd index f79bfa24..7bc9e6cd 100644 --- a/man/estimatePrecursorMz.Rd +++ b/man/estimatePrecursorMz.Rd @@ -83,6 +83,9 @@ plot(precursorMz(s), precursorMz(s) - pmz, xlab = "precursor m/z", ## we could then replace the reported precursor m/z values s$precursorMz <- pmz } +\seealso{ +\code{\link[=addProcessing]{addProcessing()}} for other data analysis and manipulation functions. +} \author{ Mar Garcia-Aloy, Johannes Rainer } diff --git a/man/hidden_aliases.Rd b/man/hidden_aliases.Rd index a203f8c6..1249a50f 100644 --- a/man/hidden_aliases.Rd +++ b/man/hidden_aliases.Rd @@ -8,15 +8,6 @@ \alias{[,MsBackendDataFrame-method} \alias{ppm} \alias{bin,numeric-method} -\alias{containsMz} -\alias{containsNeutralLoss} -\alias{dropNaSpectraVariables} -\alias{entropy} -\alias{export} -\alias{pickPeaks} -\alias{replaceIntensitiesBelow} -\alias{reset} -\alias{selectSpectraVariables} \alias{show,MsBackendDataFrame-method} \alias{backendMerge,MsBackendDataFrame-method} \alias{acquisitionNum,MsBackendDataFrame-method} @@ -170,24 +161,6 @@ .check = TRUE ) -containsMz(object, ...) - -containsNeutralLoss(object, ...) - -dropNaSpectraVariables(object, ...) - -entropy(object, ...) - -export(object, ...) - -pickPeaks(object, ...) - -replaceIntensitiesBelow(object, threshold = min, ...) - -reset(object, ...) - -selectSpectraVariables(object, ...) - \S4method{show}{MsBackendDataFrame}(object) \S4method{backendMerge}{MsBackendDataFrame}(object, ...) diff --git a/man/joinPeaks.Rd b/man/joinPeaks.Rd index 29cabc8d..bc1fa688 100644 --- a/man/joinPeaks.Rd +++ b/man/joinPeaks.Rd @@ -142,7 +142,12 @@ joinPeaksGnps(x, y, pmz_x, pmz_y) joinPeaksGnps(x, y, pmz_x, yPrecursorMz = NA) } \seealso{ -\code{\link[=gnps]{gnps()}} +\itemize{ +\item \code{\link[=compareSpectra]{compareSpectra()}} for the function to calculate similarities between +spectra. +\item \code{\link[=gnps]{gnps()}} in the \emph{MsCoreUtils} package for more information on the GNPS +similarity score. +} } \author{ Johannes Rainer, Michael Witting diff --git a/man/neutralLoss.Rd b/man/neutralLoss.Rd index da1a887e..d27cd3c8 100644 --- a/man/neutralLoss.Rd +++ b/man/neutralLoss.Rd @@ -1,13 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/AllGenerics.R, R/Spectra-neutralLoss.R +% Please edit documentation in R/Spectra-neutralLoss.R \name{neutralLoss} \alias{neutralLoss} \alias{PrecursorMzParam} \alias{neutralLoss,Spectra,PrecursorMzParam-method} \title{Calculate Neutral Loss Spectra} \usage{ -neutralLoss(object, param, ...) - PrecursorMzParam( filterPeaks = c("none", "abovePrecursor", "belowPrecursor", "removePrecursor"), msLevel = c(2L, NA_integer_), @@ -18,13 +16,6 @@ PrecursorMzParam( \S4method{neutralLoss}{Spectra,PrecursorMzParam}(object, param, ...) } \arguments{ -\item{object}{\code{\link[=Spectra]{Spectra()}} object with the fragment spectra for which neutral -loss spectra should be calculated.} - -\item{param}{One of the \emph{parameter} objects discussed below.} - -\item{...}{Currently ignored.} - \item{filterPeaks}{For \code{PrecursorMzParam()}: \code{character(1)} or \code{function} defining if and how fragment peaks should be filtered before calculation. Pre-defined options are: \code{"none"} (keep all peaks), \code{"abovePrecursor"} @@ -47,6 +38,13 @@ for details.} \item{tolerance}{\code{numeric(1)} with absolute acceptable difference in m/z values to filter peaks. Defaults to \code{tolerance = 0}. See function description for details.} + +\item{object}{\code{\link[=Spectra]{Spectra()}} object with the fragment spectra for which neutral +loss spectra should be calculated.} + +\item{param}{One of the \emph{parameter} objects discussed below.} + +\item{...}{Currently ignored.} } \value{ A \code{\link[=Spectra]{Spectra()}} object with calculated neutral loss spectra. @@ -136,6 +134,9 @@ Aisporna A, Benton PH, Chen A, Derks RJE, Galano JM, Giera M and Siuzdak G Analysis in METLIN. Journal of the American Society for Mass Spectrometry. \doi{10.1021/jasms.1c00343} } +\seealso{ +\code{\link[=addProcessing]{addProcessing()}} for other data analysis and manipulation functions. +} \author{ Johannes Rainer } diff --git a/man/processingChunkSize.Rd b/man/processingChunkSize.Rd index b47d8c69..a9382611 100644 --- a/man/processingChunkSize.Rd +++ b/man/processingChunkSize.Rd @@ -1,9 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Spectra-functions.R +% Please edit documentation in R/Spectra-functions.R, R/Spectra.R \name{processingChunkSize} \alias{processingChunkSize} \alias{processingChunkSize<-} \alias{processingChunkFactor} +\alias{backendBpparam,Spectra-method} \title{Parallel and chunk-wise processing of \code{Spectra}} \usage{ processingChunkSize(x) @@ -11,11 +12,18 @@ processingChunkSize(x) processingChunkSize(x) <- value processingChunkFactor(x) + +\S4method{backendBpparam}{Spectra}(object, BPPARAM = bpparam()) } \arguments{ \item{x}{\code{Spectra}.} \item{value}{\code{integer(1)} defining the chunk size.} + +\item{object}{\code{Spectra} object.} + +\item{BPPARAM}{Parallel setup configuration. See \code{\link[=bpparam]{bpparam()}} for more +information.} } \value{ \code{processingChunkSize()} returns the currently defined processing