From 283382a92931f281ad97f5391513b0ac89b5cfa2 Mon Sep 17 00:00:00 2001 From: Aaron Lun Date: Tue, 3 Sep 2024 16:46:38 -0700 Subject: [PATCH] Enable deduplicated saving of external seeds in storeDelayedObject. (#13) This provides some opportunities for storage optimization when dealing with DelayedArrays. A user wanting to save multiple DelayedArrays backed by the same seed only needs to save the seed once and just link/copy its files afterwards. --- NAMESPACE | 1 + R/storeDelayedObject.R | 103 +++++++++++++++++++++-- inst/NEWS.Rd | 3 + man/storeDelayedObject.Rd | 60 ++++++++++++- tests/testthat/test-storeDelayedObject.R | 61 ++++++++++++++ 5 files changed, 218 insertions(+), 10 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7818a59..155dfd3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(altReloadDelayedObjectFunction) export(altStoreDelayedObject) export(altStoreDelayedObjectFunction) export(componentNames) +export(createExternalSeedDedupSession) export(extractComponents) export(loadArray) export(loadWrapperArray) diff --git a/R/storeDelayedObject.R b/R/storeDelayedObject.R index 8bb2f07..eb64c9d 100644 --- a/R/storeDelayedObject.R +++ b/R/storeDelayedObject.R @@ -8,8 +8,8 @@ #' @param ... For \code{storeDelayedObject} and \code{reloadDelayedObject}, additional arguments to be passed to specific methods. #' #' For \code{altStoreDelayedObject} and \code{altReloadDelayedObject}, arguments to be passed to the alternative functions. -#' @param version Package version of the \pkg{chihaya} format to use when loading. -#' This should be retrieved from the attributes of the outermost group, typically by \code{readDelayedArray}. +#' @param version Package version of the \pkg{chihaya} format to use when storing or reloading delayed objects. +#' When reloading, the version should be retrieved from the attributes of the outermost group, typically by \code{readDelayedArray}. #' @param reload Function to reload delayed objects from file. #' This should accept the same arguments as \code{reloadDelayedObject}. #' @param store Function (typically a generic) to store delayed objects to file. @@ -23,6 +23,10 @@ #' It may also be \code{NULL} to delete an existing entry in the registry. #' @param existing Logical scalar indicating the action to take if a function has already been registered for \code{type} and \code{subtype} - #' keep the old or new function, or throw an error. +#' @param save.external.array Logical scalar indicating whether to save an array-like seed as an external seed, +#' even if a dedicated \code{storeDelayedObject} method is available. +#' @param external.dedup.session Session object created by \code{createExternalSeedDedupSession}. +#' @param external.dedup.action String specifying the deduplication method to use. #' #' @section Customization: #' Developers can easily extend \pkg{alabaster.matrix} to new delayed objects by writing new methods for \code{storeDelayedObject}. @@ -38,6 +42,31 @@ #' Extension developers (i.e., those who write new methods for \code{storeDelayedObject} or new functions for \code{reloadDelayedObject}) #' should generally use \code{altStoreDelayedObject} and \code{altReloadDelayedObject} in their method/funcion bodies. #' This ensures that any custom overrides specified by application developers are still respected in the extensions to \pkg{alabaster.matrix}. +#' +#' @section External seeds: +#' Whenever \code{storeDelayedObject} encounters a delayed operation or array-like seed for which it has no methods, +#' the ANY method will save the delayed object as an \dQuote{external seed}. +#' The array is saved via \code{\link{saveObject}} into a \code{seeds} directory next to the file corresponding to \code{handle}. +#' A reference to this external location is then stored in the \code{name} group inside \code{handle}. +#' +#' Users can force this behavior for all array-like seeds by passing \code{save.external.array=TRUE} in the \code{...} arguments of \code{storeDelayedObject}. +#' This instructs \code{storeDelayedObject} to save everything as external seeds, including those arrays for which it has methods. +#' Doing so can be beneficial to enable deduplication, e.g., when two delayed arrays perform different operations on the same underlying seed. +#' By saving the seeds externally, file management systems can identify the redundancy to save storage space. +#' +#' Advanced users can explicitly deduplicate external seeds by setting \code{save.external.array=TRUE} and passing \code{external.dedup.session=} to \code{storeDelayedObject}. +#' The \code{external.dedup.session} object is filled up with unique seeds as \code{storeDelayedObject} is called on various DelayedArrays. +#' Whenever a duplicate seed is encountered, it is not saved again, but is instead linked or copied from the file path associated with the identical external seed. +#' For example, a new session can be created when saving a SummarizedExperiment to deduplicate seeds across its assays. +#' +#' The exact deduplication action can be specified by specifying the \code{external.dedup.action=} parameter. +#' By default, \code{storeDelayedObject} attempts to create hard links, falling back to copies when a link cannot be created. +#' Users can instead create copies, symbolic links to absolute paths, or even symbolic links to relative paths +#' (e.g., to link to a \dQuote{neighboring} assay of the same SummarizedExperiment). +#' +#' When external seeds are encountered by \code{reloadDelayedObject}, they are loaded as \linkS4class{ReloadedArray}s (or some variant thereof) by \code{\link{altReadObject}}. +#' Users can forcibly realize the reloaded seed into memory by passing \code{custom.takane.reload=TRUE} in \code{...} for the \code{reloadDelayedObject} call. +#' This is occasionally helpful for providing a more faithful roundtrip from file back into memory. #' #' @return #' For \code{storeDelayedObject} and \code{altStoreDelayedObject}, the contents of \code{x} are saved to \code{file}, and \code{NULL} is invisibly returned. @@ -284,9 +313,10 @@ save_dense_array_for_chihaya <- function(x, handle, name, extract.native, versio } #' @export +#' @rdname storeDelayedObject setMethod("storeDelayedObject", "array", function(x, handle, name, version=package_version("1.1"), save.external.array=FALSE, ...) { if (save.external.array) { - return(callNextMethod()) # calls the ANY method + return(selectMethod("storeDelayedObject", "ANY")(x, handle, name, version=version, save.external.array=save.external.array, ...)) } save_dense_array_for_chihaya(x, handle, name, extract.native=identity, version=version, ...) }) @@ -294,7 +324,7 @@ setMethod("storeDelayedObject", "array", function(x, handle, name, version=packa #' @export setMethod("storeDelayedObject", "denseMatrix", function(x, handle, name, version=package_version("1.1"), save.external.array=FALSE, ...) { if (save.external.array) { - return(callNextMethod()) # calls the ANY method. + return(selectMethod("storeDelayedObject", "ANY")(x, handle, name, version=version, save.external.array=save.external.array, ...)) } extract.native <- NULL @@ -369,7 +399,7 @@ save_sparse_matrix_for_chihaya <- function(x, handle, name, version=package_vers #' @export setMethod("storeDelayedObject", "sparseMatrix", function(x, handle, name, version=package_version("1.1"), save.external.array=FALSE, ...) { if (save.external.array) { - return(callNextMethod()) # calls the ANY method + return(selectMethod("storeDelayedObject", "ANY")(x, handle, name, version=version, save.external.array=save.external.array, ...)) } save_sparse_matrix_for_chihaya(x, handle, name, version=version, ...) }) @@ -377,7 +407,7 @@ setMethod("storeDelayedObject", "sparseMatrix", function(x, handle, name, versio #' @export setMethod("storeDelayedObject", "SVT_SparseMatrix", function(x, handle, name, version=package_version("1.1"), save.external.array=FALSE, ...) { if (save.external.array) { - return(callNextMethod()) # calls the ANY method + return(selectMethod("storeDelayedObject", "ANY")(x, handle, name, version=version, save.external.array=save.external.array, ...)) } save_sparse_matrix_for_chihaya(x, handle, name, version=version, ...) }) @@ -1056,7 +1086,16 @@ chihaya.registry$operation[["unary arithmetic"]] <- function(handle, version, .. #' @export #' @import rhdf5 -setMethod("storeDelayedObject", "ANY", function(x, handle, name, version=package_version("1.1"), ...) { +#' @rdname storeDelayedObject +setMethod("storeDelayedObject", "ANY", function( + x, + handle, + name, + version=package_version("1.1"), + external.dedup.session=NULL, + external.dedup.action=c("link", "copy", "symlink", "relsymlink"), + ...) +{ ghandle <- H5Gcreate(handle, name) on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE) @@ -1105,7 +1144,15 @@ setMethod("storeDelayedObject", "ANY", function(x, handle, name, version=package exdir <- file.path(dirname(H5Fget_name(handle)), "seeds") dir.create(exdir, showWarnings=FALSE) n <- length(list.files(exdir)) - saveObject(x, file.path(exdir, n), ...) + output <- file.path(exdir, n) + + dedup.path <- check_external_seed_in_dedup_session(x, session=external.dedup.session) + if (is.null(dedup.path)) { + saveObject(x, output, ...) + } else { + clone_duplicate(dedup.path, output, action=match.arg(external.dedup.action)) + } + add_external_seed_to_dedup_session(x, session=external.dedup.session, path=output) h5_write_vector(ghandle, "dimensions", dim(x), type="H5T_NATIVE_UINT32", compress=0) h5_write_vector(ghandle, "type", to_value_type(type(x)), scalar=TRUE) @@ -1236,3 +1283,43 @@ altReloadDelayedObject <- function(...) { } FUN(...) } + +####################################################### +####################################################### + +#' @export +#' @rdname storeDelayedObject +createExternalSeedDedupSession <- function() { + output <- new.env() + output$known <- list() + output +} + +check_external_seed_in_dedup_session <- function(x, session) { + if (is.null(session)) { + return(NULL) + } + + cls <- as.character(class(x))[1] + if (!(cls %in% names(session$known))) { + return(NULL) + } + + candidates <- session$known[[cls]] + for (y in candidates) { + if (identical(x, y$value)) { + return(y$path) + } + } + + return(NULL) +} + +add_external_seed_to_dedup_session <- function(x, session, path) { + cls <- as.character(class(x))[1] + if (!(cls %in% names(session$known))) { + session$known[[cls]] <- list() + } + path <- normalizePath(path, mustWork=TRUE) + session$known[[cls]] <- c(session$known[[cls]], list(list(value=x, path=path))) +} diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index 1e1fd0a..7f62898 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -19,4 +19,7 @@ to allow applications to override the delayed operation saving/reading process. \item Added a \code{ReloadedArray.reuse.files="relsymlink"} option to create relative symbolic links to the original array files. This is more robust to movement of the original files, provided the new files are moved in the same manner. + +\item Enable deduplication of identical seeds across multiple calls to \code{storeDelayedObject()} within a single \dQuote{session}. +This avoids making multiple copies of the same seed for different DelayedArray instances with the same seeds, e.g., in a SummarizedExperiment. }} diff --git a/man/storeDelayedObject.Rd b/man/storeDelayedObject.Rd index b1175fd..cee2114 100644 --- a/man/storeDelayedObject.Rd +++ b/man/storeDelayedObject.Rd @@ -23,6 +23,7 @@ \alias{altStoreDelayedObject} \alias{altReloadDelayedObjectFunction} \alias{altReloadDelayedObject} +\alias{createExternalSeedDedupSession} \title{Store/reload a DelayedArray} \usage{ storeDelayedObject(x, handle, name, ...) @@ -38,6 +39,25 @@ registerReloadDelayedObjectFunction( existing = c("old", "new", "error") ) +\S4method{storeDelayedObject}{array}( + x, + handle, + name, + version = package_version("1.1"), + save.external.array = FALSE, + ... +) + +\S4method{storeDelayedObject}{ANY}( + x, + handle, + name, + version = package_version("1.1"), + external.dedup.session = NULL, + external.dedup.action = c("link", "copy", "symlink", "relsymlink"), + ... +) + altStoreDelayedObjectFunction(store) altStoreDelayedObject(...) @@ -45,6 +65,8 @@ altStoreDelayedObject(...) altReloadDelayedObjectFunction(reload) altReloadDelayedObject(...) + +createExternalSeedDedupSession() } \arguments{ \item{x}{Any of the delayed operation/array classes from \pkg{DelayedArray}.} @@ -57,8 +79,8 @@ altReloadDelayedObject(...) For \code{altStoreDelayedObject} and \code{altReloadDelayedObject}, arguments to be passed to the alternative functions.} -\item{version}{Package version of the \pkg{chihaya} format to use when loading. -This should be retrieved from the attributes of the outermost group, typically by \code{readDelayedArray}.} +\item{version}{Package version of the \pkg{chihaya} format to use when storing or reloading delayed objects. +When reloading, the version should be retrieved from the attributes of the outermost group, typically by \code{readDelayedArray}.} \item{type}{String specifying the type of delayed object, i.e., operation or array. This corresponds to \code{delayed_type} type in the \pkg{chihaya} attributes.} @@ -73,6 +95,13 @@ It may also be \code{NULL} to delete an existing entry in the registry.} \item{existing}{Logical scalar indicating the action to take if a function has already been registered for \code{type} and \code{subtype} - keep the old or new function, or throw an error.} +\item{save.external.array}{Logical scalar indicating whether to save an array-like seed as an external seed, +even if a dedicated \code{storeDelayedObject} method is available.} + +\item{external.dedup.session}{Session object created by \code{createExternalSeedDedupSession}.} + +\item{external.dedup.action}{String specifying the deduplication method to use.} + \item{store}{Function (typically a generic) to store delayed objects to file. This should accept the same arguments as \code{storeDelayedObject}.} @@ -110,6 +139,33 @@ should generally use \code{altStoreDelayedObject} and \code{altReloadDelayedObje This ensures that any custom overrides specified by application developers are still respected in the extensions to \pkg{alabaster.matrix}. } +\section{External seeds}{ + +Whenever \code{storeDelayedObject} encounters a delayed operation or array-like seed for which it has no methods, +the ANY method will save the delayed object as an \dQuote{external seed}. +The array is saved via \code{\link{saveObject}} into a \code{seeds} directory next to the file corresponding to \code{handle}. +A reference to this external location is then stored in the \code{name} group inside \code{handle}. + +Users can force this behavior for all array-like seeds by passing \code{save.external.array=TRUE} in the \code{...} arguments of \code{storeDelayedObject}. +This instructs \code{storeDelayedObject} to save everything as external seeds, including those arrays for which it has methods. +Doing so can be beneficial to enable deduplication, e.g., when two delayed arrays perform different operations on the same underlying seed. +By saving the seeds externally, file management systems can identify the redundancy to save storage space. + +Advanced users can explicitly deduplicate external seeds by setting \code{save.external.array=TRUE} and passing \code{external.dedup.session=} to \code{storeDelayedObject}. +The \code{external.dedup.session} object is filled up with unique seeds as \code{storeDelayedObject} is called on various DelayedArrays. +Whenever a duplicate seed is encountered, it is not saved again, but is instead linked or copied from the file path associated with the identical external seed. +For example, a new session can be created when saving a SummarizedExperiment to deduplicate seeds across its assays. + +The exact deduplication action can be specified by specifying the \code{external.dedup.action=} parameter. +By default, \code{storeDelayedObject} attempts to create hard links, falling back to copies when a link cannot be created. +Users can instead create copies, symbolic links to absolute paths, or even symbolic links to relative paths +(e.g., to link to a \dQuote{neighboring} assay of the same SummarizedExperiment). + +When external seeds are encountered by \code{reloadDelayedObject}, they are loaded as \linkS4class{ReloadedArray}s (or some variant thereof) by \code{\link{altReadObject}}. +Users can forcibly realize the reloaded seed into memory by passing \code{custom.takane.reload=TRUE} in \code{...} for the \code{reloadDelayedObject} call. +This is occasionally helpful for providing a more faithful roundtrip from file back into memory. +} + \examples{ library(DelayedArray) X <- DelayedArray(matrix(runif(100), ncol=20)) diff --git a/tests/testthat/test-storeDelayedObject.R b/tests/testthat/test-storeDelayedObject.R index 3fa70d0..d847392 100644 --- a/tests/testthat/test-storeDelayedObject.R +++ b/tests/testthat/test-storeDelayedObject.R @@ -760,3 +760,64 @@ test_that("alternative delayed object function overrides work", { on.exit(altReloadDelayedObjectFunction(old), after=FALSE, add=TRUE) expect_identical(altReloadDelayedObject(), "whee") }) + +####################################################### +####################################################### + +test_that("external deduplication is done correctly", { + dedup.session <- createExternalSeedDedupSession() + + X <- DelayedArray(matrix(rpois(60, 5), ncol=20)) + temp <- saveDelayed(X, save.external.array=TRUE, external.dedup.session=dedup.session) + expect_true(file.exists(file.path(temp, "seeds", 0))) + roundtrip <- loadDelayed(temp, custom.takane.realize=TRUE) + expect_identical(X, roundtrip) + + Y <- X + 1 + temp2 <- saveDelayed(Y, save.external.array=TRUE, external.dedup.session=dedup.session, external.dedup.action="symlink") + expect_true(file.exists(file.path(temp2, "seeds", 0))) + roundtrip <- loadDelayed(temp2, custom.takane.realize=TRUE) + expect_equal(Y, roundtrip) + + Z <- DelayedArray(matrix(rpois(30, 5), ncol=5)) # checking that a different array doesn't trigger the deduplicator. + temp3 <- saveDelayed(Z, save.external.array=TRUE, external.dedup.session=dedup.session) + expect_true(file.exists(file.path(temp3, "seeds", 0))) + roundtrip <- loadDelayed(temp3, custom.takane.realize=TRUE) + expect_identical(Z, roundtrip) + + if (.Platform$OS.type=="unix") { + expect_identical(Sys.readlink(file.path(temp, "seeds", "0", "OBJECT")), "") + expect_true(startsWith(Sys.readlink(file.path(temp2, "seeds", "0", "OBJECT")), "/")) + expect_identical(Sys.readlink(file.path(temp3, "seeds", "0", "OBJECT")), "") + } +}) + +test_that("external deduplication works with relative paths", { + dedup.session <- createExternalSeedDedupSession() + + staging <- tempfile() + dir.create(staging) + pwd <- getwd() + on.exit(setwd(pwd), after=FALSE, add=TRUE) + setwd(staging) + + X <- DelayedArray(matrix(rpois(60, 5), ncol=20)) + dir.create("original") + saveObject(X + 2, file.path("original", "out"), DelayedArray.preserve.ops=TRUE, + DelayedArray.store.args=list(save.external.array=TRUE, external.dedup.session=dedup.session, external.dedup.action="relsymlink")) + + Y <- X + 1 + saveObject(Y, "semiclone", DelayedArray.preserve.ops=TRUE, + DelayedArray.store.args=list(save.external.array=TRUE, external.dedup.session=dedup.session, external.dedup.action="relsymlink")) + + # Changing back and seeing whether the loader works. + setwd(pwd) + expect_true(file.exists(file.path(staging, "semiclone", "seeds", 0))) + roundtrip <- loadDelayed(file.path(staging, "semiclone"), custom.takane.realize=TRUE) + expect_equal(Y, roundtrip) + + if (.Platform$OS.type=="unix") { + expect_identical(Sys.readlink(file.path(staging, "original", "out", "seeds", "0", "OBJECT")), "") + expect_true(startsWith(Sys.readlink(file.path(staging, "semiclone", "seeds", "0", "OBJECT")), "../")) + } +})