Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switched the array savers/loaders to the simplified layout. #5

Merged
merged 2 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(extractComponents)
export(loadArray)
export(loadWrapperArray)
export(preserveDelayedOperations)
export(readArray)
export(recycleHdf5Files)
export(writeSparseMatrix)
exportClasses(AmalgamatedArray)
Expand All @@ -27,8 +28,11 @@ exportMethods(extract_sparse_array)
exportMethods(is_sparse)
exportMethods(matrixClass)
exportMethods(path)
exportMethods(saveObject)
exportMethods(stageObject)
import(alabaster.base)
import(methods)
import(rhdf5)
importClassesFrom(DelayedArray,DelayedAbind)
importClassesFrom(DelayedArray,DelayedArray)
importClassesFrom(DelayedArray,DelayedMatrix)
Expand All @@ -39,6 +43,7 @@ importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dsparseMatrix)
importClassesFrom(SparseArray,SVT_SparseMatrix)
importFrom(BiocGenerics,path)
importFrom(DelayedArray,"type<-")
importFrom(DelayedArray,DelayedArray)
importFrom(DelayedArray,OLD_extract_sparse_array)
importFrom(DelayedArray,acbind)
Expand All @@ -59,6 +64,7 @@ importFrom(DelayedArray,read_sparse_block)
importFrom(DelayedArray,rowAutoGrid)
importFrom(DelayedArray,type)
importFrom(HDF5Array,H5SparseMatrixSeed)
importFrom(HDF5Array,HDF5Array)
importFrom(HDF5Array,HDF5ArraySeed)
importFrom(HDF5Array,writeHDF5Array)
importFrom(S4Vectors,new2)
Expand All @@ -70,7 +76,6 @@ importFrom(alabaster.base,acquireFile)
importFrom(alabaster.base,acquireMetadata)
importFrom(alabaster.base,addMissingPlaceholderAttributeForHdf5)
importFrom(alabaster.base,loadObject)
importFrom(alabaster.base,stageObject)
importFrom(alabaster.base,transformVectorForHdf5)
importFrom(rhdf5,H5Fclose)
importFrom(rhdf5,H5Fopen)
Expand Down
34 changes: 0 additions & 34 deletions R/loadArray.R

This file was deleted.

85 changes: 85 additions & 0 deletions R/readArray.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#' Read a dense array from disk
#'
#' Read a dense high-dimensional array from its on-disk representation.
#'
#' @param path String containing a path to a directory, itself created by the \code{\link{saveObject}} method for a dense array.
#' @param array.file.backed Logical scalar indicating whether to return a file-backed S4 array class, or an ordinary R array in memory.
#' @param ... Further arguments, ignored.
#'
#' @return A multi-dimensional array-like object, either a \linkS4class{DelayedArray} or an ordinary R array.
#'
#' @seealso
#' \code{"\link{saveObject,array-method}"}, to create the directory and its contents.
#'
#' @author Aaron Lun
#'
#' @examples
#' arr <- array(rpois(10000, 10), c(50, 20, 10))
#' dimnames(arr) <- list(
#' paste0("GENE_", seq_len(nrow(arr))),
#' letters[1:20],
#' NULL
#' )
#'
#' dir <- tempfile()
#' saveObject(arr, dir)
#' readArray(dir)
#'
#' @export
#' @importFrom HDF5Array HDF5Array
#' @importFrom DelayedArray type<-
readArray <- function(path, array.file.backed=TRUE, ...) {
fpath <- file.path(path, "array.h5")

details <- local({
fhandle <- H5Fopen(fpath)
on.exit(H5Fclose(fhandle), add=TRUE, after=FALSE)
ghandle <- H5Gopen(fhandle, "dense_array")
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)

type <- h5_read_attribute(ghandle, "type")
transposed <- h5_read_attribute(ghandle, "transposed", check=TRUE, default=0L)

dhandle <- H5Dopen(ghandle, "data")
on.exit(H5Dclose(dhandle), add=TRUE, after=FALSE)
placeholder <- h5_read_attribute(dhandle, "missing-value-placeholder", check=TRUE, default=NULL)

ndims <- length(H5Sget_simple_extent_dims(H5Dget_space(dhandle)))
names <- load_names(ghandle, ndims)

list(type=type, names=names, transposed=(transposed != 0L), placeholder=placeholder)
})

out <- HDF5Array(filepath=fpath, name="dense_array/data")
if (type(out) == "raw") { # ... so that placeholders are correctly substituted.
type(out) <- "integer"
}

if (!is.null(details$names)) {
dimnames(out) <- rev(details$names)
}
if (!details$transposed) { # All R-based HDF5 bindings will automatically transpose, so we need to untranspose if `transposed=FALSE`.
out <- t(out)
}
if (!is.null(details$placeholder)) {
out <- DelayedMask(out, placeholder=details$placeholder)
out <- DelayedArray(out)
}
type(out) <- from_array_type(details$type)

if (!array.file.backed) {
return(as.array(out))
} else {
return(DelayedArray(out))
}
}

##############################
######### OLD STUFF ##########
##############################

#' @export
loadArray <- function(info, project) {
seed <- .createRawArraySeed(info, project=project, names=TRUE)
DelayedArray(seed)
}
105 changes: 66 additions & 39 deletions R/stageArray.R → R/saveArray.R
Original file line number Diff line number Diff line change
@@ -1,58 +1,89 @@
#' Stage a multi-dimensional array for upload
#' Save a multi-dimensional array to disk
#'
#' Stage a high-dimensional array in preparation for upload to DataSetDB.
#' Save a high-dimensional array to its on-disk representations.
#'
#' @param x An array, almost always integer or numeric, though logical and character matrices are also supported.
#' Alternatively, a \linkS4class{DelayedArray} or any instance of a \linkS4class{Matrix} class.
#' @param dir String containing the path to the staging directory.
#' @param path String containing the relative path to a subdirectory inside the staging directory, in which \code{x} is to be saved.
#' @param child Logical scalar indicating whether \code{x} is a child of a larger object.
#' @param x An integer, numeric, logical or character array.
#' Alternatively, any of the \linkS4class{denseMatrix} subclasses from the \pkg{Matrix} package.
#' @param path String containing the path to a directory in which to save \code{x}.
#' @param ... Further arguments, currently ignored.
#'
#' @return
#' \code{x} is saved into a single file at \code{file.path(dir, path)}, possibly after appending an arbitrary file extension.
#' A named list is returned, containing at least:
#' \itemize{
#' \item \code{$schema}, a string specifying the schema to use to validate the metadata.
#' \item \code{path}, a string containing the path to the file inside the subdirectory, containing the assay contents.
#' \item \code{is_child}, a logical scalar equal to the input \code{child}.
#' }
#'
#' @details
#' For dense arrays, we save the array as a dense matrix in a HDF5 file using methods from the \pkg{HDF5Array} package.
#' For sparse matrices, we call \code{\link{writeSparseMatrix}} to save the data in the 10X sparse matrix format.
#' Other representations may have more appropriate formats, which are supported by simply writing new methods for this generic.
#' Note that specialized methods will usually require new schemas to validate any new metadata fields.
#'
#' If \code{x} itself is a child of a larger object, we suggest using the output \code{path} when referencing \code{x} from within the larger object's metadata.
#' This is because \code{stageObject} methods may add more path components, file extensions, etc. to the input \code{path} when saving the object.
#' As a result, the output \code{path} may not be the same as the input \code{path}.
#' \code{x} is saved to \code{path} and \code{NULL} is invisibly returned.
#'
#' @seealso
#' \code{\link{preserveDelayedOperations}}, to preserve the delayed'ness of a \linkS4class{DelayedMatrix} \code{x}.
#'
#' \code{\link{recycleHdf5Files}}, to re-use the existing file in a HDF5-backed \linkS4class{DelayedMatrix} \code{x}.
#' \code{\link{readArray}}, to read the directory contents back into the R session.
#'
#' @author Aaron Lun
#' @examples
#' dir <- tempfile()
#' dir.create(dir)
#'
#' mat <- array(rpois(10000, 10), c(50, 20, 10))
#' dimnames(mat) <- list(
#' paste0("GENE_", seq_len(nrow(mat))),
#' letters[1:20],
#' NULL
#' )
#'
#' path <- "whee"
#' stageObject(mat, dir, path)
#'
#' dir <- tempfile()
#' saveObject(mat, dir)
#' list.files(dir)
#'
#' @name stageArray
#' @importFrom alabaster.base stageObject
#' @name saveArray
#' @aliases
#' stageObject,array-method
#' stageObject,Matrix-method
NULL

#' @import alabaster.base rhdf5
.save_array <- function(x, path, ...) {
dir.create(path)
fpath <- file.path(path, "array.h5")
name <- "dense_array"

# This needs to be wrapped up as writeHDF5Array needs to
# take ownership of the file handle internally.
local({
fhandle <- H5Fcreate(fpath, "H5F_ACC_TRUNC")
on.exit(H5Fclose(fhandle), add=TRUE, after=FALSE)

ghandle <- H5Gcreate(fhandle, name)
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)

h5_write_attribute(ghandle, "version", "1.0", scalar=TRUE)
h5_write_attribute(ghandle, "type", array_type(x), scalar=TRUE)
h5_write_attribute(ghandle, "transposed", 1L, scalar=TRUE)
})

transformed <- transformVectorForHdf5(x)
writeHDF5Array(transformed$transformed, filepath=fpath, name="dense_array/data")

# Reinitializing the handle for more downstream work.
fhandle <- H5Fopen(fpath, "H5F_ACC_RDWR")
on.exit(H5Fclose(fhandle), add=TRUE, after=FALSE)
ghandle <- H5Gopen(fhandle, name)
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)

if (!is.null(transformed$placeholder)) {
dhandle <- H5Dopen(ghandle, "data")
on.exit(H5Dclose(dhandle), add=TRUE, after=FALSE)
h5_write_attribute(dhandle, missingPlaceholderName, transformed$placeholder, scalar=TRUE)
}

save_names(ghandle, x, transpose=TRUE)
write(name, file=file.path(path, "OBJECT"))
invisible(NULL)
}

#' @export
#' @rdname saveArray
setMethod("saveObject", "array", .save_array)

#' @export
#' @rdname saveArray
setMethod("saveObject", "denseMatrix", .save_array)

##############################
######### OLD STUFF ##########
##############################

#' @importFrom DelayedArray is_sparse
#' @importFrom rhdf5 h5createFile
#' @importFrom HDF5Array writeHDF5Array
Expand Down Expand Up @@ -111,7 +142,6 @@ NULL
}

#' @export
#' @rdname stageArray
setMethod("stageObject", "array", function(x, dir, path, child=FALSE) .stage_array(x, dir, path, child=child))

#' @importFrom alabaster.base .stageObject
Expand Down Expand Up @@ -141,7 +171,6 @@ setMethod("stageObject", "array", function(x, dir, path, child=FALSE) .stage_arr
}

#' @export
#' @rdname stageArray
setMethod("stageObject", "DelayedArray", function(x, dir, path, child=FALSE) .stage_delayed(x, dir, path, child = child, fallback = .stage_array))

#' @importFrom rhdf5 h5createFile h5createGroup
Expand Down Expand Up @@ -179,12 +208,10 @@ setMethod("stageObject", "DelayedArray", function(x, dir, path, child=FALSE) .st
}

#' @export
#' @rdname stageArray
#' @importClassesFrom Matrix Matrix
setMethod("stageObject", "Matrix", function(x, dir, path, child=FALSE) .stage_any_matrix(x, dir, path, child=child))

#' @export
#' @rdname stageArray
setMethod("stageObject", "DelayedMatrix", function(x, dir, path, child=FALSE) .stage_delayed(x, dir, path, child = child, fallback = .stage_any_matrix))

.link_or_copy <- function(from, to) {
Expand Down
50 changes: 49 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#' @importFrom DelayedArray type
array_type <- function(x) {
to_array_type <- function(x) {
switch(type(x),
integer="integer",
double="number",
Expand All @@ -8,3 +8,51 @@ array_type <- function(x) {
"other"
)
}

array_type <- to_array_type

from_array_type <- function(x) {
switch(x,
integer="integer",
number="double",
boolean="logical",
string="character"
)
}

save_names <- function(handle, x, group = "names", transpose=FALSE) {
d <- dimnames(x)
if (is.null(d) || all(vapply(d, is.null, TRUE))) {
return(NULL)
}

if (transpose) { # for the HDF5 array transposition.
d <- rev(d)
}

ghandle <- H5Gcreate(handle, group)
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)
for (i in seq_along(d)) {
current <- d[[i]]
if (!is.null(current)) {
h5_write_vector(ghandle, as.character(i - 1L), current)
}
}
}

load_names <- function(handle, ndim, group = "names") {
if (!h5_object_exists(handle, group)) {
return(NULL)
}

ghandle <- H5Gopen(handle, group)
on.exit(H5Gclose(ghandle), add=TRUE, after=FALSE)
all.named <- h5ls(ghandle, datasetinfo=FALSE, recursive=FALSE)

names <- vector("list", ndim)
for (y in all.named$name) {
names[[as.integer(y) + 1L]] <- h5_read_vector(ghandle, y)
}

names
}
Loading
Loading