Skip to content

Commit

Permalink
Added internal utility to optimize HDF5 storage type based on array c…
Browse files Browse the repository at this point in the history
…ontents. (#6)

This also finds a suitable missing value placeholder if NAs are present in the array.
  • Loading branch information
LTLA authored Nov 27, 2023
1 parent f74651f commit 366ef6e
Show file tree
Hide file tree
Showing 9 changed files with 837 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
*.swp
*.rds
*.html
*.so
*.o
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Suggests:
knitr,
BiocStyle,
chihaya
LinkingTo: Rcpp
VignetteBuilder: knitr
RoxygenNote: 7.2.3
biocViews:
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ importFrom(HDF5Array,H5SparseMatrixSeed)
importFrom(HDF5Array,HDF5Array)
importFrom(HDF5Array,HDF5ArraySeed)
importFrom(HDF5Array,writeHDF5Array)
importFrom(Rcpp,sourceCpp)
importFrom(S4Vectors,new2)
importFrom(SparseArray,extract_sparse_array)
importFrom(alabaster.base,.loadObject)
Expand All @@ -91,3 +92,4 @@ importFrom(rhdf5,h5readAttributes)
importFrom(rhdf5,h5write)
importFrom(rhdf5,h5writeAttribute)
importFrom(rhdf5,h5writeDataset)
useDynLib(alabaster.matrix, .registration=TRUE)
15 changes: 15 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

lowest_double <- function() {
.Call(`_alabaster_matrix_lowest_double`)
}

highest_double <- function() {
.Call(`_alabaster_matrix_highest_double`)
}

collect_double_attributes <- function(x) {
.Call(`_alabaster_matrix_collect_double_attributes`, x)
}

4 changes: 4 additions & 0 deletions R/namespace.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#' @import alabaster.base rhdf5
#' @importFrom Rcpp sourceCpp
#' @useDynLib alabaster.matrix, .registration=TRUE
NULL
260 changes: 260 additions & 0 deletions R/optimize_storage.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
optimize_storage <- function(x) {
tt <- type(x)
if (tt == "character") {
optimize_string_storage(x)
} else if (tt == "double") {
optimize_float_storage(x)
} else if (tt == "integer") {
optimize_integer_storage(x)
} else if (tt == "logical") {
optimize_boolean_storage(x)
} else {
stop("unsupported type '", tt, "'")
}
}

###################################################
###################################################

aggregate_range <- function(collated, name) {
range(unlist(lapply(collated, function(y) y[[name]])))
}

aggregate_any <- function(collated, name) {
any(vapply(collated, function(y) y[[name]], TRUE))
}

aggregate_max <- function(collated, name) {
max(unlist(lapply(collated, function(y) y[[name]])), na.rm=TRUE)
}

###################################################
###################################################

setGeneric("collect_integer_attributes", function(x) standardGeneric("collect_integer_attributes"))

.simple_integer_collector <- function(x) {
list(
range=suppressWarnings(range(x, na.rm=TRUE)),
missing=anyNA(x)
)
}

setMethod("collect_integer_attributes", "array", .simple_integer_collector)

setMethod("collect_integer_attributes", "ANY", function(x) {
collated <- blockApply(x, .simple_integer_collector)
list(
range=aggregate_range(collated, "range"),
missing=aggregate_any(collated, "missing")
)
})

optimize_integer_storage <- function(x) {
attr <- collect_integer_attributes(x)

if (attr$missing) {
lower <- attr$range[1]
upper <- attr$range[2]
if (is.infinite(lower)) {
return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7)))
}

if (lower < 0L) {
if (lower > -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7)))
} else if (lower > -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=as.integer(-2^15)))
}
} else {
if (upper < 2^8 - 1) {
return(list(type="H5T_NATIVE_UINT8", placeholder=as.integer(2^8-1)))
} else if (upper < 2^16 - 1) {
return(list(type="H5T_NATIVE_UINT16", placeholder=as.integer(2^16-1)))
}
}

return(list(type="H5T_NATIVE_INT32", placeholder=NA_integer_))

} else {
lower <- attr$range[1]
upper <- attr$range[2]
if (is.infinite(lower)) {
return(list(type="H5T_NATIVE_INT8", placeholder=NULL))
}

if (lower < 0L) {
if (lower >= -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=NULL))
} else if (lower >= -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=NULL))
}
} else {
if (upper < 2^8) {
return(list(type="H5T_NATIVE_UINT8", placeholder=NULL))
} else if (upper < 2^16) {
return(list(type="H5T_NATIVE_UINT16", placeholder=NULL))
}
}

return(list(type="H5T_NATIVE_INT32", placeholder=NULL))
}
}

###################################################
###################################################

setGeneric("collect_float_attributes", function(x) standardGeneric("collect_float_attributes"))

setMethod("collect_float_attributes", "array", collect_double_attributes)

setMethod("collect_float_attributes", "ddenseMatrix", function(x) collect_double_attributes(x@x))

setMethod("collect_float_attributes", "ANY", function(x) {
collated <- blockApply(x, collect_double_attributes)

output <- list(range=aggregate_range(collated, "range"))
for (n in c("missing", "non_integer", "has_NaN", "has_Inf", "has_nInf", "has_lowest", "has_highest")) {
output[[n]] <- aggregate_any(collated, n)
}

output
})

optimize_float_storage <- function(x) {
attr <- collect_float_attributes(x)

if (attr$missing) {
if (!attr$non_integer) {
lower <- attr$range[1]
upper <- attr$range[2]
if (lower < 0L) {
if (lower > -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=-2^7))
} else if (lower > -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=-2^15))
} else if (lower > -2^31 && upper < 2^31) {
return(list(type="H5T_NATIVE_INT32", placeholder=-2^31))
}
} else {
if (upper < 2^8-1) {
return(list(type="H5T_NATIVE_UINT8", placeholder=2^8-1))
} else if (upper < 2^16-1) {
return(list(type="H5T_NATIVE_UINT16", placeholder=2^16-1))
} else if (upper < 2^32-1) {
return(list(type="H5T_NATIVE_UINT32", placeholder=2^32-1))
}
}
}

placeholder <- NULL
if (!attr$has_NaN) {
placeholder <- NaN
} else if (!attr$has_Inf) {
placeholder <- Inf
} else if (!attr$has_nInf) {
placeholder <- -Inf
} else if (!attr$has_lowest) {
placeholder <- lowest_double()
} else if (!attr$has_highest) {
placeholder <- highest_double()
}

# Fallback that just goes through and pulls out all unique values.
if (is.null(placeholder)) {
u <- Reduce(union, blockApply(x, function(y) unique(as.vector(y))))
placeholder <- chooseMissingPlaceholderForHdf5(u)
}

return(list(type="H5T_NATIVE_DOUBLE", placeholder=placeholder))

} else {
if (!attr$non_integer) {
lower <- attr$range[1]
upper <- attr$range[2]
if (lower < 0L) {
if (lower >= -2^7 && upper < 2^7) {
return(list(type="H5T_NATIVE_INT8", placeholder=NULL))
} else if (lower >= -2^15 && upper < 2^15) {
return(list(type="H5T_NATIVE_INT16", placeholder=NULL))
} else if (lower >= -2^31 && upper < 2^31) {
return(list(type="H5T_NATIVE_INT32", placeholder=NULL))
}
} else {
if (upper < 2^8) {
return(list(type="H5T_NATIVE_UINT8", placeholder=NULL))
} else if (upper < 2^16) {
return(list(type="H5T_NATIVE_UINT16", placeholder=NULL))
} else if (upper < 2^32) {
return(list(type="H5T_NATIVE_UINT32", placeholder=NULL))
}
}
}

return(list(type="H5T_NATIVE_DOUBLE", placeholder=NULL))
}
}

###################################################
###################################################

setGeneric("collect_string_attributes", function(x) standardGeneric("collect_string_attributes"))

setMethod("collect_string_attributes", "ANY", function(x) {
collected <- blockApply(x, function(y) {
list(
has_na1=any(y == "NA", na.rm=TRUE),
has_na2=any(y == "_NA", na.rm=TRUE),
max_len=suppressWarnings(max(nchar(y, "bytes"), na.rm=TRUE)),
missing=anyNA(y),
encoding=unique(Encoding(y))
)
})

list(
has_na1=aggregate_any(collected, "has_na1"),
has_na2=aggregate_any(collected, "has_na2"),
max_len=aggregate_max(collected, "max_len"),
missing=aggregate_any(collected, "missing"),
encoding=Reduce(union, lapply(collected, function(y) y$encoding))
)
})

optimize_string_storage <- function(x) {
attr <- collect_string_attributes(x)

placeholder <- NULL
if (attr$missing) {
if (!attr$has_na1) {
placeholder <- "NA"
} else if (!attr$has_na2) {
placeholder <- "_NA"
} else {
u <- Reduce(union, blockApply(x, function(y) unique(as.vector(y))))
placeholder <- chooseMissingPlaceholderForHdf5(u)
}
attr$max_len <- max(attr$max_len, nchar(placeholder, "bytes"))
}

tid <- H5Tcopy("H5T_C_S1")
H5Tset_strpad(tid, strpad = "NULLPAD")
H5Tset_size(tid, max(1L, attr$max_len))
if ("UTF-8" %in% attr$encoding) {
H5Tset_cset(tid, "UTF8")
} else {
H5Tset_cset(tid, "ASCII")
}

list(type=tid, placeholder=placeholder)
}

###################################################
###################################################

optimize_boolean_storage <- function(x) {
if (anyNA(x)) {
list(type="H5T_NATIVE_INT8", placeholder=-1L)
} else {
list(type="H5T_NATIVE_INT8", placeholder=NULL)
}
}
52 changes: 52 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#include <Rcpp.h>

using namespace Rcpp;

#ifdef RCPP_USE_GLOBAL_ROSTREAM
Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif

// lowest_double
double lowest_double();
RcppExport SEXP _alabaster_matrix_lowest_double() {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
rcpp_result_gen = Rcpp::wrap(lowest_double());
return rcpp_result_gen;
END_RCPP
}
// highest_double
double highest_double();
RcppExport SEXP _alabaster_matrix_highest_double() {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
rcpp_result_gen = Rcpp::wrap(highest_double());
return rcpp_result_gen;
END_RCPP
}
// collect_double_attributes
Rcpp::List collect_double_attributes(Rcpp::NumericVector x);
RcppExport SEXP _alabaster_matrix_collect_double_attributes(SEXP xSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::traits::input_parameter< Rcpp::NumericVector >::type x(xSEXP);
rcpp_result_gen = Rcpp::wrap(collect_double_attributes(x));
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_alabaster_matrix_lowest_double", (DL_FUNC) &_alabaster_matrix_lowest_double, 0},
{"_alabaster_matrix_highest_double", (DL_FUNC) &_alabaster_matrix_highest_double, 0},
{"_alabaster_matrix_collect_double_attributes", (DL_FUNC) &_alabaster_matrix_collect_double_attributes, 1},
{NULL, NULL, 0}
};

RcppExport void R_init_alabaster_matrix(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}
Loading

0 comments on commit 366ef6e

Please sign in to comment.