diff --git a/NAMESPACE b/NAMESPACE index 7f6ef31..e788ce6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,6 +43,10 @@ export(anyMissing) export(as.Rfc3339) export(checkValidDirectory) export(chooseMissingPlaceholderForHdf5) +export(collect_boolean_attributes) +export(collect_integer_attributes) +export(collect_number_attributes) +export(collect_string_attributes) export(createRedirection) export(customloadObjectHelper) export(h5_cast) @@ -67,6 +71,10 @@ export(loadDirectory) export(loadObject) export(missingPlaceholderName) export(moveObject) +export(optimize_boolean_storage) +export(optimize_integer_storage) +export(optimize_number_storage) +export(optimize_string_storage) export(processMcols) export(processMetadata) export(quickLoadObject) @@ -106,6 +114,10 @@ export(validateObject) export(writeMetadata) exportMethods(acquireFile) exportMethods(acquireMetadata) +exportMethods(collect_boolean_attributes) +exportMethods(collect_integer_attributes) +exportMethods(collect_number_attributes) +exportMethods(collect_string_attributes) exportMethods(saveObject) exportMethods(stageObject) import(alabaster.schemas) diff --git a/R/RcppExports.R b/R/RcppExports.R index f671ed6..693681b 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -25,6 +25,22 @@ choose_numeric_missing_placeholder <- function(x) { .Call(`_alabaster_base_choose_numeric_missing_placeholder`, x) } +collect_character_attributes <- function(x) { + .Call(`_alabaster_base_collect_character_attributes`, x) +} + +lowest_double <- function() { + .Call(`_alabaster_base_lowest_double`) +} + +highest_double <- function() { + .Call(`_alabaster_base_highest_double`) +} + +collect_numeric_attributes <- function(x) { + .Call(`_alabaster_base_collect_numeric_attributes`, x) +} + not_rfc3339 <- function(x) { .Call(`_alabaster_base_not_rfc3339`, x) } diff --git a/R/optimize_storage.R b/R/optimize_storage.R new file mode 100644 index 0000000..214483c --- /dev/null +++ b/R/optimize_storage.R @@ -0,0 +1,313 @@ +#' HDF5 storage type optimization +#' +#' Optimize HDF5 storage to use the smallest possible storage type. +#' Intended for \pkg{alabaster.*} developers only. +#' +#' @param x An atomic vector or array of the specified type, +#' e.g., integer vector/array for \code{collect_integer_attributes} and \code{optimize_integer_storage}. +#' Developers can also extend this to abstract objects containing values of the same type, e.g., matrix-like S4 classes. +#' @param fallback Function that accepts \code{x} and returns a suitable placeholder in the presence of \code{NA}s. +#' If \code{NULL}, this defaults to \code{\link{chooseMissingPlaceholderForHdf5}}. +#' +#' @return +#' For \code{collect_integer_attributes}, a named list containing: +#' \itemize{ +#' \item \code{min}, the smallest non-\code{NA} value in \code{x}. +#' This is set to Inf if all values are \code{NA}. +#' \item \code{max}, the largest non-\code{NA} value in \code{x}. +#' This is set to -Inf if all values are \code{NA}. +#' \item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}. +#' } +#' +#' For \code{collect_number_attributes}, a named list containing: +#' \itemize{ +#' \item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}. +#' \item \code{non_integer}, logical scalar indicating whether any values in \code{x} are non-integer. +#' \item \code{min}, the smallest non-\code{NA} integer value in \code{x}. +#' This is set to Inf if there are any non-integer or \code{NA} values. +#' \item \code{max}, the largest non-\code{NA} integer value in \code{x}. +#' This is set to -Inf if there are any non-integer or \code{NA} values. +#' \item \code{has_NaN}, logical scalar indicating whether NaN is present in \code{x}. +#' \item \code{has_Inf}, logical scalar indicating whether positive infinity is present in \code{x}. +#' \item \code{has_NegInf}, logical scalar indicating whether negative infinity is present in \code{x}. +#' \item \code{has_lowest}, logical scalar indicating whether the smallest double-precision value is present in \code{x}. +#' \item \code{has_highest}, logical scalar indicating whether the highest double-precision value is present in \code{x}. +#' } +#' +#' For \code{collect_string_attributes}, a named list containing: +#' \itemize{ +#' \item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}. +#' \item \code{has_NA}, logical scalar indicating whether the \code{"NA"} string is present in \code{x}. +#' \item \code{has__NA}, logical scalar indicating whether the \code{"_NA"} string is present in \code{x}. +#' \item \code{max_len}, integer scalar specifying the maximum length of the strings in \code{x}. +#' } +#' +#' For \code{collect_boolean_attributes}, a named list containing: +#' \itemize{ +#' \item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}. +#' } +#' +#' For the \code{optimize_*_storage} functions, a named list containing: +#' \itemize{ +#' \item \code{type}, string containing the HDF5 datatype for storing \code{x}. +#' \item \code{placeholder}, value of the placeholder for \code{NA} values. +#' \item \code{other}, other attributes of \code{x} (e.g., number of non-zero elements for sparse vectors). +#' These should be stored in an \code{other} field in the named list returned by \code{collect_*_ attributes}. +#' } +#' +#' @author Aaron Lun +#' @aliases +#' collect_integer_attributes,integer-method +#' collect_integer_attributes,array-method +#' collect_number_attributes,double-method +#' collect_number_attributes,array-method +#' collect_string_attributes,character-method +#' collect_string_attributes,array-method +#' collect_boolean_attributes,logical-method +#' collect_boolean_attributes,array-method +#' @name optimize_storage +NULL + +#' @export +#' @rdname optimize_storage +setGeneric("collect_integer_attributes", function(x) standardGeneric("collect_integer_attributes")) + +.collect_integer_attributes_raw <- function(x) { + range <- suppressWarnings(range(x, na.rm=TRUE)) + list( + min=range[1], + max=range[2], + missing=anyNA(x) + ) +} + +#' @export +setMethod("collect_integer_attributes", "integer", .collect_integer_attributes_raw) + +#' @export +setMethod("collect_integer_attributes", "array", .collect_integer_attributes_raw) + +#' @export +#' @rdname optimize_storage +optimize_integer_storage <- function(x) { + attr <- collect_integer_attributes(x) + lower <- attr$min + upper <- attr$max + + if (attr$missing) { + # If it's infinite, that means that there are only missing values in + # 'x', otherwise there should have been at least one finite value + # available. In any case, it means we can just do whatever we want so + # we'll just use the smallest type. + if (is.infinite(lower)) { + return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7), other=attr$other)) + } + + if (lower < 0L) { + if (lower > -2^7 && upper < 2^7) { + return(list(type="H5T_NATIVE_INT8", placeholder=as.integer(-2^7), other=attr$other)) + } else if (lower > -2^15 && upper < 2^15) { + return(list(type="H5T_NATIVE_INT16", placeholder=as.integer(-2^15), other=attr$other)) + } + } else { + if (upper < 2^8 - 1) { + return(list(type="H5T_NATIVE_UINT8", placeholder=as.integer(2^8-1), other=attr$other)) + } else if (upper < 2^16 - 1) { + return(list(type="H5T_NATIVE_UINT16", placeholder=as.integer(2^16-1), other=attr$other)) + } + } + + return(list(type="H5T_NATIVE_INT32", placeholder=NA_integer_, other=attr$other)) + + } else { + # If it's infinite, that means that 'x' is of length zero, otherwise + # there should have been at least one finite value available. Here, + # the type doesn't matter, so we'll just use the smallest. + if (is.infinite(lower)) { + return(list(type="H5T_NATIVE_INT8", placeholder=NULL, other=attr$other)) + } + + if (lower < 0L) { + if (lower >= -2^7 && upper < 2^7) { + return(list(type="H5T_NATIVE_INT8", placeholder=NULL, other=attr$other)) + } else if (lower >= -2^15 && upper < 2^15) { + return(list(type="H5T_NATIVE_INT16", placeholder=NULL, other=attr$other)) + } + } else { + if (upper < 2^8) { + return(list(type="H5T_NATIVE_UINT8", placeholder=NULL, other=attr$other)) + } else if (upper < 2^16) { + return(list(type="H5T_NATIVE_UINT16", placeholder=NULL, other=attr$other)) + } + } + + return(list(type="H5T_NATIVE_INT32", placeholder=NULL, other=attr$other)) + } +} + +#' @export +#' @rdname optimize_storage +setGeneric("collect_number_attributes", function(x) standardGeneric("collect_number_attributes")) + +#' @export +setMethod("collect_number_attributes", "double", collect_numeric_attributes) + +#' @export +setMethod("collect_number_attributes", "array", collect_numeric_attributes) + +#' @export +#' @rdname optimize_storage +optimize_number_storage <- function(x, fallback = chooseMissingPlaceholderForHdf5) { + attr <- collect_number_attributes(x) + lower <- attr$min + upper <- attr$max + + if (attr$missing) { + if (!attr$non_integer) { + if (lower < 0L) { + if (lower > -2^7 && upper < 2^7) { + return(list(type="H5T_NATIVE_INT8", placeholder=-2^7, other=attr$other)) + } else if (lower > -2^15 && upper < 2^15) { + return(list(type="H5T_NATIVE_INT16", placeholder=-2^15, other=attr$other)) + } else if (lower > -2^31 && upper < 2^31) { + return(list(type="H5T_NATIVE_INT32", placeholder=-2^31, other=attr$other)) + } + } else { + if (upper < 2^8-1) { + return(list(type="H5T_NATIVE_UINT8", placeholder=2^8-1, other=attr$other)) + } else if (upper < 2^16-1) { + return(list(type="H5T_NATIVE_UINT16", placeholder=2^16-1, other=attr$other)) + } else if (upper < 2^32-1) { + return(list(type="H5T_NATIVE_UINT32", placeholder=2^32-1, other=attr$other)) + } + } + } + + placeholder <- NULL + if (!attr$has_NaN) { + placeholder <- NaN + } else if (!attr$has_Inf) { + placeholder <- Inf + } else if (!attr$has_NegInf) { + placeholder <- -Inf + } else if (!attr$has_lowest) { + placeholder <- lowest_double() + } else if (!attr$has_highest) { + placeholder <- highest_double() + } + + if (is.null(placeholder)) { + if (is.null(fallback)) { + fallback <- chooseMissingPlaceholderForHdf5 + } + placeholder <- fallback(x) + } + + return(list(type="H5T_NATIVE_DOUBLE", placeholder=placeholder, other=attr$other)) + + } else { + if (!attr$non_integer) { + if (lower < 0L) { + if (lower >= -2^7 && upper < 2^7) { + return(list(type="H5T_NATIVE_INT8", placeholder=NULL, other=attr$other)) + } else if (lower >= -2^15 && upper < 2^15) { + return(list(type="H5T_NATIVE_INT16", placeholder=NULL, other=attr$other)) + } else if (lower >= -2^31 && upper < 2^31) { + return(list(type="H5T_NATIVE_INT32", placeholder=NULL, other=attr$other)) + } + } else { + if (upper < 2^8) { + return(list(type="H5T_NATIVE_UINT8", placeholder=NULL, other=attr$other)) + } else if (upper < 2^16) { + return(list(type="H5T_NATIVE_UINT16", placeholder=NULL, other=attr$other)) + } else if (upper < 2^32) { + return(list(type="H5T_NATIVE_UINT32", placeholder=NULL, other=attr$other)) + } + } + } + + return(list(type="H5T_NATIVE_DOUBLE", placeholder=NULL, other=attr$other)) + } +} + +#' @export +#' @rdname optimize_storage +setGeneric("collect_string_attributes", function(x) standardGeneric("collect_string_attributes")) + +.collect_string_attributes_raw <- function(x) { + attr <- collect_character_attributes(x) + + # Unfortunately, we need to throw an error, because if we need to change + # the encoding (e.g., with enc2utf8), the maximum length of each string in + # bytes may no longer be correct due to changes of the multi-byte + # characters. So, coercions should be done before 'x' enters this function. + if (attr$has_native) { + info <- l10n_info() + if (!info[["UTF-8"]]) { + stop("detected natively encoded strings in a non-UTF-8 locale") + } + } else if (attr$has_non_utf8) { + stop("detected non-UTF-8-encoded strings") + } + + attr$has_native <- NULL + attr$has_non_utf8 <- NULL + attr +} + +#' @export +setMethod("collect_string_attributes", "character", .collect_string_attributes_raw) + +#' @export +setMethod("collect_string_attributes", "array", .collect_string_attributes_raw) + +#' @export +#' @rdname optimize_storage +optimize_string_storage <- function(x, fallback = NULL) { + attr <- collect_string_attributes(x) + + placeholder <- NULL + if (attr$missing) { + if (!attr[["has_NA"]]) { + placeholder <- "NA" + } else if (!attr[["has__NA"]]) { + placeholder <- "_NA" + } else { + if (is.null(fallback)) { + fallback <- chooseMissingPlaceholderForHdf5 + } + placeholder <- fallback(x) + } + attr$max_len <- max(attr$max_len, nchar(placeholder, "bytes")) + } + + tid <- H5Tcopy("H5T_C_S1") + H5Tset_strpad(tid, strpad = "NULLPAD") + H5Tset_size(tid, max(1L, attr$max_len)) + H5Tset_cset(tid, "UTF8") + + list(type=tid, placeholder=placeholder, other=attr$other) +} + +#' @export +#' @rdname optimize_storage +setGeneric("collect_boolean_attributes", function(x) standardGeneric("collect_boolean_attributes")) + +.collect_boolean_attributes_raw <- function(x) list(missing=anyNA(x)) + +#' @export +setMethod("collect_boolean_attributes", "logical", .collect_boolean_attributes_raw) + +#' @export +setMethod("collect_boolean_attributes", "array", .collect_boolean_attributes_raw) + +#' @export +#' @rdname optimize_storage +optimize_boolean_storage <- function(x) { + attr <- collect_boolean_attributes(x) + placeholder <- NULL + if (attr$missing) { + placeholder <- -1L + } + list(type="H5T_NATIVE_INT8", placeholder=placeholder, other=attr$other) +} diff --git a/man/optimize_storage.Rd b/man/optimize_storage.Rd new file mode 100644 index 0000000..1883373 --- /dev/null +++ b/man/optimize_storage.Rd @@ -0,0 +1,99 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/optimize_storage.R +\name{optimize_storage} +\alias{optimize_storage} +\alias{collect_integer_attributes,integer-method} +\alias{collect_integer_attributes,array-method} +\alias{collect_number_attributes,double-method} +\alias{collect_number_attributes,array-method} +\alias{collect_string_attributes,character-method} +\alias{collect_string_attributes,array-method} +\alias{collect_boolean_attributes,logical-method} +\alias{collect_boolean_attributes,array-method} +\alias{collect_integer_attributes} +\alias{optimize_integer_storage} +\alias{collect_number_attributes} +\alias{optimize_number_storage} +\alias{collect_string_attributes} +\alias{optimize_string_storage} +\alias{collect_boolean_attributes} +\alias{optimize_boolean_storage} +\title{HDF5 storage type optimization} +\usage{ +collect_integer_attributes(x) + +optimize_integer_storage(x) + +collect_number_attributes(x) + +optimize_number_storage(x, fallback = chooseMissingPlaceholderForHdf5) + +collect_string_attributes(x) + +optimize_string_storage(x, fallback = NULL) + +collect_boolean_attributes(x) + +optimize_boolean_storage(x) +} +\arguments{ +\item{x}{An atomic vector or array of the specified type, +e.g., integer vector/array for \code{collect_integer_attributes} and \code{optimize_integer_storage}. +Developers can also extend this to abstract objects containing values of the same type, e.g., matrix-like S4 classes.} + +\item{fallback}{Function that accepts \code{x} and returns a suitable placeholder in the presence of \code{NA}s. +If \code{NULL}, this defaults to \code{\link{chooseMissingPlaceholderForHdf5}}.} +} +\value{ +For \code{collect_integer_attributes}, a named list containing: +\itemize{ +\item \code{min}, the smallest non-\code{NA} value in \code{x}. +This is set to Inf if all values are \code{NA}. +\item \code{max}, the largest non-\code{NA} value in \code{x}. +This is set to -Inf if all values are \code{NA}. +\item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}. +} + +For \code{collect_number_attributes}, a named list containing: +\itemize{ +\item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}. +\item \code{non_integer}, logical scalar indicating whether any values in \code{x} are non-integer. +\item \code{min}, the smallest non-\code{NA} integer value in \code{x}. +This is set to Inf if there are any non-integer or \code{NA} values. +\item \code{max}, the largest non-\code{NA} integer value in \code{x}. +This is set to -Inf if there are any non-integer or \code{NA} values. +\item \code{has_NaN}, logical scalar indicating whether NaN is present in \code{x}. +\item \code{has_Inf}, logical scalar indicating whether positive infinity is present in \code{x}. +\item \code{has_NegInf}, logical scalar indicating whether negative infinity is present in \code{x}. +\item \code{has_lowest}, logical scalar indicating whether the smallest double-precision value is present in \code{x}. +\item \code{has_highest}, logical scalar indicating whether the highest double-precision value is present in \code{x}. +} + +For \code{collect_string_attributes}, a named list containing: +\itemize{ +\item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}. +\item \code{has_NA}, logical scalar indicating whether the \code{"NA"} string is present in \code{x}. +\item \code{has__NA}, logical scalar indicating whether the \code{"_NA"} string is present in \code{x}. +\item \code{max_len}, integer scalar specifying the maximum length of the strings in \code{x}. +} + +For \code{collect_boolean_attributes}, a named list containing: +\itemize{ +\item \code{missing}, logical scalar indicating whether any values in \code{x} are \code{NA}. +} + +For the \code{optimize_*_storage} functions, a named list containing: +\itemize{ +\item \code{type}, string containing the HDF5 datatype for storing \code{x}. +\item \code{placeholder}, value of the placeholder for \code{NA} values. +\item \code{other}, other attributes of \code{x} (e.g., number of non-zero elements for sparse vectors). +These should be stored in an \code{other} field in the named list returned by \code{collect_*_ attributes}. +} +} +\description{ +Optimize HDF5 storage to use the smallest possible storage type. +Intended for \pkg{alabaster.*} developers only. +} +\author{ +Aaron Lun +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index c28a895..9f1d92d 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -76,6 +76,44 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// collect_character_attributes +Rcpp::List collect_character_attributes(Rcpp::StringVector x); +RcppExport SEXP _alabaster_base_collect_character_attributes(SEXP xSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< Rcpp::StringVector >::type x(xSEXP); + rcpp_result_gen = Rcpp::wrap(collect_character_attributes(x)); + return rcpp_result_gen; +END_RCPP +} +// lowest_double +double lowest_double(); +RcppExport SEXP _alabaster_base_lowest_double() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + rcpp_result_gen = Rcpp::wrap(lowest_double()); + return rcpp_result_gen; +END_RCPP +} +// highest_double +double highest_double(); +RcppExport SEXP _alabaster_base_highest_double() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + rcpp_result_gen = Rcpp::wrap(highest_double()); + return rcpp_result_gen; +END_RCPP +} +// collect_numeric_attributes +Rcpp::List collect_numeric_attributes(Rcpp::NumericVector x); +RcppExport SEXP _alabaster_base_collect_numeric_attributes(SEXP xSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< Rcpp::NumericVector >::type x(xSEXP); + rcpp_result_gen = Rcpp::wrap(collect_numeric_attributes(x)); + return rcpp_result_gen; +END_RCPP +} // not_rfc3339 Rcpp::LogicalVector not_rfc3339(Rcpp::CharacterVector x); RcppExport SEXP _alabaster_base_not_rfc3339(SEXP xSEXP) { @@ -262,6 +300,10 @@ static const R_CallMethodDef CallEntries[] = { {"_alabaster_base_any_actually_numeric_na", (DL_FUNC) &_alabaster_base_any_actually_numeric_na, 1}, {"_alabaster_base_is_actually_numeric_na", (DL_FUNC) &_alabaster_base_is_actually_numeric_na, 1}, {"_alabaster_base_choose_numeric_missing_placeholder", (DL_FUNC) &_alabaster_base_choose_numeric_missing_placeholder, 1}, + {"_alabaster_base_collect_character_attributes", (DL_FUNC) &_alabaster_base_collect_character_attributes, 1}, + {"_alabaster_base_lowest_double", (DL_FUNC) &_alabaster_base_lowest_double, 0}, + {"_alabaster_base_highest_double", (DL_FUNC) &_alabaster_base_highest_double, 0}, + {"_alabaster_base_collect_numeric_attributes", (DL_FUNC) &_alabaster_base_collect_numeric_attributes, 1}, {"_alabaster_base_not_rfc3339", (DL_FUNC) &_alabaster_base_not_rfc3339, 1}, {"_alabaster_base_load_csv", (DL_FUNC) &_alabaster_base_load_csv, 4}, {"_alabaster_base_load_list_hdf5", (DL_FUNC) &_alabaster_base_load_list_hdf5, 3}, diff --git a/src/collect_character_attributes.cpp b/src/collect_character_attributes.cpp new file mode 100644 index 0000000..290bc78 --- /dev/null +++ b/src/collect_character_attributes.cpp @@ -0,0 +1,45 @@ +#include "Rcpp.h" +#include +#include + +//[[Rcpp::export(rng=false)]] +Rcpp::List collect_character_attributes(Rcpp::StringVector x) { + bool has_missing = false; + bool has_NA = false; + bool has__NA = false; + int max_len = 0; + bool has_native = false; + bool has_non_utf8 = false; + + for (auto y : x) { + Rcpp::String s(y); + if (s == NA_STRING) { + has_missing = true; + continue; + } + + if (s == "NA") { + has_NA = true; + } else if (s == "_NA") { + has__NA = true; + } + + max_len = std::max(max_len, static_cast(Rf_length(s.get_sexp()))); + + auto enc = s.get_encoding(); + if (enc == CE_NATIVE) { + has_native = true; + } else if (enc != CE_UTF8 && enc != CE_ANY) { + has_non_utf8 = true; + } + } + + return Rcpp::List::create( + Rcpp::Named("missing") = Rcpp::LogicalVector::create(has_missing), + Rcpp::Named("has_NA") = Rcpp::LogicalVector::create(has_NA), + Rcpp::Named("has__NA") = Rcpp::LogicalVector::create(has__NA), + Rcpp::Named("max_len") = Rcpp::IntegerVector::create(max_len), + Rcpp::Named("has_native") = Rcpp::LogicalVector::create(has_native), + Rcpp::Named("has_non_utf8") = Rcpp::LogicalVector::create(has_non_utf8) + ); +} diff --git a/src/collect_numeric_attributes.cpp b/src/collect_numeric_attributes.cpp new file mode 100644 index 0000000..5c0d4d0 --- /dev/null +++ b/src/collect_numeric_attributes.cpp @@ -0,0 +1,72 @@ +#include "Rcpp.h" +#include +#include +#include + +//[[Rcpp::export(rng=false)]] +double lowest_double() { + return std::numeric_limits::lowest(); +} + +//[[Rcpp::export(rng=false)]] +double highest_double() { + return std::numeric_limits::max(); +} + +//[[Rcpp::export(rng=false)]] +Rcpp::List collect_numeric_attributes(Rcpp::NumericVector x) { + uint8_t has_missing = 0; + uint8_t has_nan = 0; + uint8_t has_posinf = 0; + uint8_t has_neginf = 0; + uint8_t non_integer = 0; + uint8_t has_lowest = 0; + uint8_t has_highest = 0; + const double lowest = lowest_double(); + const double highest = highest_double(); + + for (auto y : x) { + uint8_t is_na = ISNA(y); + has_missing |= is_na; + uint8_t not_na = !is_na; + has_nan |= (not_na & static_cast(std::isnan(y))); + + uint8_t is_inf = not_na & static_cast(std::isinf(y)); + has_posinf |= (is_inf & static_cast(y > 0)); + has_neginf |= (is_inf & static_cast(y < 0)); + + uint8_t is_finite = not_na & static_cast(!is_inf); + non_integer |= (is_finite & static_cast(std::floor(y) != y)); + has_lowest |= (is_finite & static_cast(y == lowest)); + has_highest |= (is_finite & static_cast(y == highest)); + } + + double minv = R_PosInf, maxv = R_NegInf; + if (!non_integer) { + if (!has_missing) { + for (auto y : x) { + minv = std::min(y, minv); + maxv = std::max(y, maxv); + } + } else { + for (auto y : x) { + if (!ISNA(y)) { + minv = std::min(y, minv); + maxv = std::max(y, maxv); + } + } + } + } + + return Rcpp::List::create( + Rcpp::Named("min") = Rcpp::NumericVector::create(minv), + Rcpp::Named("max") = Rcpp::NumericVector::create(maxv), + Rcpp::Named("missing") = Rcpp::LogicalVector::create(has_missing), + Rcpp::Named("non_integer") = Rcpp::LogicalVector::create(non_integer), + Rcpp::Named("has_NaN") = Rcpp::LogicalVector::create(has_nan), + Rcpp::Named("has_Inf") = Rcpp::LogicalVector::create(has_posinf), + Rcpp::Named("has_NegInf") = Rcpp::LogicalVector::create(has_neginf), + Rcpp::Named("has_lowest") = Rcpp::LogicalVector::create(has_lowest), + Rcpp::Named("has_highest") = Rcpp::LogicalVector::create(has_highest) + ); +} diff --git a/tests/testthat/test-optimize_storage.R b/tests/testthat/test-optimize_storage.R new file mode 100644 index 0000000..6f3d758 --- /dev/null +++ b/tests/testthat/test-optimize_storage.R @@ -0,0 +1,325 @@ +# library(testthat); library(alabaster.base); source("test-optimize_storage.R", encoding="UTF-8") + +library(rhdf5) + +test_that("storage optimization works for integers, no missing values", { + # < 8-bit. + mat <- matrix(seq(0, 255, length.out=1000), 50, 20) + storage.mode(mat) <- "integer" + + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT8") + expect_null(out$placeholder) + out <- optimize_integer_storage(mat - 128L) + expect_equal(out$type, "H5T_NATIVE_INT8") + out <- optimize_integer_storage(mat + 1L) + expect_equal(out$type, "H5T_NATIVE_UINT16") + + # < 16-bit. + mat <- matrix(seq(0, 65535, length.out=1000), 50, 20) + storage.mode(mat) <- "integer" + + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_null(out$placeholder) + out <- optimize_integer_storage(mat - 32768L) + expect_equal(out$type, "H5T_NATIVE_INT16") + out <- optimize_integer_storage(mat + 1L) + expect_equal(out$type, "H5T_NATIVE_INT32") + + # < 32-bit. + mat <- matrix(seq(0, .Machine$integer.max, length.out=1000), 50, 20) + storage.mode(mat) <- "integer" + + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_null(out$placeholder) +}) + +test_that("storage optimization works for integers, plus missing values", { + # < 8-bit, unsigned. + mat <- matrix(seq(0, 255, length.out=1000), 50, 20) + mat[1000] <- NA + storage.mode(mat) <- "integer" # Note that cast to int does a truncation, so there's only ever one value at a non-zero extreme. + + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT8") + expect_equal(out$placeholder, 255L) + + mat[999] <- 255L + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_equal(out$placeholder, 65535L) + + # 8-bit, signed. + mat <- matrix(seq(-128, 127, length.out=1000), 50, 20) + mat[1] <- NA + storage.mode(mat) <- "integer" + + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT8") + expect_equal(out$placeholder, -128L) + + mat[2] <- -128L + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT16") + expect_equal(out$placeholder, -32768L) + + # < 16-bit, unsigned. + mat <- matrix(seq(0, 65535, length.out=1000), 50, 20) + mat[1000] <- NA + storage.mode(mat) <- "integer" + + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_equal(out$placeholder, 65535) + + mat[999] <- 65535L + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_equal(out$placeholder, NA_integer_) + + # 16-bit, signed. + mat <- matrix(seq(-32768, 32767, length.out=1000), 50, 20) + mat[1] <- NA + storage.mode(mat) <- "integer" + + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT16") + expect_equal(out$placeholder, -32768L) + + mat[2] <- -32768L + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_equal(out$placeholder, NA_integer_) + + # Everything else. + mat <- matrix(seq(0, .Machine$integer.max, length.out=1000), 50, 20) + storage.mode(mat) <- "integer" + mat[1] <- NA + out <- optimize_integer_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_equal(out$placeholder, NA_integer_) +}) + +test_that("storage optimization works for integer-like doubles, no missing values", { + # < 8-bit. + mat <- round(matrix(seq(0, 255, length.out=1000), 50, 20)) + + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT8") + expect_null(out$placeholder) + out <- optimize_number_storage(mat - 128L) + expect_equal(out$type, "H5T_NATIVE_INT8") + out <- optimize_number_storage(mat + 1L) + expect_equal(out$type, "H5T_NATIVE_UINT16") + + # < 16-bit. + mat <- round(matrix(seq(0, 65535, length.out=1000), 50, 20)) + + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_null(out$placeholder) + out <- optimize_number_storage(mat - 32768L) + expect_equal(out$type, "H5T_NATIVE_INT16") + out <- optimize_number_storage(mat + 1L) + expect_equal(out$type, "H5T_NATIVE_UINT32") + + # < 32-bit. + mat <- round(matrix(seq(0, 2^32-1, length.out=1000), 50, 20)) + + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT32") + expect_null(out$placeholder) + out <- optimize_number_storage(mat - 2^31) + expect_equal(out$type, "H5T_NATIVE_INT32") + out <- optimize_number_storage(mat + 1L) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") +}) + +test_that("storage optimization works for integer-like doubles, plus missing values", { + # < 8-bit, unsigned + mat <- round(matrix(seq(0, 255, length.out=1000), 50, 20)) + mat[mat == 255] <- NA + + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT8") + expect_equal(out$placeholder, 255) + + mat[999] <- 255 + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_equal(out$placeholder, 65535) + + # < 8-bit, signed + mat <- round(matrix(seq(-128, 127, length.out=1000), 50, 20)) + mat[mat==-128] <- NA + + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT8") + expect_equal(out$placeholder, -128) + + mat[2] <- -128 + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT16") + expect_equal(out$placeholder, -32768) + + # < 16-bit, unsigned + mat <- round(matrix(seq(0, 65535, length.out=1000), 50, 20)) + mat[mat==65535] <- NA + + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT16") + expect_equal(out$placeholder, 65535) + + mat[999] <- 65535 + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT32") + expect_equal(out$placeholder, 2^32-1) + + # < 16-bit, signed + mat <- round(matrix(seq(-2^15, 2^15-1, length.out=1000), 50, 20)) + mat[mat==-32768] <- NA + + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT16") + expect_equal(out$placeholder, -32768) + + mat[2] <- -32768 + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_equal(out$placeholder, -2^31) + + # < 32-bit, unsigned + mat <- round(matrix(seq(0, 2^32-1, length.out=1000), 50, 20)) + mat[mat==2^32-1] <- NA + + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_UINT32") + expect_equal(out$placeholder, 2^32-1) + + mat[999] <- 2^32-1 + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, NaN) + + # < 32-bit, signed + mat <- round(matrix(seq(-2^31, 2^31-1, length.out=1000), 50, 20)) + mat[mat == -2^31] <- NA + + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT32") + expect_equal(out$placeholder, -2^31) + + mat[999] <- -2^31 + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, NaN) +}) + +test_that("storage optimization works non-integer doubles", { + mat <- matrix(rnorm(1000), 50, 20) + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_null(out$placeholder) + + # Running through the gamut of missing value placeholders. + mat <- matrix(rnorm(1000), 50, 20) + mat[1] <- NA + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, NaN) + + mat[2] <- NaN + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, Inf) + + mat[3] <- Inf + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, -Inf) + + mat[4] <- -Inf + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, alabaster.base:::lowest_double()) + + mat[5] <- alabaster.base:::lowest_double() + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_equal(out$placeholder, alabaster.base:::highest_double()) + + mat[6] <- alabaster.base:::highest_double() + out <- optimize_number_storage(mat) + expect_equal(out$type, "H5T_NATIVE_DOUBLE") + expect_true(!any(mat==out$placeholder, na.rm=TRUE)) +}) + +test_that("storage optimization works for strings", { + mat <- matrix(sample(LETTERS, 1000, replace=TRUE), 50, 20) + Encoding(mat) <- "UTF-8" + out <- optimize_string_storage(mat) + expect_equal(H5Tget_size(out$type), 1) + expect_equal(H5Tget_cset(out$type), 1) # aka UTF-8 + expect_equal(out$placeholder, NULL) + + mat[1] <- NA + out <- optimize_string_storage(mat) + expect_equal(H5Tget_size(out$type), 2) + expect_equal(out$placeholder, "NA") + + mat[2] <- "NA" + out <- optimize_string_storage(mat) + expect_equal(H5Tget_size(out$type), 3) + expect_equal(out$placeholder, "_NA") + + mat[3] <- "_NA" + out <- optimize_string_storage(mat) + expect_equal(H5Tget_size(out$type), 4) + expect_equal(out$placeholder, "__NA") + + # Correct size determination. + mat <- matrix(sample(LETTERS, 1000, replace=TRUE), 50, 20) + mat[1] <- "Aaron" + out <- optimize_string_storage(mat) + expect_equal(H5Tget_size(out$type), 5) + + mat <- matrix(sample(LETTERS, 1000, replace=TRUE), 50, 20) + mat[1000] <- "Aaron" + out <- optimize_string_storage(mat) + expect_equal(H5Tget_size(out$type), 5) + + mat <- matrix("", 50, 20) + out <- optimize_string_storage(mat) + expect_equal(H5Tget_size(out$type), 1) + + # Checking for correct behavior in all-NA cases. + mat <- matrix(NA_character_, 50, 20) + out <- optimize_string_storage(mat) + expect_equal(H5Tget_size(out$type), 2) + expect_identical(out$placeholder, "NA") + + # Handles UTF-8. Assumes that this file was sourced as UTF-8. + mat <- matrix("α", 50, 20) + Encoding(mat) <- "UTF-8" + out <- optimize_string_storage(mat) + expect_equal(H5Tget_size(out$type), 2) + expect_equal(H5Tget_cset(out$type), 1) # aka UTF-8 + + stuff <- "Ä" + Encoding(stuff) <- "latin1" + expect_error(optimize_string_storage(stuff), "non-UTF-8-encoded") +}) + +test_that("storage optimization works for booleans", { + mat <- matrix(c(TRUE, FALSE), 50, 20) + out <- optimize_boolean_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT8") + expect_null(out$placeholder) + + mat[1] <- NA + out <- optimize_boolean_storage(mat) + expect_equal(out$type, "H5T_NATIVE_INT8") + expect_equal(out$placeholder, -1L) +})