Skip to content

Commit

Permalink
Updated HDF5 data frame readers/writers for the new specification. (#17)
Browse files Browse the repository at this point in the history
This uses the more self-contained specification described in takane v0.5.0, where
many of the properties in the schema are also duplicated inside HDF5 attributes.
The aim is to simplify interpretation of the standalone HDF5 file without the need
to constantly refer to information in the schema.
  • Loading branch information
LTLA authored Nov 6, 2023
1 parent 1c515f3 commit 1b0adbc
Show file tree
Hide file tree
Showing 15 changed files with 739 additions and 452 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ importFrom(S4Vectors,mcols)
importFrom(S4Vectors,metadata)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,toJSON)
importFrom(rhdf5,H5Aclose)
importFrom(rhdf5,H5Acreate)
importFrom(rhdf5,H5Awrite)
importFrom(rhdf5,H5Dclose)
importFrom(rhdf5,H5Dcreate)
importFrom(rhdf5,H5Dopen)
Expand Down
20 changes: 16 additions & 4 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ check_csv <- function(path, is_compressed, parallel) {
.Call(`_alabaster_base_check_csv`, path, is_compressed, parallel)
}

check_csv_df <- function(path, nrows, has_row_names, column_names, column_types, string_formats, factor_levels, df_version, is_compressed, parallel) {
.Call(`_alabaster_base_check_csv_df`, path, nrows, has_row_names, column_names, column_types, string_formats, factor_levels, df_version, is_compressed, parallel)
check_csv_df <- function(path, nrows, has_row_names, column_names, column_types, string_formats, factor_ordered, factor_levels, df_version, is_compressed, parallel) {
.Call(`_alabaster_base_check_csv_df`, path, nrows, has_row_names, column_names, column_types, string_formats, factor_ordered, factor_levels, df_version, is_compressed, parallel)
}

check_hdf5_df <- function(path, name, nrows, has_row_names, column_names, column_types, string_formats, factor_levels, df_version, hdf5_version) {
.Call(`_alabaster_base_check_hdf5_df`, path, name, nrows, has_row_names, column_names, column_types, string_formats, factor_levels, df_version, hdf5_version)
check_hdf5_df <- function(path, name, nrows, has_row_names, column_names, column_types, string_formats, factor_ordered, factor_levels, df_version, hdf5_version) {
.Call(`_alabaster_base_check_hdf5_df`, path, name, nrows, has_row_names, column_names, column_types, string_formats, factor_ordered, factor_levels, df_version, hdf5_version)
}

check_list_hdf5 <- function(file, name, num_external) {
Expand All @@ -21,6 +21,18 @@ check_list_json <- function(file, num_external, parallel) {
.Call(`_alabaster_base_check_list_json`, file, num_external, parallel)
}

any_actually_numeric_na <- function(x) {
.Call(`_alabaster_base_any_actually_numeric_na`, x)
}

is_actually_numeric_na <- function(x) {
.Call(`_alabaster_base_is_actually_numeric_na`, x)
}

choose_numeric_missing_placeholder <- function(x) {
.Call(`_alabaster_base_choose_numeric_missing_placeholder`, x)
}

load_csv <- function(path, is_compressed, nrecords, parallel) {
.Call(`_alabaster_base_load_csv`, path, is_compressed, nrecords, parallel)
}
Expand Down
128 changes: 72 additions & 56 deletions R/loadDataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,49 +56,31 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
df <- make_zero_col_DFrame(nrow=nrows)
} else {
raw <- h5read(path, prefix("data"))
version_above_1 <- isTRUE(info$data_frame$version > 1)

# Replacing placeholders with NAs.
for (i in names(raw)) {
current <- raw[[i]]
if (version_above_1 || is.character(current)) {
attr <- h5readAttributes(path, prefix(paste0("data/", i)))
replace.na <- attr[["missing-value-placeholder"]]

restore_min_integer <- function(y) {
if (is.integer(y) && anyNA(y)) { # promote integer NAs back to the actual number.
y <- as.double(y)
y[is.na(y)] <- -2^31
}
y
}
df <- vector("list", length(col.info))

if (is.null(replace.na)) {
raw[[i]] <- restore_min_integer(current)
} else if (is.na(replace.na) && !is.nan(replace.na)) {
# No-op as the placeholder is already R's NA.
} else if (is.nan(replace.na)) {
raw[[i]][is.nan(current)] <- NA # avoid equality checks to an NaN.
} else {
current <- restore_min_integer(current)
current[which(current == replace.na)] <- NA # Using which() to avoid problems with existing NAs.
raw[[i]] <- current
for (i in seq_along(col.info)) {
curinfo <- col.info[[i]]
d <- as.character(i - 1L) # -1 to get back to 0-based indices.
current <- raw[[d]]

if (is.list(current)) { # Handling factors stored as lists in the new version.
if (curinfo$type != "factor") {
stop("HDF5 groups as columns are only supported for factor columns")
}
}
}
codes <- .repopulate_missing_hdf5(current$codes, path, prefix(paste0("data/", d, "/codes")))
df[[i]] <- factor(current$levels[codes + 1L], current$levels, ordered=isTRUE(curinfo$ordered))

# Adding placeholders for type:"other".
indices <- as.integer(names(raw)) + 1L # get back to 1-based.
df <- vector("list", length(col.info))
df[indices] <- lapply(raw, as.vector) # remove 1d arrays.
for (i in seq_along(df)) {
if (is.null(df[[i]])) {
df[[i]] <- logical(nrows)
} else if (!is.null(current)) {
current <- .repopulate_missing_hdf5(current, path, prefix(paste0("data/", d)))
df[[i]] <- as.vector(current) # remove 1d arrays.

} else {
df[[i]] <- logical(nrows) # placeholders
}
}

df <- DataFrame(df)
colnames(df) <- as.vector(h5read(path, prefix("column_names")))
names(df) <- as.vector(h5read(path, prefix("column_names")))
df <- DataFrame(df, check.names=FALSE)
}
if (has.rownames) {
rownames(df) <- as.vector(h5read(path, prefix("row_names")))
Expand All @@ -114,27 +96,62 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
}
}

# Make sure everyone is of the right type.
new.names <- character(ncol(df))
df <- .coerce_df_column_type(df, col.info, project, include.nested=include.nested)
.restoreMetadata(df, mcol.data=info$data_frame$column_data, meta.data=info$data_frame$other_data, project=project)
}

#' @importFrom rhdf5 h5readAttributes
.repopulate_missing_hdf5 <- function(current, path, name) {
attr <- h5readAttributes(path, name)
replace.na <- attr[["missing-value-placeholder"]]

restore_min_integer <- function(y) {
if (is.integer(y) && anyNA(y)) { # promote integer NAs back to the actual number.
y <- as.double(y)
y[is.na(y)] <- -2^31
}
y
}

if (is.null(replace.na)) {
current <- restore_min_integer(current)
} else if (is.na(replace.na)) {
if (!is.nan(replace.na)) {
# No-op as the placeholder is already R's NA of the relevant type.
} else {
current[is.nan(current)] <- NA # avoid equality checks to an NaN.
}
} else {
current <- restore_min_integer(current)
current[which(current == replace.na)] <- NA # Using which() to avoid problems with existing NAs.
}

current
}

.coerce_df_column_type <- function(df, col.info, project, include.nested) {
stopifnot(length(df) == length(col.info))
true.names <- character(length(col.info))

for (i in seq_along(col.info)) {
current.info <- col.info[[i]]
new.names[i] <- current.info$name

true.names[i] <- current.info$name
col.type <- current.info$type
col <- df[[i]]

if (col.type=="factor" || col.type=="ordered") {
level.info <- acquireMetadata(project, current.info$levels$resource$path)
levels <- altLoadObject(level.info, project=project)
if (is(levels, "DataFrame")) { # account for old objects that store levels as a DF.
levels <- levels[,1]
}
if (is.numeric(col)) {
col <- levels[col + 1L]
if (!is.factor(col)) { # we may have already transformed the column to a factor, in which case we can skip this.
level.info <- acquireMetadata(project, current.info$levels$resource$path)
levels <- altLoadObject(level.info, project=project)
if (is(levels, "DataFrame")) { # account for old objects that store levels as a DF.
levels <- levels[,1]
}
if (is.numeric(col)) {
col <- levels[col + 1L]
}
ordered <- col.type == "ordered" || isTRUE(current.info$ordered)
col <- factor(col, levels=levels, ordered=ordered)
}
ordered <- col.type == "ordered" || isTRUE(current.info$ordered)
col <- factor(col, levels=levels, ordered=ordered)

} else if (col.type=="date") {
col <- as.Date(col)
Expand Down Expand Up @@ -165,7 +182,7 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
if (include.nested || !("data_frame" %in% names(current))) {
col <- altLoadObject(current, project=project)
} else {
new.names[i] <- NA_character_
true.names[i] <- NA_character_
}
} else {
stop("unsupported column type '", col.type, "'")
Expand All @@ -175,13 +192,12 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
}

# Removing nested DFs.
if (!all(keep <- !is.na(new.names))) {
if (!all(keep <- !is.na(true.names))) {
df <- df[,keep,drop=FALSE]
new.names <- new.names[keep]
true.names <- true.names[keep]
}

# Replacing the names with the values at input.
colnames(df) <- new.names
colnames(df) <- true.names

.restoreMetadata(df, mcol.data=info$data_frame$column_data, meta.data=info$data_frame$other_data, project=project)
df
}
25 changes: 18 additions & 7 deletions R/stageBaseList.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
#' @rdname stageBaseList
#' @importFrom rhdf5 h5createFile
#' @importFrom jsonlite toJSON
setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list", .version=2) {
setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list", .version=NULL) {
dir.create(file.path(dir, path), showWarnings=FALSE)

env <- new.env()
Expand All @@ -56,21 +56,32 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
meta[["$schema"]] <- "hdf5_simple_list/v1.json"
meta$hdf5_simple_list <- list(group=dname)

if (is.null(.version)) {
.version <- 3
}

.transform_list_hdf5(x, dir=dir, path=path, fpath=fpath, name=dname, env=env, .version=.version)

if (.version > 1) {
.label_hdf5_group(fpath, dname, uzuki_version="1.2")
.label_hdf5_group(fpath, dname, uzuki_version=paste0("1.", .version))
}

check_list_hdf5(fpath, dname, length(env$collected)) # Check that we did it correctly.

} else {
target <- paste0(path, "/", fname, ".json.gz")

if (is.null(.version)) {
.version <- 2
}

formatted <- .transform_list_json(x, dir=dir, path=path, env=env, .version=.version)

if (.version > 1) {
formatted$version <- "1.2"
formatted$version <- paste0("1.", .version)
}
str <- toJSON(formatted, auto_unbox=TRUE, ident=4, null="null", na="null")

str <- toJSON(formatted, auto_unbox=TRUE, ident=4, null="null", na="null")
fpath <- file.path(dir, target)
con <- gzfile(fpath, open="wb")
write(file=con, str)
Expand Down Expand Up @@ -160,12 +171,12 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list

missing.placeholder <- NULL
if (.version > 1) {
transformed <- transformVectorForHdf5(y)
transformed <- transformVectorForHdf5(y, .version=.version)
y <- transformed$transformed
missing.placeholder <- transformed$placeholder
} else if (is.character(y)) {
if (anyNA(y)) {
missing.placeholder <- chooseMissingPlaceholderForHdf5(y)
missing.placeholder <- chooseMissingPlaceholderForHdf5(y, .version=.version)
y[is.na(y)] <- missing.placeholder
}
}
Expand All @@ -190,7 +201,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list

missing.placeholder <- NULL
if (.version > 1) {
transformed <- transformVectorForHdf5(y)
transformed <- transformVectorForHdf5(y, .version=.version)
y <- transformed$transformed
missing.placeholder <- transformed$placeholder
} else {
Expand Down
Loading

0 comments on commit 1b0adbc

Please sign in to comment.