Updated HDF5 data frame readers/writers for the new specification. (#17)

This uses the more self-contained specification described in takane v0.5.0, where many of the properties in the schema are also duplicated inside HDF5 attributes. The aim is to simplify interpretation of the standalone HDF5 file without the need to constantly refer to information in the schema.
ArtifactDB · Nov 6, 2023 · 1b0adbc · 1b0adbc
1 parent 1c515f3
commit 1b0adbc
Show file tree

Hide file tree

Showing 15 changed files with 739 additions and 452 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -71,6 +71,9 @@ importFrom(S4Vectors,mcols)
 importFrom(S4Vectors,metadata)
 importFrom(jsonlite,fromJSON)
 importFrom(jsonlite,toJSON)
+importFrom(rhdf5,H5Aclose)
+importFrom(rhdf5,H5Acreate)
+importFrom(rhdf5,H5Awrite)
 importFrom(rhdf5,H5Dclose)
 importFrom(rhdf5,H5Dcreate)
 importFrom(rhdf5,H5Dopen)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -5,12 +5,12 @@ check_csv <- function(path, is_compressed, parallel) {
     .Call(`_alabaster_base_check_csv`, path, is_compressed, parallel)
 }
 
-check_csv_df <- function(path, nrows, has_row_names, column_names, column_types, string_formats, factor_levels, df_version, is_compressed, parallel) {
-    .Call(`_alabaster_base_check_csv_df`, path, nrows, has_row_names, column_names, column_types, string_formats, factor_levels, df_version, is_compressed, parallel)
+check_csv_df <- function(path, nrows, has_row_names, column_names, column_types, string_formats, factor_ordered, factor_levels, df_version, is_compressed, parallel) {
+    .Call(`_alabaster_base_check_csv_df`, path, nrows, has_row_names, column_names, column_types, string_formats, factor_ordered, factor_levels, df_version, is_compressed, parallel)
 }
 
-check_hdf5_df <- function(path, name, nrows, has_row_names, column_names, column_types, string_formats, factor_levels, df_version, hdf5_version) {
-    .Call(`_alabaster_base_check_hdf5_df`, path, name, nrows, has_row_names, column_names, column_types, string_formats, factor_levels, df_version, hdf5_version)
+check_hdf5_df <- function(path, name, nrows, has_row_names, column_names, column_types, string_formats, factor_ordered, factor_levels, df_version, hdf5_version) {
+    .Call(`_alabaster_base_check_hdf5_df`, path, name, nrows, has_row_names, column_names, column_types, string_formats, factor_ordered, factor_levels, df_version, hdf5_version)
 }
 
 check_list_hdf5 <- function(file, name, num_external) {
@@ -21,6 +21,18 @@ check_list_json <- function(file, num_external, parallel) {
     .Call(`_alabaster_base_check_list_json`, file, num_external, parallel)
 }
 
+any_actually_numeric_na <- function(x) {
+    .Call(`_alabaster_base_any_actually_numeric_na`, x)
+}
+
+is_actually_numeric_na <- function(x) {
+    .Call(`_alabaster_base_is_actually_numeric_na`, x)
+}
+
+choose_numeric_missing_placeholder <- function(x) {
+    .Call(`_alabaster_base_choose_numeric_missing_placeholder`, x)
+}
+
 load_csv <- function(path, is_compressed, nrecords, parallel) {
     .Call(`_alabaster_base_load_csv`, path, is_compressed, nrecords, parallel)
 }

diff --git a/R/loadDataFrame.R b/R/loadDataFrame.R
@@ -56,49 +56,31 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
             df <- make_zero_col_DFrame(nrow=nrows)
         } else {
             raw <- h5read(path, prefix("data"))
-            version_above_1 <- isTRUE(info$data_frame$version > 1)
-
-            # Replacing placeholders with NAs.
-            for (i in names(raw)) {
-                current <- raw[[i]]
-                if (version_above_1 || is.character(current)) {
-                    attr <- h5readAttributes(path, prefix(paste0("data/", i)))
-                    replace.na <- attr[["missing-value-placeholder"]]
-
-                    restore_min_integer <- function(y) {
-                        if (is.integer(y) && anyNA(y)) { # promote integer NAs back to the actual number.
-                            y <- as.double(y)
-                            y[is.na(y)] <- -2^31
-                        }
-                        y
-                    }
+            df <- vector("list", length(col.info))
 
-                    if (is.null(replace.na)) {
-                        raw[[i]] <- restore_min_integer(current)
-                    } else if (is.na(replace.na) && !is.nan(replace.na)) {
-                        # No-op as the placeholder is already R's NA.
-                    } else if (is.nan(replace.na)) {
-                        raw[[i]][is.nan(current)] <- NA # avoid equality checks to an NaN.
-                    } else {
-                        current <- restore_min_integer(current)
-                        current[which(current == replace.na)] <- NA # Using which() to avoid problems with existing NAs.
-                        raw[[i]] <- current
+            for (i in seq_along(col.info)) {
+                curinfo <- col.info[[i]]
+                d <- as.character(i - 1L) # -1 to get back to 0-based indices.
+                current <- raw[[d]]
+
+                if (is.list(current)) { # Handling factors stored as lists in the new version.
+                    if (curinfo$type != "factor") {
+                        stop("HDF5 groups as columns are only supported for factor columns")
                     }
-                }
-            }
+                    codes <- .repopulate_missing_hdf5(current$codes, path, prefix(paste0("data/", d, "/codes")))
+                    df[[i]] <- factor(current$levels[codes + 1L], current$levels, ordered=isTRUE(curinfo$ordered))
 
-            # Adding placeholders for type:"other".
-            indices <- as.integer(names(raw)) + 1L # get back to 1-based.
-            df <- vector("list", length(col.info))
-            df[indices] <- lapply(raw, as.vector) # remove 1d arrays.
-            for (i in seq_along(df)) {
-                if (is.null(df[[i]])) {
-                    df[[i]] <- logical(nrows) 
+                } else if (!is.null(current)) {
+                    current <- .repopulate_missing_hdf5(current, path, prefix(paste0("data/", d)))
+                    df[[i]] <- as.vector(current) # remove 1d arrays.
+
+                } else {
+                    df[[i]] <- logical(nrows) # placeholders
                 }
             }
 
-            df <- DataFrame(df)
-            colnames(df) <- as.vector(h5read(path, prefix("column_names")))
+            names(df) <- as.vector(h5read(path, prefix("column_names")))
+            df <- DataFrame(df, check.names=FALSE)
         }
         if (has.rownames) {
             rownames(df) <- as.vector(h5read(path, prefix("row_names")))
@@ -114,27 +96,62 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
         } 
     }
 
-    # Make sure everyone is of the right type.
-    new.names <- character(ncol(df))
+    df <- .coerce_df_column_type(df, col.info, project, include.nested=include.nested)
+    .restoreMetadata(df, mcol.data=info$data_frame$column_data, meta.data=info$data_frame$other_data, project=project)
+}
+
+#' @importFrom rhdf5 h5readAttributes
+.repopulate_missing_hdf5 <- function(current, path, name) {
+    attr <- h5readAttributes(path, name)
+    replace.na <- attr[["missing-value-placeholder"]]
+
+    restore_min_integer <- function(y) {
+        if (is.integer(y) && anyNA(y)) { # promote integer NAs back to the actual number.
+            y <- as.double(y)
+            y[is.na(y)] <- -2^31
+        }
+        y
+    }
+
+    if (is.null(replace.na)) {
+        current <- restore_min_integer(current)
+    } else if (is.na(replace.na)) {
+        if (!is.nan(replace.na)) {
+            # No-op as the placeholder is already R's NA of the relevant type.
+        } else { 
+            current[is.nan(current)] <- NA # avoid equality checks to an NaN.
+        }
+    } else {
+        current <- restore_min_integer(current)
+        current[which(current == replace.na)] <- NA # Using which() to avoid problems with existing NAs.
+    }
+
+    current
+}
+
+.coerce_df_column_type <- function(df, col.info, project, include.nested) {
+    stopifnot(length(df) == length(col.info))
+    true.names <- character(length(col.info))
 
     for (i in seq_along(col.info)) {
         current.info <- col.info[[i]]
-        new.names[i] <- current.info$name
-
+        true.names[i] <- current.info$name
         col.type <- current.info$type 
         col <- df[[i]]
 
         if (col.type=="factor" || col.type=="ordered") {
-            level.info <- acquireMetadata(project, current.info$levels$resource$path)
-            levels <- altLoadObject(level.info, project=project)
-            if (is(levels, "DataFrame")) { # account for old objects that store levels as a DF.
-                levels <- levels[,1]
-            }
-            if (is.numeric(col)) {
-                col <- levels[col + 1L]
+            if (!is.factor(col)) { # we may have already transformed the column to a factor, in which case we can skip this.
+                level.info <- acquireMetadata(project, current.info$levels$resource$path)
+                levels <- altLoadObject(level.info, project=project)
+                if (is(levels, "DataFrame")) { # account for old objects that store levels as a DF.
+                    levels <- levels[,1]
+                }
+                if (is.numeric(col)) {
+                    col <- levels[col + 1L]
+                }
+                ordered <- col.type == "ordered" || isTRUE(current.info$ordered)
+                col <- factor(col, levels=levels, ordered=ordered)
             }
-            ordered <- col.type == "ordered" || isTRUE(current.info$ordered)
-            col <- factor(col, levels=levels, ordered=ordered)
 
         } else if (col.type=="date") {
             col <- as.Date(col)
@@ -165,7 +182,7 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
             if (include.nested || !("data_frame" %in% names(current))) {
                 col <- altLoadObject(current, project=project)
             } else {
-                new.names[i] <- NA_character_
+                true.names[i] <- NA_character_
             }
         } else {
             stop("unsupported column type '", col.type, "'")
@@ -175,13 +192,12 @@ loadDataFrame <- function(info, project, include.nested=TRUE, parallel=TRUE) {
     }
 
     # Removing nested DFs.
-    if (!all(keep <- !is.na(new.names))) {
+    if (!all(keep <- !is.na(true.names))) {
         df <- df[,keep,drop=FALSE]
-        new.names <- new.names[keep]
+        true.names <- true.names[keep]
     }
 
-    # Replacing the names with the values at input.
-    colnames(df) <- new.names
+    colnames(df) <- true.names
 
-    .restoreMetadata(df, mcol.data=info$data_frame$column_data, meta.data=info$data_frame$other_data, project=project)
+    df
 }
diff --git a/R/stageBaseList.R b/R/stageBaseList.R
@@ -39,7 +39,7 @@
 #' @rdname stageBaseList
 #' @importFrom rhdf5 h5createFile
 #' @importFrom jsonlite toJSON
-setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list", .version=2) {
+setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list", .version=NULL) {
     dir.create(file.path(dir, path), showWarnings=FALSE)
 
     env <- new.env()
@@ -56,21 +56,32 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
         meta[["$schema"]] <- "hdf5_simple_list/v1.json"
         meta$hdf5_simple_list <- list(group=dname)
 
+        if (is.null(.version)) {
+            .version <- 3
+        }
+
         .transform_list_hdf5(x, dir=dir, path=path, fpath=fpath, name=dname, env=env, .version=.version)
+
         if (.version > 1) {
-            .label_hdf5_group(fpath, dname, uzuki_version="1.2")
+            .label_hdf5_group(fpath, dname, uzuki_version=paste0("1.", .version))
         }
 
         check_list_hdf5(fpath, dname, length(env$collected)) # Check that we did it correctly.
 
     } else {
         target <- paste0(path, "/", fname, ".json.gz")
+
+        if (is.null(.version)) {
+            .version <- 2
+        }
+
         formatted <- .transform_list_json(x, dir=dir, path=path, env=env, .version=.version)
+
         if (.version > 1) {
-            formatted$version <- "1.2"
+            formatted$version <- paste0("1.", .version)
         }
-        str <- toJSON(formatted, auto_unbox=TRUE, ident=4, null="null", na="null")
 
+        str <- toJSON(formatted, auto_unbox=TRUE, ident=4, null="null", na="null")
         fpath <- file.path(dir, target)
         con <- gzfile(fpath, open="wb")
         write(file=con, str)
@@ -160,12 +171,12 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
 
             missing.placeholder <- NULL
             if (.version > 1) {
-                transformed <- transformVectorForHdf5(y)
+                transformed <- transformVectorForHdf5(y, .version=.version)
                 y <- transformed$transformed
                 missing.placeholder <- transformed$placeholder
             } else if (is.character(y)) {
                 if (anyNA(y)) {
-                    missing.placeholder <- chooseMissingPlaceholderForHdf5(y)
+                    missing.placeholder <- chooseMissingPlaceholderForHdf5(y, .version=.version)
                     y[is.na(y)] <- missing.placeholder
                 }
             }
@@ -190,7 +201,7 @@ setMethod("stageObject", "list", function(x, dir, path, child=FALSE, fname="list
 
             missing.placeholder <- NULL
             if (.version > 1) {
-                transformed <- transformVectorForHdf5(y)
+                transformed <- transformVectorForHdf5(y, .version=.version)
                 y <- transformed$transformed
                 missing.placeholder <- transformed$placeholder
             } else {