Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate Python based anndata testfiles #170

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
b46dcbb
Start on generate vector in python
LouiseDck Jun 29, 2024
b9b9685
Eliminate most randomness, check what happens when using np.nan, pd.N…
LouiseDck Jul 1, 2024
40decbf
Generate all manner of matrics
LouiseDck Jul 3, 2024
311f896
bugfix
LouiseDck Jul 4, 2024
b4d8dfd
Generate dataframe
LouiseDck Jul 4, 2024
11850b2
documentation
LouiseDck Jul 4, 2024
11a0a67
generate dict and start of dataset
LouiseDck Jul 4, 2024
185ec3f
Generate dataset
LouiseDck Jul 18, 2024
a13874d
Black formatting
LouiseDck Jul 18, 2024
bd50473
Remove randomness
LouiseDck Jul 18, 2024
9359252
Fix writing pd.NA in uns and then failing to write
LouiseDck Jul 24, 2024
de9767a
Remove dummy-anndata files
LouiseDck Sep 20, 2024
c8c77b1
empty commit to trigger ci
rcannood Jun 27, 2024
f7ce8d0
re-enable matrices with NAs tests in X and layers (#142)
rcannood Jun 27, 2024
a227310
Fixes to `write_h5ad_categorical()` (#155)
lazappi Jul 4, 2024
0a76134
clean up funding
rcannood Jul 5, 2024
71d5a4a
add list as param
rcannood Jul 5, 2024
961c105
Make rownames part of obs and var (#171)
rcannood Jul 8, 2024
42761f6
switch from rhdf5 to hdf5r (#169)
rcannood Jul 18, 2024
9856e72
add dependabot to the repo (#176)
rcannood Aug 15, 2024
fff14f5
bump version requirements for reticulate and rhdf5 (#174)
rcannood Aug 15, 2024
99f77fb
update actions (#181)
rcannood Aug 15, 2024
d46edb6
Tidy user interface #2 (#180)
rcannood Aug 15, 2024
e6f8a33
Verbose h5diff testing
LouiseDck Oct 3, 2024
03eb07e
Add processx to description
LouiseDck Oct 4, 2024
534ee90
Require processx
LouiseDck Oct 4, 2024
3c60c3e
lintr
LouiseDck Oct 4, 2024
0d67580
Merge remote-tracking branch 'origin/main' into dataset-generator
rcannood Oct 17, 2024
2ef2d2b
Start on diffing h5ad files
LouiseDck Nov 6, 2024
b9eb120
Basic matrix tests
LouiseDck Nov 8, 2024
99fd648
Systematise testing
LouiseDck Nov 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ Suggests:
SingleCellExperiment,
SummarizedExperiment,
testthat (>= 3.0.0),
withr
withr,
processx,
VignetteBuilder:
knitr
Config/Needs/website: pkgdown, tibble, knitr, rprojroot, stringr, readr,
Expand Down
59 changes: 34 additions & 25 deletions R/generate_matrix.R
Original file line number Diff line number Diff line change
@@ -1,73 +1,82 @@
generate_numeric_matrix <- function(n_obs, n_vars, NAs = FALSE) {
# byrow = TRUE to mimic the way a matrix gets filled in Python
m <- matrix(seq(0.5, n_obs * n_vars), nrow = n_obs, ncol = n_vars, byrow = TRUE)
if (NAs) {
m[1, 1] <- NA_real_
}
m
}

generate_integer_matrix <- function(n_obs, n_vars, NAs = FALSE) {
# byrow = TRUE to mimic the way a matrix gets filled in Python
m <- matrix(seq(0L, n_obs * n_vars), nrow = n_obs, ncol = n_vars, byrow = TRUE)
if (NAs) {
m[1, 1] <- NA_integer_
}
m
}

# nolint start
matrix_generators <- list(
numeric_matrix = function(n_obs, n_vars) {
matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
generate_numeric_matrix(n_obs, n_vars)
},
numeric_dense = function(n_obs, n_vars) {
m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
m <- generate_numeric_matrix(n_obs, n_vars)
as(m, "denseMatrix")
},
numeric_csparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m <- generate_numeric_matrix(n_obs, n_vars)
as(m, "CsparseMatrix")
},
numeric_rsparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m <- generate_numeric_matrix(n_obs, n_vars)
as(m, "RsparseMatrix")
},
numeric_matrix_with_nas = function(n_obs, n_vars) {
m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
m
generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
},
numeric_dense_with_nas = function(n_obs, n_vars) {
m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
m <- generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "denseMatrix")
},
numeric_csparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
m <- generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "CsparseMatrix")
},
numeric_rsparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
m <- generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "RsparseMatrix")
},
integer_matrix = function(n_obs, n_vars) {
matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
generate_integer_matrix(n_obs, n_vars)
},
integer_dense = function(n_obs, n_vars) {
m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
m <- generate_integer_matrix(n_obs, n_vars)
as(m, "denseMatrix")
},
integer_csparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m <- generate_integer_matrix(n_obs, n_vars)
as(m, "CsparseMatrix")
},
integer_rsparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m <- generate_integer_matrix(n_obs, n_vars)
as(m, "RsparseMatrix")
},
integer_matrix_with_nas = function(n_obs, n_vars) {
m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
m
},
integer_dense_with_nas = function(n_obs, n_vars) {
m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "denseMatrix")
},
integer_csparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "CsparseMatrix")
},
integer_rsparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "RsparseMatrix")
}
)
Expand Down
3 changes: 3 additions & 0 deletions R/write_h5ad_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,10 @@ write_h5ad_dense_array <- function(value, file, name, compression, version = "0.
}

if (!is.vector(value)) {
# value <- t(value)
value <- t(value)
# value <- matrix(value, nrow = nrow(value), byrow = TRUE)
# value <- t(matrix(value, nrow = nrow(value), byrow = TRUE))
}

# Guess data type
Expand Down
2 changes: 1 addition & 1 deletion R/write_hdf5_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ hdf5_create_dataset <- function(
compression <- match.arg(compression)

if (!is.null(dim(value))) {
dims <- dim(value)
dims <- rev(dim(value))
} else {
dims <- length(value)
}
Expand Down
240 changes: 240 additions & 0 deletions tests/testthat/test-h5diff.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
# skip_if_not_installed("hdf5r")

# requireNamespace("reticulate")
# testthat::skip_if_not(
# reticulate::py_module_available("dummy_anndata"),
# message = "Python dummy_anndata module not available for testing"
# )

matrix_equivalences <- list(
"float_matrix" = list("numeric_matrix"), #, "numeric_dense"), #numeric dense does dgematrix
"float_matrix_nas" = list("numeric_matrix_with_nas"), #, "numeric_dense_with_nas"), #numeric dense does dgematrix
"integer_matrix" = list("integer_matrix"),# , "integer_dense"),
"float_csparse" = list("numeric_csparse"),
"float_csparse_nas" = list("numeric_csparse_with_nas"),
"float_rsparse" = list("numeric_rsparse"),
"float_rsparse_nas" = list("numeric_rsparse_with_nas")
)

vector_equivalence <- list(
"categorical" = list("factor"),
"categorical_ordered" = list("factor_ordered"),
"categorical_missing_values" = list("factor_with_nas"),
"categorical_ordered_missing_values" = list("factor_ordered_with_nas"),
"string_array" = list("character"),
"dense_array" = list("numeric"),
"integer_array" = list("integer"),
"boolean_array" = list("logical"),
"nullable_integer_array" = list("integer_with_nas"),
"nullable_boolean_array" = list("logical_with_nas")
)

# TODO: check if processx is available
# TODO: check if h5diff is available --> hdf5-tools

da <- reticulate::import("dummy_anndata")

check_arg <- function(args, name, falseval) {
if (name %in% names(args)) {
args[[name]]
} else {
falseval
}
}

py_generate_dataset <- function(n_obs, n_vars, write=FALSE, ...){
args <- list(...)

data <- da$generate_dataset(n_obs, n_vars,
x_type = check_arg(args, "x_type", NULL),
layer_types = check_arg(args, "layer_types", character()),
obs_types = ifelse("obs_types" %in% names(args), args$obs_types, list("integer_array")),
var_types = ifelse("var_types" %in% names(args), args$var_types, list("integer_array")),
obsm_types = check_arg(args, "obsm_types", character()),
varm_types = check_arg(args, "varm_types", character()),
obsp_types = check_arg(args, "obsp_types", character()),
varp_types = check_arg(args, "varp_types", character()),
uns_types = check_arg(args, "uns_types", character()),
nested_uns_types = check_arg(args, "nested_uns_types", character()))

if (write) {
py_write_dataset(data)
}
data
}

py_write_dataset <- function(dataset, file=NULL){
if (is.null(file)) {
file <- tempfile(pattern = "hdf5_write_py_", fileext = ".h5ad")
}
dataset$write_h5ad(file)
}

r_generate_dataset <- function(n_obs, n_vars, write=FALSE, ...){
args <- list(...)

data <- generate_dataset(n_obs, n_vars,
x_type = check_arg(args, "x_type", "numeric_matrix"),
layer_types = check_arg(args, "layer_types", character()),
obs_types = ifelse("obs_types" %in% names(args), args$obs_types, "integer"),
var_types = ifelse("var_types" %in% names(args), args$var_types, "integer"),
obsm_types = check_arg(args, "obsm_types", character()),
varm_types = check_arg(args, "varm_types", character()),
obsp_types = check_arg(args, "obsp_types", character()),
varp_types = check_arg(args, "varp_types", character()),
uns_types = check_arg(args, "uns_types", character()),
format = "AnnData")
if (write) {
r_write_dataset(data)
}

data
}

r_write_dataset <- function(dataset, file=NULL){
if (is.null(file)) {
file <- tempfile(pattern = "hdf5_write_R_", fileext = ".h5ad")
}
write_h5ad(dataset, file)
file
}

# Test obs & var
# - there can be vectors in there

for (py_vector in names(vector_equivalence)) {
data_python <- py_generate_dataset(10L, 20L, obs_types = list(py_vector))
py_location <- py_write_dataset(data_python)

for (r_vector in vector_equivalences[[py_matrix]]) {

data_r <- r_generate_dataset(10L, 20L, obs_types = list(r_vector))
r_location <- r_write_dataset(data_r)

tryCatch({
res <- processx::run("h5diff", c("-v", h5ad_file_py, h5ad_file_r, "/obs"), error_on_status = FALSE)
}, error = function(e) {
message("Error: ", e$message)
message("Python matrix: ", py_matrix)
message("R matrix: ", r_matrix)
}, warning = function(w) {
message("Warning: ", w$message)
message("Python matrix: ", py_matrix)
message("R matrix: ", r_matrix)

})

}
}



# for (py_matrix in names(matrix_equivalences)) {
# data_python <- py_generate_dataset_only_x(10L, 20L, x_type = py_matrix)
# h5ad_file_py <- tempfile(pattern = "hdf5_write_py_", fileext = ".h5ad")
# data_python$write_h5ad(h5ad_file_py)

# for (r_matrix in matrix_equivalences[[py_matrix]]) {
# test_that(paste0("h5diff_X_", py_matrix, "_", r_matrix), {

# tryCatch({
# data_r <- r_generate_dataset_only_x(10L, 20L, x_type = r_matrix)
# h5ad_file_r <- tempfile(pattern = "hdf5_write_R_", fileext = ".h5ad")
# write_h5ad(data_r, h5ad_file_r)

# res <- processx::run("h5diff", c("-v", h5ad_file_py, h5ad_file_r, "/X"), error_on_status = FALSE)

# expect_equal(res$status, 0, info = res$stdout)
# }, error = function(e) {
# message("Error: ", e$message)
# message("Python matrix: ", py_matrix)
# message("R matrix: ", r_matrix)

# }, warning = function(w) {
# message("Warning: ", w$message)
# message("Python matrix: ", py_matrix)
# message("R matrix: ", r_matrix)

# })
# })
# }
# }

# Mismatch <- R6::R6Class("Mismatch",
# public = list(
# Rgenerated = NULL,
# Pygenerated = NULL,
# errormsg = NULL,
# initialize = function(message) {
# self$message <- message
# },
# message = NULL
# )
# )

# # test_that("h5diff_X_float", {
# data_r <- r_generate_dataset_only_x(3L, 5L, x_type = "numeric_matrix")
# data_python <- py_generate_dataset_only_x(3L, 5L, x_type = "generate_float_matrix")

# h5ad_file1 <- "hdf5_write_R_testdims_byrow.h5ad" #tempfile(pattern = "hdf5_write_R_", fileext = ".h5ad")
# h5ad_file2 <- "hdf5_write_py_testdims_byrow.h5ad" #tempfile(pattern = "hdf5_write_py_", fileext = ".h5ad")

# write_h5ad(data_r, h5ad_file1)
# data_python$write_h5ad(h5ad_file2)

# res <- processx::run("h5diff", c("-v", h5ad_file1, h5ad_file2, "/X"), error_on_status = FALSE)

# expect_equal(res$status, 0, info = res$stdout)

# # })


# # # test different matrices in X
# # test_that("h5diff_X_float", {
# # data_r <- r_generate_dataset_only_x(10L, 20L, x_type = "numeric_matrix")
# # data_python <- py_generate_dataset_only_x(10L, 20L, x_type = "generate_float_matrix")

# # h5ad_file1 <- tempfile(pattern = "hdf5_write_R_", fileext = ".h5ad")
# # h5ad_file2 <- tempfile(pattern = "hdf5_write_py_", fileext = ".h5ad")

# # write_h5ad(data_r, h5ad_file1)
# # data_python$write_h5ad(h5ad_file2)

# # res <- processx::run("h5diff", c("-v", h5ad_file1, h5ad_file2, "/X"), error_on_status = FALSE)

# # expect_equal(res$status, 0, info = res$stdout)

# # })

# data_r <- generate_dataset(10L, 20L, format = "AnnData")

# da <- reticulate::import("dummy_anndata")
# data_python <- da$generate_dataset(10L, 20L)

# h5ad_file1 <- "hdf5_write_R_test3.h5ad" #tempfile(pattern = "hdf5_write_R_", fileext = ".h5ad")
# h5ad_file2 <- "hdf5_write_py_test3.h5ad" #tempfile(pattern = "hdf5_write_py_", fileext = ".h5ad")

# write_h5ad(data_r, h5ad_file1)
# data_python$write_h5ad(h5ad_file2)
# res <- processx::run("h5diff", c("-v", h5ad_file1, h5ad_file2, "/X"), error_on_status = FALSE)

# expect_equal(res$status, 0, info = res$stdout)

# # test_that("h5diff", {
# # requireNamespace("processx")

# # data_r <- generate_dataset(10L, 20L, format = "AnnData")

# # da <- reticulate::import("dummy_anndata")
# # data_python <- da$generate_dataset(10L, 20L)

# # h5ad_file1 <- "hdf5_write_R_test2.h5ad" #tempfile(pattern = "hdf5_write_R_", fileext = ".h5ad")
# # h5ad_file2 <- "hdf5_write_py_test2.h5ad" #tempfile(pattern = "hdf5_write_py_", fileext = ".h5ad")

# # write_h5ad(data_r, h5ad_file1)
# # data_python$write_h5ad(h5ad_file2)
# # res <- processx::run("h5diff", c("-v", h5ad_file1, h5ad_file2, "/X"), error_on_status = FALSE)

# # expect_equal(res$status, 0, info = res$stdout)

# # })