diff --git a/R/join_key.R b/R/join_key.R index 9d134c366..b9b3f9b89 100644 --- a/R/join_key.R +++ b/R/join_key.R @@ -2,22 +2,28 @@ #' #' @description `r lifecycle::badge("stable")` #' -#' @details `join_key()` will create a relationship for the variables on a pair -#' of datasets. +#' @description Create a relationship between two datasets, `dataset_1` and `dataset_2`. +#' By default, this function establishes a directed relationship with `dataset_1` as the parent. +#' If `dataset_2` is not specified, the function creates a primary key for `dataset_1`. #' -#' @param dataset_1,dataset_2 (`character(1)`) dataset names. If `dataset_2` is omitted, -#' a primary key for `dataset_1` is created. -#' @param keys (optionally named `character`) where `names(keys)` are columns in `dataset_1` -#' corresponding to columns of `dataset_2` given by the elements of `keys`. +#' @param dataset_1,dataset_2 (`character(1)`) Dataset names. When `dataset_2` is omitted, +#' a primary key for `dataset_1` is created. +#' @param keys (optionally named `character`) Column mapping between the datasets, +#' where `names(keys)` maps columns in `dataset_1` corresponding to columns of +#' `dataset_2` given by the elements of `keys`. #' #' If unnamed, the same column names are used for both datasets. #' #' If any element of the `keys` vector is empty with a non-empty name, then the name is #' used for both datasets. +#' @param directed (`logical(1)`) Flag that indicates whether it should create +#' a parent-child relationship between the datasets.\cr +#' - `TRUE` (default) `dataset_1` is the parent of `dataset_2`; +#' - `FALSE` when the relationship is undirected. #' #' @return object of class `join_key_set` to be passed into `join_keys` function. #' -#' @seealso [join_keys()] +#' @seealso [join_keys()], [parents()] #' #' @export #' @@ -25,10 +31,11 @@ #' join_key("d1", "d2", c("A")) #' join_key("d1", "d2", c("A" = "B")) #' join_key("d1", "d2", c("A" = "B", "C")) -join_key <- function(dataset_1, dataset_2 = dataset_1, keys) { +join_key <- function(dataset_1, dataset_2 = dataset_1, keys, directed = TRUE) { checkmate::assert_string(dataset_1) checkmate::assert_string(dataset_2) checkmate::assert_character(keys, any.missing = FALSE) + checkmate::assert_flag(directed) if (length(keys) > 0) { if (is.null(names(keys))) { @@ -65,6 +72,12 @@ join_key <- function(dataset_1, dataset_2 = dataset_1, keys) { keys <- NULL } + parents <- if (directed && dataset_1 != dataset_2) { + stats::setNames(list(dataset_1), dataset_2) + } else { + list() + } + structure( list( structure( @@ -73,6 +86,7 @@ join_key <- function(dataset_1, dataset_2 = dataset_1, keys) { ) ), names = dataset_1, - class = "join_key_set" + class = "join_key_set", + parents = parents ) } diff --git a/R/join_keys-c.R b/R/join_keys-c.R index 576d028b9..70167b1fb 100644 --- a/R/join_keys-c.R +++ b/R/join_keys-c.R @@ -10,32 +10,22 @@ #' jk, #' join_keys( #' join_key("ds4", keys = c("pk4", "pk4_2")), -#' join_key("ds4", "ds3", c(pk4_2 = "pk3")) +#' join_key("ds3", "ds4", c(pk3 = "pk4_2")) #' ) #' ) c.join_keys <- function(...) { - join_keys_obj <- rlang::list2(...)[[1]] - x <- rlang::list2(...)[-1] - checkmate::assert_multi_class(join_keys_obj, classes = c("join_keys", "join_key_set")) + x <- rlang::list2(...) checkmate::assert_list(x, types = c("join_keys", "join_key_set")) - # Ensure base object has correct class when called from c.join_key_set - join_keys_obj <- join_keys(join_keys_obj) - - x_merged <- Reduce( + Reduce( init = join_keys(), x = x, f = function(.x, .y) { - assert_compatible_keys2(.x, .y) out <- utils::modifyList(.x, .y, keep.null = FALSE) - attr(out, "parents") <- .merge_parents(.x, .y) + parents(out) <- utils::modifyList(attr(.x, "parents"), attr(.y, "parents"), keep.null = FALSE) out } ) - - out <- utils::modifyList(join_keys_obj, x_merged, keep.null = FALSE) - attr(out, "parents") <- .merge_parents(join_keys_obj, x_merged) - out } #' @rdname join_keys @@ -50,26 +40,8 @@ c.join_keys <- function(...) { #' jk_merged <- c( #' jk_merged, #' join_key("ds5", keys = "pk5"), -#' join_key("ds5", "ds1", c(pk5 = "pk1")) +#' join_key("ds1", "ds5", c(pk1 = "pk5")) #' ) c.join_key_set <- function(...) { c.join_keys(...) } - -#' Merge parents for 2 `join_keys` object -#' -#' @param x,y (`join_keys`) objects to merge their parents -#' -#' @return a list with parents merged from 2 `join_keys`. Not the object itself. -#' @keywords internal -.merge_parents <- function(x, y) { - x_parent <- list() - y_parent <- list() - if (length(attr(x, "parents"))) { - x_parent <- attr(x, "parents") - } - if (length(attr(y, "parents"))) { - y_parent <- attr(y, "parents") - } - utils::modifyList(x_parent, y_parent, keep.null = FALSE) -} diff --git a/R/join_keys-extract.R b/R/join_keys-extract.R index 9b0d1706f..987b8ef9d 100644 --- a/R/join_keys-extract.R +++ b/R/join_keys-extract.R @@ -117,6 +117,10 @@ #' @rdname join_keys #' @order 2 #' +#' @param directed (`logical(1)`) Flag that indicates whether it should create +#' a parent-child relationship between the datasets.\cr +#' - `TRUE` (default) `dataset_1` is the parent of `dataset_2`; +#' - `FALSE` when the relationship is undirected. #' @section Functions: #' - `x[i, j] <- value`: Assignment of a key to pair `(i, j)`. #' - `x[i] <- value`: This (without `j` parameter) **is not** a supported @@ -134,12 +138,13 @@ #' #' # Setting a single relationship pair --- #' -#' jk["ds4", "ds1"] <- c("pk4" = "pk1") +#' jk["ds1", "ds4"] <- c("pk1" = "pk4") #' #' # Removing a key --- #' #' jk["ds5", "ds5"] <- NULL -`[<-.join_keys` <- function(x, i, j, value) { +`[<-.join_keys` <- function(x, i, j, directed = TRUE, value) { + checkmate::assert_flag(directed) if (missing(i) || missing(j)) { stop("join_keys[i, j] specify both indices to set a key pair.") } else if (!missing(i) && is.null(i) || !missing(j) && is.null(j)) { @@ -163,8 +168,13 @@ ) } - x[[i]][[j]] <- value - x + # Handle join key removal separately + if (is.null(value)) { + x[[i]][[j]] <- NULL + return(x) + } + + c(x, join_key(i, j, value, directed)) } #' @noRd @@ -234,9 +244,14 @@ # Remove classes to use list-based get/assign operations new_x <- unclass(x) - # In case a pair is removed, also remove the symmetric pair + # In case a pair is removed, also remove the symmetric pair and update parents removed_names <- setdiff(names(new_x[[i]]), names(norm_value)) - for (.x in removed_names) new_x[[.x]][[i]] <- NULL + for (.x in removed_names) { + if (identical(parent(x, .x), i)) attr(new_x, "parents")[[.x]] <- NULL + if (identical(parent(x, i), .x)) attr(new_x, "parents")[[i]] <- NULL + + new_x[[.x]][[i]] <- NULL + } new_x[[i]] <- norm_value diff --git a/R/join_keys-parents.R b/R/join_keys-parents.R index c48376e32..7ae70b68b 100644 --- a/R/join_keys-parents.R +++ b/R/join_keys-parents.R @@ -78,6 +78,9 @@ parents.teal_data <- function(x) { new_parents <- list() for (dataset in names(value)) { + # Custom .var.name so it is verbose and helpful for users + checkmate::assert_string(value[[dataset]], .var.name = sprintf("value[[\"%s\"]]", dataset)) + parent <- new_parents[[dataset]] checkmate::assert( checkmate::check_null(parent), @@ -86,7 +89,8 @@ parents.teal_data <- function(x) { length(value[[dataset]]) == 0 ), checkmate::check_true(parent == value[[dataset]]), - "Please check the difference between provided datasets parents and provided join_keys parents." + "Please check the difference between provided datasets parents and provided join_keys parents.", + .var.name = "value" ) if (is.null(parent)) { new_parents[[dataset]] <- value[[dataset]] diff --git a/R/join_keys-print.R b/R/join_keys-print.R index f39fef86d..ad231fd78 100644 --- a/R/join_keys-print.R +++ b/R/join_keys-print.R @@ -8,7 +8,6 @@ format.join_keys <- function(x, ...) { names <- union(names_sorted, names(x)) x_implicit <- update_keys_given_parents(x) out <- lapply(names, function(i) { - this_parent <- my_parents[[i]] out_i <- lapply(union(i, names(x[[i]])), function(j) { direction <- if (identical(my_parents[[j]], i)) { " <-- " diff --git a/R/join_keys-utils.R b/R/join_keys-utils.R index c5289864e..0696610bd 100644 --- a/R/join_keys-utils.R +++ b/R/join_keys-utils.R @@ -110,47 +110,30 @@ update_keys_given_parents <- function(x) { checkmate::assert_class(jk, "join_keys", .var.name = checkmate::vname(x)) datanames <- names(jk) - duplicate_pairs <- list() - for (d1 in datanames) { - d1_pk <- jk[[d1]][[d1]] + for (d1_ix in seq_along(datanames)) { + d1 <- datanames[[d1_ix]] d1_parent <- parent(jk, d1) - for (d2 in datanames) { - if (paste(d2, d1) %in% duplicate_pairs) { - next - } + for (d2 in datanames[-1 * seq.int(d1_ix)]) { if (length(jk[[d1]][[d2]]) == 0) { d2_parent <- parent(jk, d2) - d2_pk <- jk[[d2]][[d2]] - - fk <- if (identical(d1, d2_parent)) { - # first is parent of second -> parent keys -> first keys - d1_pk - } else if (identical(d1_parent, d2)) { - # second is parent of first -> parent keys -> second keys - d2_pk - } else if (identical(d1_parent, d2_parent) && length(d1_parent) > 0) { - # both has the same parent -> common keys to parent - keys_d1_parent <- sort(jk[[d1]][[d1_parent]]) - keys_d2_parent <- sort(jk[[d2]][[d2_parent]]) - - common_ix_1 <- unname(keys_d1_parent) %in% unname(keys_d2_parent) - common_ix_2 <- unname(keys_d2_parent) %in% unname(keys_d1_parent) - - if (all(!common_ix_1)) { - # No common keys between datasets - leave empty - next - } - - structure( - names(keys_d2_parent)[common_ix_2], - names = names(keys_d1_parent)[common_ix_1] - ) - } else { - # cant find connection - leave empty - next - } + + if (!identical(d1_parent, d2_parent) || length(d1_parent) == 0) next + + # both has the same parent -> common keys to parent + keys_d1_parent <- sort(jk[[d1]][[d1_parent]]) + keys_d2_parent <- sort(jk[[d2]][[d2_parent]]) + + common_ix_1 <- unname(keys_d1_parent) %in% unname(keys_d2_parent) + common_ix_2 <- unname(keys_d2_parent) %in% unname(keys_d1_parent) + + # No common keys between datasets - leave empty + if (all(!common_ix_1)) next + + fk <- structure( + names(keys_d2_parent)[common_ix_2], + names = names(keys_d1_parent)[common_ix_1] + ) jk[[d1]][[d2]] <- fk # mutate join key - duplicate_pairs <- append(duplicate_pairs, paste(d1, d2)) } } } diff --git a/R/join_keys.R b/R/join_keys.R index 860b97345..a48c8452e 100644 --- a/R/join_keys.R +++ b/R/join_keys.R @@ -6,13 +6,15 @@ #' join_keys(...) #' #' @description -#' `join_keys()` facilitates the creation and retrieval of relationships between datasets. -#' `join_keys` class extends a list and contains keys connecting pairs of datasets. Each element -#' of the list contains keys for specific dataset. Each dataset can have a relationship with -#' itself (primary key) and with other datasets. +#' Facilitates the creation and retrieval of relationships between datasets. +#' `join_keys` class extends `list` and contains keys connecting pairs of datasets. +#' Each element of the list contains keys for specific dataset. +#' Each dataset can have a relationship with itself (primary key) and with other datasets. #' -#' Note that `join_keys` list is symmetrical, that is, when keys are set between `dat1` and `dat2` it -#' is automatically mirrored between `dat2` and `dat1`. +#' Note that `join_keys` list is symmetrical and assumes a default direction, that is: +#' when keys are set between `ds1` and `ds2`, it defines `ds1` as the parent +#' in a parent-child relationship and the mapping is automatically mirrored between +#' `ds2` and `ds1`. #' #' @section Methods (by class): #' - `join_keys()`: Returns an empty `join_keys` object when called without arguments. @@ -24,6 +26,12 @@ #' either `teal_data` or `join_keys` to extract `join_keys`, \cr #' or any number of `join_key_set` objects to create `join_keys`, \cr #' or nothing to create an empty `join_keys` +#' @param value For `x[i, j, directed = TRUE)] <- value` (named/unnamed `character`) +#' Column mapping between datasets. +#' +#' For `join_keys(x) <- value`: (`join_key_set` or list of `join_key_set`) relationship +#' pairs to add to `join_keys` list. +#' #' #' @return `join_keys` object. #' @@ -41,8 +49,8 @@ #' join_key("ds1", "ds1", "pk1"), #' join_key("ds2", "ds2", "pk2"), #' join_key("ds3", "ds3", "pk3"), -#' join_key("ds2", "ds1", c(pk2 = "pk1")), -#' join_key("ds3", "ds1", c(pk3 = "pk1")) +#' join_key("ds1", "ds2", c(pk1 = "pk2")), +#' join_key("ds1", "ds3", c(pk1 = "pk3")) #' ) #' #' jk @@ -91,8 +99,6 @@ join_keys.teal_data <- function(...) { #' @param x (`join_keys`) empty object to set the new relationship pairs. #' `x` is typically an object of `join_keys` class. When called with the `join_keys(x)` #' or `join_keys(x) <- value` then it can also take a supported class (`teal_data`, `join_keys`) -#' @param value (`join_key_set` or list of `join_key_set`) relationship pairs to add -#' to `join_keys` list. #' #' @export `join_keys<-` <- function(x, value) { @@ -114,8 +120,8 @@ join_keys.teal_data <- function(...) { #' join_keys(obj)["ds1", "ds1"] <- "pk1" #' join_keys(obj)["ds2", "ds2"] <- "pk2" #' join_keys(obj)["ds3", "ds3"] <- "pk3" -#' join_keys(obj)["ds2", "ds1"] <- c(pk2 = "pk1") -#' join_keys(obj)["ds3", "ds1"] <- c(pk3 = "pk1") +#' join_keys(obj)["ds1", "ds2"] <- c(pk1 = "pk2") +#' join_keys(obj)["ds1", "ds3"] <- c(pk1 = "pk3") #' #' identical(jk, join_keys(obj)) `join_keys<-.join_keys` <- function(x, value) { diff --git a/inst/WORDLIST b/inst/WORDLIST index 59eaa668b..a30fc4043 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,10 +1,11 @@ Forkers -Getter -Hoffmann -Reproducibility formatters funder +Getter getter +Hoffmann pre repo +Reproducibility reproducibility +undirected diff --git a/man/dot-merge_parents.Rd b/man/dot-merge_parents.Rd deleted file mode 100644 index f558f28e2..000000000 --- a/man/dot-merge_parents.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/join_keys-c.R -\name{.merge_parents} -\alias{.merge_parents} -\title{Merge parents for 2 \code{join_keys} object} -\usage{ -.merge_parents(x, y) -} -\arguments{ -\item{x, y}{(\code{join_keys}) objects to merge their parents} -} -\value{ -a list with parents merged from 2 \code{join_keys}. Not the object itself. -} -\description{ -Merge parents for 2 \code{join_keys} object -} -\keyword{internal} diff --git a/man/get_join_keys.Rd b/man/get_join_keys.Rd index 6f8c78594..70449feb1 100644 --- a/man/get_join_keys.Rd +++ b/man/get_join_keys.Rd @@ -12,7 +12,7 @@ get_join_keys(data, dataset_1, dataset_2 = NULL) <- value \arguments{ \item{data}{`` - object to extract the join keys} -\item{dataset_1, dataset_2}{(\code{character(1)}) dataset names. If \code{dataset_2} is omitted, +\item{dataset_1, dataset_2}{(\code{character(1)}) Dataset names. When \code{dataset_2} is omitted, a primary key for \code{dataset_1} is created.} \item{value}{value to assign} diff --git a/man/join_key.Rd b/man/join_key.Rd index f31fe1f0f..012cca15f 100644 --- a/man/join_key.Rd +++ b/man/join_key.Rd @@ -4,29 +4,37 @@ \alias{join_key} \title{Create a relationship between a pair of datasets} \usage{ -join_key(dataset_1, dataset_2 = dataset_1, keys) +join_key(dataset_1, dataset_2 = dataset_1, keys, directed = TRUE) } \arguments{ -\item{dataset_1, dataset_2}{(\code{character(1)}) dataset names. If \code{dataset_2} is omitted, +\item{dataset_1, dataset_2}{(\code{character(1)}) Dataset names. When \code{dataset_2} is omitted, a primary key for \code{dataset_1} is created.} -\item{keys}{(optionally named \code{character}) where \code{names(keys)} are columns in \code{dataset_1} -corresponding to columns of \code{dataset_2} given by the elements of \code{keys}. +\item{keys}{(optionally named \code{character}) Column mapping between the datasets, +where \code{names(keys)} maps columns in \code{dataset_1} corresponding to columns of +\code{dataset_2} given by the elements of \code{keys}. If unnamed, the same column names are used for both datasets. If any element of the \code{keys} vector is empty with a non-empty name, then the name is used for both datasets.} + +\item{directed}{(\code{logical(1)}) Flag that indicates whether it should create +a parent-child relationship between the datasets.\cr +\itemize{ +\item \code{TRUE} (default) \code{dataset_1} is the parent of \code{dataset_2}; +\item \code{FALSE} when the relationship is undirected. +}} } \value{ object of class \code{join_key_set} to be passed into \code{join_keys} function. } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#stable}{\figure{lifecycle-stable.svg}{options: alt='[Stable]'}}}{\strong{[Stable]}} -} -\details{ -\code{join_key()} will create a relationship for the variables on a pair -of datasets. + +Create a relationship between two datasets, \code{dataset_1} and \code{dataset_2}. +By default, this function establishes a directed relationship with \code{dataset_1} as the parent. +If \code{dataset_2} is not specified, the function creates a primary key for \code{dataset_1}. } \examples{ join_key("d1", "d2", c("A")) @@ -34,5 +42,5 @@ join_key("d1", "d2", c("A" = "B")) join_key("d1", "d2", c("A" = "B", "C")) } \seealso{ -\code{\link[=join_keys]{join_keys()}} +\code{\link[=join_keys]{join_keys()}}, \code{\link[=parents]{parents()}} } diff --git a/man/join_keys.Rd b/man/join_keys.Rd index 2fcc03a5f..c3c2bde37 100644 --- a/man/join_keys.Rd +++ b/man/join_keys.Rd @@ -28,7 +28,7 @@ join_keys(...) \method{[}{join_keys}(x, i, j) -\method{[}{join_keys}(x, i, j) <- value +\method{[}{join_keys}(x, i, j, directed = TRUE) <- value \method{c}{join_keys}(...) @@ -57,20 +57,34 @@ or \code{join_keys(x) <- value} then it can also take a supported class (\code{t \item{i, j}{indices specifying elements to extract or replace. Index should be a a character vector, but it can also take numeric, logical, \code{NULL} or missing.} -\item{value}{(\code{join_key_set} or list of \code{join_key_set}) relationship pairs to add -to \code{join_keys} list.} +\item{directed}{(\code{logical(1)}) Flag that indicates whether it should create +a parent-child relationship between the datasets.\cr +\itemize{ +\item \code{TRUE} (default) \code{dataset_1} is the parent of \code{dataset_2}; +\item \code{FALSE} when the relationship is undirected. +}} + +\item{value}{For \verb{x[i, j, directed = TRUE)] <- value} (named/unnamed \code{character}) +Column mapping between datasets. + +For \code{join_keys(x) <- value}: (\code{join_key_set} or list of \code{join_key_set}) relationship +pairs to add to \code{join_keys} list. + +[i, j, directed = TRUE)]: R:i,\%20j,\%20directed\%20=\%20TRUE)} } \value{ \code{join_keys} object. } \description{ -\code{join_keys()} facilitates the creation and retrieval of relationships between datasets. -\code{join_keys} class extends a list and contains keys connecting pairs of datasets. Each element -of the list contains keys for specific dataset. Each dataset can have a relationship with -itself (primary key) and with other datasets. - -Note that \code{join_keys} list is symmetrical, that is, when keys are set between \code{dat1} and \code{dat2} it -is automatically mirrored between \code{dat2} and \code{dat1}. +Facilitates the creation and retrieval of relationships between datasets. +\code{join_keys} class extends \code{list} and contains keys connecting pairs of datasets. +Each element of the list contains keys for specific dataset. +Each dataset can have a relationship with itself (primary key) and with other datasets. + +Note that \code{join_keys} list is symmetrical and assumes a default direction, that is: +when keys are set between \code{ds1} and \code{ds2}, it defines \code{ds1} as the parent +in a parent-child relationship and the mapping is automatically mirrored between +\code{ds2} and \code{ds1}. } \section{Methods (by class)}{ @@ -115,8 +129,8 @@ jk <- join_keys( join_key("ds1", "ds1", "pk1"), join_key("ds2", "ds2", "pk2"), join_key("ds3", "ds3", "pk3"), - join_key("ds2", "ds1", c(pk2 = "pk1")), - join_key("ds3", "ds1", c(pk3 = "pk1")) + join_key("ds1", "ds2", c(pk1 = "pk2")), + join_key("ds1", "ds3", c(pk1 = "pk3")) ) jk @@ -138,7 +152,7 @@ jk["ds5", "ds5"] <- "pk5" # Setting a single relationship pair --- -jk["ds4", "ds1"] <- c("pk4" = "pk1") +jk["ds1", "ds4"] <- c("pk1" = "pk4") # Removing a key --- @@ -150,7 +164,7 @@ jk_merged <- c( jk, join_keys( join_key("ds4", keys = c("pk4", "pk4_2")), - join_key("ds4", "ds3", c(pk4_2 = "pk3")) + join_key("ds3", "ds4", c(pk3 = "pk4_2")) ) ) @@ -159,7 +173,7 @@ jk_merged <- c( jk_merged <- c( jk_merged, join_key("ds5", keys = "pk5"), - join_key("ds5", "ds1", c(pk5 = "pk1")) + join_key("ds1", "ds5", c(pk1 = "pk5")) ) # Assigning keys via join_keys(x)[i, j] <- value ---- @@ -171,8 +185,8 @@ obj <- teal_data() join_keys(obj)["ds1", "ds1"] <- "pk1" join_keys(obj)["ds2", "ds2"] <- "pk2" join_keys(obj)["ds3", "ds3"] <- "pk3" -join_keys(obj)["ds2", "ds1"] <- c(pk2 = "pk1") -join_keys(obj)["ds3", "ds1"] <- c(pk3 = "pk1") +join_keys(obj)["ds1", "ds2"] <- c(pk1 = "pk2") +join_keys(obj)["ds1", "ds3"] <- c(pk1 = "pk3") identical(jk, join_keys(obj)) diff --git a/tests/testthat/test-join_keys-c.R b/tests/testthat/test-join_keys-c.R index 234a8fe69..bbb503e0e 100644 --- a/tests/testthat/test-join_keys-c.R +++ b/tests/testthat/test-join_keys-c.R @@ -78,38 +78,6 @@ testthat::test_that("c.join_keys doesn't throw when second object is empty join_ testthat::expect_no_error(c(x, y)) }) -testthat::test_that("c.join_keys throws on conflicting join_keys_set objects", { - obj <- join_keys() - testthat::expect_error( - c( - obj, - join_keys(join_key("a", "b", "aa")), - join_keys(join_key("b", "a", "bb")) - ), - "cannot specify multiple different join keys between datasets" - ) - - testthat::expect_error( - c( - obj, - join_key("a", "b", "aa"), - join_key("b", "a", "bb") - ), - "cannot specify multiple different join keys between datasets" - ) -}) - -testthat::test_that("c.join_key_set throws on conflicting join_keys_set objects", { - testthat::expect_error( - c( - join_key("a", "b", "aa"), - join_key("a", "b", "ca"), - join_key("a", "b", "cc") - ), - "cannot specify multiple different join keys between datasets" - ) -}) - testthat::test_that("c.join_key_set merges with empty and non-empty parents", { jk1 <- join_keys( join_key("d1", "d1", "a") @@ -118,25 +86,19 @@ testthat::test_that("c.join_key_set merges with empty and non-empty parents", { jk2 <- join_keys( join_key("d3", "d3", "c"), join_key("d4", "d4", "d"), - join_key("d4", "d3", "cd") + join_key("d3", "d4", "cd") ) - parents(jk2) <- list(d3 = "d4") expected <- join_keys( join_key("d1", "d1", "a"), join_key("d3", "d3", "c"), join_key("d4", "d4", "d"), - join_key("d3", "d4", "cd") - ) - parents(expected) <- list(d3 = "d4") - - testthat::expect_identical( - c(jk1, jk2), - expected + join_key("d3", "d4", "cd", directed = FALSE) ) + parents(expected) <- list(d4 = "d3") testthat::expect_equal( - c(jk2, jk1), + c(jk1, jk2), expected ) }) @@ -147,19 +109,18 @@ testthat::test_that("c.join_key_set merges parents also", { join_key("d2", "d2", "b"), join_key("d1", "d2", "ab") ) - parents(jk1) <- list(d1 = "d2") jk2 <- join_key("d3", "d3", "c") expected <- join_keys( join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), - join_key("d1", "d2", "ab"), + join_key("d1", "d2", "ab", directed = FALSE), join_key("d3", "d3", "c") ) - parents(expected) <- list(d1 = "d2") + parents(expected) <- list(d2 = "d1") testthat::expect_equal( - c(jk2, jk1), + c(jk1, jk2), expected ) }) @@ -170,23 +131,22 @@ testthat::test_that("c.join_keys merges parents also", { join_key("d2", "d2", "b"), join_key("d1", "d2", "ab") ) - parents(jk1) <- list(d1 = "d2") + jk2 <- join_keys( join_key("d3", "d3", "c"), join_key("d4", "d4", "d"), - join_key("d4", "d3", "cd") + join_key("d3", "d4", "cd") ) - parents(jk2) <- list(d3 = "d4") expected <- join_keys( join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), - join_key("d1", "d2", "ab"), + join_key("d1", "d2", "ab", directed = FALSE), join_key("d3", "d3", "c"), join_key("d4", "d4", "d"), - join_key("d3", "d4", "cd") + join_key("d3", "d4", "cd", directed = FALSE) ) - parents(expected) <- list(d1 = "d2", d3 = "d4") + parents(expected) <- list(d2 = "d1", d4 = "d3") testthat::expect_identical( c(jk1, jk2), @@ -201,26 +161,30 @@ testthat::test_that("c.join_keys merges existing parents are overwritten", { join_key("d3", "d3", "c"), join_key("d4", "d4", "d"), join_key("d1", "d2", "ab"), - join_key("d4", "d3", "cd") + join_key("d3", "d4", "cd") ) - parents(jk1) <- list(d1 = "d2", d3 = "d4") jk2 <- join_keys( join_key("d2", "d2", "b"), - join_key("d3", "d2", "cb") + join_key("d2", "d3", "cb") ) - parents(jk2) <- list(d3 = "d2") expected <- join_keys( join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), join_key("d3", "d3", "c"), join_key("d4", "d4", "d"), - join_key("d1", "d2", "ab"), - join_key("d4", "d3", "cd"), - join_key("d3", "d2", "cb") + join_key("d1", "d2", "ab", directed = FALSE), + join_key("d3", "d4", "cd", directed = FALSE), + join_key("d2", "d3", "cb", directed = FALSE) ) - parents(expected) <- list(d1 = "d2", d3 = "d2") + parents(expected) <- list(d2 = "d1", d3 = "d2", d4 = "d3") + + testthat::expect_equal(c(jk1, jk2), expected) +}) - testthat::expect_identical(c(jk1, jk2), expected) +testthat::test_that("c.join_keys throws error when merge produces acyclical graph", { + jk1 <- join_keys(join_key("d1", "d2", "a")) + jk2 <- join_keys(join_key("d2", "d1", "a")) + expect_error(c(jk1, jk2), "Cycle detected in a parent and child dataset graph") }) diff --git a/tests/testthat/test-join_keys-extract.R b/tests/testthat/test-join_keys-extract.R index 10d3ce850..d7e21c40e 100644 --- a/tests/testthat/test-join_keys-extract.R +++ b/tests/testthat/test-join_keys-extract.R @@ -55,17 +55,15 @@ testthat::test_that("join_keys[i] returns join_keys object for given dataset inc join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), join_key("d3", "d3", "c"), - join_key("d2", "d1", "ab"), - join_key("d3", "d1", "ac") + join_key("d1", "d2", "ab"), + join_key("d1", "d3", "ac") ) - parents(my_keys) <- list("d2" = "d1", "d3" = "d1") expected <- join_keys( join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), - join_key("d2", "d1", "ab") + join_key("d1", "d2", "ab") ) - parents(expected) <- list("d2" = "d1") testthat::expect_equal(my_keys["d2"], expected) }) @@ -75,17 +73,15 @@ testthat::test_that("join_keys[i] returns join_keys object for given dataset and join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), join_key("d3", "d3", "c"), - join_key("d2", "d1", "ab"), - join_key("d3", "d1", "ac") + join_key("d1", "d2", "ab"), + join_key("d1", "d3", "ac") ) - parents(my_keys) <- list("d2" = "d1", "d3" = "d1") expected <- join_keys( join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), - join_key("d2", "d1", "ab") + join_key("d1", "d2", "ab") ) - parents(expected) <- list("d2" = "d1") testthat::expect_equal(my_keys["d2"], expected) }) @@ -99,11 +95,14 @@ testthat::test_that("join_keys[i] ignores duplicate indexes - return only first jk <- join_keys( join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), - join_key("d3", "d2", "b") + join_key("d2", "d3", "b") ) testthat::expect_equal( jk[c("d1", "d2", "d1")], - join_keys(join_key("d1", "d1", "a"), join_key("d2", "d2", "b")) + join_keys( + join_key("d1", "d1", "a"), + join_key("d2", "d2", "b") + ) ) }) @@ -160,10 +159,9 @@ testthat::test_that( join_key("a", "a", "aa"), join_key("b", "b", "bb"), join_key("c", "c", "cc"), - join_key("b", "a", c(child = "a1")), - join_key("c", "a", c(child = "a2")) + join_key("a", "b", c("a1" = "aa")), + join_key("a", "c", c("a2" = "aa")) ) - parents(my_keys) <- list("b" = "a", "c" = "a") testthat::expect_null(my_keys["b", "c"]) } ) @@ -175,12 +173,11 @@ testthat::test_that( join_key("a", "a", "aa"), join_key("b", "b", "bb"), join_key("c", "c", "cc"), - join_key("b", "a", "child-parent"), - join_key("c", "a", "child-parent"), - join_key("d", "b", "grandchild-child"), - join_key("e", "c", "grandchild-child") + join_key("a", "b", "child-parent"), + join_key("a", "c", "child-parent"), + join_key("b", "d", "grandchild-child"), + join_key("c", "e", "grandchild-child") ) - parents(my_keys) <- list("b" = "a", "c" = "a", "d" = "b", "e" = "c") testthat::expect_null(my_keys["d", "e"]) } ) @@ -192,10 +189,9 @@ testthat::test_that( join_key("a", "a", "aa"), join_key("b", "b", "bb"), join_key("c", "c", "cc"), - join_key("b", "a", c(bb = "aa")), - join_key("c", "a", c(cc = "aa")) + join_key("a", "b", c("aa" = "bb")), + join_key("a", "c", c("aa" = "cc")) ) - parents(my_keys) <- list("b" = "a", "c" = "a") # "bb" and "cc" are the names in child datasets, "aa" is the name in parent dataset testthat::expect_identical(my_keys["b", "c"], c(bb = "cc")) } @@ -264,6 +260,23 @@ testthat::test_that("join_keys[i,j]<- throws when i or j are longer than 1", { testthat::expect_error(my_keys["a", c("a", "b")] <- "new key") }) +testthat::test_that("join_keys[i,j]<- removes keys with NULL", { + my_keys <- join_keys( + join_key("d1", "d1", "A"), + join_key("d2", "d2", "B"), + join_key("d1", "d2", c("A" = "B")) + ) + my_keys["d2", "d1"] <- NULL + + testthat::expect_equal( + my_keys, + join_keys( + join_key("d1", "d1", "A"), + join_key("d2", "d2", "B") + ) + ) +}) + # [[<-.join_keys ------------------------------------------------ testthat::test_that("[[<-.join_keys accepts named list where each containing character", { jk <- join_keys() @@ -313,15 +326,22 @@ testthat::test_that("[[<-.join_keys assigning NULL drops a key", { testthat::expect_null(jk[["d1"]]) }) -testthat::test_that("[[<-.join_keys adds symmetrical change to the foreign dataset", { +testthat::test_that("[[<-.join_keys adds symmetrical change without parents to the foreign dataset", { jk <- join_keys() jk[["d1"]][["d2"]] <- c("A" = "B", "C" = "C") testthat::expect_equal( jk, - join_keys( - join_key("d1", "d2", c("A" = "B", "C" = "C")), - join_key("d2", "d1", c("B" = "A", "C" = "C")) + structure( + list( + d1 = list( + d2 = c(c("A" = "B", "C" = "C")) + ), + d2 = list( + d1 = c("B" = "A", "C" = "C") + ) + ), + class = c("join_keys", "list") ) ) }) @@ -339,7 +359,7 @@ testthat::test_that("[[<- mutating non-existing keys adds them", { my_keys, join_keys( join_key("d1", "d2", "A"), - join_key("d2", "d3", "B") + join_key("d2", "d3", "B", directed = FALSE) # [[<- doesn't set parent ) ) }) @@ -373,21 +393,17 @@ testthat::test_that("[[<-.join_keys removes keys with NULL", { ) }) -testthat::test_that("[[<-.join_keys removes keys with NULL and applies summetrical changes", { +testthat::test_that("[[<-.join_keys removes keys with NULL and applies symmetrical changes", { my_keys <- join_keys( join_key("d1", "d2", "A"), - join_key("d2", "d1", "A"), - join_key("d2", "d3", "B"), - join_key("d3", "d2", "B") + join_key("d2", "d3", "B") ) my_keys[["d1"]][["d2"]] <- NULL - testthat::expect_identical( + + expect_equal( my_keys, - join_keys( - join_key("d2", "d3", "B"), - join_key("d3", "d2", "B") - ) + join_keys(join_key("d2", "d3", "B")) ) }) @@ -422,5 +438,5 @@ testthat::test_that("[[<-.join_keys fails when provided foreign key pairs for sa testthat::test_that("[[<-.join_keys allows when provided foreign key pairs for same datasets and same keys", { jk <- join_keys() testthat::expect_silent(jk[["ds1"]] <- list(ds2 = "new", ds2 = c("new" = "new"))) - testthat::expect_equal(jk, join_keys(join_key("ds1", "ds2", "new"))) + testthat::expect_equal(jk, join_keys(join_key("ds1", "ds2", "new", directed = FALSE))) }) diff --git a/tests/testthat/test-join_keys-names.R b/tests/testthat/test-join_keys-names.R index 53ca3cf07..82813ca99 100644 --- a/tests/testthat/test-join_keys-names.R +++ b/tests/testthat/test-join_keys-names.R @@ -22,20 +22,19 @@ testthat::test_that("names<-.join_keys will replace names at all levels of the j testthat::test_that("names<-.join_keys will replace names at all levels of the join_keys list when parents set", { jk <- join_keys( join_key("a", "a", "a"), - join_key("b", "a", "ba"), - join_key("c", "a", "ca"), - join_key("d", "b", "db") + join_key("a", "b", "ba"), + join_key("a", "c", "ca"), + join_key("b", "d", "db") ) - parents(jk) <- list(b = "a", c = "a", d = "b") expected <- join_keys( join_key("a", "a", "a"), - join_key("B", "a", "ba"), - join_key("c", "a", "ca"), - join_key("d", "B", "db") + join_key("B", "a", "ba", directed = FALSE), + join_key("c", "a", "ca", directed = FALSE), + join_key("d", "B", "db", directed = FALSE) ) parents(expected) <- list(B = "a", c = "a", d = "B") names(jk)[2] <- "B" - testthat::expect_identical(jk, expected) + testthat::expect_equal(jk, expected) }) diff --git a/tests/testthat/test-join_keys-parents.R b/tests/testthat/test-join_keys-parents.R index 8c8a44502..d2c8e9f86 100644 --- a/tests/testthat/test-join_keys-parents.R +++ b/tests/testthat/test-join_keys-parents.R @@ -4,6 +4,12 @@ testthat::test_that("parents will return empty list when empty/not set", { testthat::expect_identical(parents(jk), list()) }) +testthat::test_that("parents will return empty list when attribute does not exist", { + jk <- join_keys() + attr(jk, "parents") <- NULL + testthat::expect_identical(parents(jk), list()) +}) + testthat::test_that("parents returns the same list as used in parents<-", { jk <- join_keys(join_key("a", "b", "ab")) parents <- list(b = "a") @@ -19,8 +25,8 @@ testthat::test_that("parents<- accepts a named list containing (non-empty, non-m testthat::test_that("parents<- single parent can be changed utilizing list functionality with [[<-", { jk <- join_keys( - join_key("a", "b", "ab"), - join_key("c", "d", "cd") + join_key("a", "b", "ab", directed = FALSE), + join_key("c", "d", "cd", directed = FALSE) ) parents(jk)[["a"]] <- "b" parents(jk)[["c"]] <- "d" @@ -29,25 +35,25 @@ testthat::test_that("parents<- single parent can be changed utilizing list funct testthat::test_that("parents<- dataset can't be own parent", { jk <- join_keys( - join_key("a", "b", "ab"), - join_key("c", "d", "cd") + join_key("a", "b", "ab", directed = FALSE), + join_key("c", "d", "cd", directed = FALSE) ) testthat::expect_error(parents(jk) <- list(a = "a")) }) testthat::test_that("parents<- setting parent-child relationship fails when no foreign keys between datasets", { jk <- join_keys( - join_key("a", "1", "aa"), - join_key("b", "b", "bb") + join_key("a", "1", "aa", directed = FALSE), + join_key("b", "b", "bb", directed = FALSE) ) testthat::expect_error(parents(jk) <- list(a = "b")) }) testthat::test_that("parents<- ensures it is a directed acyclical graph (DAG)", { cyclic_jk <- join_keys( - join_key("a", "b", "id"), - join_key("b", "c", "id"), - join_key("c", "a", "id") + join_key("a", "b", "id", directed = FALSE), + join_key("b", "c", "id", directed = FALSE), + join_key("c", "a", "id", directed = FALSE) ) testthat::expect_error( parents(cyclic_jk) <- list(a = "b", b = "c", c = "a"), @@ -57,8 +63,8 @@ testthat::test_that("parents<- ensures it is a directed acyclical graph (DAG)", testthat::test_that("parents<- single parent can be changed utilizing list functionality with [[<-", { jk <- join_keys( - join_key("a", "b", "ab"), - join_key("c", "d", "cd") + join_key("a", "b", "ab", directed = FALSE), + join_key("c", "d", "cd", directed = FALSE) ) parents(jk)[["a"]] <- "b" parents(jk)[["c"]] <- "d" @@ -67,11 +73,11 @@ testthat::test_that("parents<- single parent can be changed utilizing list funct }) testthat::test_that("parents<- fails when value isn't a list (non-empty, non-missing) character", { - jk <- join_keys(join_key("a", "b", "test")) - testthat::expect_error(parents(jk) <- list(b = 1)) - testthat::expect_error(parents(jk) <- list(b = NA_character_)) - testthat::expect_error(parents(jk) <- list(b = NULL)) - testthat::expect_error(parents(jk) <- NULL) + jk <- join_keys(join_key("a", "b", "test", directed = FALSE)) + testthat::expect_error(parents(jk) <- list(b = 1), "May only contain the following types") + testthat::expect_error(parents(jk) <- list(b = NA_character_), "May not be NA") + testthat::expect_error(parents(jk) <- list(b = NULL), "May only contain the following types") + testthat::expect_error(parents(jk) <- NULL, "Must be of type 'list'") }) testthat::test_that("parents<- setting parents again overwrites previous state", { @@ -94,7 +100,7 @@ testthat::test_that("parents<- sets parent datasets to join_keys kept in teal_da testthat::test_that("parents<- setting parents changes join_keys object", { jk <- join_keys(join_key("a", "b", "ab")) jk2 <- jk - parents <- list(b = "a") + parents <- list(a = "b") parents(jk) <- parents testthat::expect_failure(testthat::expect_identical(jk, jk2)) diff --git a/tests/testthat/test-join_keys-print.R b/tests/testthat/test-join_keys-print.R index dd3c51e95..f9aef2d6d 100644 --- a/tests/testthat/test-join_keys-print.R +++ b/tests/testthat/test-join_keys-print.R @@ -5,18 +5,18 @@ testthat::test_that("format.join_keys for empty set", { testthat::test_that("format.join_keys with empty parents", { my_keys <- join_keys( - join_key("d1", "d1", "a"), - join_key("d2", "d2", "b"), - join_key("d3", "d3", "c"), - join_key("d2", "d1", "ba"), - join_key("d3", "d2", "ca") + join_key("d1", "d1", "a", directed = FALSE), + join_key("d2", "d2", "b", directed = FALSE), + join_key("d3", "d3", "c", directed = FALSE), + join_key("d1", "d2", "ab", directed = FALSE), + join_key("d2", "d3", "ac", directed = FALSE) ) testthat::expect_identical( format(my_keys), paste( "A join_keys object containing foreign keys between 3 datasets:", - "d1: [a]", " <-> d2: [ba]", "d2: [b]", " <-> d1: [ba]", " <-> d3: [ca]", - "d3: [c]", " <-> d2: [ca]", + "d1: [a]", " <-> d2: [ab]", "d2: [b]", " <-> d1: [ab]", " <-> d3: [ac]", + "d3: [c]", " <-> d2: [ac]", sep = "\n" ) ) @@ -27,16 +27,16 @@ testthat::test_that("format.join_keys for parents", { join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), join_key("d3", "d3", "c"), - join_key("d2", "d1", "ba"), - join_key("d3", "d2", "ca") + join_key("d1", "d2", "ab"), + join_key("d2", "d3", "ac") ) - parents(my_keys) <- list("d2" = "d1", "d3" = "d2") + testthat::expect_identical( format(my_keys), paste( "A join_keys object containing foreign keys between 3 datasets:", - "d1: [a]", " <-- d2: [ba]", "d2: [b]", " --> d1: [ba]", " <-- d3: [ca]", - "d3: [c]", " --> d2: [ca]", + "d1: [a]", " <-- d2: [ab]", "d2: [b]", " --> d1: [ab]", " <-- d3: [ac]", + "d3: [c]", " --> d2: [ac]", sep = "\n" ) ) @@ -47,10 +47,10 @@ testthat::test_that("format.join_keys print inferred keys for children sharing p join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), join_key("d3", "d3", "c"), - join_key("d2", "d1", "child-a"), - join_key("d3", "d1", "child-a") + join_key("d1", "d2", "child-a"), + join_key("d1", "d3", "child-a") ) - parents(my_keys) <- list("d2" = "d1", "d3" = "d1") + testthat::expect_identical( format(my_keys), paste( @@ -68,8 +68,8 @@ testthat::test_that("print.join_keys produces output same as format", { join_key("d1", "d1", "a"), join_key("d2", "d2", "b"), join_key("d3", "d3", "c"), - join_key("d2", "d1", "ba"), - join_key("d3", "d2", "ca") + join_key("d1", "d2", "ab"), + join_key("d2", "d3", "ac") ) testthat::expect_output(print(my_keys), format(my_keys), fixed = TRUE) }) diff --git a/tests/testthat/test-join_keys.R b/tests/testthat/test-join_keys.R index 6b6b280f7..cb305947f 100644 --- a/tests/testthat/test-join_keys.R +++ b/tests/testthat/test-join_keys.R @@ -48,6 +48,16 @@ testthat::test_that("join_keys is a collection of join_key, ie named list with n ) }) +testthat::test_that("join_keys cannot create acyclical graph", { + expect_error( + join_keys( + join_key("d1", "d2", "A"), + join_key("d2", "d1", "A") + ), + "Cycle detected in a parent and child dataset graph" + ) +}) + testthat::test_that("join_keys.teal_data returns join_keys object from teal_data", { obj <- teal_data(join_keys = join_keys(join_key("d1", "d1", "a"))) testthat::expect_identical(obj@join_keys, join_keys(obj)) @@ -60,7 +70,13 @@ testthat::test_that("join_keys.join_keys returns itself", { testthat::test_that("join_keys accepts duplicated join_key", { testthat::expect_no_error( - join_keys(join_key("d1", "d2", "a"), join_key("d2", "d1", "a")) + join_keys(join_key("d1", "d2", "a"), join_key("d1", "d2", "a")) + ) +}) + +testthat::test_that("join_keys accepts duplicated join_key (undirected)", { + testthat::expect_no_error( + join_keys(join_key("d1", "d2", "a", directed = FALSE), join_key("d1", "d2", "a", directed = FALSE)) ) }) @@ -73,51 +89,26 @@ testthat::test_that("join_keys doesn't accept a list which is identical to outpu testthat::expect_error(join_keys(unclass(key))) }) -testthat::test_that("join_keys fails when provided foreign key pairs have incompatible values", { - testthat::expect_error( - join_keys(join_key("d1", "d2", "a"), join_key("d2", "d1", "b")), - "cannot specify multiple different join keys between datasets" - ) - testthat::expect_error( - join_keys(join_key("d1", "d2", c(a = "b")), join_key("d2", "d1", c(a = "b"))), - "cannot specify multiple different join keys between datasets" - ) - - testthat::expect_error( - join_keys( - join_keys( - join_key("q", "b", "d"), - join_key("a", "b", "c") - ), - join_key("a", "q", "e"), - join_key("a", "b", "f") - ), - "cannot specify multiple different join keys between datasets" - ) -}) - testthat::test_that("join_keys constructor adds symmetric keys on given (unnamed) foreign key", { my_keys <- join_keys(join_key("d1", "d2", "a")) - testthat::expect_identical( - my_keys, - join_keys(join_key("d1", "d2", "a"), join_key("d2", "d1", "a")) - ) + expected_keys <- join_keys(join_key("d2", "d1", "a", directed = FALSE)) + parents(expected_keys) <- list(d2 = "d1") + + testthat::expect_equal(my_keys, expected_keys) }) testthat::test_that("join_keys constructor adds symmetric keys on given (named) foreign key", { - testthat::expect_identical( + expected_keys <- join_keys(join_key("d2", "d1", c(b = "a"), directed = FALSE)) + parents(expected_keys) <- list(d2 = "d1") + + testthat::expect_equal( join_keys( join_key("d1", "d2", c(a = "b")) ), - join_keys( - join_key("d1", "d2", c(a = "b")), - join_key("d2", "d1", c(b = "a")) - ) + expected_keys ) }) - - # join_keys.<- ---------------------------------------------------------------- testthat::test_that("join_keys<-.join_keys overwrites existing join_keys", { my_keys <- join_keys(join_key("d1", "d1", "a"), join_key("d2", "d2", "b")) diff --git a/vignettes/join-keys.Rmd b/vignettes/join-keys.Rmd index b7f66f4f4..662b13104 100644 --- a/vignettes/join-keys.Rmd +++ b/vignettes/join-keys.Rmd @@ -14,61 +14,77 @@ vignette: > The `teal.data` package provides a way to define primary keys for a dataset and establish relationships with other datasets. -Each dataset can be characterized by: +Each dataset _joining keys_ can be characterized by: -- Columns constituting the primary key -- Merge keys, analogous to `SQL` foreign keys +- Columns constituting the primary key; +- Foreign/merge keys, analogous to `SQL` foreign keys. -Typically, an application developer specifies these keys manually. However, for datasets following the `ADaM` standard, `teal.data` can automatically assign keys using the `default_cdisc_join_keys` object. +Typically, an application developer specifies these keys manually. +However, for datasets following the `ADaM` standard, `teal.data` can automatically assign keys using the `default_cdisc_join_keys` object. Refer to the section ["Joining Keys with `ADaM` Datasets"](#join-keys-in-adam-datasets) for details on using this object to select specific datasets. -##### Uses of _join_keys_ class in _teal_ applications +##### Uses of `join_keys` class in _teal_ applications -The primary function of `join_keys` in `teal` applications is to facilitate the seamless [merging of datasets](https://insightsengineering.github.io/teal.transform/latest-tag/articles/data-merge.html) using `teal.transform`. +The primary function of the `join_keys` class in `teal` applications is to facilitate the seamless [merging of datasets](https://insightsengineering.github.io/teal.transform/latest-tag/articles/data-merge.html) using `teal.transform`. Additionally, it plays a role on the data filtering using the _[Filter Panel](https://insightsengineering.github.io/teal/main/articles/filter-panel.html)_ in a `teal` application. -The filters applied to a _(parent)_ dataset are also applied to their children. +The filters applied to a (parent) dataset are also applied to their children. ## Anatomy of Join Keys The `join_keys` object contains information about the foreign/primary keys of multiple datasets. -Each key is represented by a pair of datasets _(by name reference)_ and a named character vector that encodes the match column name between the two datasets. -In addition, a foreign key also contains a _parent-child_ attribute that is used in the "Filter Panel" as we mentioned above. +Each key is represented by a pair of datasets (by name reference) and a named character vector that encodes the column name mapping between the two datasets. +In addition, a foreign key may also contain a _parent-child_ attribute that is used in the "Filter Panel" as we mentioned above. + +A new join keys can be created as an empty object, or by defining an initial set of primary and foreign keys. -A new `join_keys` can be created as empty or by defining an initial set of primary and foreign keys. That initial object can be extended by adding/modifying/removing keys and by establishing parent-child relationships between datasets. -The `join_keys` function is used both as a constructor and as a getter. -As the getter it is used to retrieve the `join_keys` that is contained in other objects, such as `teal_data`. +##### `join_keys(...)`: Join Keys Constructor / Getter / Setter + +Convenient function that is used both as the constructor and as the getter for _join_keys_ objects. + +As the _Getter_ it is used to retrieve the _joining keys_ that are contained in other objects, such as a `teal_data` object. -As the constructor it is used to specify keys `join_keys(...)` as a collection of multiple `join_key` entries: +As the _Constructor_ it is used to specify a collection of multiple individual keys (via `join_key` function described below). -- `join_key(dataset_1, dataset_2, key)`: specifies the relationship between two datasets: - - `dataset_1`, `dataset_2`: names of the datasets (if `dataset_2` is the same as `dataset_1`, it creates a primary key) - - `key` _(optional)_: named vector of column names +##### `join_key(dataset_1, dataset_2, key, parent)`: Single Join Key Constructor + +Specifies a primary key or a relationship between two datasets. + +- `dataset_1`, `dataset_2`: names of the datasets (if `dataset_2` is the same as `dataset_1` or is omitted, it creates a primary key); +- `key` (optional): named vector of column names; +- `parent` (optional): indicates which dataset (`"dataset_1"` or `"dataset_2"`) is the parent in a _parent-child_ relationship, or `"none"` for an undirected relationship. Note that join keys are assumed to be symmetric, i.e., `join_key("ds1", "ds2", c("ds1_col" = "ds2_col"))` establishes a relationship from "x" to "y" and vice versa. +By default the new joining key will set the `dataset_1` as the parent. + +##### Example & Output + +```{r, include=FALSE} +# nolint start: commented_code_linter. +``` + ```{r, results="hide", message=FALSE, tidy=FALSE} library(teal.data) jk <- join_keys( join_key("ds1", keys = "col_1"), # ds1: [col_1] join_key("ds2", keys = c("col_1", "col_2")), # ds2: [col_1, col_2] join_key("ds3", keys = c("col_1", "col_3")), # ds3: [col_1, col_3] - join_key("ds1", "ds2", keys = "col_1"), # ds1 <--> ds2 - join_key("ds1", "ds3", keys = "col_1"), # ds1 <--> ds3 - join_key("ds4", "ds5", keys = c("col_4" = "col_5")) # ds4 <--> ds5 + join_key("ds1", "ds2", keys = "col_1"), # ds1 <-- ds2 + join_key("ds1", "ds3", keys = "col_1"), # ds1 <-- ds3 + join_key("ds4", "ds5", keys = c("col_4" = "col_5"), directed = FALSE) # ds4 <--> ds5 ) -# The parent of ds2 and ds3 is ds1 -# converts relationship to child-parent -# ds1 <--> ds2 becomes ds1 <-- ds2 -# ds1 <--> ds3 becomes ds1 <-- ds3 -parents(jk) <- list(ds2 = "ds1", ds3 = "ds1") - +# The parent-child relationships are created automatically (unless 'parent' parameter is "none") jk ``` +```{r, include=FALSE} +# nolint end: commented_code_linter. +``` + | Output of `print(jk)` | Output annotation | | ---------------------------------- |:----------------------------------------:| | `## A join_keys object containing foreign keys between 3 datasets:` | **Title** | @@ -88,8 +104,8 @@ jk ## Accessing and Modifying keys -The _subset_ operator with 2 indices is used to retrieve the primary/foreign keys in a `join_keys`. -Both indices must be a string. +The _subset_ operator with 2 indices (`x[i, j]`) is used to retrieve the primary/foreign keys. +Both indices must be a string denoting the dataset name. ```{r} # Using the jk object defined on "Anatomy of Join Keys" @@ -102,14 +118,13 @@ jk["ds1", "ds1"] jk["ds4", "ds5"] ``` -Note that there is a symmetry between `ds4` and `ds5` relationship: +Note that there is a symmetry in the keys between `ds4` and `ds5` relationship: ```{r} jk["ds5", "ds4"] jk["ds5", "ds4"] ``` - When only 1 argument is used this operator will return a `join_keys` object that is filtered accordingly. ```{r} @@ -118,16 +133,18 @@ jk # Getting primary key of "ds1" jk["ds1"] -``` +# Getting keys of "ds1" and "ds2" +jk[c("ds1", "ds2")] +``` Modifying or adding a key uses the same notation with the assignment operator `<-`. -A symmetric relationship will be created automatically. +A symmetric relationship will be created automatically, where the parent (by default) will be the dataset defined on the first index. Assigning `NULL` value will delete the relationship. ```{r} -# Adding a new ds5 <-> ds1 key -jk["ds5", "ds1"] <- "a_column" +# Adding a new ds5 <-- ds1 key +jk["ds1", "ds5"] <- "a_column" # Removing an existing key jk["ds4", "ds5"] <- NULL @@ -135,14 +152,14 @@ jk["ds4", "ds5"] <- NULL ## Join Keys Relationships -There are 2 types of relationships encoded with `join_keys` described in the following sections. The _primary_ and _foreign_ keys are created explicitly using the `join_key` function. +There are 2 types of relationships encoded with _joining keys_ that are described in the following sections. The _primary_ and _foreign_ keys are created explicitly using the constructor for individual keys (`join_key`). -Additionally, the `join_keys` object detects implicit relationships when two datasets share foreign keys to a parent dataset, but not between themselves. These implicit relationships are available just like another foreign key and can be used to merge datasets, despite not being defined by the user. +Additionally, the `join_keys` object infers implicit relationships when two datasets share foreign keys to a parent dataset, but not between themselves. These implicit relationships are available just like another foreign key and can be used to merge datasets, despite not being defined by the user. ### Primary Key with `teal_data` When using the `teal_data` function, the simplest method to define the join keys is to use the `join_keys` argument. -We can specify the column(s) of the dataset that _(together)_ uniquely identify rows in the dataset. +We can specify the column(s) of the dataset that (together) uniquely identify rows in the dataset. ```{r, include=FALSE} # nolint start: commented_code_linter. @@ -165,8 +182,6 @@ join_keys(td_pk) We can extend the previous example and define primary keys for multiple datasets: ```{r, message=FALSE} -library(teal.data) - td_pk <- within( td_pk, { @@ -188,8 +203,8 @@ join_keys(td_pk) ### Foreign Keys with `teal_data` -When passing multiple datasets to the `teal_data` function, dataset relationships are set using `join_keys` and `join_key` and these are used to merge datasets together within `teal` apps. -For users familiar with `SQL` database schema, these relationships are symmetric and not as strict as `SQL` foreign key relationships as `teal` does not validate whether the values inserted into foreign key columns are present in the parent table. +When passing multiple datasets to the `teal_data` function, dataset relationships are set using `join_keys` and `join_key` functions, which then can be used to merge datasets together within `teal` apps. +For users familiar with `SQL` database schema, these relationships are symmetric and not as strict as `SQL` foreign key relationships as `teal` does not validate whether the values defined as foreign key columns are present in the table. For example: @@ -212,22 +227,19 @@ join_keys(td_fk) <- join_keys( join_key("ds2", keys = c("V", "W")), join_key("ds3", keys = c("V")), # Foreign keys - join_key("ds2", "ds1", c("W" = "X")), - join_key("ds3", "ds2", c("V" = "V")) + join_key("ds1", "ds2", c("X" = "W")), + join_key("ds2", "ds3", c("V" = "V")) ) -# The parent of ds2 and ds3 is ds1 -parents(td_fk) <- list(ds2 = "ds1", ds3 = "ds2") - join_keys(td_fk) ``` ### Implicit Relationships -Two datasets that share common _foreign_ keys to the same _parent_ dataset have an implicit relationship between them that is modeled and accessible in `join_keys`. +Two datasets that share common _foreign_ keys to the same _parent_ dataset have an implicit relationship between them that is modeled and accessible in _joining keys_. -This is a special inferred relationship from existing `join_keys` that does not need to be explicitly defined and can be seamlessly accessible just as any other foreign key. -As any other foreign key it can be overwritten. +This is a special relationship that is inferred from existing foreign keys. +It does not need to be explicitly defined but it can be accessed and overwritten just as any other foreign key. These implicit relationships can be used to merge 2 datasets together, just as if they were defined manually. @@ -252,14 +264,11 @@ join_keys(td) <- join_keys( join_key("ds3", keys = c("V")), join_key("ds4", keys = c("V")), # Foreign keys - join_key("ds2", "ds1", c("W" = "X")), - join_key("ds3", "ds2", c("V" = "V")), - join_key("ds4", "ds1", c("V" = "X")) + join_key("ds1", "ds2", c("X" = "W")), + join_key("ds2", "ds3", c("V" = "V")), + join_key("ds1", "ds4", c("X" = "B")) ) -# The parent of ds2 and ds3 is ds1 -parents(td) <- list(ds2 = "ds1", ds3 = "ds2", ds4 = "ds1") - join_keys(td) join_keys(td)["ds2", "ds4"]