From 8f9865e807223e8ed77466cfefc296bff86b622c Mon Sep 17 00:00:00 2001 From: edward-burn <9583964+edward-burn@users.noreply.github.com> Date: Sat, 7 Dec 2024 15:07:07 +0000 Subject: [PATCH 1/4] merge or extend overlapping records --- R/conceptCohort.R | 86 +++++++++++++-- man/conceptCohort.Rd | 17 ++- man/measurementCohort.Rd | 7 +- tests/testthat/test-conceptCohort.R | 156 +++++++++++++++++++++++++++- 4 files changed, 250 insertions(+), 16 deletions(-) diff --git a/R/conceptCohort.R b/R/conceptCohort.R index f68f68d6..0065f773 100644 --- a/R/conceptCohort.R +++ b/R/conceptCohort.R @@ -18,17 +18,23 @@ #' condition_start_date and condition_end_date for records coming #' from the condition_occurrence tables). So that the resulting table #' satisfies the requirements of an OMOP CDM cohort table: -#' * Overlapping records are collapsed into a single cohort entry. -#' * If a record starts outside of an observation period it will be -#' silently ignored. -#' * If a record ends outside of an observation period it will be -#' trimmed so as to end at the preceding observation period end date. +#' * Cohort entries will not overlap. Overlapping records will be +#' combined based on the overlap argument. +#' * Cohort entries will not go out of observation. If a record starts +#' outside of an observation period it will be silently ignored. If a +#' record ends outside of an observation period it will be trimmed so +#' as to end at the preceding observation period end date. #' #' @inheritParams cdmDoc #' @inheritParams conceptSetDoc #' @inheritParams nameDoc #' @param exit How the cohort end date is defined. Can be either #' "event_end_date" or "event_start_date". +#' @param overlap How to deal with overlapping records. In all +#' cases cohort start will be set as the earliest start date. If +#' "merge", cohort end will be the latest end date. If "extend", +#' cohort end date will be set by adding together the total days +#' from each of the overlapping records. #' @param useSourceFields If TRUE, the source concept_id fields will also be #' used when identifying relevant clinical records. If FALSE, only the standard #' concept_id fields will be used. @@ -57,6 +63,7 @@ conceptCohort <- function(cdm, conceptSet, name, exit = "event_end_date", + overlap = "merge", useSourceFields = FALSE, subsetCohort = NULL, subsetCohortId = NULL) { @@ -65,6 +72,7 @@ conceptCohort <- function(cdm, cdm <- omopgenerics::validateCdmArgument(cdm) conceptSet <- omopgenerics::validateConceptSetArgument(conceptSet, cdm) omopgenerics::assertChoice(exit, c("event_start_date", "event_end_date")) + omopgenerics::assertChoice(overlap, c("merge", "extend"), length = 1) omopgenerics::assertLogical(useSourceFields, length = 1) useIndexes <- getOption("CohortConstructor.use_indexes") @@ -190,10 +198,20 @@ conceptCohort <- function(cdm, cli::cli_inform(c("i" = "Applying cohort requirements.")) cdm[[name]] <- fulfillCohortReqs(cdm = cdm, name = name) - cli::cli_inform(c("i" = "Collapsing records.")) + if(overlap == "merge"){ + cli::cli_inform(c("i" = "Merging overlapping records.")) cdm[[name]] <- cdm[[name]] |> joinOverlap(name = name, gap = 0) |> - omopgenerics::recordCohortAttrition(reason = "Collapse overlapping records") + omopgenerics::recordCohortAttrition(reason = "Merge overlapping records") + } + + if(overlap == "extend"){ + cli::cli_inform(c("i" = "Merging overlapping records.")) + cdm[[name]] <- cdm[[name]] |> + extendOverlap(name = name) |> + omopgenerics::recordCohortAttrition(reason = "Add overlapping records") + } + cdm[[name]] <- omopgenerics::newCohortTable(table = cdm[[name]]) @@ -462,3 +480,57 @@ getDomainCohort <- function(cdm, ) |> dplyr::compute(temporary = FALSE, name = name) } + +extendOverlap <- function(cohort, + name){ + cohort <- cohort %>% + dplyr::mutate(record_id = dplyr::row_number()) |> + dplyr::compute() + + # keep overlapping records + cohort_overlap <- cohort %>% + dplyr::inner_join(cohort, + by = c("cohort_definition_id", "subject_id"), + suffix = c("", "_overlap")) |> + dplyr::filter( + record_id != record_id_overlap, + cohort_start_date <= cohort_end_date_overlap & + cohort_end_date >= cohort_start_date_overlap + ) |> + dplyr::select("cohort_definition_id", "subject_id", + "cohort_start_date", "cohort_end_date", + "record_id") |> + dplyr::distinct() |> + dplyr::compute() + + cohort_no_overlap <- cohort |> + dplyr::anti_join(cohort_overlap |> + dplyr::select("record_id"), + by = "record_id") |> + dplyr::select(!"record_id") |> + dplyr::compute() + + cohort_overlap <- cohort_overlap %>% + dplyr::mutate(days = !!CDMConnector::datediff("cohort_start_date", + "cohort_end_date")) |> + dplyr::group_by(dplyr::pick("cohort_definition_id", + "subject_id")) |> + dplyr::summarise(cohort_start_date = min(.data$cohort_start_date, na.rm = TRUE), + days = as.integer(sum(.data$days))) %>% + dplyr:: ungroup() %>% + dplyr::mutate(cohort_end_date = as.Date( + !!CDMConnector::dateadd( + date = "cohort_start_date", + number = "days", + interval = "day" + ))) |> + dplyr::select(!"days") |> + dplyr::compute() + + cohort_updated <- dplyr::union_all(cohort_overlap, + cohort_no_overlap) |> + dplyr::compute(name = name, temporary = FALSE) + + cohort_updated + +} diff --git a/man/conceptCohort.Rd b/man/conceptCohort.Rd index bf965237..8444a75e 100644 --- a/man/conceptCohort.Rd +++ b/man/conceptCohort.Rd @@ -36,6 +36,12 @@ generated cohort.} \item{subsetCohortId}{Optional. Specifies cohort IDs from the \code{subsetCohort} table to include. If none are provided, all cohorts from the \code{subsetCohort} are included.} + +\item{overlap}{How to deal with overlapping records. In all +cases cohort start will be set as the earliest start date. If +"merge", cohort end will be the latest end date. If "extend", +cohort end date will be set by adding together the total days +from each of the overlapping records.} } \value{ A cohort table @@ -61,11 +67,12 @@ condition_start_date and condition_end_date for records coming from the condition_occurrence tables). So that the resulting table satisfies the requirements of an OMOP CDM cohort table: \itemize{ -\item Overlapping records are collapsed into a single cohort entry. -\item If a record starts outside of an observation period it will be -silently ignored. -\item If a record ends outside of an observation period it will be -trimmed so as to end at the preceding observation period end date. +\item Cohort entries will not overlap. Overlapping records will be +combined based on the overlap argument. +\item Cohort entries will not go out of observation. If a record starts +outside of an observation period it will be silently ignored. If a +record ends outside of an observation period it will be trimmed so +as to end at the preceding observation period end date. } } \examples{ diff --git a/man/measurementCohort.Rd b/man/measurementCohort.Rd index 99ed9bb7..8a8775a8 100644 --- a/man/measurementCohort.Rd +++ b/man/measurementCohort.Rd @@ -22,13 +22,14 @@ or a conceptSetExpression.} \item{valueAsConcept}{A vector of cohort IDs used to filter measurements. Only measurements with these values in the \code{value_as_concept_id} column of -the measurement table will be included. If NULL all entries independently of +the measurement table will be included. If NULL all entries independent of their value as concept will be considered.} \item{valueAsNumber}{A named list indicating the range of values and the unit they correspond to, as follows: -list("unit_concept_id" = c(rangeValue1, rangeValue2)). If NULL, all entries -independently of their value as number will be included.} +list("unit_concept_id" = c(rangeValue1, rangeValue2)). If no name is supplied +in the list, no requirement on unit concept id will be applied. If NULL, all +entries independent of their value as number will be included.} } \value{ A cohort table diff --git a/tests/testthat/test-conceptCohort.R b/tests/testthat/test-conceptCohort.R index bdbfd210..fd618db7 100644 --- a/tests/testthat/test-conceptCohort.R +++ b/tests/testthat/test-conceptCohort.R @@ -155,7 +155,7 @@ test_that("simple example", { "reason_id" = 1:4L, "reason" = c( "Initial qualifying events", "Record start <= record end", - "Record in observation", "Collapse overlapping records" + "Record in observation", "Merge overlapping records" ), "excluded_records" = c(0L, 0L, 0L, 5L), "excluded_subjects" = 0L @@ -747,6 +747,160 @@ test_that("missing event end dates", { PatientProfiles::mockDisconnect(cdm) }) +test_that("overlap option", { + skip_on_cran() + cdm <- omock::mockPerson(nPerson = 1) + cdm <- omopgenerics::insertTable( + cdm = cdm, name = "observation_period", table = dplyr::tibble( + "observation_period_id" = c(1L, 2L), + "person_id" = c(1L, 2L), + "observation_period_start_date" = as.Date(c("2020-01-01")), + "observation_period_end_date" = as.Date(c("2020-01-30")), + "period_type_concept_id" = NA_integer_ + )) + cdm <- omopgenerics::insertTable( + cdm = cdm, name = "drug_exposure", table = dplyr::tibble( + "drug_exposure_id" = c(1L, 2L, 3L, 4L, 5L, 6L), + "person_id" = c(1L, 1L, 1L, 1L, 2L, 1L), + "drug_concept_id" = c(1L, 1L, 1L, 1L, 1L, 2L), + # 3 overlapping records + "drug_exposure_start_date" = as.Date(c("2020-01-01", + "2020-01-04", + "2020-01-05", + "2020-01-20", + "2020-01-01", + "2020-01-02")), + "drug_exposure_end_date" = as.Date(c("2020-01-06", + "2020-01-08", + "2020-01-06", + "2020-01-21", + "2020-01-21", + "2020-01-03")), + "drug_type_concept_id" = 1L + ) ) + cdm <- omopgenerics::insertTable( + cdm = cdm, name = "concept", table = dplyr::tibble( + "concept_id" = c(1L, 2L), + "concept_name" = c("concept 1", "concept 2"), + "domain_id" = "drug", + "vocabulary_id" = NA, + "concept_class_id" = NA, + "concept_code" = NA, + "valid_start_date" = NA, + "valid_end_date" = NA + ) + ) + + cdm <- cdm |> copyCdm() + + expect_no_error(cdm$cohort_1 <- conceptCohort(cdm = cdm, + conceptSet = list(a = 1L, + b = 2L), + name = "cohort_1", + exit = "event_end_date", + overlap = "merge")) + expect_true(nrow(cdm$cohort_1 |> + dplyr::collect()) == 4) + expect_true(all(sort(cdm$cohort_1 |> + dplyr::pull("cohort_start_date")) == + as.Date(c("2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-01-20")))) + expect_true(all(sort(cdm$cohort_1 |> + dplyr::pull("cohort_end_date")) == + as.Date(c("2020-01-03", + "2020-01-08", + "2020-01-21", + "2020-01-21")))) + + expect_no_error(cdm$cohort_2 <- conceptCohort(cdm = cdm, + conceptSet = list(a = 1L, + b = 2L), + name = "cohort_2", + exit = "event_end_date", + overlap = "extend")) + expect_true(all(sort(cdm$cohort_2 |> + dplyr::pull("cohort_start_date")) == + as.Date(c("2020-01-01", + "2020-01-01", + "2020-01-02", + "2020-01-20")))) + expect_true(all(sort(cdm$cohort_2 |> + dplyr::pull("cohort_end_date")) == + as.Date(c("2020-01-03", + "2020-01-11", # now 11 instead of 8 (3 days overlap) + "2020-01-21", + "2020-01-21")))) + + +# only overlapping records + cdm <- omopgenerics::insertTable( + cdm = cdm, name = "drug_exposure", table = dplyr::tibble( + "drug_exposure_id" = c(1L, 2L), + "person_id" = c(1L, 1L), + "drug_concept_id" = c(1L, 1L), + "drug_exposure_start_date" = as.Date(c("2020-01-01", + "2020-01-04")), + "drug_exposure_end_date" = as.Date(c("2020-01-06", + "2020-01-08")), + "drug_type_concept_id" = 1L + )) + + expect_no_error(cdm$cohort_3 <- conceptCohort(cdm = cdm, + conceptSet = list(a = 1L, + b = 2L), + name = "cohort_3", + exit = "event_end_date", + overlap = "extend")) + expect_true(nrow(cdm$cohort_3 |> + dplyr::collect()) == 1) + expect_true(all(sort(cdm$cohort_3 |> + dplyr::pull("cohort_start_date")) == + as.Date(c("2020-01-01")))) + expect_true(all(sort(cdm$cohort_3 |> + dplyr::pull("cohort_end_date")) == + as.Date(c("2020-01-10")))) + +# no overlapping records + cdm <- omopgenerics::insertTable( + cdm = cdm, name = "drug_exposure", table = dplyr::tibble( + "drug_exposure_id" = c(1L, 2L), + "person_id" = c(1L, 1L), + "drug_concept_id" = c(1L, 1L), + "drug_exposure_start_date" = as.Date(c("2020-01-01", + "2020-01-05")), + "drug_exposure_end_date" = as.Date(c("2020-01-03", + "2020-01-10")), + "drug_type_concept_id" = 1L + )) + + expect_no_error(cdm$cohort_4 <- conceptCohort(cdm = cdm, + conceptSet = list(a = 1L, + b = 2L), + name = "cohort_4", + exit = "event_end_date", + overlap = "extend")) + expect_true(nrow(cdm$cohort_4 |> + dplyr::collect()) == 2) + expect_true(all(sort(cdm$cohort_4 |> + dplyr::pull("cohort_start_date")) == + as.Date(c("2020-01-01", "2020-01-05")))) + expect_true(all(sort(cdm$cohort_4 |> + dplyr::pull("cohort_end_date")) == + as.Date(c("2020-01-03", "2020-01-10")))) + +# wrong input + expect_error(cdm$cohort_5 <- conceptCohort(cdm = cdm, + conceptSet = list(a = 1L), + name = "cohort_5", + exit = "event_end_date", + overlap = "another")) + + + PatientProfiles::mockDisconnect(cdm) +}) + test_that("test indexes - postgres", { skip_on_cran() skip_if(Sys.getenv("CDM5_POSTGRESQL_DBNAME") == "") From e0698e7750437f64fb9f0cddbb3ed3f8e68f02f5 Mon Sep 17 00:00:00 2001 From: edward-burn <9583964+edward-burn@users.noreply.github.com> Date: Sat, 7 Dec 2024 15:44:35 +0000 Subject: [PATCH 2/4] compute premanent --- R/conceptCohort.R | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/R/conceptCohort.R b/R/conceptCohort.R index 0065f773..24101fc3 100644 --- a/R/conceptCohort.R +++ b/R/conceptCohort.R @@ -483,9 +483,13 @@ getDomainCohort <- function(cdm, extendOverlap <- function(cohort, name){ + + workingTblNames <- paste0(omopgenerics::uniqueTableName(), "_", c(1:4)) + cohort <- cohort %>% dplyr::mutate(record_id = dplyr::row_number()) |> - dplyr::compute() + dplyr::compute(temporary = FALSE, + name = workingTblNames[1]) # keep overlapping records cohort_overlap <- cohort %>% @@ -501,14 +505,16 @@ extendOverlap <- function(cohort, "cohort_start_date", "cohort_end_date", "record_id") |> dplyr::distinct() |> - dplyr::compute() + dplyr::compute(temporary = FALSE, + name = workingTblNames[2]) cohort_no_overlap <- cohort |> dplyr::anti_join(cohort_overlap |> dplyr::select("record_id"), by = "record_id") |> - dplyr::select(!"record_id") |> - dplyr::compute() + dplyr::select(!"record_id") |> + dplyr::compute(temporary = FALSE, + name = workingTblNames[3]) cohort_overlap <- cohort_overlap %>% dplyr::mutate(days = !!CDMConnector::datediff("cohort_start_date", @@ -524,13 +530,17 @@ extendOverlap <- function(cohort, number = "days", interval = "day" ))) |> - dplyr::select(!"days") |> - dplyr::compute() + dplyr::select(!"days") |> + dplyr::compute(temporary = FALSE, + name = workingTblNames[4]) cohort_updated <- dplyr::union_all(cohort_overlap, cohort_no_overlap) |> dplyr::compute(name = name, temporary = FALSE) + CDMConnector::dropTable(cdm = cdm, + name = workingTblNames) + cohort_updated } From b015c879237e0c45feb0186e8be43ba63b667b4b Mon Sep 17 00:00:00 2001 From: edward-burn <9583964+edward-burn@users.noreply.github.com> Date: Sat, 7 Dec 2024 20:14:19 +0000 Subject: [PATCH 3/4] fix --- R/conceptCohort.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/conceptCohort.R b/R/conceptCohort.R index 24101fc3..a3037e0d 100644 --- a/R/conceptCohort.R +++ b/R/conceptCohort.R @@ -486,6 +486,8 @@ extendOverlap <- function(cohort, workingTblNames <- paste0(omopgenerics::uniqueTableName(), "_", c(1:4)) + cdm <- omopgenerics::cdmReference(cohort) + cohort <- cohort %>% dplyr::mutate(record_id = dplyr::row_number()) |> dplyr::compute(temporary = FALSE, From 291cd55bddb258c21976886b7d6b12796ea9be8b Mon Sep 17 00:00:00 2001 From: edward-burn <9583964+edward-burn@users.noreply.github.com> Date: Sun, 8 Dec 2024 09:44:14 +0000 Subject: [PATCH 4/4] document --- man/conceptCohort.Rd | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/man/conceptCohort.Rd b/man/conceptCohort.Rd index 8444a75e..6deac6b5 100644 --- a/man/conceptCohort.Rd +++ b/man/conceptCohort.Rd @@ -9,6 +9,7 @@ conceptCohort( conceptSet, name, exit = "event_end_date", + overlap = "merge", useSourceFields = FALSE, subsetCohort = NULL, subsetCohortId = NULL @@ -25,6 +26,12 @@ or a conceptSetExpression.} \item{exit}{How the cohort end date is defined. Can be either "event_end_date" or "event_start_date".} +\item{overlap}{How to deal with overlapping records. In all +cases cohort start will be set as the earliest start date. If +"merge", cohort end will be the latest end date. If "extend", +cohort end date will be set by adding together the total days +from each of the overlapping records.} + \item{useSourceFields}{If TRUE, the source concept_id fields will also be used when identifying relevant clinical records. If FALSE, only the standard concept_id fields will be used.} @@ -36,12 +43,6 @@ generated cohort.} \item{subsetCohortId}{Optional. Specifies cohort IDs from the \code{subsetCohort} table to include. If none are provided, all cohorts from the \code{subsetCohort} are included.} - -\item{overlap}{How to deal with overlapping records. In all -cases cohort start will be set as the earliest start date. If -"merge", cohort end will be the latest end date. If "extend", -cohort end date will be set by adding together the total days -from each of the overlapping records.} } \value{ A cohort table