diff --git a/R/CensoredDataSuite.R b/R/CensoredDataSuite.R index 85bfe1f9..bd56ea43 100644 --- a/R/CensoredDataSuite.R +++ b/R/CensoredDataSuite.R @@ -38,24 +38,25 @@ TADA_IDCensoredData <- function(.data) { ) TADA_CheckColumns(.data, expected_cols) - # Move detection limit value and unit to TADA Result Measure Value and Unit columns - # this first row copies all over when result is blank (NA) but - # TADA.DetectionQuantitationLimitMeasure.MeasureValue is not and the - # TADA.ResultMeasureValueDataTypes.Flag is not Text - # Imp note: TADA result values are NA for text even though they are not NA in the original result value + # Copy detection limit value and unit to TADA Result Measure Value and Unit columns + # this first row copies all over when TADA.DetectionQuantitationLimitMeasure.MeasureValue is not NA and the + # TADA.ResultMeasureValueDataTypes.Flag is "NA - Not Available" + # Imp note: TADA result values are NA for text and other values (coerced) even though they are not + # NA in the original result value .data$TADA.ResultMeasureValue <- ifelse( - is.na(.data$TADA.ResultMeasureValue) - & !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue) - & .data$TADA.ResultMeasureValueDataTypes.Flag != "Text", + !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue) + & .data$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available", .data$TADA.DetectionQuantitationLimitMeasure.MeasureValue, .data$TADA.ResultMeasureValue) + # this does the same as above for the units .data$TADA.ResultMeasure.MeasureUnitCode <- ifelse( - is.na(.data$TADA.ResultMeasure.MeasureUnitCode) - & !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode) - & .data$TADA.ResultMeasureValueDataTypes.Flag != "Text", + !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode) + & .data$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available", .data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode, .data$TADA.ResultMeasure.MeasureUnitCode) + + # this updates the TADA.ResultMeasureValueDataTypes.Flag .data$TADA.ResultMeasureValueDataTypes.Flag <- ifelse( .data$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available" & !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue), @@ -66,30 +67,31 @@ TADA_IDCensoredData <- function(.data) { # when the result value is TEXT but there is a specific text value that indicates # the result is censored (BPQL, BDL, ND) # and the TADA.DetectionQuantitationLimitMeasure.MeasureValue provided - .data$TADA.ResultMeasureValueDataTypes.Flag <- ifelse( - .data$TADA.ResultMeasureValueDataTypes.Flag == "Text" & - .data$ResultMeasureValue == "BPQL" | - .data$ResultMeasureValue == "BDL" | - .data$ResultMeasureValue == "ND" & - !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue), - "Result Value/Unit Copied from Detection Limit", - .data$TADA.ResultMeasureValueDataTypes.Flag) - .data$TADA.ResultMeasureValue <- ifelse( - is.na(.data$TADA.ResultMeasureValue) - & !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue) - & .data$ResultMeasureValue == "BPQL" | - .data$ResultMeasureValue == "BDL" | - .data$ResultMeasureValue == "ND" , - .data$TADA.DetectionQuantitationLimitMeasure.MeasureValue, - .data$TADA.ResultMeasureValue) - # this does the same as above for the units - .data$TADA.ResultMeasure.MeasureUnitCode <- ifelse( - !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode) - & .data$ResultMeasureValue == "BPQL" | - .data$ResultMeasureValue == "BDL" | - .data$ResultMeasureValue == "ND" , - .data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode, - .data$TADA.ResultMeasure.MeasureUnitCode) + # if more are added, they need to be included below as well (line 194) + # .data$TADA.ResultMeasureValue <- ifelse( + # !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue) + # & (.data$ResultMeasureValue == "BPQL" | + # .data$ResultMeasureValue == "BDL" | + # .data$ResultMeasureValue == "ND") , + # .data$TADA.DetectionQuantitationLimitMeasure.MeasureValue, + # .data$TADA.ResultMeasureValue) + # + # # this does the same as above for the units + # .data$TADA.ResultMeasure.MeasureUnitCode <- ifelse( + # !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode) + # & (.data$ResultMeasureValue == "BPQL" | + # .data$ResultMeasureValue == "BDL" | + # .data$ResultMeasureValue == "ND") , + # .data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode, + # .data$TADA.ResultMeasure.MeasureUnitCode) + # + # # this updates the TADA.ResultMeasureValueDataTypes.Flag + # .data$TADA.ResultMeasureValueDataTypes.Flag <- ifelse( + # (.data$ResultMeasureValue == "BPQL" | + # .data$ResultMeasureValue == "BDL" | + # .data$ResultMeasureValue == "ND"), + # "Result Value/Unit Copied from Detection Limit", + # .data$TADA.ResultMeasureValueDataTypes.Flag) # If user has not previously run TADA_FlagMeasureQualifierCode, run it here # to add column TADA.MeasureQualifier.Flag to allow for using user-supplied @@ -140,7 +142,7 @@ TADA_IDCensoredData <- function(.data) { # NOTE that at this point, TADA.Detection_Type may be NA if there are detection conditions in dataset that are not present in domain table if (any(cens$TADA.Detection_Type[!is.na(cens$TADA.Detection_Type)] == "ResultDetectionConditionText missing")) { missing_detcond <- length(cens$TADA.Detection_Type[cens$TADA.Detection_Type == "ResultDetectionConditionText missing"]) - print(paste0("TADA_IDCensoredData: There are ", missing_detcond, " results in your dataset that are missing ResultDetectionConditionText. When TADA cannot clearly ID the result as a non-detect based in the metadata provided, TADA requires BOTH ResultDetectionConditionText and DetectionQuantitationLimitTypeName fields to be populated in order to categorize censored data. Please contact the TADA Admins to resolve.")) + print(paste0("TADA_IDCensoredData: There are ", missing_detcond, " results in your dataset that are missing ResultDetectionConditionText. TADA requires BOTH ResultDetectionConditionText and DetectionQuantitationLimitTypeName fields to be populated in order to categorize censored data. Please contact the TADA Admins to resolve.")) } ## Let user know when one or more result detection conditions are not in the ref table @@ -168,11 +170,32 @@ TADA_IDCensoredData <- function(.data) { ## Create flag for condition and limit type combinations cens$TADA.CensoredData.Flag <- "Detection condition or detection limit is not documented in TADA reference tables." - cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("ResultDetectionConditionText missing"), "Detection condition is missing and required for censored data ID.", cens$TADA.CensoredData.Flag) - cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Non-Detect") & cens$TADA.Limit_Type %in% c("Non-Detect"), "Non-Detect", cens$TADA.CensoredData.Flag) - cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Over-Detect") & cens$TADA.Limit_Type %in% c("Over-Detect"), "Over-Detect", cens$TADA.CensoredData.Flag) - cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Other") & cens$TADA.Limit_Type %in% c("Other"), "Other Condition/Limit Populated", cens$TADA.CensoredData.Flag) - cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Non-Detect", "Over-Detect", "Other") & cens$TADA.Limit_Type %in% c("Non-Detect", "Over-Detect", "Other") & !cens$TADA.Detection_Type == cens$TADA.Limit_Type, "Conflict between Condition and Limit", cens$TADA.CensoredData.Flag) + cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("ResultDetectionConditionText missing"), + "Detection condition is missing and required for censored data ID.", + cens$TADA.CensoredData.Flag) + cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Non-Detect") & + cens$TADA.Limit_Type %in% c("Non-Detect"), + "Non-Detect", + cens$TADA.CensoredData.Flag) + cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Over-Detect") & + cens$TADA.Limit_Type %in% c("Over-Detect"), + "Over-Detect", + cens$TADA.CensoredData.Flag) + cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Other") & + cens$TADA.Limit_Type %in% c("Other"), + "Other Condition/Limit Populated", + cens$TADA.CensoredData.Flag) + cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Non-Detect", "Over-Detect", "Other") & + cens$TADA.Limit_Type %in% c("Non-Detect", "Over-Detect", "Other") & + !cens$TADA.Detection_Type == cens$TADA.Limit_Type, + "Conflict between Condition and Limit", + cens$TADA.CensoredData.Flag) + # Because detection limits for ResultMeasureValue = "BDL", "BPQL", or "ND" are copied to the result value (above), the TADA.CensoredData.Flag should be + # "Non-Detect" + # cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.ResultMeasureValueDataTypes.Flag %in% c("Result Value/Unit Copied from Detection Limit") & + # cens$ResultMeasureValue %in% c("BDL", "BPQL", "ND"), + # "Non-Detect", + # cens$TADA.CensoredData.Flag) ## warn when some limit metadata may be problematic if ("Conflict between Condition and Limit" %in% cens$TADA.CensoredData.Flag) { @@ -251,7 +274,7 @@ TADA_SimpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multi } # If user has not previously run TADA_IDCensoredData function, run it here to get required columns and to copy - # detection limit to resut value + # detection limit to result value if (!"TADA.CensoredData.Flag" %in% names(.data)) { cens.data <- TADA_IDCensoredData(.data) } else { diff --git a/R/ResultFlagsDependent.R b/R/ResultFlagsDependent.R index 735424ab..ee8c2838 100644 --- a/R/ResultFlagsDependent.R +++ b/R/ResultFlagsDependent.R @@ -594,8 +594,7 @@ TADA_AutoFilter <- function(.data) { "ActivityTypeCode" )) - autofilter <- dplyr::filter(.data, TADA.ResultMeasureValueDataTypes.Flag != "NA - Not Available" & - TADA.ResultMeasureValueDataTypes.Flag != "Text" & + autofilter <- dplyr::filter(.data, TADA.ResultMeasureValueDataTypes.Flag != "Text" & TADA.ResultMeasureValueDataTypes.Flag != "NA - Not Available" & !is.na(TADA.ResultMeasureValue)) # & # TADA.ActivityMediaName == "WATER") diff --git a/R/Utilities.R b/R/Utilities.R index 4c00bc63..e5a4a5b0 100644 --- a/R/Utilities.R +++ b/R/Utilities.R @@ -318,15 +318,15 @@ TADA_ConvertSpecialChars <- function(.data, col) { is.na(masked) ~ as.character("NA - Not Available"), TRUE ~ as.character("Numeric") )) + } else { + chars.data$masked <- gsub(" ", "", chars.data$masked) # get rid of white space for subsequent sorting - - # Detect special characters in column and populate new flag column with descriptor + # Detect special characters in column and populate new flag column with descriptor # of the specific type of character/data type clean.data <- chars.data %>% dplyr::mutate(flag = dplyr::case_when( is.na(masked) ~ as.character("NA - Not Available"), - # (masked == "ND") ~ as.character("NA - Not Available"), (!is.na(suppressWarnings(as.numeric(masked)) == TRUE)) ~ as.character("Numeric"), (grepl("<", masked) == TRUE) ~ as.character("Less Than"), (grepl(">", masked) == TRUE) ~ as.character("Greater Than"), @@ -338,9 +338,9 @@ TADA_ConvertSpecialChars <- function(.data, col) { # because * is a special character you have to escape\\ it: (grepl("\\*", masked) == TRUE) ~ as.character("Approximate Value"), (!stringi::stri_enc_mark(masked) %in% c("ASCII")) ~ as.character("Non-ASCII Character(s)"), - TRUE ~ "NA - Not Available" + TRUE ~ "Coerced to NA" )) - + # Result Values that are numeric ranges with the format #-# are converted to an average of the two numbers expressed in the range. if (any(clean.data$flag == "Numeric Range - Averaged")) { numrange <- subset(clean.data, clean.data$flag %in% c("Numeric Range - Averaged")) @@ -353,18 +353,20 @@ TADA_ConvertSpecialChars <- function(.data, col) { clean.data <- plyr::rbind.fill(notnumrange, numrange) } + # In the new TADA column, convert to numeric and remove some specific special # characters. clean.data$masked <- suppressWarnings(as.numeric(stringr::str_replace_all( - clean.data$masked, c("<" = "", ">" = "", "~" = "", "," = "", "%" = "", "\\*" = "") + clean.data$masked, c("<" = "", ">" = "", "~" = "", "%" = "", "\\*" = "") ))) + } # Rename to original column name, TADA column name, and flag column name names(clean.data)[names(clean.data) == "orig"] <- col names(clean.data)[names(clean.data) == "masked"] <- numcol names(clean.data)[names(clean.data) == "flag"] <- flagcol - + clean.data <- TADA_OrderCols(clean.data) return(clean.data) diff --git a/tests/testthat/test-CensoredDataSuite.R b/tests/testthat/test-CensoredDataSuite.R index 0abe57b4..b77b303d 100644 --- a/tests/testthat/test-CensoredDataSuite.R +++ b/tests/testthat/test-CensoredDataSuite.R @@ -11,7 +11,128 @@ test_that("TADA_SimpleCensoredMethods doesn't drop data", { test = TADA_RandomNationalTestingSet() + +# Copy detection limit value and unit to TADA Result Measure Value and Unit columns +# this first row copies all over when TADA.DetectionQuantitationLimitMeasure.MeasureValue is not NA and the +# TADA.ResultMeasureValueDataTypes.Flag is "NA - Not Available" +# Imp note: TADA result values are NA for text and other values (coerced) even though they are not +# NA in the original result value +test$TADA.ResultMeasureValue <- ifelse( + !is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureValue) + & test$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available", + test$TADA.DetectionQuantitationLimitMeasure.MeasureValue, + test$TADA.ResultMeasureValue) + +# this does the same as above for the units +test$TADA.ResultMeasure.MeasureUnitCode <- ifelse( + !is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode) + & test$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available", + test$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode, + test$TADA.ResultMeasure.MeasureUnitCode) + +# this updates the TADA.ResultMeasureValueDataTypes.Flag +test$TADA.ResultMeasureValueDataTypes.Flag <- ifelse( + test$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available" + & !is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureValue), + "Result Value/Unit Copied from Detection Limit", + test$TADA.ResultMeasureValueDataTypes.Flag) + +# this copies det lim result value and unit over to TADA result value and unit +# when the result value is TEXT but there is a specific text value that indicates +# the result is censored (BPQL, BDL, ND) +# and the TADA.DetectionQuantitationLimitMeasure.MeasureValue provided +# if more are added, they need to be included below as well (line 194) +test$TADA.ResultMeasureValue <- ifelse( + is.na(test$TADA.ResultMeasureValue) + & !is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureValue) + & (test$ResultMeasureValue == "BPQL" | + test$ResultMeasureValue == "BDL" | + test$ResultMeasureValue == "ND") , + test$TADA.DetectionQuantitationLimitMeasure.MeasureValue, + test$TADA.ResultMeasureValue) + +# this does the same as above for the units +test$TADA.ResultMeasure.MeasureUnitCode <- ifelse( + !is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode) + & (test$ResultMeasureValue == "BPQL" | + test$ResultMeasureValue == "BDL" | + test$ResultMeasureValue == "ND") , + test$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode, + test$TADA.ResultMeasure.MeasureUnitCode) + +test$TADA.ResultMeasureValueDataTypes.Flag <- ifelse( + test$TADA.ResultMeasureValueDataTypes.Flag == "Text" & + (test$ResultMeasureValue == "BPQL" | + test$ResultMeasureValue == "BDL" | + test$ResultMeasureValue == "ND") & + !is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureValue), + "Result Value/Unit Copied from Detection Limit", + test$TADA.ResultMeasureValueDataTypes.Flag) + +# Result Measure Qualifier codes to identify censored samples. +if (!"TADA.MeasureQualifierCode.Flag" %in% names(test)) { + data_mq_flag <- TADA_FlagMeasureQualifierCode(test) +} else { + data_mq_flag <- test +} + +## Identify censored data using TADA.ResultMeasureValueDataTypes.Flag and TADA.MeasureQualifierCode.Flag +cens_rm_flag <- data_mq_flag %>% dplyr::filter(TADA.ResultMeasureValueDataTypes.Flag == "Result Value/Unit Copied from Detection Limit") +cens_mq_flag <- data_mq_flag %>% dplyr::filter(TADA.MeasureQualifierCode.Flag %in% c("Non-Detect", "Over-Detect")) %>% + dplyr::filter(!ResultIdentifier %in% cens_rm_flag$ResultIdentifier) +cens <- cens_rm_flag %>% + rbind(cens_mq_flag) +not_cens <- data_mq_flag %>% dplyr::filter(!ResultIdentifier %in% cens$ResultIdentifier) +not_cens$TADA.CensoredData.Flag <- "Uncensored" + +rm(cens_rm_flag, cens_mq_flag, data_mq_flag) + + +blah = dplyr::select(test, ActivityTypeCode, TADA.ActivityType.Flag,ResultDetectionConditionText, + CharacteristicName, + TADA.CharacteristicName, + ResultMeasureValue, + TADA.ResultMeasureValue, + ResultMeasure.MeasureUnitCode, + TADA.WQXResultUnitConversion, + TADA.ResultMeasureValueDataTypes.Flag, + TADA.ResultMeasure.MeasureUnitCode, + DetectionQuantitationLimitTypeName, + DetectionQuantitationLimitMeasure.MeasureValue, + DetectionQuantitationLimitMeasure.MeasureUnitCode, + TADA.DetectionQuantitationLimitMeasure.MeasureValue, + TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode, + #TADA.MeasureQualifierCode.Flag, + ProviderName) +write.csv(blah, "blah.csv") + test1 = TADA_IDCensoredData(test) +blah2 = dplyr::select(test1, + ActivityTypeCode, + TADA.ActivityType.Flag, + ResultDetectionConditionText, + CharacteristicName, + TADA.CharacteristicName, + ResultMeasureValue, + TADA.ResultMeasureValue, + ResultMeasure.MeasureUnitCode, + TADA.WQXResultUnitConversion, + TADA.ResultMeasureValueDataTypes.Flag, + TADA.ResultMeasure.MeasureUnitCode, + MeasureQualifierCode, + TADA.MeasureQualifierCode.Flag, + TADA.MeasureQualifierCode.Def, + DetectionQuantitationLimitTypeName, + DetectionQuantitationLimitMeasure.MeasureValue, + DetectionQuantitationLimitMeasure.MeasureUnitCode, + TADA.DetectionQuantitationLimitMeasure.MeasureValue, + TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode, + TADA.DetectionQuantitationLimitMeasure.MeasureValueDataTypes.Flag, + TADA.CensoredData.Flag, + ProviderName +) +write.csv(blah2, "blah2.csv") + test2 = TADA_SimpleCensoredMethods(test1) test3 = dplyr::select(test2, ActivityTypeCode,