Skip to content

Commit

Permalink
Update TADA_IDCensoredData
Browse files Browse the repository at this point in the history
lines 66-94 need more work (ran out of time). commenting out for now
  • Loading branch information
cristinamullin committed Dec 29, 2023
1 parent 05ac102 commit b4d6dc6
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 51 deletions.
107 changes: 65 additions & 42 deletions R/CensoredDataSuite.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,24 +38,25 @@ TADA_IDCensoredData <- function(.data) {
)
TADA_CheckColumns(.data, expected_cols)

# Move detection limit value and unit to TADA Result Measure Value and Unit columns
# this first row copies all over when result is blank (NA) but
# TADA.DetectionQuantitationLimitMeasure.MeasureValue is not and the
# TADA.ResultMeasureValueDataTypes.Flag is not Text
# Imp note: TADA result values are NA for text even though they are not NA in the original result value
# Copy detection limit value and unit to TADA Result Measure Value and Unit columns
# this first row copies all over when TADA.DetectionQuantitationLimitMeasure.MeasureValue is not NA and the
# TADA.ResultMeasureValueDataTypes.Flag is "NA - Not Available"
# Imp note: TADA result values are NA for text and other values (coerced) even though they are not
# NA in the original result value
.data$TADA.ResultMeasureValue <- ifelse(
is.na(.data$TADA.ResultMeasureValue)
& !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue)
& .data$TADA.ResultMeasureValueDataTypes.Flag != "Text",
!is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue)
& .data$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available",
.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue,
.data$TADA.ResultMeasureValue)

# this does the same as above for the units
.data$TADA.ResultMeasure.MeasureUnitCode <- ifelse(
is.na(.data$TADA.ResultMeasure.MeasureUnitCode)
& !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode)
& .data$TADA.ResultMeasureValueDataTypes.Flag != "Text",
!is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode)
& .data$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available",
.data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
.data$TADA.ResultMeasure.MeasureUnitCode)

# this updates the TADA.ResultMeasureValueDataTypes.Flag
.data$TADA.ResultMeasureValueDataTypes.Flag <- ifelse(
.data$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available"
& !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue),
Expand All @@ -66,30 +67,31 @@ TADA_IDCensoredData <- function(.data) {
# when the result value is TEXT but there is a specific text value that indicates
# the result is censored (BPQL, BDL, ND)
# and the TADA.DetectionQuantitationLimitMeasure.MeasureValue provided
.data$TADA.ResultMeasureValueDataTypes.Flag <- ifelse(
.data$TADA.ResultMeasureValueDataTypes.Flag == "Text" &
.data$ResultMeasureValue == "BPQL" |
.data$ResultMeasureValue == "BDL" |
.data$ResultMeasureValue == "ND" &
!is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue),
"Result Value/Unit Copied from Detection Limit",
.data$TADA.ResultMeasureValueDataTypes.Flag)
.data$TADA.ResultMeasureValue <- ifelse(
is.na(.data$TADA.ResultMeasureValue)
& !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue)
& .data$ResultMeasureValue == "BPQL" |
.data$ResultMeasureValue == "BDL" |
.data$ResultMeasureValue == "ND" ,
.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue,
.data$TADA.ResultMeasureValue)
# this does the same as above for the units
.data$TADA.ResultMeasure.MeasureUnitCode <- ifelse(
!is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode)
& .data$ResultMeasureValue == "BPQL" |
.data$ResultMeasureValue == "BDL" |
.data$ResultMeasureValue == "ND" ,
.data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
.data$TADA.ResultMeasure.MeasureUnitCode)
# if more are added, they need to be included below as well (line 194)
# .data$TADA.ResultMeasureValue <- ifelse(
# !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureValue)
# & (.data$ResultMeasureValue == "BPQL" |
# .data$ResultMeasureValue == "BDL" |
# .data$ResultMeasureValue == "ND") ,
# .data$TADA.DetectionQuantitationLimitMeasure.MeasureValue,
# .data$TADA.ResultMeasureValue)
#
# # this does the same as above for the units
# .data$TADA.ResultMeasure.MeasureUnitCode <- ifelse(
# !is.na(.data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode)
# & (.data$ResultMeasureValue == "BPQL" |
# .data$ResultMeasureValue == "BDL" |
# .data$ResultMeasureValue == "ND") ,
# .data$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
# .data$TADA.ResultMeasure.MeasureUnitCode)
#
# # this updates the TADA.ResultMeasureValueDataTypes.Flag
# .data$TADA.ResultMeasureValueDataTypes.Flag <- ifelse(
# (.data$ResultMeasureValue == "BPQL" |
# .data$ResultMeasureValue == "BDL" |
# .data$ResultMeasureValue == "ND"),
# "Result Value/Unit Copied from Detection Limit",
# .data$TADA.ResultMeasureValueDataTypes.Flag)

# If user has not previously run TADA_FlagMeasureQualifierCode, run it here
# to add column TADA.MeasureQualifier.Flag to allow for using user-supplied
Expand Down Expand Up @@ -140,7 +142,7 @@ TADA_IDCensoredData <- function(.data) {
# NOTE that at this point, TADA.Detection_Type may be NA if there are detection conditions in dataset that are not present in domain table
if (any(cens$TADA.Detection_Type[!is.na(cens$TADA.Detection_Type)] == "ResultDetectionConditionText missing")) {
missing_detcond <- length(cens$TADA.Detection_Type[cens$TADA.Detection_Type == "ResultDetectionConditionText missing"])
print(paste0("TADA_IDCensoredData: There are ", missing_detcond, " results in your dataset that are missing ResultDetectionConditionText. When TADA cannot clearly ID the result as a non-detect based in the metadata provided, TADA requires BOTH ResultDetectionConditionText and DetectionQuantitationLimitTypeName fields to be populated in order to categorize censored data. Please contact the TADA Admins to resolve."))
print(paste0("TADA_IDCensoredData: There are ", missing_detcond, " results in your dataset that are missing ResultDetectionConditionText. TADA requires BOTH ResultDetectionConditionText and DetectionQuantitationLimitTypeName fields to be populated in order to categorize censored data. Please contact the TADA Admins to resolve."))
}

## Let user know when one or more result detection conditions are not in the ref table
Expand Down Expand Up @@ -168,11 +170,32 @@ TADA_IDCensoredData <- function(.data) {

## Create flag for condition and limit type combinations
cens$TADA.CensoredData.Flag <- "Detection condition or detection limit is not documented in TADA reference tables."
cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("ResultDetectionConditionText missing"), "Detection condition is missing and required for censored data ID.", cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Non-Detect") & cens$TADA.Limit_Type %in% c("Non-Detect"), "Non-Detect", cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Over-Detect") & cens$TADA.Limit_Type %in% c("Over-Detect"), "Over-Detect", cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Other") & cens$TADA.Limit_Type %in% c("Other"), "Other Condition/Limit Populated", cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Non-Detect", "Over-Detect", "Other") & cens$TADA.Limit_Type %in% c("Non-Detect", "Over-Detect", "Other") & !cens$TADA.Detection_Type == cens$TADA.Limit_Type, "Conflict between Condition and Limit", cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("ResultDetectionConditionText missing"),
"Detection condition is missing and required for censored data ID.",
cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Non-Detect") &
cens$TADA.Limit_Type %in% c("Non-Detect"),
"Non-Detect",
cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Over-Detect") &
cens$TADA.Limit_Type %in% c("Over-Detect"),
"Over-Detect",
cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Other") &
cens$TADA.Limit_Type %in% c("Other"),
"Other Condition/Limit Populated",
cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.Detection_Type %in% c("Non-Detect", "Over-Detect", "Other") &
cens$TADA.Limit_Type %in% c("Non-Detect", "Over-Detect", "Other") &
!cens$TADA.Detection_Type == cens$TADA.Limit_Type,
"Conflict between Condition and Limit",
cens$TADA.CensoredData.Flag)
# Because detection limits for ResultMeasureValue = "BDL", "BPQL", or "ND" are copied to the result value (above), the TADA.CensoredData.Flag should be
# "Non-Detect"
# cens$TADA.CensoredData.Flag <- ifelse(cens$TADA.ResultMeasureValueDataTypes.Flag %in% c("Result Value/Unit Copied from Detection Limit") &
# cens$ResultMeasureValue %in% c("BDL", "BPQL", "ND"),
# "Non-Detect",
# cens$TADA.CensoredData.Flag)

## warn when some limit metadata may be problematic
if ("Conflict between Condition and Limit" %in% cens$TADA.CensoredData.Flag) {
Expand Down Expand Up @@ -251,7 +274,7 @@ TADA_SimpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multi
}

# If user has not previously run TADA_IDCensoredData function, run it here to get required columns and to copy
# detection limit to resut value
# detection limit to result value
if (!"TADA.CensoredData.Flag" %in% names(.data)) {
cens.data <- TADA_IDCensoredData(.data)
} else {
Expand Down
3 changes: 1 addition & 2 deletions R/ResultFlagsDependent.R
Original file line number Diff line number Diff line change
Expand Up @@ -594,8 +594,7 @@ TADA_AutoFilter <- function(.data) {
"ActivityTypeCode"
))

autofilter <- dplyr::filter(.data, TADA.ResultMeasureValueDataTypes.Flag != "NA - Not Available" &
TADA.ResultMeasureValueDataTypes.Flag != "Text" &
autofilter <- dplyr::filter(.data, TADA.ResultMeasureValueDataTypes.Flag != "Text" &
TADA.ResultMeasureValueDataTypes.Flag != "NA - Not Available" &
!is.na(TADA.ResultMeasureValue)) # &
# TADA.ActivityMediaName == "WATER")
Expand Down
16 changes: 9 additions & 7 deletions R/Utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -318,15 +318,15 @@ TADA_ConvertSpecialChars <- function(.data, col) {
is.na(masked) ~ as.character("NA - Not Available"),
TRUE ~ as.character("Numeric")
))

} else {

chars.data$masked <- gsub(" ", "", chars.data$masked) # get rid of white space for subsequent sorting

# Detect special characters in column and populate new flag column with descriptor
# Detect special characters in column and populate new flag column with descriptor
# of the specific type of character/data type
clean.data <- chars.data %>%
dplyr::mutate(flag = dplyr::case_when(
is.na(masked) ~ as.character("NA - Not Available"),
# (masked == "ND") ~ as.character("NA - Not Available"),
(!is.na(suppressWarnings(as.numeric(masked)) == TRUE)) ~ as.character("Numeric"),
(grepl("<", masked) == TRUE) ~ as.character("Less Than"),
(grepl(">", masked) == TRUE) ~ as.character("Greater Than"),
Expand All @@ -338,9 +338,9 @@ TADA_ConvertSpecialChars <- function(.data, col) {
# because * is a special character you have to escape\\ it:
(grepl("\\*", masked) == TRUE) ~ as.character("Approximate Value"),
(!stringi::stri_enc_mark(masked) %in% c("ASCII")) ~ as.character("Non-ASCII Character(s)"),
TRUE ~ "NA - Not Available"
TRUE ~ "Coerced to NA"
))

# Result Values that are numeric ranges with the format #-# are converted to an average of the two numbers expressed in the range.
if (any(clean.data$flag == "Numeric Range - Averaged")) {
numrange <- subset(clean.data, clean.data$flag %in% c("Numeric Range - Averaged"))
Expand All @@ -353,18 +353,20 @@ TADA_ConvertSpecialChars <- function(.data, col) {

clean.data <- plyr::rbind.fill(notnumrange, numrange)
}

# In the new TADA column, convert to numeric and remove some specific special
# characters.
clean.data$masked <- suppressWarnings(as.numeric(stringr::str_replace_all(
clean.data$masked, c("<" = "", ">" = "", "~" = "", "," = "", "%" = "", "\\*" = "")
clean.data$masked, c("<" = "", ">" = "", "~" = "", "%" = "", "\\*" = "")
)))

}

# Rename to original column name, TADA column name, and flag column name
names(clean.data)[names(clean.data) == "orig"] <- col
names(clean.data)[names(clean.data) == "masked"] <- numcol
names(clean.data)[names(clean.data) == "flag"] <- flagcol

clean.data <- TADA_OrderCols(clean.data)

return(clean.data)
Expand Down
121 changes: 121 additions & 0 deletions tests/testthat/test-CensoredDataSuite.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,128 @@ test_that("TADA_SimpleCensoredMethods doesn't drop data", {


test = TADA_RandomNationalTestingSet()

# Copy detection limit value and unit to TADA Result Measure Value and Unit columns
# this first row copies all over when TADA.DetectionQuantitationLimitMeasure.MeasureValue is not NA and the
# TADA.ResultMeasureValueDataTypes.Flag is "NA - Not Available"
# Imp note: TADA result values are NA for text and other values (coerced) even though they are not
# NA in the original result value
test$TADA.ResultMeasureValue <- ifelse(
!is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureValue)
& test$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available",
test$TADA.DetectionQuantitationLimitMeasure.MeasureValue,
test$TADA.ResultMeasureValue)

# this does the same as above for the units
test$TADA.ResultMeasure.MeasureUnitCode <- ifelse(
!is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode)
& test$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available",
test$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
test$TADA.ResultMeasure.MeasureUnitCode)

# this updates the TADA.ResultMeasureValueDataTypes.Flag
test$TADA.ResultMeasureValueDataTypes.Flag <- ifelse(
test$TADA.ResultMeasureValueDataTypes.Flag == "NA - Not Available"
& !is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureValue),
"Result Value/Unit Copied from Detection Limit",
test$TADA.ResultMeasureValueDataTypes.Flag)

# this copies det lim result value and unit over to TADA result value and unit
# when the result value is TEXT but there is a specific text value that indicates
# the result is censored (BPQL, BDL, ND)
# and the TADA.DetectionQuantitationLimitMeasure.MeasureValue provided
# if more are added, they need to be included below as well (line 194)
test$TADA.ResultMeasureValue <- ifelse(
is.na(test$TADA.ResultMeasureValue)
& !is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureValue)
& (test$ResultMeasureValue == "BPQL" |
test$ResultMeasureValue == "BDL" |
test$ResultMeasureValue == "ND") ,
test$TADA.DetectionQuantitationLimitMeasure.MeasureValue,
test$TADA.ResultMeasureValue)

# this does the same as above for the units
test$TADA.ResultMeasure.MeasureUnitCode <- ifelse(
!is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode)
& (test$ResultMeasureValue == "BPQL" |
test$ResultMeasureValue == "BDL" |
test$ResultMeasureValue == "ND") ,
test$TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
test$TADA.ResultMeasure.MeasureUnitCode)

test$TADA.ResultMeasureValueDataTypes.Flag <- ifelse(
test$TADA.ResultMeasureValueDataTypes.Flag == "Text" &
(test$ResultMeasureValue == "BPQL" |
test$ResultMeasureValue == "BDL" |
test$ResultMeasureValue == "ND") &
!is.na(test$TADA.DetectionQuantitationLimitMeasure.MeasureValue),
"Result Value/Unit Copied from Detection Limit",
test$TADA.ResultMeasureValueDataTypes.Flag)

# Result Measure Qualifier codes to identify censored samples.
if (!"TADA.MeasureQualifierCode.Flag" %in% names(test)) {
data_mq_flag <- TADA_FlagMeasureQualifierCode(test)
} else {
data_mq_flag <- test
}

## Identify censored data using TADA.ResultMeasureValueDataTypes.Flag and TADA.MeasureQualifierCode.Flag
cens_rm_flag <- data_mq_flag %>% dplyr::filter(TADA.ResultMeasureValueDataTypes.Flag == "Result Value/Unit Copied from Detection Limit")
cens_mq_flag <- data_mq_flag %>% dplyr::filter(TADA.MeasureQualifierCode.Flag %in% c("Non-Detect", "Over-Detect")) %>%
dplyr::filter(!ResultIdentifier %in% cens_rm_flag$ResultIdentifier)
cens <- cens_rm_flag %>%
rbind(cens_mq_flag)
not_cens <- data_mq_flag %>% dplyr::filter(!ResultIdentifier %in% cens$ResultIdentifier)
not_cens$TADA.CensoredData.Flag <- "Uncensored"

rm(cens_rm_flag, cens_mq_flag, data_mq_flag)


blah = dplyr::select(test, ActivityTypeCode, TADA.ActivityType.Flag,ResultDetectionConditionText,
CharacteristicName,
TADA.CharacteristicName,
ResultMeasureValue,
TADA.ResultMeasureValue,
ResultMeasure.MeasureUnitCode,
TADA.WQXResultUnitConversion,
TADA.ResultMeasureValueDataTypes.Flag,
TADA.ResultMeasure.MeasureUnitCode,
DetectionQuantitationLimitTypeName,
DetectionQuantitationLimitMeasure.MeasureValue,
DetectionQuantitationLimitMeasure.MeasureUnitCode,
TADA.DetectionQuantitationLimitMeasure.MeasureValue,
TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
#TADA.MeasureQualifierCode.Flag,
ProviderName)
write.csv(blah, "blah.csv")

test1 = TADA_IDCensoredData(test)
blah2 = dplyr::select(test1,
ActivityTypeCode,
TADA.ActivityType.Flag,
ResultDetectionConditionText,
CharacteristicName,
TADA.CharacteristicName,
ResultMeasureValue,
TADA.ResultMeasureValue,
ResultMeasure.MeasureUnitCode,
TADA.WQXResultUnitConversion,
TADA.ResultMeasureValueDataTypes.Flag,
TADA.ResultMeasure.MeasureUnitCode,
MeasureQualifierCode,
TADA.MeasureQualifierCode.Flag,
TADA.MeasureQualifierCode.Def,
DetectionQuantitationLimitTypeName,
DetectionQuantitationLimitMeasure.MeasureValue,
DetectionQuantitationLimitMeasure.MeasureUnitCode,
TADA.DetectionQuantitationLimitMeasure.MeasureValue,
TADA.DetectionQuantitationLimitMeasure.MeasureUnitCode,
TADA.DetectionQuantitationLimitMeasure.MeasureValueDataTypes.Flag,
TADA.CensoredData.Flag,
ProviderName
)
write.csv(blah2, "blah2.csv")

test2 = TADA_SimpleCensoredMethods(test1)
test3 = dplyr::select(test2,
ActivityTypeCode,
Expand Down

0 comments on commit b4d6dc6

Please sign in to comment.