diff --git a/dvc.lock b/dvc.lock index 6ec804ba..d0b093f1 100755 --- a/dvc.lock +++ b/dvc.lock @@ -33,20 +33,20 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: 5187eb62e2a44b99247356211a5b02d7 - size: 308810165 + md5: 28c8299ff7945f0e1ccfe854339435ce + size: 309900279 - path: input/char_data.parquet hash: md5 - md5: 70a5d4c4d80559462adf09d8cd113c00 - size: 612547374 + md5: 00ee5862a32841db25bdd04ee101e708 + size: 616501379 - path: input/complex_id_data.parquet hash: md5 - md5: f977b7d16738fa9033fde627fc3feaf5 - size: 702763 + md5: 33556a9ed684d14edb3adc17bf109dae + size: 702489 - path: input/hie_data.parquet hash: md5 - md5: da793d132f70a14f917635a434cf027e - size: 1924350 + md5: 8d7ad3974f98db6beb6f93eebe4ea092 + size: 1928855 - path: input/land_nbhd_rate_data.parquet hash: md5 md5: e508daf5790982c303d6503fe1cb8e2b @@ -56,8 +56,8 @@ stages: size: 2109 - path: input/training_data.parquet hash: md5 - md5: f90e28c15c0a71edaa4d7ec0257ebc16 - size: 156744321 + md5: 00170e54ba27d99cfb343b9cbe2c0288 + size: 156769394 train: cmd: Rscript pipeline/01-train.R deps: diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index bf35512a..351b2833 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -63,6 +63,25 @@ training_data <- dbGetQuery( ) tictoc::toc() +# NOTE: This is a temporary shim to insert updated sales validation flags +# that use slightly different groupings/methods vs the production flags. This +# will be replaced once the production flags are stable +sales_flags <- read_parquet(paste0( + "s3://ccao-ci-test-township-partition-data-warehouse-us-east-1", + "/sale/flag/2024-01-22_11:55-charming-damon.parquet" +)) + +# Replace the old flags with the new flags by reference +library(data.table) +conflicts_prefer(dplyr::between) +conflict_prefer_all("lubridate") +setDT(training_data) +setDT(sales_flags) + +training_data[sales_flags, c("sv_is_outlier", "sv_outlier_type") := { + .(i.sv_is_outlier, i.sv_outlier_type) +}, on = .(meta_sale_document_num)] + # Pull all ADDCHARS/HIE data. These are Home Improvement Exemptions (HIEs) # stored in the legacy (AS/400) data system tictoc::tic("HIE data pulled")