diff --git a/dvc.lock b/dvc.lock index 27a1f244..671610c8 100755 --- a/dvc.lock +++ b/dvc.lock @@ -5,17 +5,17 @@ stages: deps: - path: pipeline/00-ingest.R hash: md5 - md5: c453195da12dd0197e0bdd16f4ef3937 - size: 23004 + md5: f02ab180615e40480dbefd49489b8a70 + size: 25715 params: params.yaml: assessment: year: '2024' date: '2024-01-01' - triad: city + triad: north group: residential data_year: '2023' - working_year: '2024' + working_year: '2025' input: min_sale_year: '2015' max_sale_year: '2023' @@ -38,28 +38,28 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: e4b429a0121c6898b972fa20b42544fd - size: 425747228 + md5: 6abab4a0d303c18ea134043144086331 + size: 425373072 - path: input/char_data.parquet hash: md5 - md5: 827c97f9d3bbd3426e8f6fd9136313f8 - size: 847441146 + md5: ccb30b53f04515f8eb5197af6da58a61 + size: 848166656 - path: input/complex_id_data.parquet hash: md5 - md5: 0e2a42a935106a9b6f50d8250012d98c - size: 703255 + md5: b2ef4dd99dc98260a2a5f102d8bc9457 + size: 703671 - path: input/hie_data.parquet hash: md5 - md5: ca86d0e5f29fd252455dc67e2dd40ac1 - size: 1927927 + md5: 94e8843c300a4251ab7ba49cee8a38af + size: 1909891 - path: input/land_nbhd_rate_data.parquet hash: md5 md5: f3ec9627322bd271bf2957b7388aaa34 size: 3873 - path: input/training_data.parquet hash: md5 - md5: 76d91858f84f57ad2dce9fd292fe1ae2 - size: 208138341 + md5: e5d9bdb4ae54e6073337fbc601a06440 + size: 208846045 train: cmd: Rscript pipeline/01-train.R deps: diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 76e96c05..bb6662a8 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -286,6 +286,11 @@ message("Adding time features and cleaning") ## 5.1. Training Data ---------------------------------------------------------- + +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Testing. Multi-card munging -------------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + # Clean up the training data. Goal is to get it into a publishable format. # Final featurization, missingness, etc. is handled via Tidymodels recipes training_data_clean <- training_data_w_hie %>% @@ -399,10 +404,80 @@ training_data_clean <- training_data_w_hie %>% !(char_bldg_sf < 300 & !ind_pin_is_multicard), !(char_land_sf < 300 & !ind_pin_is_multicard) ) %>% + as_tibble() #%>% + # write_parquet(paths$input$training$local) + +# Test new features and multi-card +#- - - - - - - - - - - - - - +training_data_full <- training_data_clean %>% + group_by(meta_pin, meta_sale_document_num) %>% + mutate(meta_card_id = row_number()) %>% + ungroup() + +# 2. Identify multi-card sales rows +multicard_sales <- training_data_full %>% + filter(ind_pin_is_multicard) %>% + select( + meta_pin, + meta_sale_document_num, + meta_card_id, + meta_sale_price, + char_bldg_sf + ) + +# 3. Summarize total building SF per PIN + doc_no +# (so you can compute each card's share) +multicard_total_sf <- multicard_sales %>% + group_by(meta_pin, meta_sale_document_num) %>% + summarise( + total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE), + .groups = "drop" + ) + +# 4. Join total SF back onto each multi-card record, and allocate sale price +multicard_sales_alloc <- multicard_sales %>% + left_join(multicard_total_sf, by = c("meta_pin", "meta_sale_document_num")) %>% + mutate( + sf_share = char_bldg_sf / total_bldg_sf, + meta_sale_price_allocated = meta_sale_price * sf_share + ) + +# 5. Reintegrate into the main training data, joining by meta_card_id +training_data_with_alloc <- training_data_full %>% + left_join( + multicard_sales_alloc %>% + select(meta_pin, meta_sale_document_num, meta_card_id, meta_sale_price_allocated), + by = c("meta_pin", "meta_sale_document_num", "meta_card_id") + ) %>% + # If a multi-card sale, use the allocated sale price; otherwise original + mutate( + meta_sale_price = if_else( + ind_pin_is_multicard, + meta_sale_price_allocated, + meta_sale_price + ) + ) + +training_data <- training_data_with_alloc %>% + select(-meta_sale_price_allocated, -meta_card_id) + +# - - - - - - - - +# Add two features +# - - - - - - - - + +training_data_clean <- training_data %>% + group_by(meta_pin) %>% + mutate( + char_card_pct_bldg = char_bldg_sf / sum(char_bldg_sf, na.rm = TRUE), + char_key_card = char_bldg_sf == max(char_bldg_sf, na.rm = TRUE) + ) %>% + ungroup() %>% as_tibble() %>% write_parquet(paths$input$training$local) + + ## 5.2. Assessment Data -------------------------------------------------------- # Clean the assessment data. This is the target data that the trained model is @@ -503,7 +578,15 @@ assessment_data_clean <- assessment_data_w_hie %>% relocate(starts_with("sv_"), .after = everything()) %>% relocate("year", .after = everything()) %>% relocate(starts_with("meta_sale_"), .after = hie_num_expired) %>% - as_tibble() %>% + group_by(meta_pin) %>% + mutate( + total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE), + char_card_pct_bldg = char_bldg_sf / total_bldg_sf, + # Flag as key card if it’s the maximum SF (ties will create multiple key cards) + char_key_card = if_else(char_bldg_sf == max(char_bldg_sf, na.rm = TRUE), 1, 0) + ) %>% + ungroup() %>% + select(-total_bldg_sf) %>% write_parquet(paths$input$assessment$local) diff --git a/pipeline/01-train.R b/pipeline/01-train.R index 1724d7a0..25231d7d 100644 --- a/pipeline/01-train.R +++ b/pipeline/01-train.R @@ -7,6 +7,7 @@ # Start the stage timer and clear logs from prior stage tictoc::tic.clearlog() + tictoc::tic("Train") # Load libraries, helpers, and recipes from files @@ -24,15 +25,20 @@ message("Run type: ", run_type) #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - message("Preparing model training data") +training_data_full <- read_parquet(paths$input$training$local) %>% + filter(!sv_is_outlier) %>% + arrange(meta_sale_date) + + # Load the full set of training data, then arrange by sale date in order to # facilitate out-of-time sampling/validation # NOTE: It is critical to trim "multicard" sales when training. Multicard means # there is multiple buildings on a PIN. Since these sales include multiple # buildings, they are typically higher than a "normal" sale and must be removed -training_data_full <- read_parquet(paths$input$training$local) %>% - filter(!ind_pin_is_multicard, !sv_is_outlier) %>% - arrange(meta_sale_date) +# training_data_full <- read_parquet(paths$input$training$local) %>% +# filter(!ind_pin_is_multicard, !sv_is_outlier) %>% +# arrange(meta_sale_date) # Create train/test split by time, with most recent observations in the test set # We want our best model(s) to be predictive of the future, since properties are @@ -46,6 +52,8 @@ train <- training(split_data) # Create a recipe for the training data which removes non-predictor columns and # preps categorical data, see R/recipes.R for details + +#TODO: Add two card like features here train_recipe <- model_main_recipe( data = training_data_full, pred_vars = params$model$predictor$all, diff --git a/pipeline/02-assess.R b/pipeline/02-assess.R index 38fa8f25..78edaa96 100644 --- a/pipeline/02-assess.R +++ b/pipeline/02-assess.R @@ -91,14 +91,7 @@ assessment_card_data_mc <- assessment_card_data_pred %>% # blowing up the PIN-level AV group_by(meta_pin) %>% mutate( - pred_pin_card_sum = ifelse( - sum(pred_card_intermediate_fmv) * meta_tieback_proration_rate <= - params$pv$multicard_yoy_cap * first(meta_1yr_pri_board_tot * 10) | - is.na(meta_1yr_pri_board_tot) | - n() != 2, - sum(pred_card_intermediate_fmv), - max(pred_card_intermediate_fmv) - ) + pred_pin_card_sum = sum(pred_card_intermediate_fmv) ) %>% ungroup() @@ -191,8 +184,6 @@ assessment_pin_data_w_land <- assessment_card_data_round %>% ) - - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 5. Prorate and Reapportion --------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -324,7 +315,7 @@ assessment_card_data_merged %>% ) %>% ccao::vars_recode( cols = any_of(char_vars), - code_type = "long", + #code_type = "long", as_factor = FALSE ) %>% write_parquet(paths$output$assessment_card$local) @@ -551,7 +542,7 @@ message("Saving final PIN-level data") assessment_pin_data_final %>% ccao::vars_recode( cols = starts_with("char_"), - code_type = "short", + #code_type = "short", as_factor = FALSE ) %>% # Coerce columns to their expected Athena output type