From 38549d9e757e4d4b247a7aa0a252eabc666e7e8e Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 7 Jan 2025 21:27:05 +0000 Subject: [PATCH 01/17] Add hacky multi-card logic --- dvc.lock | 96 ++++++++++++++++++++++---------------------- pipeline/00-ingest.R | 89 ++++++++++++++++++++++++++++++++++++++++ pipeline/01-train.R | 69 +++++++++++++++++-------------- pipeline/02-assess.R | 38 ++++++++++++------ 4 files changed, 201 insertions(+), 91 deletions(-) diff --git a/dvc.lock b/dvc.lock index 27a1f244..7f07f55d 100755 --- a/dvc.lock +++ b/dvc.lock @@ -5,17 +5,17 @@ stages: deps: - path: pipeline/00-ingest.R hash: md5 - md5: c453195da12dd0197e0bdd16f4ef3937 - size: 23004 + md5: 1ad4a6046ce52c0cd6d22838307e1aff + size: 25943 params: params.yaml: assessment: year: '2024' date: '2024-01-01' - triad: city + triad: north group: residential data_year: '2023' - working_year: '2024' + working_year: '2025' input: min_sale_year: '2015' max_sale_year: '2023' @@ -38,50 +38,50 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: e4b429a0121c6898b972fa20b42544fd - size: 425747228 + md5: 88d26b3cb5a56a3d09d7f5a4b54d2fa3 + size: 425743809 - path: input/char_data.parquet hash: md5 - md5: 827c97f9d3bbd3426e8f6fd9136313f8 - size: 847441146 + md5: 37b9bab4c12556b688546a016881cf6e + size: 847100652 - path: input/complex_id_data.parquet hash: md5 - md5: 0e2a42a935106a9b6f50d8250012d98c - size: 703255 + md5: 7efc4a5f530a2b23eedafc2ff06d8b66 + size: 702567 - path: input/hie_data.parquet hash: md5 - md5: ca86d0e5f29fd252455dc67e2dd40ac1 - size: 1927927 + md5: a1a540b7e55ec8d36f9cb222ee6f64d8 + size: 1918937 - path: input/land_nbhd_rate_data.parquet hash: md5 md5: f3ec9627322bd271bf2957b7388aaa34 size: 3873 - path: input/training_data.parquet hash: md5 - md5: 76d91858f84f57ad2dce9fd292fe1ae2 - size: 208138341 + md5: a1931a15c6968ad5be89b2da4218995f + size: 208911843 train: cmd: Rscript pipeline/01-train.R deps: - - path: pipeline/01-train.R - hash: md5 - md5: 46115d48cf066d35b0db14dc13a8d9b3 - size: 17448 - path: input/training_data.parquet hash: md5 - md5: 680e07bdb2a55166b7070155c4ff5a38 - size: 148069926 + md5: a1931a15c6968ad5be89b2da4218995f + size: 208911843 + - path: pipeline/01-train.R + hash: md5 + md5: 6a4409b7f1c44e8e03903be6e298920b + size: 17700 params: params.yaml: cv: split_prop: 0.9 - num_folds: 5 + num_folds: 10 + fold_overlap: 9 initial_set: 20 max_iterations: 50 no_improve: 15 uncertain: 8 best_metric: rmse - input.time_split: 15 model.engine: lightgbm model.hyperparameter: default: @@ -101,7 +101,7 @@ stages: lambda_l2: 0.152 range: num_iterations: - - 40 + - 100 - 2500 learning_rate: - -3.0 @@ -148,13 +148,12 @@ stages: validation_type: random validation_metric: rmse link_max_depth: true - stop_iter: 40 + stop_iter: 50 model.predictor: all: - meta_township_code - meta_nbhd_code - - meta_modeling_group - - meta_tieback_proration_rate + - meta_sale_count_past_n_years - char_yrblt - char_air - char_apts @@ -164,14 +163,16 @@ stages: - char_bldg_sf - char_bsmt - char_bsmt_fin + - char_card_pct_bldg + - char_class - char_ext_wall - char_fbath - char_frpl - - char_gar1_area - char_gar1_att - char_gar1_cnst - char_gar1_size - char_hbath + - char_key_card - char_land_sf - char_heat - char_ncu @@ -179,14 +180,12 @@ stages: - char_roof_cnst - char_rooms - char_tp_dsgn - - char_tp_plan - char_type_resd - char_recent_renovation - loc_longitude - loc_latitude - - loc_env_flood_fema_sfha + - loc_census_tract_geoid - loc_env_flood_fs_factor - - loc_env_flood_fs_risk_direction - loc_env_airport_noise_dnl - loc_school_elementary_district_geoid - loc_school_secondary_district_geoid @@ -212,6 +211,8 @@ stages: - prox_nearest_park_dist_ft - prox_nearest_railroad_dist_ft - prox_nearest_secondary_road_dist_ft + - prox_nearest_university_dist_ft + - prox_nearest_vacant_land_dist_ft - prox_nearest_water_dist_ft - prox_nearest_golf_course_dist_ft - acs5_percent_age_children @@ -237,7 +238,9 @@ stages: - other_tax_bill_rate - other_school_district_elementary_avg_rating - other_school_district_secondary_avg_rating + - ccao_is_active_exe_homeowner - ccao_is_corner_lot + - ccao_n_years_exe_homeowner - time_sale_year - time_sale_day - time_sale_quarter_of_year @@ -249,15 +252,14 @@ stages: categorical: - meta_township_code - meta_nbhd_code - - meta_modeling_group - char_air - char_apts - char_attic_fnsh - char_attic_type - char_bsmt - char_bsmt_fin + - char_class - char_ext_wall - - char_gar1_area - char_gar1_att - char_gar1_cnst - char_gar1_size @@ -265,8 +267,8 @@ stages: - char_porch - char_roof_cnst - char_tp_dsgn - - char_tp_plan - char_type_resd + - loc_census_tract_geoid - loc_tax_municipality_name - loc_school_elementary_district_geoid - loc_school_secondary_district_geoid @@ -301,40 +303,36 @@ stages: - loc_school_elementary_district_geoid - loc_school_secondary_district_geoid - loc_school_unified_district_geoid - pins: - - '13253180150000' - - '17321110470000' - - '05174150240000' toggle.cv_enable: false outs: - path: output/intermediate/timing/model_timing_train.parquet hash: md5 - md5: 98fae7af31e3fee9e5ba6281c2201ed9 - size: 2872 + md5: afd8e7df175204e4e1a0589f768fc512 + size: 2494 - path: output/parameter_final/model_parameter_final.parquet hash: md5 - md5: fd6a04559a01ec21417ded57ce680f17 - size: 8516 + md5: 42d5e8030cef68b122116608d206756a + size: 6403 - path: output/parameter_range/model_parameter_range.parquet hash: md5 - md5: 3b2015c65992cfcc2a46b1c029d62212 + md5: 4e6296ab17f03b1b8c20b374fc8751a3 size: 501 - path: output/parameter_search/model_parameter_search.parquet hash: md5 - md5: 3b2015c65992cfcc2a46b1c029d62212 + md5: 4e6296ab17f03b1b8c20b374fc8751a3 size: 501 - path: output/test_card/model_test_card.parquet hash: md5 - md5: a115c5d7c95f31e8440f8881ca45a144 - size: 2075102 + md5: b1ee0d8ae58521a0cf16f9f407d56a6b + size: 483246 - path: output/workflow/fit/model_workflow_fit.zip hash: md5 - md5: 9c53eaf3b76c70369107fc259fd29dff - size: 11141014 + md5: 8269ac972c881cdf388cce77bf297e4e + size: 9892339 - path: output/workflow/recipe/model_workflow_recipe.rds hash: md5 - md5: d3596ea1b3054ac3cead76d8f4d1a8d7 - size: 1802847 + md5: ec6db9515fddbf97cb6d1acc4f9f55a4 + size: 541507 assess: cmd: Rscript pipeline/02-assess.R deps: diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 76e96c05..3e98996e 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -286,6 +286,25 @@ message("Adding time features and cleaning") ## 5.1. Training Data ---------------------------------------------------------- + +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +# Testing. Multi-card munging -------------------------------------------------- +#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +# Calculate total square footage per PIN and proportion per card +training_data_w_proportions <- training_data_w_hie %>% + # Group by PIN and sale to get total square footage + group_by(meta_pin, meta_sale_document_num) %>% + mutate( + # Total building square footage for the PIN + pin_total_bldg_sf = sum(char_bldg_sf), + # This card's proportion of total square footage + card_sqft_proportion = char_bldg_sf / pin_total_bldg_sf, + # Allocate sale price based on square footage proportion + meta_sale_price_allocated = meta_sale_price * card_sqft_proportion + ) %>% + ungroup() + # Clean up the training data. Goal is to get it into a publishable format. # Final featurization, missingness, etc. is handled via Tidymodels recipes training_data_clean <- training_data_w_hie %>% @@ -399,10 +418,80 @@ training_data_clean <- training_data_w_hie %>% !(char_bldg_sf < 300 & !ind_pin_is_multicard), !(char_land_sf < 300 & !ind_pin_is_multicard) ) %>% + as_tibble() #%>% + # write_parquet(paths$input$training$local) + +# Test new features and multi-card +#- - - - - - - - - - - - - - +training_data_full <- training_data_clean %>% + group_by(meta_pin, meta_sale_document_num) %>% + mutate(meta_card_id = row_number()) %>% + ungroup() + +# 2. Identify multi-card sales rows +multicard_sales <- training_data_full %>% + filter(ind_pin_is_multicard) %>% + select( + meta_pin, + meta_sale_document_num, + meta_card_id, + meta_sale_price, + char_bldg_sf + ) + +# 3. Summarize total building SF per PIN + doc_no +# (so you can compute each card's share) +multicard_total_sf <- multicard_sales %>% + group_by(meta_pin, meta_sale_document_num) %>% + summarise( + total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE), + .groups = "drop" + ) + +# 4. Join total SF back onto each multi-card record, and allocate sale price +multicard_sales_alloc <- multicard_sales %>% + left_join(multicard_total_sf, by = c("meta_pin", "meta_sale_document_num")) %>% + mutate( + sf_share = char_bldg_sf / total_bldg_sf, + meta_sale_price_allocated = meta_sale_price * sf_share + ) + +# 5. Reintegrate into the main training data, joining by meta_card_id +training_data_with_alloc <- training_data_full %>% + left_join( + multicard_sales_alloc %>% + select(meta_pin, meta_sale_document_num, meta_card_id, meta_sale_price_allocated), + by = c("meta_pin", "meta_sale_document_num", "meta_card_id") + ) %>% + # If a multi-card sale, use the allocated sale price; otherwise original + mutate( + meta_sale_price = if_else( + ind_pin_is_multicard, + meta_sale_price_allocated, + meta_sale_price + ) + ) + +training_data <- training_data_with_alloc %>% + select(-meta_sale_price_allocated, -meta_card_id) + +# - - - - - - - - +# Add two features +# - - - - - - - - + +training_data_clean <- training_data %>% + group_by(meta_pin) %>% + mutate( + char_card_pct_bldg = char_bldg_sf / sum(char_bldg_sf, na.rm = TRUE), + char_key_card = char_bldg_sf == max(char_bldg_sf, na.rm = TRUE) + ) %>% + ungroup() %>% as_tibble() %>% write_parquet(paths$input$training$local) + + ## 5.2. Assessment Data -------------------------------------------------------- # Clean the assessment data. This is the target data that the trained model is diff --git a/pipeline/01-train.R b/pipeline/01-train.R index 1724d7a0..ab0fb12c 100644 --- a/pipeline/01-train.R +++ b/pipeline/01-train.R @@ -7,6 +7,7 @@ # Start the stage timer and clear logs from prior stage tictoc::tic.clearlog() + tictoc::tic("Train") # Load libraries, helpers, and recipes from files @@ -24,15 +25,21 @@ message("Run type: ", run_type) #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - message("Preparing model training data") +training_data_full <- read_parquet(paths$input$training$local) %>% + filter(!sv_is_outlier) %>% + arrange(meta_sale_date) %>% + sample_frac(size = 0.20) + + # Load the full set of training data, then arrange by sale date in order to # facilitate out-of-time sampling/validation # NOTE: It is critical to trim "multicard" sales when training. Multicard means # there is multiple buildings on a PIN. Since these sales include multiple # buildings, they are typically higher than a "normal" sale and must be removed -training_data_full <- read_parquet(paths$input$training$local) %>% - filter(!ind_pin_is_multicard, !sv_is_outlier) %>% - arrange(meta_sale_date) +# training_data_full <- read_parquet(paths$input$training$local) %>% +# filter(!ind_pin_is_multicard, !sv_is_outlier) %>% +# arrange(meta_sale_date) # Create train/test split by time, with most recent observations in the test set # We want our best model(s) to be predictive of the future, since properties are @@ -46,6 +53,8 @@ train <- training(split_data) # Create a recipe for the training data which removes non-predictor columns and # preps categorical data, see R/recipes.R for details + +#TODO: Add two card like features here train_recipe <- model_main_recipe( data = training_data_full, pred_vars = params$model$predictor$all, @@ -63,28 +72,28 @@ message("Creating and fitting linear baseline model") # Create a linear model recipe with additional imputation, transformations, # and feature interactions -lin_recipe <- model_lin_recipe( - data = training_data_full %>% - mutate(meta_sale_price = log(meta_sale_price)), - pred_vars = params$model$predictor$all, - cat_vars = params$model$predictor$categorical, - id_vars = params$model$predictor$id -) - -# Create a linear model specification and workflow -lin_model <- parsnip::linear_reg() %>% - set_mode("regression") %>% - set_engine("lm") -lin_wflow <- workflow() %>% - add_model(lin_model) %>% - add_recipe( - recipe = lin_recipe, - blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE) - ) - -# Fit the linear model on the training data -lin_wflow_final_fit <- lin_wflow %>% - fit(data = train %>% mutate(meta_sale_price = log(meta_sale_price))) +# lin_recipe <- model_lin_recipe( +# data = training_data_full %>% +# mutate(meta_sale_price = log(meta_sale_price)), +# pred_vars = params$model$predictor$all, +# cat_vars = params$model$predictor$categorical, +# id_vars = params$model$predictor$id +# ) +# +# # Create a linear model specification and workflow +# lin_model <- parsnip::linear_reg() %>% +# set_mode("regression") %>% +# set_engine("lm") +# lin_wflow <- workflow() %>% +# add_model(lin_model) %>% +# add_recipe( +# recipe = lin_recipe, +# blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE) +# ) +# +# # Fit the linear model on the training data +# lin_wflow_final_fit <- lin_wflow %>% +# fit(data = train %>% mutate(meta_sale_price = log(meta_sale_price))) @@ -388,10 +397,10 @@ message("Finalizing and saving trained model") test %>% mutate( pred_card_initial_fmv = predict(lgbm_wflow_final_fit, test)$.pred, - pred_card_initial_fmv_lin = exp(predict( - lin_wflow_final_fit, - test %>% mutate(meta_sale_price = log(meta_sale_price)) - )$.pred) + # pred_card_initial_fmv_lin = exp(predict( + # lin_wflow_final_fit, + # test %>% mutate(meta_sale_price = log(meta_sale_price)) + # )$.pred) ) %>% select( meta_year, meta_pin, meta_class, meta_card_num, meta_triad_code, @@ -400,7 +409,7 @@ test %>% "prior_far_tot" = params$ratio_study$far_column, "prior_near_tot" = params$ratio_study$near_column )), - pred_card_initial_fmv, pred_card_initial_fmv_lin, + pred_card_initial_fmv, #pred_card_initial_fmv_lin, meta_sale_price, meta_sale_date, meta_sale_document_num ) %>% # Prior year values are AV, not FMV. Multiply by 10 to get FMV for residential diff --git a/pipeline/02-assess.R b/pipeline/02-assess.R index f7c45ca8..fe8da55b 100644 --- a/pipeline/02-assess.R +++ b/pipeline/02-assess.R @@ -41,7 +41,19 @@ lgbm_final_full_recipe <- readRDS(paths$output$workflow_recipe$local) # Load the data for assessment. This is the universe of CARDs (not # PINs) that needs values. Use the trained lightgbm model to estimate a single # fair-market value for each card + + + + assessment_card_data_pred <- read_parquet(paths$input$assessment$local) %>% + group_by(meta_pin) %>% + mutate( + total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE), + char_card_pct_bldg = char_bldg_sf / total_bldg_sf, + # Flag as key card if it’s the maximum SF (ties will create multiple key cards) + char_key_card = if_else(char_bldg_sf == max(char_bldg_sf, na.rm = TRUE), 1, 0) + ) %>% + ungroup() %>% as_tibble() %>% mutate( pred_card_initial_fmv = predict( @@ -91,14 +103,7 @@ assessment_card_data_mc <- assessment_card_data_pred %>% # blowing up the PIN-level AV group_by(meta_pin) %>% mutate( - pred_pin_card_sum = ifelse( - sum(pred_card_intermediate_fmv) * meta_tieback_proration_rate <= - params$pv$multicard_yoy_cap * first(meta_1yr_pri_board_tot * 10) | - is.na(meta_1yr_pri_board_tot) | - n() != 2, - sum(pred_card_intermediate_fmv), - max(pred_card_intermediate_fmv) - ) + pred_pin_card_sum = sum(pred_card_intermediate_fmv) ) %>% ungroup() @@ -175,13 +180,22 @@ assessment_pin_data_w_land <- assessment_card_data_round %>% pred_pin_final_fmv_round_no_prorate * params$pv$land_pct_of_total_cap, TRUE ~ char_land_sf * land_rate_per_sqft )), + # If the land $/sqft is missing, just use the max capped land value as a + # default (usually 50% of the predicted value). Data doesn't usually get + # land $/sqft until the beginning of the year we're modeling for, but a + # predicted land value is required to calculate the final estimated FMV. As + # such, setting this default lets us start modeling before we receive the + # finalized land $/sqft rates + pred_pin_final_fmv_land = ifelse( + is.na(pred_pin_final_fmv_land), + pred_pin_final_fmv_round_no_prorate * params$pv$land_pct_of_total_cap, + pred_pin_final_fmv_land + ), # Keep the uncapped value for display in desk review pred_pin_uncapped_fmv_land = ceiling(char_land_sf * land_rate_per_sqft) ) - - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 5. Prorate and Reapportion --------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -313,7 +327,7 @@ assessment_card_data_merged %>% ) %>% ccao::vars_recode( cols = any_of(char_vars), - code_type = "long", + #code_type = "long", as_factor = FALSE ) %>% write_parquet(paths$output$assessment_card$local) @@ -540,7 +554,7 @@ message("Saving final PIN-level data") assessment_pin_data_final %>% ccao::vars_recode( cols = starts_with("char_"), - code_type = "short", + #code_type = "short", as_factor = FALSE ) %>% # Coerce columns to their expected Athena output type From 65740c7071a3b78989ad68e720fc5cdebe976655 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 7 Jan 2025 21:42:11 +0000 Subject: [PATCH 02/17] Bring features into assess data --- pipeline/00-ingest.R | 10 +++++++++- pipeline/02-assess.R | 12 ------------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 3e98996e..6e906e64 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -592,7 +592,15 @@ assessment_data_clean <- assessment_data_w_hie %>% relocate(starts_with("sv_"), .after = everything()) %>% relocate("year", .after = everything()) %>% relocate(starts_with("meta_sale_"), .after = hie_num_expired) %>% - as_tibble() %>% + group_by(meta_pin) %>% + mutate( + total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE), + char_card_pct_bldg = char_bldg_sf / total_bldg_sf, + # Flag as key card if it’s the maximum SF (ties will create multiple key cards) + char_key_card = if_else(char_bldg_sf == max(char_bldg_sf, na.rm = TRUE), 1, 0) + ) %>% + ungroup() %>% + select(-total_bldg_sf) %>% write_parquet(paths$input$assessment$local) diff --git a/pipeline/02-assess.R b/pipeline/02-assess.R index fe8da55b..78edaa96 100644 --- a/pipeline/02-assess.R +++ b/pipeline/02-assess.R @@ -41,19 +41,7 @@ lgbm_final_full_recipe <- readRDS(paths$output$workflow_recipe$local) # Load the data for assessment. This is the universe of CARDs (not # PINs) that needs values. Use the trained lightgbm model to estimate a single # fair-market value for each card - - - - assessment_card_data_pred <- read_parquet(paths$input$assessment$local) %>% - group_by(meta_pin) %>% - mutate( - total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE), - char_card_pct_bldg = char_bldg_sf / total_bldg_sf, - # Flag as key card if it’s the maximum SF (ties will create multiple key cards) - char_key_card = if_else(char_bldg_sf == max(char_bldg_sf, na.rm = TRUE), 1, 0) - ) %>% - ungroup() %>% as_tibble() %>% mutate( pred_card_initial_fmv = predict( From 8d7df91e4de0d55efdd093d92af1929171148721 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 7 Jan 2025 21:46:18 +0000 Subject: [PATCH 03/17] Remove subset on train data --- pipeline/01-train.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipeline/01-train.R b/pipeline/01-train.R index ab0fb12c..bbcf5b59 100644 --- a/pipeline/01-train.R +++ b/pipeline/01-train.R @@ -27,8 +27,7 @@ message("Preparing model training data") training_data_full <- read_parquet(paths$input$training$local) %>% filter(!sv_is_outlier) %>% - arrange(meta_sale_date) %>% - sample_frac(size = 0.20) + arrange(meta_sale_date) # Load the full set of training data, then arrange by sale date in order to From 2f3db8723ad1d186498db1f4166ea83de7f43065 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 8 Jan 2025 16:24:19 +0000 Subject: [PATCH 04/17] Only change ingest lock file, restore the rest to main --- dvc.lock | 70 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/dvc.lock b/dvc.lock index 7f07f55d..1e396949 100755 --- a/dvc.lock +++ b/dvc.lock @@ -60,28 +60,28 @@ stages: hash: md5 md5: a1931a15c6968ad5be89b2da4218995f size: 208911843 - train: + train: cmd: Rscript pipeline/01-train.R deps: - - path: input/training_data.parquet - hash: md5 - md5: a1931a15c6968ad5be89b2da4218995f - size: 208911843 - path: pipeline/01-train.R hash: md5 - md5: 6a4409b7f1c44e8e03903be6e298920b - size: 17700 + md5: 46115d48cf066d35b0db14dc13a8d9b3 + size: 17448 + - path: input/training_data.parquet + hash: md5 + md5: 680e07bdb2a55166b7070155c4ff5a38 + size: 148069926 params: params.yaml: cv: split_prop: 0.9 - num_folds: 10 - fold_overlap: 9 + num_folds: 5 initial_set: 20 max_iterations: 50 no_improve: 15 uncertain: 8 best_metric: rmse + input.time_split: 15 model.engine: lightgbm model.hyperparameter: default: @@ -101,7 +101,7 @@ stages: lambda_l2: 0.152 range: num_iterations: - - 100 + - 40 - 2500 learning_rate: - -3.0 @@ -148,12 +148,13 @@ stages: validation_type: random validation_metric: rmse link_max_depth: true - stop_iter: 50 + stop_iter: 40 model.predictor: all: - meta_township_code - meta_nbhd_code - - meta_sale_count_past_n_years + - meta_modeling_group + - meta_tieback_proration_rate - char_yrblt - char_air - char_apts @@ -163,16 +164,14 @@ stages: - char_bldg_sf - char_bsmt - char_bsmt_fin - - char_card_pct_bldg - - char_class - char_ext_wall - char_fbath - char_frpl + - char_gar1_area - char_gar1_att - char_gar1_cnst - char_gar1_size - char_hbath - - char_key_card - char_land_sf - char_heat - char_ncu @@ -180,12 +179,14 @@ stages: - char_roof_cnst - char_rooms - char_tp_dsgn + - char_tp_plan - char_type_resd - char_recent_renovation - loc_longitude - loc_latitude - - loc_census_tract_geoid + - loc_env_flood_fema_sfha - loc_env_flood_fs_factor + - loc_env_flood_fs_risk_direction - loc_env_airport_noise_dnl - loc_school_elementary_district_geoid - loc_school_secondary_district_geoid @@ -211,8 +212,6 @@ stages: - prox_nearest_park_dist_ft - prox_nearest_railroad_dist_ft - prox_nearest_secondary_road_dist_ft - - prox_nearest_university_dist_ft - - prox_nearest_vacant_land_dist_ft - prox_nearest_water_dist_ft - prox_nearest_golf_course_dist_ft - acs5_percent_age_children @@ -238,9 +237,7 @@ stages: - other_tax_bill_rate - other_school_district_elementary_avg_rating - other_school_district_secondary_avg_rating - - ccao_is_active_exe_homeowner - ccao_is_corner_lot - - ccao_n_years_exe_homeowner - time_sale_year - time_sale_day - time_sale_quarter_of_year @@ -252,14 +249,15 @@ stages: categorical: - meta_township_code - meta_nbhd_code + - meta_modeling_group - char_air - char_apts - char_attic_fnsh - char_attic_type - char_bsmt - char_bsmt_fin - - char_class - char_ext_wall + - char_gar1_area - char_gar1_att - char_gar1_cnst - char_gar1_size @@ -267,8 +265,8 @@ stages: - char_porch - char_roof_cnst - char_tp_dsgn + - char_tp_plan - char_type_resd - - loc_census_tract_geoid - loc_tax_municipality_name - loc_school_elementary_district_geoid - loc_school_secondary_district_geoid @@ -303,36 +301,40 @@ stages: - loc_school_elementary_district_geoid - loc_school_secondary_district_geoid - loc_school_unified_district_geoid + pins: + - '13253180150000' + - '17321110470000' + - '05174150240000' toggle.cv_enable: false outs: - path: output/intermediate/timing/model_timing_train.parquet hash: md5 - md5: afd8e7df175204e4e1a0589f768fc512 - size: 2494 + md5: 98fae7af31e3fee9e5ba6281c2201ed9 + size: 2872 - path: output/parameter_final/model_parameter_final.parquet hash: md5 - md5: 42d5e8030cef68b122116608d206756a - size: 6403 + md5: fd6a04559a01ec21417ded57ce680f17 + size: 8516 - path: output/parameter_range/model_parameter_range.parquet hash: md5 - md5: 4e6296ab17f03b1b8c20b374fc8751a3 + md5: 3b2015c65992cfcc2a46b1c029d62212 size: 501 - path: output/parameter_search/model_parameter_search.parquet hash: md5 - md5: 4e6296ab17f03b1b8c20b374fc8751a3 + md5: 3b2015c65992cfcc2a46b1c029d62212 size: 501 - path: output/test_card/model_test_card.parquet hash: md5 - md5: b1ee0d8ae58521a0cf16f9f407d56a6b - size: 483246 + md5: a115c5d7c95f31e8440f8881ca45a144 + size: 2075102 - path: output/workflow/fit/model_workflow_fit.zip hash: md5 - md5: 8269ac972c881cdf388cce77bf297e4e - size: 9892339 + md5: 9c53eaf3b76c70369107fc259fd29dff + size: 11141014 - path: output/workflow/recipe/model_workflow_recipe.rds hash: md5 - md5: ec6db9515fddbf97cb6d1acc4f9f55a4 - size: 541507 + md5: d3596ea1b3054ac3cead76d8f4d1a8d7 + size: 1802847 assess: cmd: Rscript pipeline/02-assess.R deps: From d079c9a5a27b179c50a43325318f1516816f4c3d Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 8 Jan 2025 16:29:37 +0000 Subject: [PATCH 05/17] Remove meaning block --- pipeline/00-ingest.R | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 6e906e64..bb6662a8 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -291,20 +291,6 @@ message("Adding time features and cleaning") # Testing. Multi-card munging -------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# Calculate total square footage per PIN and proportion per card -training_data_w_proportions <- training_data_w_hie %>% - # Group by PIN and sale to get total square footage - group_by(meta_pin, meta_sale_document_num) %>% - mutate( - # Total building square footage for the PIN - pin_total_bldg_sf = sum(char_bldg_sf), - # This card's proportion of total square footage - card_sqft_proportion = char_bldg_sf / pin_total_bldg_sf, - # Allocate sale price based on square footage proportion - meta_sale_price_allocated = meta_sale_price * card_sqft_proportion - ) %>% - ungroup() - # Clean up the training data. Goal is to get it into a publishable format. # Final featurization, missingness, etc. is handled via Tidymodels recipes training_data_clean <- training_data_w_hie %>% From d43727e07f86a4713265d239ef5a54f00f170a51 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 8 Jan 2025 17:07:17 +0000 Subject: [PATCH 06/17] Update training data --- dvc.lock | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dvc.lock b/dvc.lock index 1e396949..9753c032 100755 --- a/dvc.lock +++ b/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: pipeline/00-ingest.R hash: md5 - md5: 1ad4a6046ce52c0cd6d22838307e1aff - size: 25943 + md5: f02ab180615e40480dbefd49489b8a70 + size: 25715 params: params.yaml: assessment: @@ -38,29 +38,29 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: 88d26b3cb5a56a3d09d7f5a4b54d2fa3 - size: 425743809 + md5: f21e7e8ceb757f6ca50100ebf2585095 + size: 426074126 - path: input/char_data.parquet hash: md5 - md5: 37b9bab4c12556b688546a016881cf6e - size: 847100652 + md5: f4cd84f4a70657c77d93cf75e0596409 + size: 848642630 - path: input/complex_id_data.parquet hash: md5 - md5: 7efc4a5f530a2b23eedafc2ff06d8b66 - size: 702567 + md5: 76666aad9263ea3ebbee4d1ae195d66f + size: 704849 - path: input/hie_data.parquet hash: md5 - md5: a1a540b7e55ec8d36f9cb222ee6f64d8 - size: 1918937 + md5: 0849e7146d0c7fc3412134f5e9903652 + size: 1922244 - path: input/land_nbhd_rate_data.parquet hash: md5 md5: f3ec9627322bd271bf2957b7388aaa34 size: 3873 - path: input/training_data.parquet hash: md5 - md5: a1931a15c6968ad5be89b2da4218995f - size: 208911843 - train: + md5: 864af5260ebe80b78f7dccb2c83d4e55 + size: 208854739 + train: cmd: Rscript pipeline/01-train.R deps: - path: pipeline/01-train.R From 06447fe6e7cec437ad3e2138dfe14bc839f84fd8 Mon Sep 17 00:00:00 2001 From: Dan Snow Date: Wed, 8 Jan 2025 17:49:05 +0000 Subject: [PATCH 07/17] Update DVC lock file with new multi-card features --- dvc.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dvc.lock b/dvc.lock index 9753c032..671610c8 100755 --- a/dvc.lock +++ b/dvc.lock @@ -38,28 +38,28 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: f21e7e8ceb757f6ca50100ebf2585095 - size: 426074126 + md5: 6abab4a0d303c18ea134043144086331 + size: 425373072 - path: input/char_data.parquet hash: md5 - md5: f4cd84f4a70657c77d93cf75e0596409 - size: 848642630 + md5: ccb30b53f04515f8eb5197af6da58a61 + size: 848166656 - path: input/complex_id_data.parquet hash: md5 - md5: 76666aad9263ea3ebbee4d1ae195d66f - size: 704849 + md5: b2ef4dd99dc98260a2a5f102d8bc9457 + size: 703671 - path: input/hie_data.parquet hash: md5 - md5: 0849e7146d0c7fc3412134f5e9903652 - size: 1922244 + md5: 94e8843c300a4251ab7ba49cee8a38af + size: 1909891 - path: input/land_nbhd_rate_data.parquet hash: md5 md5: f3ec9627322bd271bf2957b7388aaa34 size: 3873 - path: input/training_data.parquet hash: md5 - md5: 864af5260ebe80b78f7dccb2c83d4e55 - size: 208854739 + md5: e5d9bdb4ae54e6073337fbc601a06440 + size: 208846045 train: cmd: Rscript pipeline/01-train.R deps: From 21af34302f17a0fad09df164f761120fcee57da5 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 8 Jan 2025 19:26:16 +0000 Subject: [PATCH 08/17] Add back lin model --- pipeline/01-train.R | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/pipeline/01-train.R b/pipeline/01-train.R index bbcf5b59..2166b7ee 100644 --- a/pipeline/01-train.R +++ b/pipeline/01-train.R @@ -71,28 +71,28 @@ message("Creating and fitting linear baseline model") # Create a linear model recipe with additional imputation, transformations, # and feature interactions -# lin_recipe <- model_lin_recipe( -# data = training_data_full %>% -# mutate(meta_sale_price = log(meta_sale_price)), -# pred_vars = params$model$predictor$all, -# cat_vars = params$model$predictor$categorical, -# id_vars = params$model$predictor$id -# ) -# -# # Create a linear model specification and workflow -# lin_model <- parsnip::linear_reg() %>% -# set_mode("regression") %>% -# set_engine("lm") -# lin_wflow <- workflow() %>% -# add_model(lin_model) %>% -# add_recipe( -# recipe = lin_recipe, -# blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE) -# ) -# -# # Fit the linear model on the training data -# lin_wflow_final_fit <- lin_wflow %>% -# fit(data = train %>% mutate(meta_sale_price = log(meta_sale_price))) +lin_recipe <- model_lin_recipe( + data = training_data_full %>% + mutate(meta_sale_price = log(meta_sale_price)), + pred_vars = params$model$predictor$all, + cat_vars = params$model$predictor$categorical, + id_vars = params$model$predictor$id +) + +# Create a linear model specification and workflow +lin_model <- parsnip::linear_reg() %>% + set_mode("regression") %>% + set_engine("lm") +lin_wflow <- workflow() %>% + add_model(lin_model) %>% + add_recipe( + recipe = lin_recipe, + blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE) + ) + +# Fit the linear model on the training data +lin_wflow_final_fit <- lin_wflow %>% + fit(data = train %>% mutate(meta_sale_price = log(meta_sale_price))) @@ -396,10 +396,10 @@ message("Finalizing and saving trained model") test %>% mutate( pred_card_initial_fmv = predict(lgbm_wflow_final_fit, test)$.pred, - # pred_card_initial_fmv_lin = exp(predict( - # lin_wflow_final_fit, - # test %>% mutate(meta_sale_price = log(meta_sale_price)) - # )$.pred) + pred_card_initial_fmv_lin = exp(predict( + lin_wflow_final_fit, + test %>% mutate(meta_sale_price = log(meta_sale_price)) + )$.pred) ) %>% select( meta_year, meta_pin, meta_class, meta_card_num, meta_triad_code, From 4a291690d622b4724e417fdf64e36966716d37c4 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Thu, 9 Jan 2025 15:32:25 +0000 Subject: [PATCH 09/17] Re-add lin pred fmv --- pipeline/01-train.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/01-train.R b/pipeline/01-train.R index 2166b7ee..25231d7d 100644 --- a/pipeline/01-train.R +++ b/pipeline/01-train.R @@ -408,7 +408,7 @@ test %>% "prior_far_tot" = params$ratio_study$far_column, "prior_near_tot" = params$ratio_study$near_column )), - pred_card_initial_fmv, #pred_card_initial_fmv_lin, + pred_card_initial_fmv, pred_card_initial_fmv_lin, meta_sale_price, meta_sale_date, meta_sale_document_num ) %>% # Prior year values are AV, not FMV. Multiply by 10 to get FMV for residential From 7be12d6357f0e522377c4e03a8226023c0b9a46f Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 14 Jan 2025 21:38:33 +0000 Subject: [PATCH 10/17] Fix 0 price by dropping values --- pipeline/00-ingest.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index bb6662a8..7384b9f3 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -456,7 +456,7 @@ training_data_with_alloc <- training_data_full %>% meta_sale_price_allocated, meta_sale_price ) - ) + ) %>% filter(meta_sale_price != 0) training_data <- training_data_with_alloc %>% select(-meta_sale_price_allocated, -meta_card_id) From dd62d5351957aa3070b912bbd20284400e660382 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 14 Jan 2025 22:06:37 +0000 Subject: [PATCH 11/17] Unfreeze --- dvc.yaml | 259 +++++++++++++++++++++++++++---------------------------- 1 file changed, 129 insertions(+), 130 deletions(-) diff --git a/dvc.yaml b/dvc.yaml index 5cfdb69f..09904d42 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -5,53 +5,52 @@ stages: Ingest training and assessment data from Athena + generate townhome complex identifiers deps: - - pipeline/00-ingest.R + - pipeline/00-ingest.R params: - - assessment - - input + - assessment + - input outs: - - input/assessment_data.parquet - - input/char_data.parquet - - input/complex_id_data.parquet - - input/hie_data.parquet - - input/land_nbhd_rate_data.parquet - - input/training_data.parquet + - input/assessment_data.parquet + - input/char_data.parquet + - input/complex_id_data.parquet + - input/hie_data.parquet + - input/land_nbhd_rate_data.parquet + - input/training_data.parquet - frozen: true train: cmd: Rscript pipeline/01-train.R desc: > Train a LightGBM model with cross-validation. Generate model objects, data recipes, and predictions on the test set (most recent 10% of sales) deps: - - pipeline/01-train.R - - input/training_data.parquet + - pipeline/01-train.R + - input/training_data.parquet params: - - cv - - model.engine - - model.hyperparameter - - model.objective - - model.parameter - - model.predictor - - model.seed - - model.verbose - - ratio_study - - toggle.cv_enable + - cv + - model.engine + - model.hyperparameter + - model.objective + - model.parameter + - model.predictor + - model.seed + - model.verbose + - ratio_study + - toggle.cv_enable outs: - - output/intermediate/timing/model_timing_train.parquet: - cache: false - - output/parameter_final/model_parameter_final.parquet: - cache: false - - output/parameter_range/model_parameter_range.parquet: - cache: false - - output/parameter_search/model_parameter_search.parquet: - cache: false - - output/test_card/model_test_card.parquet: - cache: false - - output/workflow/fit/model_workflow_fit.zip: - cache: false - - output/workflow/recipe/model_workflow_recipe.rds: - cache: false + - output/intermediate/timing/model_timing_train.parquet: + cache: false + - output/parameter_final/model_parameter_final.parquet: + cache: false + - output/parameter_range/model_parameter_range.parquet: + cache: false + - output/parameter_search/model_parameter_search.parquet: + cache: false + - output/test_card/model_test_card.parquet: + cache: false + - output/workflow/fit/model_workflow_fit.zip: + cache: false + - output/workflow/recipe/model_workflow_recipe.rds: + cache: false assess: cmd: Rscript pipeline/02-assess.R @@ -60,25 +59,25 @@ stages: County. Also generate flags, calculate land values, and make any post-modeling changes deps: - - pipeline/02-assess.R - - input/training_data.parquet - - input/assessment_data.parquet - - input/complex_id_data.parquet - - input/land_nbhd_rate_data.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds + - pipeline/02-assess.R + - input/training_data.parquet + - input/assessment_data.parquet + - input/complex_id_data.parquet + - input/land_nbhd_rate_data.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds params: - - assessment - - pv - - ratio_study - - model.predictor.all + - assessment + - pv + - ratio_study + - model.predictor.all outs: - - output/assessment_card/model_assessment_card.parquet: - cache: false - - output/assessment_pin/model_assessment_pin.parquet: - cache: false - - output/intermediate/timing/model_timing_assess.parquet: - cache: false + - output/assessment_card/model_assessment_card.parquet: + cache: false + - output/assessment_pin/model_assessment_pin.parquet: + cache: false + - output/intermediate/timing/model_timing_assess.parquet: + cache: false evaluate: cmd: Rscript pipeline/03-evaluate.R @@ -88,23 +87,23 @@ stages: 2. An assessor-specific ratio study comparing estimated assessments to the previous year's sales deps: - - pipeline/03-evaluate.R - - output/test_card/model_test_card.parquet - - output/assessment_pin/model_assessment_pin.parquet + - pipeline/03-evaluate.R + - output/test_card/model_test_card.parquet + - output/assessment_pin/model_assessment_pin.parquet params: - - assessment - - ratio_study + - assessment + - ratio_study outs: - - output/performance/model_performance_test.parquet: - cache: false - - output/performance_quantile/model_performance_quantile_test.parquet: - cache: false - - output/performance/model_performance_assessment.parquet: - cache: false - - output/performance_quantile/model_performance_quantile_assessment.parquet: - cache: false - - output/intermediate/timing/model_timing_evaluate.parquet: - cache: false + - output/performance/model_performance_test.parquet: + cache: false + - output/performance_quantile/model_performance_quantile_test.parquet: + cache: false + - output/performance/model_performance_assessment.parquet: + cache: false + - output/performance_quantile/model_performance_quantile_assessment.parquet: + cache: false + - output/intermediate/timing/model_timing_evaluate.parquet: + cache: false interpret: cmd: Rscript pipeline/04-interpret.R @@ -112,25 +111,25 @@ stages: Generate SHAP values for each card and feature as well as feature importance metrics for each feature deps: - - pipeline/04-interpret.R - - input/assessment_data.parquet - - input/training_data.parquet - - output/assessment_card/model_assessment_card.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds + - pipeline/04-interpret.R + - input/assessment_data.parquet + - input/training_data.parquet + - output/assessment_card/model_assessment_card.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds params: - - toggle.shap_enable - - toggle.comp_enable - - model.predictor.all + - toggle.shap_enable + - toggle.comp_enable + - model.predictor.all outs: - - output/shap/model_shap.parquet: - cache: false - - output/feature_importance/model_feature_importance.parquet: - cache: false - - output/intermediate/timing/model_timing_interpret.parquet: - cache: false - - output/comp/model_comp.parquet: - cache: false + - output/shap/model_shap.parquet: + cache: false + - output/feature_importance/model_feature_importance.parquet: + cache: false + - output/intermediate/timing/model_timing_interpret.parquet: + cache: false + - output/comp/model_comp.parquet: + cache: false finalize: cmd: Rscript pipeline/05-finalize.R @@ -138,28 +137,28 @@ stages: Save run timings and run metadata to disk and render a performance report using Quarto. deps: - - pipeline/05-finalize.R - - output/intermediate/timing/model_timing_train.parquet - - output/intermediate/timing/model_timing_assess.parquet - - output/intermediate/timing/model_timing_evaluate.parquet - - output/intermediate/timing/model_timing_interpret.parquet + - pipeline/05-finalize.R + - output/intermediate/timing/model_timing_train.parquet + - output/intermediate/timing/model_timing_assess.parquet + - output/intermediate/timing/model_timing_evaluate.parquet + - output/intermediate/timing/model_timing_interpret.parquet params: - - run_note - - toggle - - input - - cv - - model - - pv - - ratio_study + - run_note + - toggle + - input + - cv + - model + - pv + - ratio_study outs: - - output/intermediate/timing/model_timing_finalize.parquet: - cache: false - - output/timing/model_timing.parquet: - cache: false - - output/metadata/model_metadata.parquet: - cache: false - - reports/performance/performance.html: - cache: false + - output/intermediate/timing/model_timing_finalize.parquet: + cache: false + - output/timing/model_timing.parquet: + cache: false + - output/metadata/model_metadata.parquet: + cache: false + - reports/performance/performance.html: + cache: false upload: cmd: Rscript pipeline/06-upload.R @@ -169,25 +168,25 @@ stages: outputs prior to upload and attach a unique run ID. This step requires access to the CCAO Data AWS account, and so is assumed to be internal-only deps: - - pipeline/06-upload.R - - output/parameter_final/model_parameter_final.parquet - - output/parameter_range/model_parameter_range.parquet - - output/parameter_search/model_parameter_search.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds - - output/test_card/model_test_card.parquet - - output/assessment_card/model_assessment_card.parquet - - output/assessment_pin/model_assessment_pin.parquet - - output/performance/model_performance_test.parquet - - output/performance_quantile/model_performance_quantile_test.parquet - - output/performance/model_performance_assessment.parquet - - output/performance_quantile/model_performance_quantile_assessment.parquet - - output/shap/model_shap.parquet - - output/comp/model_comp.parquet - - output/feature_importance/model_feature_importance.parquet - - output/metadata/model_metadata.parquet - - output/timing/model_timing.parquet - - reports/performance/performance.html + - pipeline/06-upload.R + - output/parameter_final/model_parameter_final.parquet + - output/parameter_range/model_parameter_range.parquet + - output/parameter_search/model_parameter_search.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds + - output/test_card/model_test_card.parquet + - output/assessment_card/model_assessment_card.parquet + - output/assessment_pin/model_assessment_pin.parquet + - output/performance/model_performance_test.parquet + - output/performance_quantile/model_performance_quantile_test.parquet + - output/performance/model_performance_assessment.parquet + - output/performance_quantile/model_performance_quantile_assessment.parquet + - output/shap/model_shap.parquet + - output/comp/model_comp.parquet + - output/feature_importance/model_feature_importance.parquet + - output/metadata/model_metadata.parquet + - output/timing/model_timing.parquet + - reports/performance/performance.html export: cmd: Rscript pipeline/07-export.R @@ -196,11 +195,11 @@ stages: run. NOT automatically run since it is typically only run once. Manually run once a model is selected deps: - - pipeline/07-export.R + - pipeline/07-export.R params: - - assessment.year - - input.min_sale_year - - input.max_sale_year - - ratio_study - - export + - assessment.year + - input.min_sale_year + - input.max_sale_year + - ratio_study + - export frozen: true From 048d11ef7759e06ac14184d7e929d5441fa49c6c Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 14 Jan 2025 22:11:25 +0000 Subject: [PATCH 12/17] Revert dvc yaml and unfreeze ingest --- dvc.yaml | 259 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 130 insertions(+), 129 deletions(-) diff --git a/dvc.yaml b/dvc.yaml index 09904d42..c821d5ba 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -5,52 +5,53 @@ stages: Ingest training and assessment data from Athena + generate townhome complex identifiers deps: - - pipeline/00-ingest.R + - pipeline/00-ingest.R params: - - assessment - - input + - assessment + - input outs: - - input/assessment_data.parquet - - input/char_data.parquet - - input/complex_id_data.parquet - - input/hie_data.parquet - - input/land_nbhd_rate_data.parquet - - input/training_data.parquet + - input/assessment_data.parquet + - input/char_data.parquet + - input/complex_id_data.parquet + - input/hie_data.parquet + - input/land_nbhd_rate_data.parquet + - input/training_data.parquet + frozen: false train: cmd: Rscript pipeline/01-train.R desc: > Train a LightGBM model with cross-validation. Generate model objects, data recipes, and predictions on the test set (most recent 10% of sales) deps: - - pipeline/01-train.R - - input/training_data.parquet + - pipeline/01-train.R + - input/training_data.parquet params: - - cv - - model.engine - - model.hyperparameter - - model.objective - - model.parameter - - model.predictor - - model.seed - - model.verbose - - ratio_study - - toggle.cv_enable + - cv + - model.engine + - model.hyperparameter + - model.objective + - model.parameter + - model.predictor + - model.seed + - model.verbose + - ratio_study + - toggle.cv_enable outs: - - output/intermediate/timing/model_timing_train.parquet: - cache: false - - output/parameter_final/model_parameter_final.parquet: - cache: false - - output/parameter_range/model_parameter_range.parquet: - cache: false - - output/parameter_search/model_parameter_search.parquet: - cache: false - - output/test_card/model_test_card.parquet: - cache: false - - output/workflow/fit/model_workflow_fit.zip: - cache: false - - output/workflow/recipe/model_workflow_recipe.rds: - cache: false + - output/intermediate/timing/model_timing_train.parquet: + cache: false + - output/parameter_final/model_parameter_final.parquet: + cache: false + - output/parameter_range/model_parameter_range.parquet: + cache: false + - output/parameter_search/model_parameter_search.parquet: + cache: false + - output/test_card/model_test_card.parquet: + cache: false + - output/workflow/fit/model_workflow_fit.zip: + cache: false + - output/workflow/recipe/model_workflow_recipe.rds: + cache: false assess: cmd: Rscript pipeline/02-assess.R @@ -59,25 +60,25 @@ stages: County. Also generate flags, calculate land values, and make any post-modeling changes deps: - - pipeline/02-assess.R - - input/training_data.parquet - - input/assessment_data.parquet - - input/complex_id_data.parquet - - input/land_nbhd_rate_data.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds + - pipeline/02-assess.R + - input/training_data.parquet + - input/assessment_data.parquet + - input/complex_id_data.parquet + - input/land_nbhd_rate_data.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds params: - - assessment - - pv - - ratio_study - - model.predictor.all + - assessment + - pv + - ratio_study + - model.predictor.all outs: - - output/assessment_card/model_assessment_card.parquet: - cache: false - - output/assessment_pin/model_assessment_pin.parquet: - cache: false - - output/intermediate/timing/model_timing_assess.parquet: - cache: false + - output/assessment_card/model_assessment_card.parquet: + cache: false + - output/assessment_pin/model_assessment_pin.parquet: + cache: false + - output/intermediate/timing/model_timing_assess.parquet: + cache: false evaluate: cmd: Rscript pipeline/03-evaluate.R @@ -87,23 +88,23 @@ stages: 2. An assessor-specific ratio study comparing estimated assessments to the previous year's sales deps: - - pipeline/03-evaluate.R - - output/test_card/model_test_card.parquet - - output/assessment_pin/model_assessment_pin.parquet + - pipeline/03-evaluate.R + - output/test_card/model_test_card.parquet + - output/assessment_pin/model_assessment_pin.parquet params: - - assessment - - ratio_study + - assessment + - ratio_study outs: - - output/performance/model_performance_test.parquet: - cache: false - - output/performance_quantile/model_performance_quantile_test.parquet: - cache: false - - output/performance/model_performance_assessment.parquet: - cache: false - - output/performance_quantile/model_performance_quantile_assessment.parquet: - cache: false - - output/intermediate/timing/model_timing_evaluate.parquet: - cache: false + - output/performance/model_performance_test.parquet: + cache: false + - output/performance_quantile/model_performance_quantile_test.parquet: + cache: false + - output/performance/model_performance_assessment.parquet: + cache: false + - output/performance_quantile/model_performance_quantile_assessment.parquet: + cache: false + - output/intermediate/timing/model_timing_evaluate.parquet: + cache: false interpret: cmd: Rscript pipeline/04-interpret.R @@ -111,25 +112,25 @@ stages: Generate SHAP values for each card and feature as well as feature importance metrics for each feature deps: - - pipeline/04-interpret.R - - input/assessment_data.parquet - - input/training_data.parquet - - output/assessment_card/model_assessment_card.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds + - pipeline/04-interpret.R + - input/assessment_data.parquet + - input/training_data.parquet + - output/assessment_card/model_assessment_card.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds params: - - toggle.shap_enable - - toggle.comp_enable - - model.predictor.all + - toggle.shap_enable + - toggle.comp_enable + - model.predictor.all outs: - - output/shap/model_shap.parquet: - cache: false - - output/feature_importance/model_feature_importance.parquet: - cache: false - - output/intermediate/timing/model_timing_interpret.parquet: - cache: false - - output/comp/model_comp.parquet: - cache: false + - output/shap/model_shap.parquet: + cache: false + - output/feature_importance/model_feature_importance.parquet: + cache: false + - output/intermediate/timing/model_timing_interpret.parquet: + cache: false + - output/comp/model_comp.parquet: + cache: false finalize: cmd: Rscript pipeline/05-finalize.R @@ -137,28 +138,28 @@ stages: Save run timings and run metadata to disk and render a performance report using Quarto. deps: - - pipeline/05-finalize.R - - output/intermediate/timing/model_timing_train.parquet - - output/intermediate/timing/model_timing_assess.parquet - - output/intermediate/timing/model_timing_evaluate.parquet - - output/intermediate/timing/model_timing_interpret.parquet + - pipeline/05-finalize.R + - output/intermediate/timing/model_timing_train.parquet + - output/intermediate/timing/model_timing_assess.parquet + - output/intermediate/timing/model_timing_evaluate.parquet + - output/intermediate/timing/model_timing_interpret.parquet params: - - run_note - - toggle - - input - - cv - - model - - pv - - ratio_study + - run_note + - toggle + - input + - cv + - model + - pv + - ratio_study outs: - - output/intermediate/timing/model_timing_finalize.parquet: - cache: false - - output/timing/model_timing.parquet: - cache: false - - output/metadata/model_metadata.parquet: - cache: false - - reports/performance/performance.html: - cache: false + - output/intermediate/timing/model_timing_finalize.parquet: + cache: false + - output/timing/model_timing.parquet: + cache: false + - output/metadata/model_metadata.parquet: + cache: false + - reports/performance/performance.html: + cache: false upload: cmd: Rscript pipeline/06-upload.R @@ -168,25 +169,25 @@ stages: outputs prior to upload and attach a unique run ID. This step requires access to the CCAO Data AWS account, and so is assumed to be internal-only deps: - - pipeline/06-upload.R - - output/parameter_final/model_parameter_final.parquet - - output/parameter_range/model_parameter_range.parquet - - output/parameter_search/model_parameter_search.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds - - output/test_card/model_test_card.parquet - - output/assessment_card/model_assessment_card.parquet - - output/assessment_pin/model_assessment_pin.parquet - - output/performance/model_performance_test.parquet - - output/performance_quantile/model_performance_quantile_test.parquet - - output/performance/model_performance_assessment.parquet - - output/performance_quantile/model_performance_quantile_assessment.parquet - - output/shap/model_shap.parquet - - output/comp/model_comp.parquet - - output/feature_importance/model_feature_importance.parquet - - output/metadata/model_metadata.parquet - - output/timing/model_timing.parquet - - reports/performance/performance.html + - pipeline/06-upload.R + - output/parameter_final/model_parameter_final.parquet + - output/parameter_range/model_parameter_range.parquet + - output/parameter_search/model_parameter_search.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds + - output/test_card/model_test_card.parquet + - output/assessment_card/model_assessment_card.parquet + - output/assessment_pin/model_assessment_pin.parquet + - output/performance/model_performance_test.parquet + - output/performance_quantile/model_performance_quantile_test.parquet + - output/performance/model_performance_assessment.parquet + - output/performance_quantile/model_performance_quantile_assessment.parquet + - output/shap/model_shap.parquet + - output/comp/model_comp.parquet + - output/feature_importance/model_feature_importance.parquet + - output/metadata/model_metadata.parquet + - output/timing/model_timing.parquet + - reports/performance/performance.html export: cmd: Rscript pipeline/07-export.R @@ -195,11 +196,11 @@ stages: run. NOT automatically run since it is typically only run once. Manually run once a model is selected deps: - - pipeline/07-export.R + - pipeline/07-export.R params: - - assessment.year - - input.min_sale_year - - input.max_sale_year - - ratio_study - - export + - assessment.year + - input.min_sale_year + - input.max_sale_year + - ratio_study + - export frozen: true From 72e6622eb42a0d83ee45a275db509ae30b69052a Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 14 Jan 2025 22:41:45 +0000 Subject: [PATCH 13/17] Freeze ingest --- dvc.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc.yaml b/dvc.yaml index c821d5ba..5cfdb69f 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -17,7 +17,7 @@ stages: - input/land_nbhd_rate_data.parquet - input/training_data.parquet - frozen: false + frozen: true train: cmd: Rscript pipeline/01-train.R desc: > From c83a3416f78de29c27357bbcc391b247bf96e474 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 15 Jan 2025 15:01:45 +0000 Subject: [PATCH 14/17] Update lock file --- dvc.lock | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dvc.lock b/dvc.lock index 671610c8..1b56bc6d 100755 --- a/dvc.lock +++ b/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: pipeline/00-ingest.R hash: md5 - md5: f02ab180615e40480dbefd49489b8a70 - size: 25715 + md5: 3b2eb741041cb9eedd37e2f5bc9bf3f8 + size: 25748 params: params.yaml: assessment: @@ -38,28 +38,28 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: 6abab4a0d303c18ea134043144086331 - size: 425373072 + md5: 7c739ef53b63c6f9d43385f51d6897b7 + size: 425836158 - path: input/char_data.parquet hash: md5 - md5: ccb30b53f04515f8eb5197af6da58a61 - size: 848166656 + md5: bb3473cf1974403842f8bc0b4fa99dbd + size: 848172615 - path: input/complex_id_data.parquet hash: md5 - md5: b2ef4dd99dc98260a2a5f102d8bc9457 - size: 703671 + md5: 5c6defdbf1b956fa8977ddfb520793ef + size: 704147 - path: input/hie_data.parquet hash: md5 - md5: 94e8843c300a4251ab7ba49cee8a38af - size: 1909891 + md5: 4575fb4e70ce4c012a3ff0dad409501d + size: 1917981 - path: input/land_nbhd_rate_data.parquet hash: md5 - md5: f3ec9627322bd271bf2957b7388aaa34 + md5: 4b3c72021c15daf8309d0029987da9f2 size: 3873 - path: input/training_data.parquet hash: md5 - md5: e5d9bdb4ae54e6073337fbc601a06440 - size: 208846045 + md5: 8beda25db7e54cd639b292443c5c10a0 + size: 208864171 train: cmd: Rscript pipeline/01-train.R deps: From a1e9c6472d62bcebd725a4cdbf14e8be94084297 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 15 Jan 2025 15:12:33 +0000 Subject: [PATCH 15/17] Uncomment vars_recode params --- pipeline/02-assess.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline/02-assess.R b/pipeline/02-assess.R index 78edaa96..affb23c4 100644 --- a/pipeline/02-assess.R +++ b/pipeline/02-assess.R @@ -315,7 +315,7 @@ assessment_card_data_merged %>% ) %>% ccao::vars_recode( cols = any_of(char_vars), - #code_type = "long", + code_type = "long", as_factor = FALSE ) %>% write_parquet(paths$output$assessment_card$local) @@ -542,7 +542,7 @@ message("Saving final PIN-level data") assessment_pin_data_final %>% ccao::vars_recode( cols = starts_with("char_"), - #code_type = "short", + code_type = "short", as_factor = FALSE ) %>% # Coerce columns to their expected Athena output type From 91d9dbe5a4d1dc39c4a906cdd087c6da6b377f14 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 15 Jan 2025 15:15:54 +0000 Subject: [PATCH 16/17] Update weird spacing and freeze for dvc yaml --- dvc.yaml | 259 +++++++++++++++++++++++++++---------------------------- 1 file changed, 129 insertions(+), 130 deletions(-) diff --git a/dvc.yaml b/dvc.yaml index 5cfdb69f..09904d42 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -5,53 +5,52 @@ stages: Ingest training and assessment data from Athena + generate townhome complex identifiers deps: - - pipeline/00-ingest.R + - pipeline/00-ingest.R params: - - assessment - - input + - assessment + - input outs: - - input/assessment_data.parquet - - input/char_data.parquet - - input/complex_id_data.parquet - - input/hie_data.parquet - - input/land_nbhd_rate_data.parquet - - input/training_data.parquet + - input/assessment_data.parquet + - input/char_data.parquet + - input/complex_id_data.parquet + - input/hie_data.parquet + - input/land_nbhd_rate_data.parquet + - input/training_data.parquet - frozen: true train: cmd: Rscript pipeline/01-train.R desc: > Train a LightGBM model with cross-validation. Generate model objects, data recipes, and predictions on the test set (most recent 10% of sales) deps: - - pipeline/01-train.R - - input/training_data.parquet + - pipeline/01-train.R + - input/training_data.parquet params: - - cv - - model.engine - - model.hyperparameter - - model.objective - - model.parameter - - model.predictor - - model.seed - - model.verbose - - ratio_study - - toggle.cv_enable + - cv + - model.engine + - model.hyperparameter + - model.objective + - model.parameter + - model.predictor + - model.seed + - model.verbose + - ratio_study + - toggle.cv_enable outs: - - output/intermediate/timing/model_timing_train.parquet: - cache: false - - output/parameter_final/model_parameter_final.parquet: - cache: false - - output/parameter_range/model_parameter_range.parquet: - cache: false - - output/parameter_search/model_parameter_search.parquet: - cache: false - - output/test_card/model_test_card.parquet: - cache: false - - output/workflow/fit/model_workflow_fit.zip: - cache: false - - output/workflow/recipe/model_workflow_recipe.rds: - cache: false + - output/intermediate/timing/model_timing_train.parquet: + cache: false + - output/parameter_final/model_parameter_final.parquet: + cache: false + - output/parameter_range/model_parameter_range.parquet: + cache: false + - output/parameter_search/model_parameter_search.parquet: + cache: false + - output/test_card/model_test_card.parquet: + cache: false + - output/workflow/fit/model_workflow_fit.zip: + cache: false + - output/workflow/recipe/model_workflow_recipe.rds: + cache: false assess: cmd: Rscript pipeline/02-assess.R @@ -60,25 +59,25 @@ stages: County. Also generate flags, calculate land values, and make any post-modeling changes deps: - - pipeline/02-assess.R - - input/training_data.parquet - - input/assessment_data.parquet - - input/complex_id_data.parquet - - input/land_nbhd_rate_data.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds + - pipeline/02-assess.R + - input/training_data.parquet + - input/assessment_data.parquet + - input/complex_id_data.parquet + - input/land_nbhd_rate_data.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds params: - - assessment - - pv - - ratio_study - - model.predictor.all + - assessment + - pv + - ratio_study + - model.predictor.all outs: - - output/assessment_card/model_assessment_card.parquet: - cache: false - - output/assessment_pin/model_assessment_pin.parquet: - cache: false - - output/intermediate/timing/model_timing_assess.parquet: - cache: false + - output/assessment_card/model_assessment_card.parquet: + cache: false + - output/assessment_pin/model_assessment_pin.parquet: + cache: false + - output/intermediate/timing/model_timing_assess.parquet: + cache: false evaluate: cmd: Rscript pipeline/03-evaluate.R @@ -88,23 +87,23 @@ stages: 2. An assessor-specific ratio study comparing estimated assessments to the previous year's sales deps: - - pipeline/03-evaluate.R - - output/test_card/model_test_card.parquet - - output/assessment_pin/model_assessment_pin.parquet + - pipeline/03-evaluate.R + - output/test_card/model_test_card.parquet + - output/assessment_pin/model_assessment_pin.parquet params: - - assessment - - ratio_study + - assessment + - ratio_study outs: - - output/performance/model_performance_test.parquet: - cache: false - - output/performance_quantile/model_performance_quantile_test.parquet: - cache: false - - output/performance/model_performance_assessment.parquet: - cache: false - - output/performance_quantile/model_performance_quantile_assessment.parquet: - cache: false - - output/intermediate/timing/model_timing_evaluate.parquet: - cache: false + - output/performance/model_performance_test.parquet: + cache: false + - output/performance_quantile/model_performance_quantile_test.parquet: + cache: false + - output/performance/model_performance_assessment.parquet: + cache: false + - output/performance_quantile/model_performance_quantile_assessment.parquet: + cache: false + - output/intermediate/timing/model_timing_evaluate.parquet: + cache: false interpret: cmd: Rscript pipeline/04-interpret.R @@ -112,25 +111,25 @@ stages: Generate SHAP values for each card and feature as well as feature importance metrics for each feature deps: - - pipeline/04-interpret.R - - input/assessment_data.parquet - - input/training_data.parquet - - output/assessment_card/model_assessment_card.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds + - pipeline/04-interpret.R + - input/assessment_data.parquet + - input/training_data.parquet + - output/assessment_card/model_assessment_card.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds params: - - toggle.shap_enable - - toggle.comp_enable - - model.predictor.all + - toggle.shap_enable + - toggle.comp_enable + - model.predictor.all outs: - - output/shap/model_shap.parquet: - cache: false - - output/feature_importance/model_feature_importance.parquet: - cache: false - - output/intermediate/timing/model_timing_interpret.parquet: - cache: false - - output/comp/model_comp.parquet: - cache: false + - output/shap/model_shap.parquet: + cache: false + - output/feature_importance/model_feature_importance.parquet: + cache: false + - output/intermediate/timing/model_timing_interpret.parquet: + cache: false + - output/comp/model_comp.parquet: + cache: false finalize: cmd: Rscript pipeline/05-finalize.R @@ -138,28 +137,28 @@ stages: Save run timings and run metadata to disk and render a performance report using Quarto. deps: - - pipeline/05-finalize.R - - output/intermediate/timing/model_timing_train.parquet - - output/intermediate/timing/model_timing_assess.parquet - - output/intermediate/timing/model_timing_evaluate.parquet - - output/intermediate/timing/model_timing_interpret.parquet + - pipeline/05-finalize.R + - output/intermediate/timing/model_timing_train.parquet + - output/intermediate/timing/model_timing_assess.parquet + - output/intermediate/timing/model_timing_evaluate.parquet + - output/intermediate/timing/model_timing_interpret.parquet params: - - run_note - - toggle - - input - - cv - - model - - pv - - ratio_study + - run_note + - toggle + - input + - cv + - model + - pv + - ratio_study outs: - - output/intermediate/timing/model_timing_finalize.parquet: - cache: false - - output/timing/model_timing.parquet: - cache: false - - output/metadata/model_metadata.parquet: - cache: false - - reports/performance/performance.html: - cache: false + - output/intermediate/timing/model_timing_finalize.parquet: + cache: false + - output/timing/model_timing.parquet: + cache: false + - output/metadata/model_metadata.parquet: + cache: false + - reports/performance/performance.html: + cache: false upload: cmd: Rscript pipeline/06-upload.R @@ -169,25 +168,25 @@ stages: outputs prior to upload and attach a unique run ID. This step requires access to the CCAO Data AWS account, and so is assumed to be internal-only deps: - - pipeline/06-upload.R - - output/parameter_final/model_parameter_final.parquet - - output/parameter_range/model_parameter_range.parquet - - output/parameter_search/model_parameter_search.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds - - output/test_card/model_test_card.parquet - - output/assessment_card/model_assessment_card.parquet - - output/assessment_pin/model_assessment_pin.parquet - - output/performance/model_performance_test.parquet - - output/performance_quantile/model_performance_quantile_test.parquet - - output/performance/model_performance_assessment.parquet - - output/performance_quantile/model_performance_quantile_assessment.parquet - - output/shap/model_shap.parquet - - output/comp/model_comp.parquet - - output/feature_importance/model_feature_importance.parquet - - output/metadata/model_metadata.parquet - - output/timing/model_timing.parquet - - reports/performance/performance.html + - pipeline/06-upload.R + - output/parameter_final/model_parameter_final.parquet + - output/parameter_range/model_parameter_range.parquet + - output/parameter_search/model_parameter_search.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds + - output/test_card/model_test_card.parquet + - output/assessment_card/model_assessment_card.parquet + - output/assessment_pin/model_assessment_pin.parquet + - output/performance/model_performance_test.parquet + - output/performance_quantile/model_performance_quantile_test.parquet + - output/performance/model_performance_assessment.parquet + - output/performance_quantile/model_performance_quantile_assessment.parquet + - output/shap/model_shap.parquet + - output/comp/model_comp.parquet + - output/feature_importance/model_feature_importance.parquet + - output/metadata/model_metadata.parquet + - output/timing/model_timing.parquet + - reports/performance/performance.html export: cmd: Rscript pipeline/07-export.R @@ -196,11 +195,11 @@ stages: run. NOT automatically run since it is typically only run once. Manually run once a model is selected deps: - - pipeline/07-export.R + - pipeline/07-export.R params: - - assessment.year - - input.min_sale_year - - input.max_sale_year - - ratio_study - - export + - assessment.year + - input.min_sale_year + - input.max_sale_year + - ratio_study + - export frozen: true From 5ab348cb398a5985505a06ad4c26a8b5e59764c0 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 15 Jan 2025 15:16:43 +0000 Subject: [PATCH 17/17] Freeze ingest --- dvc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dvc.yaml b/dvc.yaml index 09904d42..9787ae3b 100755 --- a/dvc.yaml +++ b/dvc.yaml @@ -17,6 +17,7 @@ stages: - input/land_nbhd_rate_data.parquet - input/training_data.parquet + frozen: true train: cmd: Rscript pipeline/01-train.R desc: >