From 712e945565c77b3444fb910aacf943f83d4bdffc Mon Sep 17 00:00:00 2001 From: Dan Snow <31494343+dfsnow@users.noreply.github.com> Date: Sun, 4 Feb 2024 20:25:40 -0600 Subject: [PATCH] Fix residential data ingest issues (#207) * Increase the number of comp search bins * Drop modeling_group feature since it's redundant with class * Remove char_apts and char_ncu for non-MF PINs * Remove one-off weird deed type * Replace missing municipality name with UNINCORPORATED * Boot low sqft sales from training data * Update data with ingest fixes --- dvc.lock | 20 +++++++-------- params.yaml | 4 +-- pipeline/00-ingest.R | 58 ++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 64 insertions(+), 18 deletions(-) diff --git a/dvc.lock b/dvc.lock index 6f9516cd..888504e5 100755 --- a/dvc.lock +++ b/dvc.lock @@ -33,20 +33,20 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: b9c706a72f32f9c42982e21d042e1c28 - size: 309062434 + md5: b32480d27f75180149afe1b2de908037 + size: 310404298 - path: input/char_data.parquet hash: md5 - md5: 981118cb1e6f7f531f6cd51e0ae9d84c - size: 616960318 + md5: 537c278255d5721744ad8f180df22526 + size: 618205566 - path: input/complex_id_data.parquet hash: md5 - md5: 649fc0edfbe3a0671bd9fb650f212d90 - size: 703251 + md5: 0492bec3b81f48d814bfedb9d9f893e0 + size: 703076 - path: input/hie_data.parquet hash: md5 - md5: 6d00a8031d3bb6ff912ecdcb5fa1ff67 - size: 1925883 + md5: 2e049a011cae0a45d2072406e95b78b7 + size: 1925258 - path: input/land_nbhd_rate_data.parquet hash: md5 md5: e508daf5790982c303d6503fe1cb8e2b @@ -56,8 +56,8 @@ stages: size: 2109 - path: input/training_data.parquet hash: md5 - md5: 3bdd7526dc26dd60c8126ab7a5857460 - size: 156819436 + md5: 292caca708c332ecb5acc7886ce46e80 + size: 156692811 train: cmd: Rscript pipeline/01-train.R deps: diff --git a/params.yaml b/params.yaml index ff498e19..08a4b15e 100644 --- a/params.yaml +++ b/params.yaml @@ -151,7 +151,6 @@ model: all: - "meta_township_code" - "meta_nbhd_code" - - "meta_modeling_group" - "meta_sale_count_past_n_years" - "char_yrblt" - "char_air" @@ -253,7 +252,6 @@ model: categorical: - "meta_township_code" - "meta_nbhd_code" - - "meta_modeling_group" - "char_air" - "char_apts" - "char_attic_fnsh" @@ -438,7 +436,7 @@ comp: # Number of price bins to use when binning properties for the purpose of # comparison. Corresponds to ntile of predicted FMV, e.g. 10 creates deciles # of predicted values. Larger deciles = larger comparables search range - num_price_bins: 10 + num_price_bins: 20 # Export ----------------------------------------------------------------------- diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 551dffe5..eec29bfa 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -55,6 +55,7 @@ training_data <- dbGetQuery( BETWEEN CAST({params$input$min_sale_year} AS int) - {params$input$n_years_prior} AND CAST({params$input$max_sale_year} AS int) + AND sale.deed_type IN ('01', '02', '05') AND NOT sale.is_multisale AND NOT sale.sale_filter_same_sale_within_365 AND NOT sale.sale_filter_less_than_10k @@ -270,6 +271,25 @@ training_data_clean <- training_data_w_hie %>% # This will remove any categories not stored in the dictionary and convert # them to NA (useful since there are a lot of misrecorded variables) ccao::vars_recode(cols = starts_with("char_"), type = "code") %>% + # Recode the number of apartments from its numeric code to its actual number + # of units. Additionally, ensure non-multi-family PINs always have NONE apts + ccao::vars_recode( + cols = all_of("char_apts"), + type = "short", + as_factor = FALSE + ) %>% + mutate( + char_apts = case_when( + char_class %in% c("211", "212") & !is.na(char_apts) ~ char_apts, + char_class %in% c("211", "212") & is.na(char_apts) ~ "UNKNOWN", + TRUE ~ "NONE" + ), + char_apts = factor( + char_apts, + levels = c("TWO", "THREE", "FOUR", "FIVE", "SIX", "UNKNOWN", "NONE") + ), + char_ncu = ifelse(char_class == "212" & !is.na(char_ncu), char_ncu, 0) + ) %>% # Coerce columns to the data types recorded in the dictionary. Necessary # because the SQL drivers will often coerce types on pull (boolean becomes # character) @@ -289,6 +309,8 @@ training_data_clean <- training_data_w_hie %>% across(starts_with("loc_tax_"), \(x) str_replace_all(x, "\\[|\\]", "")), across(starts_with("loc_tax_"), \(x) str_trim(str_split_i(x, ",", 1))), across(starts_with("loc_tax_"), \(x) na_if(x, "")), + loc_tax_municipality_name = + replace_na(loc_tax_municipality_name, "UNINCORPORATED"), # Miscellaneous column-level cleanup ccao_is_corner_lot = replace_na(ccao_is_corner_lot, FALSE), ccao_is_active_exe_homeowner = replace_na(ccao_is_active_exe_homeowner, 0L), @@ -341,15 +363,22 @@ training_data_clean <- training_data_w_hie %>% time_sale_day_of_week = as.integer(wday(meta_sale_date)), time_sale_post_covid = meta_sale_date >= make_date(2020, 3, 15) ) %>% + # Reorder resulting columns select(-any_of(c("time_interval"))) %>% relocate(starts_with("sv_"), .after = everything()) %>% relocate("year", .after = everything()) %>% relocate("meta_sale_count_past_n_years", .after = meta_sale_buyer_name) %>% - filter(between( - meta_sale_date, - make_date(params$input$min_sale_year, 1, 1), - make_date(params$input$max_sale_year, 12, 31) - )) %>% + # Drop invalid sales outside the sample date range or with obvious incorrect + # square footage values + filter( + between( + meta_sale_date, + make_date(params$input$min_sale_year, 1, 1), + make_date(params$input$max_sale_year, 12, 31) + ), + !(char_bldg_sf < 300 & !ind_pin_is_multicard), + !(char_land_sf < 300 & !ind_pin_is_multicard) + ) %>% as_tibble() %>% write_parquet(paths$input$training$local) @@ -361,6 +390,23 @@ training_data_clean <- training_data_w_hie %>% # time variables and identifying complexes assessment_data_clean <- assessment_data_w_hie %>% ccao::vars_recode(cols = starts_with("char_"), type = "code") %>% + ccao::vars_recode( + cols = all_of("char_apts"), + type = "short", + as_factor = FALSE + ) %>% + mutate( + char_apts = case_when( + char_class %in% c("211", "212") & !is.na(char_apts) ~ char_apts, + char_class %in% c("211", "212") & is.na(char_apts) ~ "UNKNOWN", + TRUE ~ "NONE" + ), + char_apts = factor( + char_apts, + levels = c("TWO", "THREE", "FOUR", "FIVE", "SIX", "UNKNOWN", "NONE") + ), + char_ncu = ifelse(char_class == "212" & !is.na(char_ncu), char_ncu, 0) + ) %>% mutate(across( any_of(col_type_dict$var_name), ~ recode_column_type(.x, cur_column()) @@ -370,6 +416,8 @@ assessment_data_clean <- assessment_data_w_hie %>% across(starts_with("loc_tax_"), \(x) str_replace_all(x, "\\[|\\]", "")), across(starts_with("loc_tax_"), \(x) str_trim(str_split_i(x, ",", 1))), across(starts_with("loc_tax_"), \(x) na_if(x, "")), + loc_tax_municipality_name = + replace_na(loc_tax_municipality_name, "UNINCORPORATED"), ccao_is_corner_lot = replace_na(ccao_is_corner_lot, FALSE), ccao_is_active_exe_homeowner = replace_na(ccao_is_active_exe_homeowner, 0L), ccao_n_years_exe_homeowner = replace_na(ccao_n_years_exe_homeowner, 0L),