Skip to content

Commit

Permalink
Fix residential data ingest issues (#207)
Browse files Browse the repository at this point in the history
* Increase the number of comp search bins

* Drop modeling_group feature since it's redundant with class

* Remove char_apts and char_ncu for non-MF PINs

* Remove one-off weird deed type

* Replace missing municipality name with UNINCORPORATED

* Boot low sqft sales from training data

* Update data with ingest fixes
  • Loading branch information
dfsnow authored Feb 5, 2024
1 parent a7644cd commit 712e945
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 18 deletions.
20 changes: 10 additions & 10 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,20 @@ stages:
outs:
- path: input/assessment_data.parquet
hash: md5
md5: b9c706a72f32f9c42982e21d042e1c28
size: 309062434
md5: b32480d27f75180149afe1b2de908037
size: 310404298
- path: input/char_data.parquet
hash: md5
md5: 981118cb1e6f7f531f6cd51e0ae9d84c
size: 616960318
md5: 537c278255d5721744ad8f180df22526
size: 618205566
- path: input/complex_id_data.parquet
hash: md5
md5: 649fc0edfbe3a0671bd9fb650f212d90
size: 703251
md5: 0492bec3b81f48d814bfedb9d9f893e0
size: 703076
- path: input/hie_data.parquet
hash: md5
md5: 6d00a8031d3bb6ff912ecdcb5fa1ff67
size: 1925883
md5: 2e049a011cae0a45d2072406e95b78b7
size: 1925258
- path: input/land_nbhd_rate_data.parquet
hash: md5
md5: e508daf5790982c303d6503fe1cb8e2b
Expand All @@ -56,8 +56,8 @@ stages:
size: 2109
- path: input/training_data.parquet
hash: md5
md5: 3bdd7526dc26dd60c8126ab7a5857460
size: 156819436
md5: 292caca708c332ecb5acc7886ce46e80
size: 156692811
train:
cmd: Rscript pipeline/01-train.R
deps:
Expand Down
4 changes: 1 addition & 3 deletions params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ model:
all:
- "meta_township_code"
- "meta_nbhd_code"
- "meta_modeling_group"
- "meta_sale_count_past_n_years"
- "char_yrblt"
- "char_air"
Expand Down Expand Up @@ -253,7 +252,6 @@ model:
categorical:
- "meta_township_code"
- "meta_nbhd_code"
- "meta_modeling_group"
- "char_air"
- "char_apts"
- "char_attic_fnsh"
Expand Down Expand Up @@ -438,7 +436,7 @@ comp:
# Number of price bins to use when binning properties for the purpose of
# comparison. Corresponds to ntile of predicted FMV, e.g. 10 creates deciles
# of predicted values. Larger deciles = larger comparables search range
num_price_bins: 10
num_price_bins: 20


# Export -----------------------------------------------------------------------
Expand Down
58 changes: 53 additions & 5 deletions pipeline/00-ingest.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ training_data <- dbGetQuery(
BETWEEN CAST({params$input$min_sale_year} AS int) -
{params$input$n_years_prior}
AND CAST({params$input$max_sale_year} AS int)
AND sale.deed_type IN ('01', '02', '05')
AND NOT sale.is_multisale
AND NOT sale.sale_filter_same_sale_within_365
AND NOT sale.sale_filter_less_than_10k
Expand Down Expand Up @@ -270,6 +271,25 @@ training_data_clean <- training_data_w_hie %>%
# This will remove any categories not stored in the dictionary and convert
# them to NA (useful since there are a lot of misrecorded variables)
ccao::vars_recode(cols = starts_with("char_"), type = "code") %>%
# Recode the number of apartments from its numeric code to its actual number
# of units. Additionally, ensure non-multi-family PINs always have NONE apts
ccao::vars_recode(
cols = all_of("char_apts"),
type = "short",
as_factor = FALSE
) %>%
mutate(
char_apts = case_when(
char_class %in% c("211", "212") & !is.na(char_apts) ~ char_apts,
char_class %in% c("211", "212") & is.na(char_apts) ~ "UNKNOWN",
TRUE ~ "NONE"
),
char_apts = factor(
char_apts,
levels = c("TWO", "THREE", "FOUR", "FIVE", "SIX", "UNKNOWN", "NONE")
),
char_ncu = ifelse(char_class == "212" & !is.na(char_ncu), char_ncu, 0)
) %>%
# Coerce columns to the data types recorded in the dictionary. Necessary
# because the SQL drivers will often coerce types on pull (boolean becomes
# character)
Expand All @@ -289,6 +309,8 @@ training_data_clean <- training_data_w_hie %>%
across(starts_with("loc_tax_"), \(x) str_replace_all(x, "\\[|\\]", "")),
across(starts_with("loc_tax_"), \(x) str_trim(str_split_i(x, ",", 1))),
across(starts_with("loc_tax_"), \(x) na_if(x, "")),
loc_tax_municipality_name =
replace_na(loc_tax_municipality_name, "UNINCORPORATED"),
# Miscellaneous column-level cleanup
ccao_is_corner_lot = replace_na(ccao_is_corner_lot, FALSE),
ccao_is_active_exe_homeowner = replace_na(ccao_is_active_exe_homeowner, 0L),
Expand Down Expand Up @@ -341,15 +363,22 @@ training_data_clean <- training_data_w_hie %>%
time_sale_day_of_week = as.integer(wday(meta_sale_date)),
time_sale_post_covid = meta_sale_date >= make_date(2020, 3, 15)
) %>%
# Reorder resulting columns
select(-any_of(c("time_interval"))) %>%
relocate(starts_with("sv_"), .after = everything()) %>%
relocate("year", .after = everything()) %>%
relocate("meta_sale_count_past_n_years", .after = meta_sale_buyer_name) %>%
filter(between(
meta_sale_date,
make_date(params$input$min_sale_year, 1, 1),
make_date(params$input$max_sale_year, 12, 31)
)) %>%
# Drop invalid sales outside the sample date range or with obvious incorrect
# square footage values
filter(
between(
meta_sale_date,
make_date(params$input$min_sale_year, 1, 1),
make_date(params$input$max_sale_year, 12, 31)
),
!(char_bldg_sf < 300 & !ind_pin_is_multicard),
!(char_land_sf < 300 & !ind_pin_is_multicard)
) %>%
as_tibble() %>%
write_parquet(paths$input$training$local)

Expand All @@ -361,6 +390,23 @@ training_data_clean <- training_data_w_hie %>%
# time variables and identifying complexes
assessment_data_clean <- assessment_data_w_hie %>%
ccao::vars_recode(cols = starts_with("char_"), type = "code") %>%
ccao::vars_recode(
cols = all_of("char_apts"),
type = "short",
as_factor = FALSE
) %>%
mutate(
char_apts = case_when(
char_class %in% c("211", "212") & !is.na(char_apts) ~ char_apts,
char_class %in% c("211", "212") & is.na(char_apts) ~ "UNKNOWN",
TRUE ~ "NONE"
),
char_apts = factor(
char_apts,
levels = c("TWO", "THREE", "FOUR", "FIVE", "SIX", "UNKNOWN", "NONE")
),
char_ncu = ifelse(char_class == "212" & !is.na(char_ncu), char_ncu, 0)
) %>%
mutate(across(
any_of(col_type_dict$var_name),
~ recode_column_type(.x, cur_column())
Expand All @@ -370,6 +416,8 @@ assessment_data_clean <- assessment_data_w_hie %>%
across(starts_with("loc_tax_"), \(x) str_replace_all(x, "\\[|\\]", "")),
across(starts_with("loc_tax_"), \(x) str_trim(str_split_i(x, ",", 1))),
across(starts_with("loc_tax_"), \(x) na_if(x, "")),
loc_tax_municipality_name =
replace_na(loc_tax_municipality_name, "UNINCORPORATED"),
ccao_is_corner_lot = replace_na(ccao_is_corner_lot, FALSE),
ccao_is_active_exe_homeowner = replace_na(ccao_is_active_exe_homeowner, 0L),
ccao_n_years_exe_homeowner = replace_na(ccao_n_years_exe_homeowner, 0L),
Expand Down

0 comments on commit 712e945

Please sign in to comment.