Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

228 improve modeling multi cards #312

Draft
wants to merge 10 commits into
base: 2025-assessment-year
Choose a base branch
from
28 changes: 14 additions & 14 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ stages:
deps:
- path: pipeline/00-ingest.R
hash: md5
md5: c453195da12dd0197e0bdd16f4ef3937
size: 23004
md5: f02ab180615e40480dbefd49489b8a70
size: 25715
params:
params.yaml:
assessment:
year: '2024'
date: '2024-01-01'
triad: city
triad: north
group: residential
data_year: '2023'
working_year: '2024'
working_year: '2025'
input:
min_sale_year: '2015'
max_sale_year: '2023'
Expand All @@ -38,28 +38,28 @@ stages:
outs:
- path: input/assessment_data.parquet
hash: md5
md5: e4b429a0121c6898b972fa20b42544fd
size: 425747228
md5: 6abab4a0d303c18ea134043144086331
size: 425373072
- path: input/char_data.parquet
hash: md5
md5: 827c97f9d3bbd3426e8f6fd9136313f8
size: 847441146
md5: ccb30b53f04515f8eb5197af6da58a61
size: 848166656
- path: input/complex_id_data.parquet
hash: md5
md5: 0e2a42a935106a9b6f50d8250012d98c
size: 703255
md5: b2ef4dd99dc98260a2a5f102d8bc9457
size: 703671
- path: input/hie_data.parquet
hash: md5
md5: ca86d0e5f29fd252455dc67e2dd40ac1
size: 1927927
md5: 94e8843c300a4251ab7ba49cee8a38af
size: 1909891
- path: input/land_nbhd_rate_data.parquet
hash: md5
md5: f3ec9627322bd271bf2957b7388aaa34
size: 3873
- path: input/training_data.parquet
hash: md5
md5: 76d91858f84f57ad2dce9fd292fe1ae2
size: 208138341
md5: e5d9bdb4ae54e6073337fbc601a06440
size: 208846045
train:
cmd: Rscript pipeline/01-train.R
deps:
Expand Down
85 changes: 84 additions & 1 deletion pipeline/00-ingest.R
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,11 @@ message("Adding time features and cleaning")

## 5.1. Training Data ----------------------------------------------------------


#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Testing. Multi-card munging --------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Clean up the training data. Goal is to get it into a publishable format.
# Final featurization, missingness, etc. is handled via Tidymodels recipes
training_data_clean <- training_data_w_hie %>%
Expand Down Expand Up @@ -399,10 +404,80 @@ training_data_clean <- training_data_w_hie %>%
!(char_bldg_sf < 300 & !ind_pin_is_multicard),
!(char_land_sf < 300 & !ind_pin_is_multicard)
) %>%
as_tibble() #%>%
# write_parquet(paths$input$training$local)

# Test new features and multi-card
#- - - - - - - - - - - - - -
training_data_full <- training_data_clean %>%
group_by(meta_pin, meta_sale_document_num) %>%
mutate(meta_card_id = row_number()) %>%
ungroup()

# 2. Identify multi-card sales rows
multicard_sales <- training_data_full %>%
filter(ind_pin_is_multicard) %>%
select(
meta_pin,
meta_sale_document_num,
meta_card_id,
meta_sale_price,
char_bldg_sf
)

# 3. Summarize total building SF per PIN + doc_no
# (so you can compute each card's share)
multicard_total_sf <- multicard_sales %>%
group_by(meta_pin, meta_sale_document_num) %>%
summarise(
total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE),
.groups = "drop"
)

# 4. Join total SF back onto each multi-card record, and allocate sale price
multicard_sales_alloc <- multicard_sales %>%
left_join(multicard_total_sf, by = c("meta_pin", "meta_sale_document_num")) %>%
mutate(
sf_share = char_bldg_sf / total_bldg_sf,
meta_sale_price_allocated = meta_sale_price * sf_share
)

# 5. Reintegrate into the main training data, joining by meta_card_id
training_data_with_alloc <- training_data_full %>%
left_join(
multicard_sales_alloc %>%
select(meta_pin, meta_sale_document_num, meta_card_id, meta_sale_price_allocated),
by = c("meta_pin", "meta_sale_document_num", "meta_card_id")
) %>%
# If a multi-card sale, use the allocated sale price; otherwise original
mutate(
meta_sale_price = if_else(
ind_pin_is_multicard,
meta_sale_price_allocated,
meta_sale_price
)
)

training_data <- training_data_with_alloc %>%
select(-meta_sale_price_allocated, -meta_card_id)

# - - - - - - - -
# Add two features
# - - - - - - - -

training_data_clean <- training_data %>%
group_by(meta_pin) %>%
mutate(
char_card_pct_bldg = char_bldg_sf / sum(char_bldg_sf, na.rm = TRUE),
char_key_card = char_bldg_sf == max(char_bldg_sf, na.rm = TRUE)
) %>%
ungroup() %>%
as_tibble() %>%
write_parquet(paths$input$training$local)




## 5.2. Assessment Data --------------------------------------------------------

# Clean the assessment data. This is the target data that the trained model is
Expand Down Expand Up @@ -503,7 +578,15 @@ assessment_data_clean <- assessment_data_w_hie %>%
relocate(starts_with("sv_"), .after = everything()) %>%
relocate("year", .after = everything()) %>%
relocate(starts_with("meta_sale_"), .after = hie_num_expired) %>%
as_tibble() %>%
group_by(meta_pin) %>%
mutate(
total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE),
char_card_pct_bldg = char_bldg_sf / total_bldg_sf,
# Flag as key card if it’s the maximum SF (ties will create multiple key cards)
char_key_card = if_else(char_bldg_sf == max(char_bldg_sf, na.rm = TRUE), 1, 0)
) %>%
ungroup() %>%
select(-total_bldg_sf) %>%
write_parquet(paths$input$assessment$local)


Expand Down
14 changes: 11 additions & 3 deletions pipeline/01-train.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# Start the stage timer and clear logs from prior stage
tictoc::tic.clearlog()

tictoc::tic("Train")

# Load libraries, helpers, and recipes from files
Expand All @@ -24,15 +25,20 @@ message("Run type: ", run_type)
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
message("Preparing model training data")

training_data_full <- read_parquet(paths$input$training$local) %>%
filter(!sv_is_outlier) %>%
arrange(meta_sale_date)


# Load the full set of training data, then arrange by sale date in order to
# facilitate out-of-time sampling/validation

# NOTE: It is critical to trim "multicard" sales when training. Multicard means
# there is multiple buildings on a PIN. Since these sales include multiple
# buildings, they are typically higher than a "normal" sale and must be removed
training_data_full <- read_parquet(paths$input$training$local) %>%
filter(!ind_pin_is_multicard, !sv_is_outlier) %>%
arrange(meta_sale_date)
# training_data_full <- read_parquet(paths$input$training$local) %>%
# filter(!ind_pin_is_multicard, !sv_is_outlier) %>%
# arrange(meta_sale_date)

# Create train/test split by time, with most recent observations in the test set
# We want our best model(s) to be predictive of the future, since properties are
Expand All @@ -46,6 +52,8 @@ train <- training(split_data)

# Create a recipe for the training data which removes non-predictor columns and
# preps categorical data, see R/recipes.R for details

#TODO: Add two card like features here
train_recipe <- model_main_recipe(
data = training_data_full,
pred_vars = params$model$predictor$all,
Expand Down
15 changes: 3 additions & 12 deletions pipeline/02-assess.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,7 @@ assessment_card_data_mc <- assessment_card_data_pred %>%
# blowing up the PIN-level AV
group_by(meta_pin) %>%
mutate(
pred_pin_card_sum = ifelse(
sum(pred_card_intermediate_fmv) * meta_tieback_proration_rate <=
params$pv$multicard_yoy_cap * first(meta_1yr_pri_board_tot * 10) |
is.na(meta_1yr_pri_board_tot) |
n() != 2,
sum(pred_card_intermediate_fmv),
max(pred_card_intermediate_fmv)
)
pred_pin_card_sum = sum(pred_card_intermediate_fmv)
) %>%
ungroup()

Expand Down Expand Up @@ -191,8 +184,6 @@ assessment_pin_data_w_land <- assessment_card_data_round %>%
)




#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 5. Prorate and Reapportion ---------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down Expand Up @@ -324,7 +315,7 @@ assessment_card_data_merged %>%
) %>%
ccao::vars_recode(
cols = any_of(char_vars),
code_type = "long",
#code_type = "long",
as_factor = FALSE
) %>%
write_parquet(paths$output$assessment_card$local)
Expand Down Expand Up @@ -551,7 +542,7 @@ message("Saving final PIN-level data")
assessment_pin_data_final %>%
ccao::vars_recode(
cols = starts_with("char_"),
code_type = "short",
#code_type = "short",
as_factor = FALSE
) %>%
# Coerce columns to their expected Athena output type
Expand Down
Loading