Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test multi card training sqft lump 2 #332

Draft
wants to merge 18 commits into
base: 2025-assessment-year
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ model:
# Vector of predictors from the training data included in the model. Edit
# this list to add or remove variables from the model
all:
- "meta_pin_num_cards"
- "meta_township_code"
- "meta_nbhd_code"
- "meta_sale_count_past_n_years"
Expand Down
26 changes: 25 additions & 1 deletion pipeline/01-train.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,33 @@ message("Preparing model training data")
# there is multiple buildings on a PIN. Since these sales include multiple
# buildings, they are typically higher than a "normal" sale and must be removed
training_data_full <- read_parquet(paths$input$training$local) %>%
filter(!ind_pin_is_multicard, !sv_is_outlier) %>%
filter(
meta_pin_num_cards <= 3,
!sv_is_outlier
) %>%
arrange(meta_sale_date)

# To calculate a value for multi-card properties, keep the largest card
# and drop the others. The building square footage from the dropped card(s) is
# added to the kept card to make a more robust prediction.
df_multi_card_kept <- training_data_full %>%
filter(ind_pin_is_multicard) %>%
group_by(meta_pin, meta_sale_document_num) %>%
mutate(
total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE)
) %>%
slice_max(char_bldg_sf, with_ties = FALSE) %>%
mutate(char_bldg_sf = total_bldg_sf) %>%
select(-total_bldg_sf) %>%
ungroup()

df_single_card <- training_data_full %>%
filter(!ind_pin_is_multicard)

training_data_full <- df_single_card %>%
bind_rows(df_multi_card_kept)


# Create train/test split by time, with most recent observations in the test set
# We want our best model(s) to be predictive of the future, since properties are
# assessed on the basis of past sales
Expand Down
85 changes: 71 additions & 14 deletions pipeline/02-assess.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,29 @@ lgbm_final_full_recipe <- readRDS(paths$output$workflow_recipe$local)
# Load the data for assessment. This is the universe of CARDs (not
# PINs) that needs values. Use the trained lightgbm model to estimate a single
# fair-market value for each card
assessment_card_data_pred <- read_parquet(paths$input$assessment$local) %>%
df_assessment_data <- read_parquet(paths$input$assessment$local)

# To calculate a value for multi-card properties, keep the largest card
# and drop the others. The building square footage from the dropped card(s) is
# added to the kept card to make a more robust prediction.
df_multi_card_kept <- df_assessment_data %>%
filter(ind_pin_is_multicard) %>%
group_by(meta_pin) %>%
mutate(
total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE)
) %>%
slice_max(char_bldg_sf, with_ties = FALSE) %>%
mutate(char_bldg_sf = total_bldg_sf) %>%
select(-total_bldg_sf) %>%
ungroup()

df_single_card <- df_assessment_data %>%
filter(!ind_pin_is_multicard)

assessment_adjusted_multi_card <- df_single_card %>%
bind_rows(df_multi_card_kept)

assessment_card_data_pred <- assessment_adjusted_multi_card %>%
as_tibble() %>%
mutate(
pred_card_initial_fmv = predict(
Expand All @@ -65,9 +87,54 @@ message("Performing post-modeling adjustments")
## 3.1. Multicards -------------------------------------------------------------
message("Fixing multicard PINs")

# Re-add dropped multicard rows, distributing total predicted FMV
# across each original card by share of bldg sqft

# Identify the dropped multi-card rows
df_multi_card_dropped <- df_assessment_data %>%
filter(ind_pin_is_multicard) %>%
anti_join(
df_multi_card_kept %>% select(meta_pin, meta_card_num),
by = c("meta_pin", "meta_card_num")
)

# Combine kept predicted rows with 'dropped' rows
df_multi_card_combined <- assessment_card_data_pred %>%
filter(ind_pin_is_multicard) %>%
bind_rows(
df_multi_card_dropped %>% mutate(pred_card_initial_fmv = NA_real_)
)

# For each PIN, distribute total predicted FMV proportionally
df_multi_card_final <- df_multi_card_combined %>%
group_by(meta_pin) %>%
mutate(
total_fmv = sum(pred_card_initial_fmv, na.rm = TRUE),
total_bldg_sf_pin = sum(char_bldg_sf, na.rm = TRUE),
share_bldg_sf = char_bldg_sf / total_bldg_sf_pin,
pred_card_initial_fmv = total_fmv * share_bldg_sf
) %>%
ungroup() %>%
# Drop intermediate columns
select(
-total_fmv,
-share_bldg_sf,
-total_bldg_sf_pin
)

# Pull single-card PINs
df_single_card_final <- assessment_card_data_pred %>%
filter(!ind_pin_is_multicard)

# Combine single + multi-card rows
deaggregated_card_preds <- bind_rows(
df_single_card_final,
df_multi_card_final
)

# Cards represent buildings/improvements. A PIN can have multiple cards, and
# the total taxable value of the PIN is (usually) the sum of all cards
assessment_card_data_mc <- assessment_card_data_pred %>%
assessment_card_data_mc <- deaggregated_card_preds %>%
select(
meta_year, meta_pin, meta_nbhd_code, meta_class, meta_card_num,
char_bldg_sf, char_land_sf,
Expand All @@ -85,20 +152,10 @@ assessment_card_data_mc <- assessment_card_data_pred %>%
mean(pred_card_initial_fmv)
)
) %>%
# Aggregate multi-cards to the PIN-level by summing the predictions
# of all cards. We use a heuristic here to limit the PIN-level total
# value, this is to prevent super-high-value back-buildings/ADUs from
# blowing up the PIN-level AV
# Re-aggregate the the pin-level prediction
group_by(meta_pin) %>%
mutate(
pred_pin_card_sum = ifelse(
sum(pred_card_intermediate_fmv) * meta_tieback_proration_rate <=
params$pv$multicard_yoy_cap * first(meta_1yr_pri_board_tot * 10) |
is.na(meta_1yr_pri_board_tot) |
n() != 2,
sum(pred_card_intermediate_fmv),
max(pred_card_intermediate_fmv)
)
pred_pin_card_sum = sum(pred_card_intermediate_fmv)
) %>%
ungroup()

Expand Down
91 changes: 89 additions & 2 deletions reports/challenge_groups/challenge_groups.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ knitr:
out.width: "100%"
editor: source
params:
run_id: "2024-03-17-stupefied-maya"
year: "2024"
run_id: "2025-01-10-serene-boni"
year: "2025"
---

{{< include ../_setup.qmd >}}
Expand Down Expand Up @@ -303,3 +303,90 @@ iwalk(plots, ~ {
```

:::

## Assessed values for multi-card properties

The sales data we use to measure accuracy is the most recent sale per multi-card
pin if there was one after 2020.

```{r _decile_ratio_graph}
# Grab multi-card indicator
assessment_pin <- assessment_pin %>%
left_join(
assessment_data %>%
select(meta_pin, ind_pin_is_multicard),
by = "meta_pin"
)

df_filtered <- assessment_pin %>%
filter(ind_pin_is_multicard) %>%
mutate(
ratio = pred_pin_final_fmv / sale_recent_1_price
) %>%
filter(sale_recent_1_date >= as.Date("2020-01-01"))


df_filtered <- df_filtered %>%
mutate(decile = ntile(pred_pin_final_fmv, 10))

df_deciles <- df_filtered %>%
group_by(decile) %>%
summarise(median_ratio = median(ratio, na.rm = TRUE))

p_deciles <- ggplot(df_deciles, aes(x = decile, y = median_ratio)) +
geom_line() +
geom_point() +
geom_hline(yintercept = 1, color = "black", linetype = "dashed") +
scale_x_continuous(breaks = 1:10) +
labs(
title = "Median Ratio by Decile (Sales After 2020)",
x = "Decile",
y = "Median Ratio"
) +
theme_minimal()

p_deciles
```

```{r _scatterplot_pred_vs_sale}
max_val <- max(
c(df_filtered$sale_recent_1_price, df_filtered$pred_pin_final_fmv),
na.rm = TRUE
)

p_scatter_base <- plot_ly(
data = df_filtered,
x = ~sale_recent_1_price,
y = ~pred_pin_final_fmv,
type = "scatter",
mode = "markers",
hoverinfo = "text",
text = ~ paste(
"<b>PIN:</b>", meta_pin,
"<br><b>Sale Price:</b>", sale_recent_1_price,
"<br><b>Predicted (Base):</b>", round(pred_pin_final_fmv, 2),
"<br><b>Sale Date:</b>", sale_recent_1_date,
"<br><b>Ratio:</b>", round(ratio, 3)
)
) %>%
plotly::layout(
title = "FMV vs sale price",
shapes = list(
list(
type = "line",
x0 = 0,
y0 = 0,
x1 = max_val,
y1 = max_val,
xref = "x",
yref = "y",
line = list(color = "red", dash = "dash")
)
),
xaxis = list(title = "Sale Price"),
yaxis = list(title = "Predicted FMV (Base)")
)

p_scatter_base
```

Loading