ccao-data · wagnerlmichael · Jan 14, 2025 · Jan 15, 2025 · Jan 16, 2025 · Jan 16, 2025
@@ -149,6 +149,7 @@ model:
     # Vector of predictors from the training data included in the model. Edit
     # this list to add or remove variables from the model
     all:
+      - "meta_pin_num_cards"
       - "meta_township_code"
       - "meta_nbhd_code"
       - "meta_sale_count_past_n_years"

@@ -31,9 +31,33 @@ message("Preparing model training data")
 # there is multiple buildings on a PIN. Since these sales include multiple
 # buildings, they are typically higher than a "normal" sale and must be removed
 training_data_full <- read_parquet(paths$input$training$local) %>%
-  filter(!ind_pin_is_multicard, !sv_is_outlier) %>%
+  filter(
+    meta_pin_num_cards <= 3,
+    !sv_is_outlier
+  ) %>%
   arrange(meta_sale_date)
 
+# To calculate a value for multi-card properties, keep the largest card
+# and drop the others. The building square footage from the dropped card(s) is
+# added to the kept card to make a more robust prediction.
+df_multi_card_kept <- training_data_full %>%
+  filter(ind_pin_is_multicard) %>%
+  group_by(meta_pin, meta_sale_document_num) %>%
+  mutate(
+    total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE)
+  ) %>%
+  slice_max(char_bldg_sf, with_ties = FALSE) %>%
+  mutate(char_bldg_sf = total_bldg_sf) %>%
+  select(-total_bldg_sf) %>%
+  ungroup()
+
+df_single_card <- training_data_full %>%
+  filter(!ind_pin_is_multicard)
+
+training_data_full <- df_single_card %>%
+  bind_rows(df_multi_card_kept)
+
+
 # Create train/test split by time, with most recent observations in the test set
 # We want our best model(s) to be predictive of the future, since properties are
 # assessed on the basis of past sales

@@ -41,7 +41,29 @@ lgbm_final_full_recipe <- readRDS(paths$output$workflow_recipe$local)
 # Load the data for assessment. This is the universe of CARDs (not
 # PINs) that needs values. Use the trained lightgbm model to estimate a single
 # fair-market value for each card
-assessment_card_data_pred <- read_parquet(paths$input$assessment$local) %>%
+df_assessment_data <- read_parquet(paths$input$assessment$local)
+
+# To calculate a value for multi-card properties, keep the largest card
+# and drop the others. The building square footage from the dropped card(s) is
+# added to the kept card to make a more robust prediction.
+df_multi_card_kept <- df_assessment_data %>%
+  filter(ind_pin_is_multicard) %>%
+  group_by(meta_pin) %>%
+  mutate(
+    total_bldg_sf = sum(char_bldg_sf, na.rm = TRUE)
+  ) %>%
+  slice_max(char_bldg_sf, with_ties = FALSE) %>%
+  mutate(char_bldg_sf = total_bldg_sf) %>%
+  select(-total_bldg_sf) %>%
+  ungroup()
+
+df_single_card <- df_assessment_data %>%
+  filter(!ind_pin_is_multicard)
+
+assessment_adjusted_multi_card <- df_single_card %>%
+  bind_rows(df_multi_card_kept)
+
+assessment_card_data_pred <- assessment_adjusted_multi_card %>%
   as_tibble() %>%
   mutate(
     pred_card_initial_fmv = predict(
@@ -65,9 +87,54 @@ message("Performing post-modeling adjustments")
 ## 3.1. Multicards -------------------------------------------------------------
 message("Fixing multicard PINs")
 
+# Re-add dropped multicard rows, distributing total predicted FMV
+# across each original card by share of bldg sqft
+
+# Identify the dropped multi-card rows
+df_multi_card_dropped <- df_assessment_data %>%
+  filter(ind_pin_is_multicard) %>%
+  anti_join(
+    df_multi_card_kept %>% select(meta_pin, meta_card_num),
+    by = c("meta_pin", "meta_card_num")
+  )
+
+# Combine kept predicted rows with 'dropped' rows
+df_multi_card_combined <- assessment_card_data_pred %>%
+  filter(ind_pin_is_multicard) %>%
+  bind_rows(
+    df_multi_card_dropped %>% mutate(pred_card_initial_fmv = NA_real_)
+  )
+
+# For each PIN, distribute total predicted FMV proportionally
+df_multi_card_final <- df_multi_card_combined %>%
+  group_by(meta_pin) %>%
+  mutate(
+    total_fmv = sum(pred_card_initial_fmv, na.rm = TRUE),
+    total_bldg_sf_pin = sum(char_bldg_sf, na.rm = TRUE),
+    share_bldg_sf = char_bldg_sf / total_bldg_sf_pin,
+    pred_card_initial_fmv = total_fmv * share_bldg_sf
+  ) %>%
+  ungroup() %>%
+  # Drop intermediate columns
+  select(
+    -total_fmv,
+    -share_bldg_sf,
+    -total_bldg_sf_pin
+  )
+
+# Pull single-card PINs
+df_single_card_final <- assessment_card_data_pred %>%
+  filter(!ind_pin_is_multicard)
+
+# Combine single + multi-card rows
+deaggregated_card_preds <- bind_rows(
+  df_single_card_final,
+  df_multi_card_final
+)
+
 # Cards represent buildings/improvements. A PIN can have multiple cards, and
 # the total taxable value of the PIN is (usually) the sum of all cards
-assessment_card_data_mc <- assessment_card_data_pred %>%
+assessment_card_data_mc <- deaggregated_card_preds %>%
   select(
     meta_year, meta_pin, meta_nbhd_code, meta_class, meta_card_num,
     char_bldg_sf, char_land_sf,
@@ -85,20 +152,10 @@ assessment_card_data_mc <- assessment_card_data_pred %>%
       mean(pred_card_initial_fmv)
     )
   ) %>%
-  # Aggregate multi-cards to the PIN-level by summing the predictions
-  # of all cards. We use a heuristic here to limit the PIN-level total
-  # value, this is to prevent super-high-value back-buildings/ADUs from
-  # blowing up the PIN-level AV
+  # Re-aggregate the the pin-level prediction
   group_by(meta_pin) %>%
   mutate(
-    pred_pin_card_sum = ifelse(
-      sum(pred_card_intermediate_fmv) * meta_tieback_proration_rate <=
-        params$pv$multicard_yoy_cap * first(meta_1yr_pri_board_tot * 10) |
-        is.na(meta_1yr_pri_board_tot) |
-        n() != 2,
-      sum(pred_card_intermediate_fmv),
-      max(pred_card_intermediate_fmv)
-    )
+    pred_pin_card_sum = sum(pred_card_intermediate_fmv)
   ) %>%
   ungroup()
 

@@ -20,8 +20,8 @@ knitr:
     out.width: "100%"
 editor: source
 params:
-  run_id: "2024-03-17-stupefied-maya"
-  year: "2024"
+  run_id: "2025-01-10-serene-boni"
+  year: "2025"
 ---
 
 {{< include ../_setup.qmd >}}
@@ -303,3 +303,90 @@ iwalk(plots, ~ {
 ```
 
 :::
+
+## Assessed values for multi-card properties
+
+The sales data we use to measure accuracy is the most recent sale per multi-card
+pin if there was one after 2020.
+
+```{r _decile_ratio_graph}
+# Grab multi-card indicator
+assessment_pin <- assessment_pin %>%
+  left_join(
+    assessment_data %>%
+      select(meta_pin, ind_pin_is_multicard),
+    by = "meta_pin"
+  )
+
+df_filtered <- assessment_pin %>%
+  filter(ind_pin_is_multicard) %>%
+  mutate(
+    ratio = pred_pin_final_fmv / sale_recent_1_price
+  ) %>%
+  filter(sale_recent_1_date >= as.Date("2020-01-01"))
+
+
+df_filtered <- df_filtered %>%
+  mutate(decile = ntile(pred_pin_final_fmv, 10))
+
+df_deciles <- df_filtered %>%
+  group_by(decile) %>%
+  summarise(median_ratio = median(ratio, na.rm = TRUE))
+
+p_deciles <- ggplot(df_deciles, aes(x = decile, y = median_ratio)) +
+  geom_line() +
+  geom_point() +
+  geom_hline(yintercept = 1, color = "black", linetype = "dashed") +
+  scale_x_continuous(breaks = 1:10) +
+  labs(
+    title = "Median Ratio by Decile (Sales After 2020)",
+    x = "Decile",
+    y = "Median Ratio"
+  ) +
+  theme_minimal()
+
+p_deciles
+```
+
+```{r _scatterplot_pred_vs_sale}
+max_val <- max(
+  c(df_filtered$sale_recent_1_price, df_filtered$pred_pin_final_fmv),
+  na.rm = TRUE
+)
+
+p_scatter_base <- plot_ly(
+  data = df_filtered,
+  x = ~sale_recent_1_price,
+  y = ~pred_pin_final_fmv,
+  type = "scatter",
+  mode = "markers",
+  hoverinfo = "text",
+  text = ~ paste(
+    "<b>PIN:</b>", meta_pin,
+    "<br><b>Sale Price:</b>", sale_recent_1_price,
+    "<br><b>Predicted (Base):</b>", round(pred_pin_final_fmv, 2),
+    "<br><b>Sale Date:</b>", sale_recent_1_date,
+    "<br><b>Ratio:</b>", round(ratio, 3)
+  )
+) %>%
+  plotly::layout(
+    title = "FMV vs sale price",
+    shapes = list(
+      list(
+        type = "line",
+        x0   = 0,
+        y0   = 0,
+        x1   = max_val,
+        y1   = max_val,
+        xref = "x",
+        yref = "y",
+        line = list(color = "red", dash = "dash")
+      )
+    ),
+    xaxis = list(title = "Sale Price"),
+    yaxis = list(title = "Predicted FMV (Base)")
+  )
+
+p_scatter_base
+```
+