Skip to content

Commit

Permalink
Remove price binning from comps and filter targets by tri instead (#301)
Browse files Browse the repository at this point in the history
* Remove price binning from comps and filter targets by tri instead

* Remove unnecessary predicted_value computation from comps pipeline

* Update params.yaml for comps binning changes

* Remove stray trailing comma from comps function call in interpret stage

* Temporarily reduce num_iterations in params so that we can speed up comps feedback cycle

* Fix handling for missing comps in interpret stage

* Use actual chunk values when iterating chunks in comps.py

* Switch to city tri for the purpose of testing unbinned comps

* Revert "Temporarily reduce num_iterations in params so that we can speed up comps feedback cycle"

This reverts commit 5c7098a.

* Switch back to North tri

* Add minor comps speed improvements (#314)

* Add comps incremental speedups

* Format with ruff

---------

Co-authored-by: Dan Snow <[email protected]>
  • Loading branch information
jeancochrane and dfsnow authored Jan 8, 2025
1 parent 14cfc52 commit aca92c2
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 251 deletions.
8 changes: 1 addition & 7 deletions params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -422,13 +422,7 @@ ratio_study:
# in the interpret pipeline stage. Only used if comp_enable = true
comp:
# Number of comps to generate for each PIN/card
num_comps: 20

# Number of price bins to use when binning properties for the purpose of
# comparison. Corresponds to ntile of predicted FMV, e.g. 10 creates deciles
# of predicted values. Larger deciles = larger comparables search range
num_price_bins: 20

num_comps: 5

# Export -----------------------------------------------------------------------

Expand Down
57 changes: 32 additions & 25 deletions pipeline/04-interpret.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,30 @@ lightgbm::lgb.importance(lgbm_final_full_fit$fit) %>%
if (comp_enable) {
message("Finding comparables")

# Filter target properties for only the current triad, to speed up the comps
# algorithm
comp_assessment_data <- assessment_data %>%
filter(
meta_township_code %in% (
ccao::town_dict %>%
filter(triad_name == tools::toTitleCase(params$assessment$triad)) %>%
pull(township_code)
)
)
comp_assessment_data_prepped <- recipes::bake(
object = lgbm_final_full_recipe,
new_data = comp_assessment_data,
all_predictors()
)

# Calculate the leaf node assignments for every predicted value.
# Due to integer overflow problems with leaf node assignment, we need to
# chunk our data such that they are strictly less than the limit of 1073742
# rows. More detail here: https://github.com/microsoft/LightGBM/issues/1884
chunk_size <- 500000
chunks <- split(
assessment_data_prepped,
ceiling(seq_along(assessment_data_prepped[[1]]) / chunk_size)
comp_assessment_data_prepped,
ceiling(seq_along(comp_assessment_data_prepped[[1]]) / chunk_size)
)
chunked_leaf_nodes <- chunks %>%
map(\(chunk) {
Expand Down Expand Up @@ -184,20 +200,6 @@ if (comp_enable) {
type = "leaf"
) %>%
as_tibble()
training_leaf_nodes$predicted_value <- predict(
object = lgbm_final_full_fit$fit,
newdata = as.matrix(training_data_prepped)
) %>%
# Round predicted values down for binning
floor()

# Get predicted values for the assessment set, which we already have in
# the assessment card set
leaf_nodes$predicted_value <- assessment_data %>%
left_join(assessment_card, by = c("meta_pin", "meta_card_num")) %>%
# Round predicted values down for binning
mutate(pred_card_initial_fmv = floor(pred_card_initial_fmv)) %>%
dplyr::pull(pred_card_initial_fmv)

# Make sure that the leaf node tibbles are all integers, which is what
# the comps algorithm expects
Expand All @@ -213,8 +215,7 @@ if (comp_enable) {
{
comps <- comps_module$get_comps(
leaf_nodes, training_leaf_nodes, tree_weights,
num_comps = as.integer(params$comp$num_comps),
num_price_bins = as.integer(params$comp$num_price_bins)
num_comps = as.integer(params$comp$num_comps)
)
},
error = function(e) {
Expand All @@ -223,32 +224,38 @@ if (comp_enable) {
stop("Encountered error in python/comps.py")
}
)
# Correct for the fact that Python is 0-indexed by incrementing the
# comp indexes by 1
comps[[1]] <- comps[[1]] + 1

# Translate comp indexes to PINs and document numbers
comps[[1]] <- comps[[1]] %>%
mutate(
# Correct for the fact that Python is 0-indexed by incrementing the
# comp indexes by 1, and cast null indicators (-1) to null
across(everything(), ~ ifelse(. == -1, NA, . + 1)),
# Map comp index to PIN
across(
starts_with("comp_idx_"),
\(idx_row) {
training_data[idx_row, ]$meta_pin
ifelse(is.na(idx_row), NA, training_data[idx_row, ]$meta_pin)
},
.names = "comp_pin_{str_remove(col, 'comp_idx_')}"
),
# Map comp index to sale doc number
across(
starts_with("comp_idx_"),
\(idx_row) {
training_data[idx_row, ]$meta_sale_document_num
ifelse(
is.na(idx_row),
NA,
training_data[idx_row, ]$meta_sale_document_num
)
},
.names = "comp_document_num_{str_remove(col, 'comp_idx_')}"
)
) %>%
select(-starts_with("comp_idx_")) %>%
cbind(
pin = assessment_data$meta_pin,
card = assessment_data$meta_card_num
pin = comp_assessment_data$meta_pin,
card = comp_assessment_data$meta_card_num
) %>%
relocate(pin, card)

Expand Down
1 change: 0 additions & 1 deletion pipeline/05-finalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ metadata <- tibble::tibble(
shap_enable = shap_enable,
comp_enable = comp_enable,
comp_num_comps = params$comp$num_comps,
comp_num_price_bins = params$comp$num_price_bins,
cv_enable = cv_enable,
cv_num_folds = params$cv$num_folds,
cv_fold_overlap = params$cv$fold_overlap,
Expand Down
Loading

0 comments on commit aca92c2

Please sign in to comment.