diff --git a/R/recipes.R b/R/recipes.R
index 746aa3c5..d1e7de0d 100644
--- a/R/recipes.R
+++ b/R/recipes.R
@@ -10,17 +10,18 @@
#' will be the right-hand side of the regression AKA predictors.
#' @param cat_vars Character vector of categorical column names. These will be
#' integer-encoded (base 0).
-#' @param knn_vars Character vector of column names. These columns will have
-#' missing values imputed via KNN.
-#' @param knn_imp_vars Character vector of column names. These columns will be
-#' used to impute the columns in knn_vars.
+#' @param imp Character vector of column names. These columns will have
+#' missing values imputed.
+#' @param imp_vars Character vector of column names. These columns will be
+#' used to impute the columns in imp.
#' @param id_vars Character vector of ID variables. These can be kept in "baked"
#' data without being treated as predictors.
+#' @param seed Integer seed value for reproducibility.
#'
#' @return A recipe object that can be used to clean model input data.
#'
model_main_recipe <- function(data, pred_vars, cat_vars,
- knn_vars, knn_imp_vars, id_vars) {
+ imp, imp_vars, id_vars, seed) {
recipe(data) %>%
# Set the role of each variable in the input data
update_role(meta_sale_price, new_role = "outcome") %>%
@@ -30,19 +31,12 @@ model_main_recipe <- function(data, pred_vars, cat_vars,
update_role_requirements("NA", bake = FALSE) %>%
# Remove any variables not an outcome var or in the pred_vars vector
step_rm(-all_outcomes(), -all_predictors(), -has_role("ID")) %>%
- # Impute missing values using KNN. Specific to condo model, usually used to
- # impute missing condo building strata. Within step_impute_knn, an estimated
- # node value is called with the sample(). This is not deterministic, meaning
- # different runs of the model will have different imputed values, and thus
- # different FMVs.
- step_impute_knn(
- all_of(knn_vars),
- neighbors = tune(),
- impute_with = imp_vars(all_of(knn_imp_vars)),
- options = list(
- nthread = parallel::detectCores(logical = FALSE),
- eps = 1e-08
- )
+ # Impute missing values using a separate tree model
+ step_impute_bag(
+ all_of(imp),
+ trees = tune("imp_trees"),
+ impute_with = imp_vars(all_of(imp_vars)),
+ seed_val = seed
) %>%
# Replace novel levels with "new"
step_novel(all_of(cat_vars), -has_role("ID")) %>%
@@ -66,17 +60,18 @@ model_main_recipe <- function(data, pred_vars, cat_vars,
#' will be the right-hand side of the regression AKA predictors.
#' @param cat_vars Character vector of categorical column names. These will be
#' transformed/encoded using embeddings.
-#' @param knn_vars Character vector of column names. These columns will have
-#' missing values imputed via KNN.
-#' @param knn_imp_vars Character vector of column names. These columns will be
-#' used to impute the columns in knn_vars.
+#' @param imp Character vector of column names. These columns will have
+#' missing values imputed.
+#' @param imp_vars Character vector of column names. These columns will be
+#' used to impute the columns in imp.
#' @param id_vars Character vector of ID variables. These can be kept in "baked"
#' data without being treated as predictors.
+#' @param seed Integer seed value for reproducibility.
#'
#' @return A recipe object that can be used to clean model input data.
#'
model_lin_recipe <- function(data, pred_vars, cat_vars,
- knn_vars, knn_imp_vars, id_vars) {
+ imp, imp_vars, id_vars, seed) {
recipe(data) %>%
# Set the role of each variable in the input data
update_role(meta_sale_price, new_role = "outcome") %>%
@@ -89,16 +84,12 @@ model_lin_recipe <- function(data, pred_vars, cat_vars,
step_rm(-all_outcomes(), -all_predictors(), -has_role("ID")) %>%
# Drop extra location predictors that aren't nbhd or township
step_rm(starts_with("loc_"), -all_numeric_predictors()) %>%
- # Impute missing values using KNN. Specific to condo model, usually used to
- # impute missing condo building strata
- step_impute_knn(
- all_of(knn_vars),
- neighbors = tune(),
- impute_with = imp_vars(all_of(knn_imp_vars)),
- options = list(
- nthread = parallel::detectCores(logical = FALSE),
- eps = 1e-08
- )
+ # Impute missing values using a separate tree model
+ step_impute_bag(
+ all_of(imp),
+ trees = tune("imp_trees"),
+ impute_with = imp_vars(all_of(imp_vars)),
+ seed_val = seed
) %>%
# Transforms and imputations
step_mutate(
diff --git a/README.Rmd b/README.Rmd
index d491facd..a50fd902 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -54,13 +54,14 @@ Like most assessors nationwide, our office staff cannot enter buildings to obser
The only _complete_ information our office currently has about individual condominium units is their age, location, sale date/price, and percentage of ownership. This makes modeling condos particularly challenging, as the number of usable features is quite small. Fortunately, condos have two qualities which make modeling a bit easier:
1. Condos are more homogeneous than single/multi-family properties, i.e. the range of potential condo sale prices is much narrower.
-2. Condo are pre-grouped into clusters of like units (buildings), and units within the same building usually have similar sale prices.
+2. Condos are pre-grouped into clusters of like units (buildings), and units within the same building usually have similar sale prices.
-We leverage these qualities to produce what we call ***strata***, a feature unique to the condo model. See [Condo Strata](#condo-strata) for more information about how strata is used and calculated.
+We leverage these qualities to produce a time-weighted, rolling average sale price for
+each building which is then used as a predictor in the unit-level model.
### Features Used
-Because our individual condo unit characteristics are sparse and incomplete, we primarily must rely on aggregate geospatial features, economic features, [strata](#condo-strata), and time of sale to determine condo assessed values. The features in the table below are the ones used in the most recent assessment model.
+Because our individual condo unit characteristics are sparse and incomplete, we primarily must rely on aggregate geospatial features, economic features, and time of sale to determine condo assessed values. The features in the table below are the ones used in the most recent assessment model.
```{r features_used, message=FALSE, echo=FALSE}
library(dplyr)
@@ -87,11 +88,7 @@ hardcoded_descriptions <- tribble(
"sale_day_of_year", "Numeric encoding of day of year (1 - 365)",
"sale_day_of_month", "Numeric encoding of day of month (1 - 31)",
"sale_day_of_week", "Numeric encoding of day of week (1 - 7)",
- "sale_post_covid", "Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020)",
- "strata_1",
- glue("Condominium Building Strata - {condo_params$input$strata$k_1} Levels"),
- "strata_2",
- glue("Condominium Building Strata - {condo_params$input$strata$k_2} Levels")
+ "sale_post_covid", "Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020)"
)
# nolint end
@@ -209,7 +206,7 @@ We maintain a few useful resources for working with these features:
- Once you've [pulled the input data](#getting-data), you can inner join the data to the CSV version of the data dictionary ([`docs/data-dict.csv`](./docs/data-dict.csv)) to filter for only the features that we use in the model.
- You can browse our [data catalog](https://ccao-data.github.io/data-architecture/#!/overview) to see more details about these features, in particular the [condo model input view](https://ccao-data.github.io/data-architecture/#!/model/model.ccao_data_athena.model.vw_pin_condo_input) which is the source of our training data.
-- You can use the [`ccao` R package](https://ccao-data.github.io/ccao/) or its [Python equivalent](https://ccao-data.github.io/ccao/python/) to programmatically convert variable names to their human-readable versions ([`ccao::vars_rename()`](https://ccao-data.github.io/ccao/reference/vars_rename.html)) or convert numerically-encoded variables to human-readable values ([`ccao::vars_recode()`](https://ccao-data.github.io/ccao/reference/vars_recode.html). The [`ccao::vars_dict` object](https://ccao-data.github.io/ccao/reference/vars_dict.html) is also useful for inspecting the raw crosswalk that powers the rename and recode functions.
+- You can use the [`ccao` R package](https://ccao-data.github.io/ccao/) or its [Python equivalent](https://ccao-data.github.io/ccao/python/) to programmatically convert variable names to their human-readable versions ([`ccao::vars_rename()`](https://ccao-data.github.io/ccao/reference/vars_rename.html)) or convert numerically-encoded variables to human-readable values ([`ccao::vars_recode()`](https://ccao-data.github.io/ccao/reference/vars_recode.html)). The [`ccao::vars_dict` object](https://ccao-data.github.io/ccao/reference/vars_dict.html) is also useful for inspecting the raw crosswalk that powers the rename and recode functions.
### Valuation
@@ -236,34 +233,6 @@ The condo model is trained on a select number of "multi-PIN sales" (or "multi-sa
$$\frac{0.04}{0.04 + 0.01} * \$100,000 = \$80,000$$
-## Condo Strata
-
-The condo model uses an engineered feature called *strata* to deliver much of its predictive power. Strata is the binned, time-weighted, 5-year average sale price of the building. There are two strata features used in the model, one with `r condo_params$input$strata$k_1` bins and one with `r condo_params$input$strata$k_2` bins. Buildings are binned across each triad using either quantiles or 1-dimensional k-means. A visual representation of quantile-based strata binning looks like:
-
-
-
-To put strata in more concrete terms, the table below shows a sample 5-level strata. Each condominium unit would be assigned a strata from this table (Strata 1, Strata 2, etc.) based on the 5-year weighted average sale price of its building. All units in a building will have the same strata.
-
-```{r strata, echo=FALSE}
-library(tibble)
-
-tribble(
- ~"Strata", ~"Range of 5-year Average Sale Price",
- "Strata 1", "$0 - $121K",
- "Strata 2", "$121K - $149K",
- "Strata 3", "$149K - $199K",
- "Strata 4", "$199K - $276K",
- "Strata 5", "$276K+"
-) %>%
- knitr::kable(format = "markdown")
-```
-
-Some additional notes on strata:
-
-- Strata is calculated in the [ingest stage](./pipeline/00-ingest.R) of this repository.
-- Calculating the 5-year average sale price of a building requires at least 1 sale. Buildings with no sales have their strata imputed via KNN (using year built, number of units, and location as features).
-- Number of bins (`r condo_params$input$strata$k_1` and `r condo_params$input$strata$k_2`) was chosen based on model performance. These numbers yielded the lowest root mean-squared error (RMSE).
-
# Ongoing Issues
The CCAO faces a number of ongoing issues specific to condominium modeling. We are currently working on processes to fix these issues. We list the issues here for the sake of transparency and to provide a sense of the challenges we face.
@@ -272,10 +241,10 @@ The CCAO faces a number of ongoing issues specific to condominium modeling. We a
The current modeling methodology for condominiums makes two assumptions:
-1. Condos units within the same building are similar and will sell for similar amounts.
+1. Condo units within the same building are similar and will sell for similar amounts.
2. If units are not similar, the percentage of ownership will accurately reflect and be proportional to any difference in value between units.
-The model process works even in heterogeneous buildings as long as assumption 2 is met. For example, imagine a building with 8 identical units and 1 penthouse unit. This building violates assumption 1 because the penthouse unit is likely larger and worth more than the other 10. However, if the percentage of ownership of each unit is roughly proportional to its value, then each unit will still receive a fair assessment.
+The model process works even in heterogeneous buildings as long as assumption 2 is met. For example, imagine a building with 8 identical units and 1 penthouse unit. This building violates assumption 1 because the penthouse unit is likely larger and worth more than the other 8. However, if the percentage of ownership of each unit is roughly proportional to its value, then each unit will still receive a fair assessment.
However, the model can produce poor results when both of these assumptions are violated. For example, if a building has an extreme mix of different units, each with the same percentage of ownership, then smaller, less expensive units will be overvalued and larger, more expensive units will be undervalued.
@@ -283,13 +252,13 @@ This problem is rare, but does occur in certain buildings with many heterogeneou
### Buildings With Few Sales
-The condo model relies on sales within the same building to calculate [strata](#condo-strata). This method works well for large buildings with many sales, but can break down when there are only 1 or 2 sales in a building. The primary danger here is _unrepresentative_ sales, i.e. sales that deviate significantly from the real average value of a building's units. When this happens, buildings can have their average unit sale value pegged too high or low.
+The condo model relies on sales within the same building to calculate a weighted, rolling average building sale price. This method works well for large buildings with many sales, but can break down when there are only 1 or 2 sales in a building. The primary danger here is _unrepresentative_ sales, i.e. sales that deviate significantly from the real average value of a building's units. When this happens, buildings can have their average unit sale value pegged too high or low.
Fortunately, buildings without any recent sales are relatively rare, as condos have a higher turnover rate than single and multi-family property. Smaller buildings with low turnover are the most likely to not have recent sales.
### Buildings Without Sales
-When no sales have occurred in a building in the 5 years prior to assessment, the building's strata features are imputed. The model will look at nearby buildings that have similar unit counts/age and then try to assign an appropriate strata to the target building.
+When no sales have occurred in a building in the 5 years prior to assessment, the building's mean sale price feature is imputed. The model will look at nearby buildings that have similar unit counts, age, and other features, then try to assign an appropriate average to the target building.
Most of the time, this technique produces reasonable results. However, buildings without sales still go through an additional round of review to ensure the accuracy of individual unit values.
@@ -303,11 +272,7 @@ As with the [residential model](https://github.com/ccao-data/model-res-avm), the
* Location, location, location. Location is the largest driver of county-wide variation in condo value. We account for location using [geospatial features like neighborhood](#features-used).
* Condo percentage of ownership, which determines the intra-building variation in unit price.
-* [Condo building strata](#condo-strata). Strata provides us with a good estimate of the average sale price of a building's units.
-
-**Q: How do I see my condo building's strata?**
-
-Individual building [strata](#condo-strata) are not included with assessment notices or shown on the CCAO's website. However, strata *are* stored in the sample data included in this repository. You can load the data ([`input/condo_strata_data.parquet`](./input/condo_strata_data.parquet)) using R and the `read_parquet()` function from the `arrow` library.
+* Other sales in the building. This is captured by a rolling average of sales in the building over the past 5 years, excluding any sales of the target condo unit.
**Q: How do I see the assessed value of other units in my building?**
diff --git a/README.md b/README.md
index b0c92d2c..ffe165f4 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,6 @@ Table of Contents
- [Features Used](#features-used)
- [Valuation](#valuation)
- [Multi-PIN Sales](#multi-pin-sales)
- - [Condo Strata](#condo-strata)
- [Ongoing Issues](#ongoing-issues)
- [Unit Heterogeneity](#unit-heterogeneity)
- [Buildings With Few Sales](#buildings-with-few-sales)
@@ -92,20 +91,20 @@ Fortunately, condos have two qualities which make modeling a bit easier:
1. Condos are more homogeneous than single/multi-family properties,
i.e. the range of potential condo sale prices is much narrower.
-2. Condo are pre-grouped into clusters of like units (buildings), and
+2. Condos are pre-grouped into clusters of like units (buildings), and
units within the same building usually have similar sale prices.
-We leverage these qualities to produce what we call ***strata***, a
-feature unique to the condo model. See [Condo Strata](#condo-strata) for
-more information about how strata is used and calculated.
+We leverage these qualities to produce a time-weighted, rolling average
+sale price for each building which is then used as a predictor in the
+unit-level model.
### Features Used
Because our individual condo unit characteristics are sparse and
incomplete, we primarily must rely on aggregate geospatial features,
-economic features, [strata](#condo-strata), and time of sale to
-determine condo assessed values. The features in the table below are the
-ones used in the most recent assessment model.
+economic features, and time of sale to determine condo assessed values.
+The features in the table below are the ones used in the most recent
+assessment model.
| Feature Name | Variable Name | Description | Category | Type | Unique to Condo Model |
|:----------------------------------------------------------------------------|:------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------|:----------|:----------------------|
@@ -119,8 +118,8 @@ ones used in the most recent assessment model.
| Condominium Unit Half Baths | char_half_baths | Number of half baths | Characteristic | numeric | X |
| Condominium Unit Full Baths | char_full_baths | Number of full bathrooms | Characteristic | numeric | X |
| Condominium % Ownership | meta_tieback_proration_rate | Proration rate applied to the PIN | Meta | numeric | X |
-| Condominium Building Strata 1 | meta_strata_1 | Condominium Building Strata - 10 Levels | Meta | character | X |
-| Condominium Building Strata 2 | meta_strata_2 | Condominium Building Strata - 100 Levels | Meta | character | X |
+| Building Rolling Average Sale Price | meta_pin10_bldg_roll_mean | | Meta | numeric | X |
+| Building Rolling Percent Units Sold | meta_pin10_bldg_roll_pct_sold | | Meta | numeric | X |
| Standard Deviation Distance From Parcel Centroid to Vertices (Feet) | shp_parcel_centroid_dist_ft_sd | Standard deviation of the distance from each major parcel vertex to the parcel centroid | Parcel Shape | numeric | X |
| Standard Deviation Parcel Edge Length (Feet) | shp_parcel_edge_len_ft_sd | Standard deviation of the edge length between parcel vertices | Parcel Shape | numeric | X |
| Standard Deviation Parcel Interior Angle (Degrees) | shp_parcel_interior_angle_sd | Standard deviation of the interior angles of the parcel polygon | Parcel Shape | numeric | X |
@@ -211,7 +210,7 @@ We maintain a few useful resources for working with these features:
versions
([`ccao::vars_rename()`](https://ccao-data.github.io/ccao/reference/vars_rename.html))
or convert numerically-encoded variables to human-readable values
- ([`ccao::vars_recode()`](https://ccao-data.github.io/ccao/reference/vars_recode.html).
+ ([`ccao::vars_recode()`](https://ccao-data.github.io/ccao/reference/vars_recode.html)).
The [`ccao::vars_dict`
object](https://ccao-data.github.io/ccao/reference/vars_dict.html) is
also useful for inspecting the raw crosswalk that powers the rename
@@ -271,42 +270,6 @@ a parking space (1% ownership), the sale would be adjusted to \$80,000:
$$\frac{0.04}{0.04 + 0.01} * \$100,000 = \$80,000$$
-## Condo Strata
-
-The condo model uses an engineered feature called *strata* to deliver
-much of its predictive power. Strata is the binned, time-weighted,
-5-year average sale price of the building. There are two strata features
-used in the model, one with 10 bins and one with 100 bins. Buildings are
-binned across each triad using either quantiles or 1-dimensional
-k-means. A visual representation of quantile-based strata binning looks
-like:
-
-
-
-To put strata in more concrete terms, the table below shows a sample
-5-level strata. Each condominium unit would be assigned a strata from
-this table (Strata 1, Strata 2, etc.) based on the 5-year weighted
-average sale price of its building. All units in a building will have
-the same strata.
-
-| Strata | Range of 5-year Average Sale Price |
-|:---------|:-----------------------------------|
-| Strata 1 | \$0 - \$121K |
-| Strata 2 | \$121K - \$149K |
-| Strata 3 | \$149K - \$199K |
-| Strata 4 | \$199K - \$276K |
-| Strata 5 | \$276K+ |
-
-Some additional notes on strata:
-
-- Strata is calculated in the [ingest stage](./pipeline/00-ingest.R) of
- this repository.
-- Calculating the 5-year average sale price of a building requires at
- least 1 sale. Buildings with no sales have their strata imputed via
- KNN (using year built, number of units, and location as features).
-- Number of bins (10 and 100) was chosen based on model performance.
- These numbers yielded the lowest root mean-squared error (RMSE).
-
# Ongoing Issues
The CCAO faces a number of ongoing issues specific to condominium
@@ -318,7 +281,7 @@ of the challenges we face.
The current modeling methodology for condominiums makes two assumptions:
-1. Condos units within the same building are similar and will sell for
+1. Condo units within the same building are similar and will sell for
similar amounts.
2. If units are not similar, the percentage of ownership will
accurately reflect and be proportional to any difference in value
@@ -327,7 +290,7 @@ The current modeling methodology for condominiums makes two assumptions:
The model process works even in heterogeneous buildings as long as
assumption 2 is met. For example, imagine a building with 8 identical
units and 1 penthouse unit. This building violates assumption 1 because
-the penthouse unit is likely larger and worth more than the other 10.
+the penthouse unit is likely larger and worth more than the other 8.
However, if the percentage of ownership of each unit is roughly
proportional to its value, then each unit will still receive a fair
assessment.
@@ -344,13 +307,13 @@ secondary review to ensure the accuracy of the individual unit values.
### Buildings With Few Sales
-The condo model relies on sales within the same building to calculate
-[strata](#condo-strata). This method works well for large buildings with
-many sales, but can break down when there are only 1 or 2 sales in a
-building. The primary danger here is *unrepresentative* sales,
-i.e. sales that deviate significantly from the real average value of a
-building’s units. When this happens, buildings can have their average
-unit sale value pegged too high or low.
+The condo model relies on sales within the same building to calculate a
+weighted, rolling average building sale price. This method works well
+for large buildings with many sales, but can break down when there are
+only 1 or 2 sales in a building. The primary danger here is
+*unrepresentative* sales, i.e. sales that deviate significantly from the
+real average value of a building’s units. When this happens, buildings
+can have their average unit sale value pegged too high or low.
Fortunately, buildings without any recent sales are relatively rare, as
condos have a higher turnover rate than single and multi-family
@@ -360,9 +323,10 @@ have recent sales.
### Buildings Without Sales
When no sales have occurred in a building in the 5 years prior to
-assessment, the building’s strata features are imputed. The model will
-look at nearby buildings that have similar unit counts/age and then try
-to assign an appropriate strata to the target building.
+assessment, the building’s mean sale price feature is imputed. The model
+will look at nearby buildings that have similar unit counts, age, and
+other features, then try to assign an appropriate average to the target
+building.
Most of the time, this technique produces reasonable results. However,
buildings without sales still go through an additional round of review
@@ -386,17 +350,9 @@ speaking, the most important features are:
[geospatial features like neighborhood](#features-used).
- Condo percentage of ownership, which determines the intra-building
variation in unit price.
-- [Condo building strata](#condo-strata). Strata provides us with a good
- estimate of the average sale price of a building’s units.
-
-**Q: How do I see my condo building’s strata?**
-
-Individual building [strata](#condo-strata) are not included with
-assessment notices or shown on the CCAO’s website. However, strata *are*
-stored in the sample data included in this repository. You can load the
-data
-([`input/condo_strata_data.parquet`](./input/condo_strata_data.parquet))
-using R and the `read_parquet()` function from the `arrow` library.
+- Other sales in the building. This is captured by a rolling average of
+ sales in the building over the past 5 years, excluding any sales of
+ the target condo unit.
**Q: How do I see the assessed value of other units in my building?**
diff --git a/docs/data-dict.csv b/docs/data-dict.csv
index fd8cbac0..f65e4110 100644
--- a/docs/data-dict.csv
+++ b/docs/data-dict.csv
@@ -9,8 +9,8 @@ Condominium Unit Bedrooms,char_bedrooms,Number of bedrooms in the building,Chara
Condominium Unit Half Baths,char_half_baths,Number of half baths,Characteristic,numeric,TRUE
Condominium Unit Full Baths,char_full_baths,Number of full bathrooms,Characteristic,numeric,TRUE
Condominium % Ownership,meta_tieback_proration_rate,Proration rate applied to the PIN,Meta,numeric,TRUE
-Condominium Building Strata 1,meta_strata_1,Condominium Building Strata - 10 Levels,Meta,character,TRUE
-Condominium Building Strata 2,meta_strata_2,Condominium Building Strata - 100 Levels,Meta,character,TRUE
+Building Rolling Average Sale Price,meta_pin10_bldg_roll_mean,,Meta,numeric,TRUE
+Building Rolling Percent Units Sold,meta_pin10_bldg_roll_pct_sold,,Meta,numeric,TRUE
Standard Deviation Distance From Parcel Centroid to Vertices (Feet),shp_parcel_centroid_dist_ft_sd,Standard deviation of the distance from each major parcel vertex to the parcel centroid,Parcel Shape,numeric,TRUE
Standard Deviation Parcel Edge Length (Feet),shp_parcel_edge_len_ft_sd,Standard deviation of the edge length between parcel vertices,Parcel Shape,numeric,TRUE
Standard Deviation Parcel Interior Angle (Degrees),shp_parcel_interior_angle_sd,Standard deviation of the interior angles of the parcel polygon,Parcel Shape,numeric,TRUE
diff --git a/dvc.lock b/dvc.lock
index 71c5fdda..3d8d5742 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -5,8 +5,8 @@ stages:
deps:
- path: pipeline/00-ingest.R
hash: md5
- md5: f758cc2d2c8dbe928806ffb0a46ab821
- size: 24134
+ md5: 079802067e9c909b1081f4791631a878
+ size: 23152
params:
params.yaml:
assessment:
@@ -19,52 +19,43 @@ stages:
input:
min_sale_year: '2016'
max_sale_year: '2024'
- strata:
- seed: 123
- group_var:
- - meta_township_code
- type: ntile
- k_1: 10
- k_2: 100
- weight_min: 0.3
+ n_years_prior: 5
+ building:
+ weight_min: 0.1
weight_max: 1.0
outs:
- path: input/assessment_data.parquet
hash: md5
- md5: b1462cc55efa7d8beb5ec2af9a649a9b
- size: 76103136
+ md5: abc97105413c6bce634894142d19cf28
+ size: 83170498
- path: input/char_data.parquet
hash: md5
- md5: 09a842b0910fa84c9fa7834593ee488c
- size: 149301395
- - path: input/condo_strata_data.parquet
- hash: md5
- md5: ded3ecde590af57e6b98a8935fae0215
- size: 40493
+ md5: af64cede592de02ef60b6f31d5bf47c1
+ size: 155430472
- path: input/land_nbhd_rate_data.parquet
hash: md5
md5: 5fe80edfabdfac91efe888a25ee4051c
size: 6019
- path: input/training_data.parquet
hash: md5
- md5: ef87ceb9be93d8ae85118991ab5269f2
- size: 76713007
+ md5: 60918e8a0dbacadc1d375a168c5183d1
+ size: 78826111
train:
cmd: Rscript pipeline/01-train.R
deps:
- path: input/training_data.parquet
hash: md5
- md5: ef87ceb9be93d8ae85118991ab5269f2
- size: 76713007
+ md5: c65f1c3b32929dbea29e2def92206a47
+ size: 78194210
- path: pipeline/01-train.R
hash: md5
- md5: 3cdf7f4f1dc9eb8056b7a133685d7d74
- size: 17278
+ md5: efcbae591ffc1e50041de19410a99013
+ size: 17242
params:
params.yaml:
cv:
split_prop: 0.9
- num_folds: 10
+ num_folds: 7
fold_overlap: 9
initial_set: 20
max_iterations: 50
@@ -74,7 +65,7 @@ stages:
model.engine: lightgbm
model.hyperparameter:
default:
- num_iterations: 2275
+ num_iterations: 100
learning_rate: 0.011
max_bin: 225
num_leaves: 200
@@ -88,7 +79,7 @@ stages:
cat_l2: 0.017
lambda_l1: 0.697
lambda_l2: 0.002
- neighbors: 15
+ trees: 25
range:
num_iterations:
- 100
@@ -132,9 +123,9 @@ stages:
lambda_l2:
- -3
- 2
- neighbors:
+ trees:
- 5
- - 40
+ - 50
model.objective: rmse
model.parameter:
validation_prop: 0.1
@@ -225,8 +216,7 @@ stages:
- shp_parcel_mrr_area_ratio
- shp_parcel_mrr_side_ratio
- shp_parcel_num_vertices
- - meta_strata_1
- - meta_strata_2
+ - meta_pin10_bldg_roll_mean
categorical:
- meta_township_code
- meta_nbhd_code
@@ -235,16 +225,19 @@ stages:
- loc_school_elementary_district_geoid
- loc_school_secondary_district_geoid
- time_sale_quarter_of_year
- - meta_strata_1
- - meta_strata_2
- knn:
- - meta_strata_1
- - meta_strata_2
- knn_imp:
+ imp:
+ - meta_pin10_bldg_roll_mean
+ imp_vars:
- loc_latitude
- loc_longitude
- char_building_units
- char_yrblt
+ - meta_township_code
+ - char_land_sf
+ - char_building_units
+ - char_building_non_units
+ - char_bldg_is_mixed_use
+ - char_building_sf
id:
- meta_year
- meta_pin
@@ -271,7 +264,6 @@ stages:
- meta_nbhd_code
- loc_tax_municipality_name
- loc_ward_num
- - loc_census_puma_geoid
- loc_census_tract_geoid
- loc_school_elementary_district_geoid
- loc_school_secondary_district_geoid
@@ -280,12 +272,12 @@ stages:
outs:
- path: output/intermediate/timing/model_timing_train.parquet
hash: md5
- md5: 2758f68917d737236484cd58de2f754d
- size: 2494
+ md5: 4edd3123a15e6436ac7e96e1fcde961e
+ size: 2489
- path: output/parameter_final/model_parameter_final.parquet
hash: md5
- md5: e597bcc058d0663c402dec775a849ef7
- size: 7224
+ md5: 67819be6c5cf19c9e1f87aca8717132e
+ size: 6638
- path: output/parameter_range/model_parameter_range.parquet
hash: md5
md5: a47965c8cbafb84368f2a21a047bc7f2
@@ -296,47 +288,43 @@ stages:
size: 501
- path: output/test_card/model_test_card.parquet
hash: md5
- md5: 23f29c2b932cf552d56e7079090417f1
- size: 1374409
+ md5: 48fff8bb08c9c3cba220fc29bafedc91
+ size: 1081005
- path: output/workflow/fit/model_workflow_fit.zip
hash: md5
- md5: 76bf5ea01d51bccdaa691159fa23fbf1
- size: 14550582
+ md5: 483c97374e121c302d60d970034b1d9c
+ size: 710110
- path: output/workflow/recipe/model_workflow_recipe.rds
hash: md5
- md5: 488787c224f3f95dd0e533e7a03faa7d
- size: 4259303
+ md5: 338e3eb58c5074c43d471814476c432d
+ size: 1128500306
assess:
cmd: Rscript pipeline/02-assess.R
deps:
- path: input/assessment_data.parquet
hash: md5
- md5: b1462cc55efa7d8beb5ec2af9a649a9b
- size: 76103136
- - path: input/condo_strata_data.parquet
- hash: md5
- md5: ded3ecde590af57e6b98a8935fae0215
- size: 40493
+ md5: 02f1e9a9466060f6c9b2ab388fca0de6
+ size: 82056354
- path: input/land_nbhd_rate_data.parquet
hash: md5
md5: 5fe80edfabdfac91efe888a25ee4051c
size: 6019
- path: input/training_data.parquet
hash: md5
- md5: ef87ceb9be93d8ae85118991ab5269f2
- size: 76713007
+ md5: c65f1c3b32929dbea29e2def92206a47
+ size: 78194210
- path: output/workflow/fit/model_workflow_fit.zip
hash: md5
- md5: 76bf5ea01d51bccdaa691159fa23fbf1
- size: 14550582
+ md5: 483c97374e121c302d60d970034b1d9c
+ size: 710110
- path: output/workflow/recipe/model_workflow_recipe.rds
hash: md5
- md5: 488787c224f3f95dd0e533e7a03faa7d
- size: 4259303
+ md5: 338e3eb58c5074c43d471814476c432d
+ size: 1128500306
- path: pipeline/02-assess.R
hash: md5
- md5: 82b43cd8084454f1712d6fc859a93e2e
- size: 18054
+ md5: 3d8f569a964928af134b032b8767dd05
+ size: 16233
params:
params.yaml:
assessment:
@@ -428,8 +416,7 @@ stages:
- shp_parcel_mrr_area_ratio
- shp_parcel_mrr_side_ratio
- shp_parcel_num_vertices
- - meta_strata_1
- - meta_strata_2
+ - meta_pin10_bldg_roll_mean
pv:
land_pct_of_total_cap: 0.5
round_break:
@@ -457,7 +444,6 @@ stages:
- meta_nbhd_code
- loc_tax_municipality_name
- loc_ward_num
- - loc_census_puma_geoid
- loc_census_tract_geoid
- loc_school_elementary_district_geoid
- loc_school_secondary_district_geoid
@@ -465,31 +451,31 @@ stages:
outs:
- path: output/assessment_card/model_assessment_card.parquet
hash: md5
- md5: 953e53a2ddebc8bc018f1f0b4b8cd7ef
- size: 45892575
+ md5: 09a3ef3e8b3bef60872b80d8d34dd251
+ size: 43729009
- path: output/assessment_pin/model_assessment_pin.parquet
hash: md5
- md5: 2b324a2be2937c0fb781991e904419f3
- size: 38206397
+ md5: c313bd4695c6b17c83a777d308132c3d
+ size: 39517679
- path: output/intermediate/timing/model_timing_assess.parquet
hash: md5
- md5: 19a8f9a219a1383d47d4d7c33a35da50
- size: 2499
+ md5: a676482115bebdf36a57ddc8251a6ce7
+ size: 2494
evaluate:
cmd: Rscript pipeline/03-evaluate.R
deps:
- path: output/assessment_pin/model_assessment_pin.parquet
hash: md5
- md5: 2b324a2be2937c0fb781991e904419f3
- size: 38206397
+ md5: c313bd4695c6b17c83a777d308132c3d
+ size: 39517679
- path: output/test_card/model_test_card.parquet
hash: md5
- md5: 23f29c2b932cf552d56e7079090417f1
- size: 1374409
+ md5: 48fff8bb08c9c3cba220fc29bafedc91
+ size: 1081005
- path: pipeline/03-evaluate.R
hash: md5
- md5: ff504eb22892ae0908bbaaf4e76da4f4
- size: 17443
+ md5: a1e765acbb7531bdfc17e0cc60508914
+ size: 17400
params:
params.yaml:
assessment:
@@ -516,7 +502,6 @@ stages:
- meta_nbhd_code
- loc_tax_municipality_name
- loc_ward_num
- - loc_census_puma_geoid
- loc_census_tract_geoid
- loc_school_elementary_district_geoid
- loc_school_secondary_district_geoid
@@ -524,39 +509,39 @@ stages:
outs:
- path: output/intermediate/timing/model_timing_evaluate.parquet
hash: md5
- md5: 058060bcb9e959e97bd68fcf4947e4b8
+ md5: 95402d164ddc790cd557ed128e41ffc8
size: 2509
- path: output/performance/model_performance_assessment.parquet
hash: md5
- md5: f4b0f6eeaf748c419023a87672367bc7
- size: 285172
+ md5: 82cc9036c85e56e7e0dd2ecc6b74d40d
+ size: 258446
- path: output/performance/model_performance_test.parquet
hash: md5
- md5: b0527619653a981a624ce9d23f32497b
- size: 1058246
+ md5: e1fe19b18ed38846fcb18409be5b10cf
+ size: 1000121
- path: output/performance_quantile/model_performance_quantile_assessment.parquet
hash: md5
- md5: a007019deda1689fdf6f878b43eb93f3
- size: 222160
+ md5: 77f31f73c26328a4b90f0064469f5a2b
+ size: 198599
- path: output/performance_quantile/model_performance_quantile_test.parquet
hash: md5
- md5: 9df38602a248579e42dceca499983320
- size: 1045767
+ md5: e643ecb20e5b340eb1449c2d1ae4024d
+ size: 954434
interpret:
cmd: Rscript pipeline/04-interpret.R
deps:
- path: input/assessment_data.parquet
hash: md5
- md5: b1462cc55efa7d8beb5ec2af9a649a9b
- size: 76103136
+ md5: 02f1e9a9466060f6c9b2ab388fca0de6
+ size: 82056354
- path: output/workflow/fit/model_workflow_fit.zip
hash: md5
- md5: 76bf5ea01d51bccdaa691159fa23fbf1
- size: 14550582
+ md5: 483c97374e121c302d60d970034b1d9c
+ size: 710110
- path: output/workflow/recipe/model_workflow_recipe.rds
hash: md5
- md5: 488787c224f3f95dd0e533e7a03faa7d
- size: 4259303
+ md5: 338e3eb58c5074c43d471814476c432d
+ size: 1128500306
- path: pipeline/04-interpret.R
hash: md5
md5: 51795fcf45dabc142f57c7b6e524b74b
@@ -645,18 +630,17 @@ stages:
- shp_parcel_mrr_area_ratio
- shp_parcel_mrr_side_ratio
- shp_parcel_num_vertices
- - meta_strata_1
- - meta_strata_2
+ - meta_pin10_bldg_roll_mean
toggle.shap_enable: false
outs:
- path: output/feature_importance/model_feature_importance.parquet
hash: md5
- md5: 69d08c67cc718ca07b38fa9f6f5359ca
- size: 8029
+ md5: 324ed5e9fdb575d00fba8172c93b1ad5
+ size: 7819
- path: output/intermediate/timing/model_timing_interpret.parquet
hash: md5
- md5: b8fbb79269b51ccb868732702d3d8468
- size: 2519
+ md5: c3fcd4cb704324621749fa73a3a7cad5
+ size: 2524
- path: output/shap/model_shap.parquet
hash: md5
md5: a47965c8cbafb84368f2a21a047bc7f2
@@ -666,29 +650,29 @@ stages:
deps:
- path: output/intermediate/timing/model_timing_assess.parquet
hash: md5
- md5: 19a8f9a219a1383d47d4d7c33a35da50
- size: 2499
+ md5: a676482115bebdf36a57ddc8251a6ce7
+ size: 2494
- path: output/intermediate/timing/model_timing_evaluate.parquet
hash: md5
- md5: 058060bcb9e959e97bd68fcf4947e4b8
+ md5: 95402d164ddc790cd557ed128e41ffc8
size: 2509
- path: output/intermediate/timing/model_timing_interpret.parquet
hash: md5
- md5: b8fbb79269b51ccb868732702d3d8468
- size: 2519
+ md5: c3fcd4cb704324621749fa73a3a7cad5
+ size: 2524
- path: output/intermediate/timing/model_timing_train.parquet
hash: md5
- md5: 2758f68917d737236484cd58de2f754d
- size: 2494
+ md5: 4edd3123a15e6436ac7e96e1fcde961e
+ size: 2489
- path: pipeline/05-finalize.R
hash: md5
- md5: df815760b41cedc8e41132262d2977c7
- size: 8074
+ md5: c25e5ec1a936d176a68f858033b9b136
+ size: 7610
params:
params.yaml:
cv:
split_prop: 0.9
- num_folds: 10
+ num_folds: 7
fold_overlap: 9
initial_set: 20
max_iterations: 50
@@ -698,14 +682,9 @@ stages:
input:
min_sale_year: '2016'
max_sale_year: '2024'
- strata:
- seed: 123
- group_var:
- - meta_township_code
- type: ntile
- k_1: 10
- k_2: 100
- weight_min: 0.3
+ n_years_prior: 5
+ building:
+ weight_min: 0.1
weight_max: 1.0
model:
engine: lightgbm
@@ -797,8 +776,7 @@ stages:
- shp_parcel_mrr_area_ratio
- shp_parcel_mrr_side_ratio
- shp_parcel_num_vertices
- - meta_strata_1
- - meta_strata_2
+ - meta_pin10_bldg_roll_mean
categorical:
- meta_township_code
- meta_nbhd_code
@@ -807,16 +785,19 @@ stages:
- loc_school_elementary_district_geoid
- loc_school_secondary_district_geoid
- time_sale_quarter_of_year
- - meta_strata_1
- - meta_strata_2
- knn:
- - meta_strata_1
- - meta_strata_2
- knn_imp:
+ imp:
+ - meta_pin10_bldg_roll_mean
+ imp_vars:
- loc_latitude
- loc_longitude
- char_building_units
- char_yrblt
+ - meta_township_code
+ - char_land_sf
+ - char_building_units
+ - char_building_non_units
+ - char_bldg_is_mixed_use
+ - char_building_sf
id:
- meta_year
- meta_pin
@@ -832,7 +813,7 @@ stages:
stop_iter: 50
hyperparameter:
default:
- num_iterations: 2275
+ num_iterations: 100
learning_rate: 0.011
max_bin: 225
num_leaves: 200
@@ -846,7 +827,7 @@ stages:
cat_l2: 0.017
lambda_l1: 0.697
lambda_l2: 0.002
- neighbors: 15
+ trees: 25
range:
num_iterations:
- 100
@@ -890,9 +871,9 @@ stages:
lambda_l2:
- -3
- 2
- neighbors:
+ trees:
- 5
- - 40
+ - 50
pv:
land_pct_of_total_cap: 0.5
round_break:
@@ -920,7 +901,6 @@ stages:
- meta_nbhd_code
- loc_tax_municipality_name
- loc_ward_num
- - loc_census_puma_geoid
- loc_census_tract_geoid
- loc_school_elementary_district_geoid
- loc_school_secondary_district_geoid
@@ -933,20 +913,20 @@ stages:
outs:
- path: output/intermediate/timing/model_timing_finalize.parquet
hash: md5
- md5: 0a45237f2c6c482dfb330d7d25582904
- size: 2519
+ md5: a17c53c9916f005df891c41c030ff570
+ size: 2514
- path: output/metadata/model_metadata.parquet
hash: md5
- md5: 69f62b872b3565fc8a4465091cc44440
- size: 21850
+ md5: 1409ea6b029801c9aefd87dd5cbb909f
+ size: 19208
- path: output/timing/model_timing.parquet
hash: md5
- md5: 647f1051c7cb6558b3513b9f197486db
- size: 5143
+ md5: e4a9a4d8bae3247af74dc5f6cace68e0
+ size: 5148
- path: reports/performance/performance.html
hash: md5
- md5: 28f307ad4945be82b850bfdfc7c51ac2
- size: 45960173
+ md5: 1aad56f5188476b2d21f87847e805258
+ size: 20874024
export:
cmd: Rscript pipeline/07-export.R
params:
@@ -984,24 +964,24 @@ stages:
deps:
- path: output/assessment_card/model_assessment_card.parquet
hash: md5
- md5: 953e53a2ddebc8bc018f1f0b4b8cd7ef
- size: 45892575
+ md5: 09a3ef3e8b3bef60872b80d8d34dd251
+ size: 43729009
- path: output/assessment_pin/model_assessment_pin.parquet
hash: md5
- md5: 2b324a2be2937c0fb781991e904419f3
- size: 38206397
+ md5: c313bd4695c6b17c83a777d308132c3d
+ size: 39517679
- path: output/feature_importance/model_feature_importance.parquet
hash: md5
- md5: 69d08c67cc718ca07b38fa9f6f5359ca
- size: 8029
+ md5: 324ed5e9fdb575d00fba8172c93b1ad5
+ size: 7819
- path: output/metadata/model_metadata.parquet
hash: md5
- md5: 69f62b872b3565fc8a4465091cc44440
- size: 21850
+ md5: 1409ea6b029801c9aefd87dd5cbb909f
+ size: 19208
- path: output/parameter_final/model_parameter_final.parquet
hash: md5
- md5: e597bcc058d0663c402dec775a849ef7
- size: 7224
+ md5: 67819be6c5cf19c9e1f87aca8717132e
+ size: 6638
- path: output/parameter_range/model_parameter_range.parquet
hash: md5
md5: a47965c8cbafb84368f2a21a047bc7f2
@@ -1012,45 +992,45 @@ stages:
size: 501
- path: output/performance/model_performance_assessment.parquet
hash: md5
- md5: f4b0f6eeaf748c419023a87672367bc7
- size: 285172
+ md5: 82cc9036c85e56e7e0dd2ecc6b74d40d
+ size: 258446
- path: output/performance/model_performance_test.parquet
hash: md5
- md5: b0527619653a981a624ce9d23f32497b
- size: 1058246
+ md5: e1fe19b18ed38846fcb18409be5b10cf
+ size: 1000121
- path: output/performance_quantile/model_performance_quantile_assessment.parquet
hash: md5
- md5: a007019deda1689fdf6f878b43eb93f3
- size: 222160
+ md5: 77f31f73c26328a4b90f0064469f5a2b
+ size: 198599
- path: output/performance_quantile/model_performance_quantile_test.parquet
hash: md5
- md5: 9df38602a248579e42dceca499983320
- size: 1045767
+ md5: e643ecb20e5b340eb1449c2d1ae4024d
+ size: 954434
- path: output/shap/model_shap.parquet
hash: md5
md5: a47965c8cbafb84368f2a21a047bc7f2
size: 501
- path: output/test_card/model_test_card.parquet
hash: md5
- md5: 23f29c2b932cf552d56e7079090417f1
- size: 1374409
+ md5: 48fff8bb08c9c3cba220fc29bafedc91
+ size: 1081005
- path: output/timing/model_timing.parquet
hash: md5
- md5: 647f1051c7cb6558b3513b9f197486db
- size: 5143
+ md5: e4a9a4d8bae3247af74dc5f6cace68e0
+ size: 5148
- path: output/workflow/fit/model_workflow_fit.zip
hash: md5
- md5: 76bf5ea01d51bccdaa691159fa23fbf1
- size: 14550582
+ md5: 483c97374e121c302d60d970034b1d9c
+ size: 710110
- path: output/workflow/recipe/model_workflow_recipe.rds
hash: md5
- md5: 488787c224f3f95dd0e533e7a03faa7d
- size: 4259303
+ md5: 338e3eb58c5074c43d471814476c432d
+ size: 1128500306
- path: pipeline/06-upload.R
hash: md5
md5: 613632039c6744d3132a8760c1b51099
size: 10855
- path: reports/performance/performance.html
hash: md5
- md5: 28f307ad4945be82b850bfdfc7c51ac2
- size: 45960173
+ md5: 1aad56f5188476b2d21f87847e805258
+ size: 20874024
diff --git a/dvc.yaml b/dvc.yaml
index 8c6dfcf0..c8e6d9f5 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -2,7 +2,7 @@ stages:
ingest:
cmd: Rscript pipeline/00-ingest.R
desc: >
- Ingest training and assessment data from Athena + generate condo strata
+ Ingest training and assessment data from Athena + generate building means
deps:
- pipeline/00-ingest.R
params:
@@ -11,7 +11,6 @@ stages:
outs:
- input/assessment_data.parquet
- input/char_data.parquet
- - input/condo_strata_data.parquet
- input/land_nbhd_rate_data.parquet
- input/training_data.parquet
frozen: true
@@ -60,7 +59,6 @@ stages:
deps:
- pipeline/02-assess.R
- input/assessment_data.parquet
- - input/condo_strata_data.parquet
- input/land_nbhd_rate_data.parquet
- input/training_data.parquet
- output/workflow/fit/model_workflow_fit.zip
diff --git a/misc/desk_review_template.xlsx b/misc/desk_review_template.xlsx
index ec4d87f9..36fe048d 100644
Binary files a/misc/desk_review_template.xlsx and b/misc/desk_review_template.xlsx differ
diff --git a/params.yaml b/params.yaml
index be115708..deeafbd9 100644
--- a/params.yaml
+++ b/params.yaml
@@ -60,26 +60,14 @@ input:
min_sale_year: "2016"
max_sale_year: "2024"
- # Parameters used to construct condominium strata features
- strata:
- # Seed for k-means
- seed: 123
-
- # Grouping variables used in the construction of condo strata. Each group
- # will have its own independent strata of size K 1 and K 2 (see below)
- group_var:
- - "meta_township_code"
-
- # We can use either quantiles or k-means clustering to build condo strata
- type: "ntile"
-
- # Number of quantiles of centers to use when constructing strata
- k_1: 10
- k_2: 100
+ # Rolling time window size for building mean feature
+ n_years_prior: 5
+ # Parameters used to construct condominium building mean feature
+ building:
# Max and min weights for mean sale price weighted by sale date i.e. the
# the least recent sale counts half as much as the most recent one
- weight_min: 0.3
+ weight_min: 0.1
weight_max: 1.0
@@ -94,7 +82,7 @@ cv:
# Number of folds to use for cross-validation. For v-fold CV, the data will be
# randomly split. For rolling-origin, the data will be split into V chunks by
# time, with each chunk/period calculated automatically
- num_folds: 10
+ num_folds: 7
# The number of months time-based folds should overlap each other. Only
# applicable to rolling-origin CV. See https://www.tmwr.org/resampling#rolling
@@ -224,8 +212,8 @@ model:
- "shp_parcel_mrr_area_ratio"
- "shp_parcel_mrr_side_ratio"
- "shp_parcel_num_vertices"
- - "meta_strata_1"
- - "meta_strata_2"
+ - "meta_pin10_bldg_roll_mean"
+ - "meta_pin10_bldg_roll_pct_sold"
# List of predictors included in predictor.all which are categoricals. It is
# CRITICAL that any categorical variables are included in this list, else
@@ -238,21 +226,23 @@ model:
- "loc_school_elementary_district_geoid"
- "loc_school_secondary_district_geoid"
- "time_sale_quarter_of_year"
- - "meta_strata_1"
- - "meta_strata_2"
- # List of variables used with the recipe step step_impute_knn(). The knn
- # variables will have missing values imputed, while the knn_imp variables
+ # List of variables used with the recipe step step_impute_bag(). The imp
+ # variables will have missing values imputed, while the imp_vars variables
# are used to do the imputing
- knn:
- - "meta_strata_1"
- - "meta_strata_2"
+ imp:
+ - "meta_pin10_bldg_roll_mean"
- knn_imp:
+ imp_vars:
- "loc_latitude"
- "loc_longitude"
- "char_building_units"
- "char_yrblt"
+ - "meta_township_code"
+ - "char_land_sf"
+ - "char_building_non_units"
+ - "char_bldg_is_mixed_use"
+ - "char_building_sf"
# List of identifiers for each observation, can be ignored
id:
@@ -313,7 +303,7 @@ model:
cat_l2: 0.017
lambda_l1: 0.697
lambda_l2: 0.002
- neighbors: 15
+ imp_trees: 25
# Range of possible hyperparameter values for tuning to explore
range:
@@ -336,7 +326,7 @@ model:
cat_l2: [-3, 2] # 10 ^ X
lambda_l1: [-3, 2] # 10 ^ X
lambda_l2: [-3, 2] # 10 ^ X
- neighbors: [5, 40]
+ imp_trees: [5, 50]
# Post-Valuation ---------------------------------------------------------------
diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R
index d271ab4c..a7cd3607 100644
--- a/pipeline/00-ingest.R
+++ b/pipeline/00-ingest.R
@@ -14,11 +14,16 @@ purrr::walk(list.files("R/", "\\.R$", full.names = TRUE), source)
# Load additional dev R libraries (see README#managing-r-dependencies)
suppressPackageStartupMessages({
+ library(data.table)
library(DBI)
library(igraph)
library(noctua)
})
+# Load data.table without breaking everything else
+conflict_prefer_all("dplyr", "data.table", quiet = TRUE)
+conflict_prefer_all("lubridate", "data.table", quiet = TRUE)
+
# Adds arrow support to speed up ingest process
noctua_options(unload = TRUE)
@@ -56,67 +61,6 @@ recode_column_type <- function(col, col_name, dictionary = col_type_dict) {
)
}
-
-# Create quantiles with unbounded top and bottom bins. Used to bin
-# condo building sales prices into strata
-val_create_ntiles <- function(x, probs, na.rm = TRUE) {
- stopifnot(
- is.numeric(x),
- is.numeric(probs),
- is.logical(na.rm)
- )
-
- output <- list(c(
- -Inf,
- unique(stats::quantile(x, probs = probs, na.rm = na.rm, names = FALSE)),
- Inf
- ))
- output <- ifelse(all(is.na(x)), list(NA_real_), output)
-
- return(output)
-}
-
-
-# Given a sale price x, assign the sale price to a pre-made strata bin
-val_assign_ntile <- function(x, ntiles) {
- output <- as.character(ifelse(
- !is.na(x),
- purrr::pmap(
- list(x, ntiles),
- ~ cut(.x, breaks = .y, labels = FALSE)
- ),
- NA_character_
- ))
-
- return(output)
-}
-
-
-# Given a set of k-means centers and a sale price, find the nearest center
-val_assign_center <- function(x, centers) {
- output <- as.character(ifelse(
- !is.na(x) & !is.na(centers),
- purrr::pmap(
- list(x, centers),
- ~ which.min(mapply(function(z, y) sum(z - y)^2, .x, .y))
- ),
- NA_character_
- ))
-
- return(output)
-}
-
-
-# Rescaling function to normalize a continuous range to be between a min and max
-rescale <- function(x, min = 0, max = 1) {
- output <- (x - min(x, na.rm = TRUE)) /
- (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) *
- (max - min) + min
-
- return(output)
-}
-
-
# Mini function to deal with arrays
# Some Athena columns are stored as arrays but are converted to string on
# ingest. In such cases, we either keep the contents of the cell (if 1 unit),
@@ -136,6 +80,7 @@ process_array_column <- function(x) {
+
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 3. Pull Data -----------------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -164,35 +109,19 @@ training_data <- dbGetQuery(
INNER JOIN default.vw_pin_sale sale
ON sale.pin = condo.meta_pin
AND sale.year = condo.year
- WHERE condo.year
- BETWEEN '{params$input$min_sale_year}'
- AND '{params$input$max_sale_year}'
+ WHERE CAST(condo.year AS int)
+ BETWEEN CAST({params$input$min_sale_year} AS int) -
+ {params$input$n_years_prior}
+ AND CAST({params$input$max_sale_year} AS int)
AND sale.deed_type IN ('01', '02', '05')
AND NOT sale.sale_filter_same_sale_within_365
AND NOT sale.sale_filter_less_than_10k
AND NOT sale.sale_filter_deed_type
- AND Year(sale.sale_date) >= {params$input$min_sale_year}
AND sale.num_parcels_sale <= 2
")
)
tictoc::toc()
-# Raw sales document number data used to identify some sales accidentally
-# excluded from the original training runs. See
-# https://github.com/ccao-data/data-architecture/pull/334 for more info
-tictoc::tic("Sales data pulled")
-sales_data <- dbGetQuery(
- conn = AWS_ATHENA_CONN_NOCTUA, glue("
- SELECT DISTINCT
- substr(saledt, 1, 4) AS year,
- instruno AS doc_no_old,
- NULLIF(REPLACE(instruno, 'D', ''), '') AS doc_no_new
- FROM iasworld.sales
- WHERE substr(saledt, 1, 4) >= '{params$input$min_sale_year}'
- ")
-)
-tictoc::toc()
-
# Pull all condo PIN input data for the assessment and prior year. We will only
# use the assessment year to run the model, but the prior year can be used for
# report generation
@@ -338,9 +267,7 @@ training_data_clean <- training_data_fil %>%
) %>%
# Only exclude explicit outliers from training. Sales with missing validation
# outcomes will be considered non-outliers
- mutate(
- sv_is_outlier = replace_na(sv_is_outlier, FALSE)
- ) %>%
+ mutate(sv_is_outlier = replace_na(sv_is_outlier, FALSE)) %>%
# Some Athena columns are stored as arrays but are converted to string on
# ingest. In such cases, take the first element and clean the string
# Apply the helper function to process array columns
@@ -382,15 +309,8 @@ training_data_clean <- training_data_fil %>%
.after = meta_2yr_pri_board_tot
) %>%
relocate(starts_with("ind_"), .after = starts_with("meta_")) %>%
- relocate(starts_with("char_"), .after = starts_with("ind_")) %>%
- filter(
- between(
- meta_sale_date,
- make_date(params$input$min_sale_year, 1, 1),
- make_date(params$input$max_sale_year, 12, 31)
- )
- ) %>%
- as_tibble()
+ relocate(starts_with("char_"), .after = starts_with("ind_"))
+
## 4.2. Assessment Data --------------------------------------------------------
@@ -459,162 +379,235 @@ land_nbhd_rate_data %>%
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-# 5. Condo Strata --------------------------------------------------------------
+# 5. Building Means ------------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-message("Calculating condo strata")
-
-## 5.1. Calculate Strata -------------------------------------------------------
+message("Calculating building rolling means")
# Condominiums' unit characteristics (such as square footage, # of bedrooms,
-# etc.) are not tracked by the CCAO. As such, e need to rely on other
+# etc.) are not tracked well by the CCAO. As such, we need to rely on other
# information to determine the value of unsold condos. Fortunately, condos are
# more homogeneous than single-family homes and are pre-grouped into like units
# (buildings)
-# As such, we can use the historic sale price of other sold units in the same
-# building to determine an unsold condo's value. To do so, we construct condo
-# "strata", which are bins of the 5-year average sale price of the building.
-# Units and buildings in the same strata should ultimately have very similar
-# assessed values
-
-# The first step here is to get the average sale price of condos in each
-# building. The first 10 digits of a given PIN are the building (the last 4 are
-# the unit)
-
-# Get the the recency-weighted mean log10 sale price of each building
-bldg_5yr_sales_avg <- training_data_clean %>%
- filter(
- meta_sale_date > make_date(as.numeric(params$input$max_sale_year) - 4),
- !sv_is_outlier
- ) %>%
+# We can use the historic sale price of other sold units in the same
+# building to determine an unsold condo's value. To do so, we construct a
+# time-weighted, leave-one-out, rolling mean of sale prices for each building.
+# In other words, we get the average of sales in the building in the past
+# N years, excluding the sale we're trying to predict.
+
+## 5.1. Construct Rolling Means ------------------------------------------------
+
+# This offset is the size of the rolling window
+offset <- years(params$input$n_years_prior)
+
+# Mush together the training and assessment data and sort by PIN and sale date.
+# Note that "sales" from the assessment data (i.e. the constructed sale on the
+# lien date) will always be the last sale in the building, since they occur
+# after all the training data sales. We exploit this property to also calculate
+# the rolling means for the assessment data by simply taking the N year rolling
+# average of sales prior to the lien date
+bldg_rolling_means_dt <- training_data_clean %>%
+ mutate(data_source = "training") %>%
select(
- meta_pin10, meta_sale_price, meta_sale_date,
- all_of(params$input$strata$group_var)
- ) %>%
- mutate(
- meta_sale_date_norm = rescale(
- as.numeric(meta_sale_date),
- params$input$strata$weight_min,
- params$input$strata$weight_max
- )
+ meta_pin10, meta_pin, meta_tieback_proration_rate,
+ meta_sale_date, meta_sale_price, meta_sale_document_num, sv_is_outlier,
+ meta_modeling_group, data_source
) %>%
- group_by(meta_pin10, across(any_of(params$input$strata$group_var))) %>%
- summarise(
- mean_log10_sale_price = weighted.mean(
- log10(meta_sale_price),
- meta_sale_date_norm,
- na.rm = TRUE
- ),
- meta_pin10_5yr_num_sale = n()
+ bind_rows(
+ assessment_data_clean %>%
+ mutate(data_source = "assessment") %>%
+ select(
+ meta_pin10, meta_pin, meta_tieback_proration_rate,
+ meta_sale_date,
+ meta_modeling_group, data_source
+ )
) %>%
- ungroup()
-
-# Use either k-means clustering or simple quantiles to construct a condominium
-# building strata model. This model can be used to assign strata to buildings
-if (params$input$strata$type == "kmeans") {
- # Set seed for k-means reproducibility
- set.seed(params$input$strata$seed)
-
- # For k-means, construct strata as a 1-dimensional cluster of the average
- # sale price of the building
- bldg_strata_model <- bldg_5yr_sales_avg %>%
- group_by(across(all_of(params$input$strata$group_var))) %>%
- summarize(
- meta_strata_model_1 = list(kmeans(
- mean_log10_sale_price,
- centers = params$input$strata$k_1,
- iter.max = 200,
- nstart = 50,
- algorithm = "MacQueen"
- )$centers),
- meta_strata_model_2 = list(kmeans(
- mean_log10_sale_price,
- centers = params$input$strata$k_2,
- iter.max = 200,
- nstart = 25,
- algorithm = "MacQueen"
- )$centers)
- ) %>%
- ungroup()
-} else if (params$input$strata$type == "ntile") {
- # Construct strata as quantile bins of the average sale price of the building
- bldg_strata_model <- bldg_5yr_sales_avg %>%
- group_by(across(all_of(params$input$strata$group_var))) %>%
- summarize(
- meta_strata_model_1 = val_create_ntiles(
- x = mean_log10_sale_price,
- probs = seq(0, 1, 1 / params$input$strata$k_1)[
- c(-1, -(params$input$strata$k_1 + 1))
- ]
- ),
- meta_strata_model_2 = val_create_ntiles(
- x = mean_log10_sale_price,
- probs = seq(0, 1, 1 / params$input$strata$k_2)[
- c(-1, -(params$input$strata$k_2 + 1))
- ]
+ as.data.table() %>%
+ setkey(meta_pin10, meta_sale_date)
+
+# Construct the time-weighted, leave-one-out rolling mean of building sale
+# prices. We use data.table here since it's MUCH faster than dplyr for this
+# task. The code here is a bit dense. View the output dataframe for debugging
+# (it helps a lot).
+bldg_rolling_means_dt[
+ ,
+ # Create initial time weights for the range of sales across the whole training
+ # date range. This is a logistic curve centered 3 years before the lien
+ # date and bounded between the min and max weights. The parameters here were
+ # discovered with some rough grid search
+ sale_wt := params$input$building$weight_max / (
+ params$input$building$weight_max +
+ exp(
+ -(0.002 * as.integer(meta_sale_date - (max(meta_sale_date) - years(3))))
)
- ) %>%
- ungroup()
-}
-
-# Save strata model to file in case we need to use it later
-bldg_strata_model %>%
- write_parquet(paths$input$condo_strata$local)
-
-
-## 5.2. Assign Strata ----------------------------------------------------------
-
-# Use strata models to create strata of building-level, previous-5-year sale
-# prices. These strata are used as categorical variables in the model
-bldg_strata <- bldg_5yr_sales_avg %>%
- left_join(bldg_strata_model, by = params$input$strata$group_var) %>%
+ ) * (1 - params$input$building$weight_min) + params$input$building$weight_min
+][
+ ,
+ # Scale weights so the max weight is 1
+ sale_wt := sale_wt / max(sale_wt)
+][
+ # Calculate the adaptive rolling window size using some tricky interval logic.
+ # To demo what's actually going on here with `findInterval()`:
+ #
+ # Given the following sales Y:
+ # 2015-12-01 2018-01-01 2022-06-15 2025-01-01
+ #
+ # And their 5-year offset X:
+ # 2010-12-01 2013-01-01 2017-06-15 2020-01-01
+ #
+ # For each element of X, find the _index position_ of the breaks in Y that
+ # contains that element e.g. for the first element of X:
+ # 2015-12-01 2018-01-01 2022-06-15 2025-01-01
+ # └── 2010-12-01 is outside any of the cuts, so the index is 0
+ #
+ # Or for the fourth element of X:
+ # 2015-12-01 2018-01-01 2022-06-15 2025-01-01
+ # └── 2020-01-01 is between these two, so the index is 2
+ #
+ # Using this technique, we can determine how many index positions we need to
+ # move before the offset current date (sale date - N years) finds prior sales
+ # that are within the N year time window. We then subtract that number of
+ # index positions from the window size, effectively shrinking the front of the
+ # window by N positions and excluding sales outside the N year window.
+ #
+ # In the case of the 4th element of Y, we end up with a window size of
+ # 4 - 2 == 2. This means that our rolling mean will include the last two sales
+ # in Y, the sales at positions 3 and 4. Since position 4 is the target sale,
+ # we will avoid data leakage later on in the pipeline by subtracting the
+ # target sale price from the mean (or subtracting 0 in the case of the
+ # assessment set, which does not have a sale price).
+ !sv_is_outlier & meta_modeling_group == "CONDO" | data_source == "assessment",
+ `:=`(
+ index_in_group = seq_len(.N),
+ shrink_win_by_n_positions = findInterval(
+ meta_sale_date %m-% offset, meta_sale_date
+ )
+ ),
+ by = .(meta_pin10)
+][
+ # This is the size of the rolling window relative to EACH sale i.e. for any
+ # given sale, how many index positions back do we need to go to get only sales
+ # from the past N years
+ !sv_is_outlier & meta_modeling_group == "CONDO" | data_source == "assessment",
+ window_size := index_in_group - shrink_win_by_n_positions,
+ by = .(meta_pin10)
+][
+ !sv_is_outlier & meta_modeling_group == "CONDO" | data_source == "assessment",
+ # Calculate the numerator and denominator of the weighted rolling mean, but
+ # EXCLUDE the current sale. This is the leave-one-out part. Note that we need
+ # to replace NA values with 0 in the denominator for cases where there's no
+ # current sale but we still want to create a mean of prior sales e.g. for
+ # the assessment data
+ `:=`(
+ cnt = data.table::frollsum(
+ as.numeric(!is.na(meta_sale_price)),
+ n = window_size,
+ algo = "exact",
+ align = "right",
+ adaptive = TRUE,
+ na.rm = TRUE,
+ hasNA = TRUE
+ ) - as.numeric(!is.na(meta_sale_price)),
+ wtd_valsum = data.table::frollsum(
+ meta_sale_price * sale_wt,
+ n = window_size,
+ algo = "exact",
+ align = "right",
+ adaptive = TRUE,
+ na.rm = TRUE,
+ hasNA = TRUE
+ ) - replace(meta_sale_price, is.na(meta_sale_price), 0) * sale_wt,
+ wtd_cnt = data.table::frollsum(
+ as.numeric(!is.na(meta_sale_price)) * sale_wt,
+ n = window_size,
+ algo = "exact",
+ align = "right",
+ adaptive = TRUE,
+ na.rm = TRUE,
+ hasNA = TRUE
+ ) - as.numeric(!is.na(meta_sale_price)) * sale_wt
+ ),
+ by = .(meta_pin10)
+][, wtd_mean := wtd_valsum / wtd_cnt][
+ ,
+ `:=`(
+ wtd_mean =
+ fifelse(is.nan(wtd_mean) | is.infinite(wtd_mean), NA_real_, wtd_mean),
+ cnt = fifelse(is.nan(cnt) | is.infinite(cnt), NA_real_, cnt)
+ )
+]
+
+
+## 5.2. Re-attach to Original Data ---------------------------------------------
+
+# Extract the constructed building means from the dedicated dataframe and
+# re-attach them to their respective datasets. Note that some PINs will
+# not have a mean (no sales in the building or no sales in the window). These
+# missing values get imputed during the training stage
+training_data_clean <- training_data_clean %>%
+ left_join(
+ bldg_rolling_means_dt %>%
+ filter(data_source == "training") %>%
+ select(
+ meta_pin10, meta_sale_document_num,
+ meta_pin10_bldg_roll_mean = wtd_mean,
+ meta_pin10_bldg_roll_count = cnt
+ ),
+ by = c("meta_pin10", "meta_sale_document_num")
+ ) %>%
mutate(
- meta_strata_1 = switch(params$input$strata$type,
- kmeans = val_assign_center(mean_log10_sale_price, meta_strata_model_1),
- ntile = val_assign_ntile(mean_log10_sale_price, meta_strata_model_1)
- ),
- meta_strata_2 = switch(params$input$strata$type,
- kmeans = val_assign_center(mean_log10_sale_price, meta_strata_model_2),
- ntile = val_assign_ntile(mean_log10_sale_price, meta_strata_model_2)
+ # Also construct a "percentage of units sold in the building" feature
+ meta_pin10_bldg_roll_pct_sold =
+ meta_pin10_bldg_roll_count / char_building_units,
+ meta_pin10_bldg_roll_pct_sold = ifelse(
+ is.na(meta_pin10_bldg_roll_pct_sold) |
+ is.nan(meta_pin10_bldg_roll_pct_sold) |
+ is.infinite(meta_pin10_bldg_roll_pct_sold),
+ NA_real_,
+ meta_pin10_bldg_roll_pct_sold
)
) %>%
- group_by(across(params$input$strata$group_var), meta_strata_1) %>%
- mutate(meta_strata_1_5yr_num_sale = sum(meta_pin10_5yr_num_sale)) %>%
- group_by(across(params$input$strata$group_var), meta_strata_2) %>%
- mutate(meta_strata_2_5yr_num_sale = sum(meta_pin10_5yr_num_sale)) %>%
- ungroup() %>%
- select(
- -c(mean_log10_sale_price, meta_strata_model_1, meta_strata_model_2),
- -all_of(params$input$strata$group_var)
- )
-
-# Attach the strata and sale counts for both assessment and training data
-training_data_w_strata <- training_data_clean %>%
- left_join(bldg_strata, by = "meta_pin10") %>%
- mutate(meta_pin10_5yr_num_sale = replace_na(meta_pin10_5yr_num_sale, 0)) %>%
- relocate(
- c(starts_with("meta_strata"), meta_pin10_5yr_num_sale),
- .before = starts_with("ind_")
+ filter(
+ between(
+ meta_sale_date,
+ make_date(params$input$min_sale_year, 1, 1),
+ make_date(params$input$max_sale_year, 12, 31)
+ )
) %>%
+ as_tibble() %>%
write_parquet(paths$input$training$local)
-assessment_data_w_strata <- assessment_data_clean %>%
- left_join(bldg_strata, by = "meta_pin10") %>%
- mutate(meta_pin10_5yr_num_sale = replace_na(meta_pin10_5yr_num_sale, 0)) %>%
- relocate(
- c(starts_with("meta_strata"), meta_pin10_5yr_num_sale),
- .before = starts_with("ind_")
+assessment_data_clean <- assessment_data_clean %>%
+ left_join(
+ bldg_rolling_means_dt %>%
+ filter(data_source == "assessment") %>%
+ select(
+ meta_pin,
+ meta_pin10_bldg_roll_mean = wtd_mean,
+ meta_pin10_bldg_roll_count = cnt
+ ),
+ by = c("meta_pin")
+ ) %>%
+ mutate(
+ meta_pin10_bldg_roll_pct_sold =
+ meta_pin10_bldg_roll_count / char_building_units,
+ meta_pin10_bldg_roll_pct_sold = ifelse(
+ is.na(meta_pin10_bldg_roll_pct_sold) |
+ is.nan(meta_pin10_bldg_roll_pct_sold) |
+ is.infinite(meta_pin10_bldg_roll_pct_sold),
+ NA_real_,
+ meta_pin10_bldg_roll_pct_sold
+ )
) %>%
+ as_tibble() %>%
write_parquet(paths$input$assessment$local)
-
-## 5.3. Missing Strata ---------------------------------------------------------
-
-# Condo buildings that don't have any recent sales will be missing strata.
-# We use KNN to assign strata for those buildings based on longitude, latitude,
-# year built, and number of livable building units.
-
-# This step is now performed via the Tidymodels recipes package. See R/recipes.R
+# Throw errors if any of the constructed mean features are negative
+if (any(training_data_clean$meta_pin10_bldg_roll_mean < 0, na.rm = TRUE)) {
+ stop("Negative building rolling mean detected in training data")
+} else if (any(assessment_data_clean$meta_pin10_bldg_roll_mean < 0, na.rm = TRUE)) { # nolint
+ stop("Negative building rolling mean detected in assessment data")
+}
# Reminder to upload to DVC store
message(
diff --git a/pipeline/01-train.R b/pipeline/01-train.R
index 214f24f6..c6ab5443 100644
--- a/pipeline/01-train.R
+++ b/pipeline/01-train.R
@@ -46,9 +46,10 @@ train_recipe <- model_main_recipe(
data = training_data_full,
pred_vars = params$model$predictor$all,
cat_vars = params$model$predictor$categorical,
- knn_vars = params$model$predictor$knn,
- knn_imp_vars = params$model$predictor$knn_imp,
- id_vars = params$model$predictor$id
+ imp = params$model$predictor$imp,
+ imp_vars = params$model$predictor$imp_vars,
+ id_vars = params$model$predictor$id,
+ seed = params$model$seed
)
@@ -66,9 +67,10 @@ lin_recipe <- model_lin_recipe(
mutate(meta_sale_price = log(meta_sale_price)),
pred_vars = params$model$predictor$all,
cat_vars = params$model$predictor$categorical,
- knn_vars = params$model$predictor$knn,
- knn_imp_vars = params$model$predictor$knn_imp,
- id_vars = params$model$predictor$id
+ imp = params$model$predictor$imp,
+ imp_vars = params$model$predictor$imp_vars,
+ id_vars = params$model$predictor$id,
+ seed = params$model$seed
)
# Create a linear model specification and workflow
@@ -85,7 +87,7 @@ lin_wflow <- workflow() %>%
# Fit the linear model on the training data
lin_wflow_final_fit <- lin_wflow %>%
finalize_workflow(
- list(neighbors = params$model$hyperparameter$default$neighbors)
+ list(imp_trees = params$model$hyperparameter$default$imp_trees)
) %>%
fit(data = train %>% mutate(meta_sale_price = log(meta_sale_price)))
@@ -228,7 +230,7 @@ if (cv_enable) {
date_col = meta_sale_date,
val_prop = params$model$parameter$validation_prop,
train_includes_val = params$model$parameter$validation_prop > 0,
- cumulative = FALSE
+ cumulative = TRUE
)
}
@@ -253,7 +255,7 @@ if (cv_enable) {
cat_l2 = lightsnip::cat_l2(lgbm_range$cat_l2),
lambda_l1 = lightsnip::lambda_l1(lgbm_range$lambda_l1),
lambda_l2 = lightsnip::lambda_l2(lgbm_range$lambda_l2),
- neighbors = dials::neighbors(lgbm_range$neighbors)
+ imp_trees = dials::trees(lgbm_range$imp_trees)
# nolint end
)
diff --git a/pipeline/02-assess.R b/pipeline/02-assess.R
index c47cb303..97fd8173 100644
--- a/pipeline/02-assess.R
+++ b/pipeline/02-assess.R
@@ -30,7 +30,7 @@ land_nbhd_rate <- read_parquet(
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-# 2. Predict Values and Recover Strata ----------------------------------------
+# 2. Predict Values ------------------------------------------------------------
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
message("Predicting off-market values with trained model")
@@ -44,57 +44,19 @@ lgbm_final_full_recipe <- readRDS(paths$output$workflow_recipe$local)
assessment_data_pred <- read_parquet(paths$input$assessment$local) %>%
as_tibble()
-assessment_data_baked <- assessment_data_pred %>%
- bake(lgbm_final_full_recipe, new_data = ., all_predictors())
-
assessment_data_pred <- assessment_data_pred %>%
mutate(
.,
pred_card_initial_fmv = as.numeric(predict(
lgbm_final_full_fit,
- new_data = assessment_data_baked
- )$.pred),
- # Strata variables are converted to 0-indexed integers during baking.
- # We save those converted values so we can unconvert them below
- temp_strata_1 = assessment_data_baked$meta_strata_1,
- temp_strata_2 = assessment_data_baked$meta_strata_2
+ new_data = bake(
+ lgbm_final_full_recipe,
+ new_data = assessment_data_pred,
+ all_predictors()
+ )
+ )$.pred)
)
-# The baked data encodes categorical values as base-0 integers.
-# However, here we want to recover the original (unencoded) values of our
-# strata variables wherever they've been imputed by the baking step. To do so,
-# we create a mapping of the encoded to unencoded values and use them to
-# recover both the original strata values and those imputed by
-# step_impute_knn (in R/recipes.R)
-strata_mapping_1 <- assessment_data_pred %>%
- filter(!is.na(meta_strata_1)) %>%
- distinct(temp_strata_1, meta_strata_1) %>%
- pull(meta_strata_1, name = temp_strata_1)
-strata_mapping_2 <- assessment_data_pred %>%
- filter(!is.na(meta_strata_2)) %>%
- distinct(temp_strata_2, meta_strata_2) %>%
- pull(meta_strata_2, name = temp_strata_2)
-
-# Recover the imputed strata values
-assessment_data_pred <- assessment_data_pred %>%
- mutate(
- # Binary variable to identify condos which have imputed strata
- flag_strata_is_imputed = is.na(meta_strata_1) | is.na(meta_strata_2),
- # Use mappings to replace meta_strata_1 and meta_strata_2 directly
- meta_strata_1 = ifelse(
- is.na(meta_strata_1),
- unname(strata_mapping_1[as.character(temp_strata_1)]),
- meta_strata_1
- ),
- meta_strata_2 = ifelse(
- is.na(meta_strata_2),
- unname(strata_mapping_2[as.character(temp_strata_2)]),
- meta_strata_2
- )
- ) %>%
- # Remove unnecessary columns
- select(-temp_strata_1, -temp_strata_2)
-
@@ -195,8 +157,7 @@ assessment_data_merged %>%
select(
meta_year, meta_pin, meta_class, meta_card_num, meta_lline_num,
meta_modeling_group, ends_with("_num_sale"), pred_card_initial_fmv,
- all_of(params$model$predictor$all),
- flag_strata_is_imputed, township_code
+ all_of(params$model$predictor$all), township_code
) %>%
mutate(
ccao_n_years_exe_homeowner = as.integer(ccao_n_years_exe_homeowner)
@@ -311,8 +272,7 @@ assessment_data_pin <- assessment_data_merged %>%
meta_year, meta_pin, meta_pin10, meta_triad_code, meta_township_code,
meta_nbhd_code, meta_tax_code, meta_class, meta_tieback_key_pin,
meta_tieback_proration_rate, meta_cdu, meta_modeling_group,
- meta_pin_num_landlines, meta_strata_1, meta_strata_2,
- flag_strata_is_imputed, char_yrblt,
+ meta_pin_num_landlines, char_yrblt,
# Keep overall building square footage
char_total_bldg_sf = char_building_sf,
@@ -324,7 +284,7 @@ assessment_data_pin <- assessment_data_merged %>%
"loc_property_", "loc_ward_", "loc_chicago_",
"loc_census", "loc_school_", "loc_tax_", "prior_", "ind_"
)),
- meta_pin10_5yr_num_sale,
+ meta_pin10_bldg_roll_mean, meta_pin10_bldg_roll_count,
# Keep PIN-level predicted values
pred_pin_final_fmv, pred_pin_final_fmv_round, township_code
@@ -417,10 +377,9 @@ assessment_data_pin_final <- assessment_data_pin_2 %>%
mutate(
meta_pin_num_landlines = tidyr::replace_na(meta_pin_num_landlines, 1),
flag_pin_is_multiland = tidyr::replace_na(flag_pin_is_multiland, FALSE),
- flag_nonlivable_space = meta_modeling_group == "NONLIVABLE",
- flag_pin10_5yr_num_sale = meta_pin10_5yr_num_sale
+ flag_nonlivable_space = meta_modeling_group == "NONLIVABLE"
) %>%
- select(-meta_modeling_group, -meta_pin10_5yr_num_sale) %>%
+ select(-meta_modeling_group) %>%
relocate(flag_prior_far_yoy_bldg_change_pct, .after = starts_with("flag_"))
diff --git a/pipeline/05-finalize.R b/pipeline/05-finalize.R
index 94ac78da..06676975 100644
--- a/pipeline/05-finalize.R
+++ b/pipeline/05-finalize.R
@@ -68,22 +68,9 @@ metadata <- tibble::tibble(
assessment_data_year = params$assessment$data_year,
input_min_sale_year = params$input$min_sale_year,
input_max_sale_year = params$input$max_sale_year,
- input_strata_seed = params$input$strata$seed,
- input_strata_group_var = list(params$input$strata$group_var),
- input_strata_type = params$input$strata$type,
- input_strata_k_1 = params$input$strata$k_1,
- input_strata_k_2 = params$input$strata$k_2,
- input_strata_weight_min = params$input$strata$weight_min,
- input_strata_weight_max = params$input$strata$weight_max,
- input_sale_validation_stat_groups = list(
- params$input$sale_validation$stat_groups
- ),
- input_sale_validation_iso_forest = list(
- params$input$sale_validation$iso_forest
- ),
- input_sale_validation_dev_bounds = list(
- params$input$sale_validation$dev_bounds
- ),
+ input_n_years_prior = params$input$n_years_prior,
+ input_building_weight_min = params$input$building$weight_min,
+ input_building_weight_max = params$input$building$weight_max,
ratio_study_far_year = params$ratio_study$far_year,
ratio_study_far_stage = params$ratio_study$far_stage,
ratio_study_far_column = params$ratio_study$far_column,
@@ -111,15 +98,15 @@ metadata <- tibble::tibble(
model_predictor_categorical_count =
length(params$model$predictor$categorical),
model_predictor_categorical_name = list(params$model$predictor$categorical),
- model_predictor_knn_count = length(params$model$predictor$knn),
- model_predictor_knn_name = list(params$model$predictor$knn),
- model_predictor_knn_imp_count = length(params$model$predictor$knn_imp),
- model_predictor_knn_imp_name = list(params$model$predictor$knn_imp)
+ model_predictor_imp_count = length(params$model$predictor$imp),
+ model_predictor_imp_name = list(params$model$predictor$imp),
+ model_predictor_imp_vars_count = length(params$model$predictor$imp_vars),
+ model_predictor_imp_vars_name = list(params$model$predictor$imp_vars)
) %>%
bind_cols(dvc_md5_df) %>%
relocate(
starts_with("dvc_id_"),
- .after = "input_strata_weight_max"
+ .after = "input_building_weight_max"
) %>%
arrow::write_parquet(paths$output$metadata$local)
diff --git a/pipeline/07-export.R b/pipeline/07-export.R
index ff48082d..d9dd908b 100644
--- a/pipeline/07-export.R
+++ b/pipeline/07-export.R
@@ -123,12 +123,12 @@ assessment_pin_prepped <- assessment_pin %>%
sale_recent_2_outlier_type, sale_recent_2_document_num,
sale_recent_2_num_parcels,
char_yrblt, char_total_bldg_sf, char_land_sf,
- char_unit_sf, meta_strata_1, meta_strata_2, flag_nonlivable_space,
+ char_unit_sf, meta_pin10_bldg_roll_mean, meta_pin10_bldg_roll_count,
+ flag_nonlivable_space,
flag_pin10_5yr_num_sale, flag_proration_sum_not_1,
flag_pin_is_multiland, flag_land_gte_95_percentile,
flag_land_value_capped, flag_prior_near_to_pred_unchanged,
- flag_prior_near_yoy_inc_gt_50_pct, flag_prior_near_yoy_dec_gt_5_pct,
- flag_strata_is_imputed
+ flag_prior_near_yoy_inc_gt_50_pct, flag_prior_near_yoy_dec_gt_5_pct
) %>%
mutate(
across(starts_with("flag_"), as.numeric),
diff --git a/renv.lock b/renv.lock
index 3ac4d7b9..dfdd8c60 100644
--- a/renv.lock
+++ b/renv.lock
@@ -279,20 +279,11 @@
"Version": "1.3.0",
"Source": "GitHub",
"RemoteType": "github",
+ "RemoteHost": "api.github.com",
"RemoteUsername": "ccao-data",
"RemoteRepo": "ccao",
"RemoteRef": "master",
- "RemoteSha": "a5449b9717323de3c51ee1948e4431175d6ccda0",
- "RemoteHost": "api.github.com",
- "Requirements": [
- "R",
- "assessr",
- "dplyr",
- "magrittr",
- "rlang",
- "tidyr"
- ],
- "Hash": "c29ff9f60bde4f122b8fe0bb75f02e8c"
+ "RemoteSha": "600f73a7161146762a9e9e156307c91a209ee637"
},
"class": {
"Package": "class",
diff --git a/reports/performance/_outcomes.qmd b/reports/performance/_outcomes.qmd
index b3649ced..5755eff7 100644
--- a/reports/performance/_outcomes.qmd
+++ b/reports/performance/_outcomes.qmd
@@ -388,214 +388,3 @@ stats_median_yoy_delta %>%
c("Comparison of YOY Change in AV for Sold and Unsold Houses" = 7)
)
```
-
-## Strata
-
-::: panel-tabset
-
-```{r _outcomes_imputed_cleaning}
-outcomes_strata <- assessment_pin %>%
- filter(!flag_nonlivable_space) %>%
- select(
- meta_pin,
- meta_strata_1,
- pred_pin_final_fmv,
- sale_recent_1_price,
- township = meta_township_code,
- imputed = flag_strata_is_imputed,
- flag_nonlivable_space
- ) %>%
- mutate(
- meta_strata_1 = as.numeric(meta_strata_1),
- township = ccao::town_convert(township)
- )
-```
-
-### Estimated Final FMV (PIN)
-
-```{r _outcomes_estimated_fmv_imputed, warning=FALSE}
-outcomes_strata %>%
- ggplot(aes(
- x = factor(meta_strata_1),
- y = pred_pin_final_fmv,
- fill = imputed
- )) +
- geom_boxplot() +
- scale_y_continuous(
- limits = c(0, 2000000),
- labels = scales::dollar_format(scale = 1 / 1000, suffix = "K")
- ) +
- labs(
- x = "Strata",
- y = "Estimated Final FMV (PIN)",
- fill = "Imputation Status"
- ) +
- theme_minimal() +
- theme(legend.position = "bottom")
-```
-
-### Sale Prices
-
-```{r _outcomes_sale_prices, warning=FALSE}
-outcomes_strata %>%
- ggplot(
- aes(x = factor(meta_strata_1), y = sale_recent_1_price, fill = imputed)
- ) +
- geom_boxplot() +
- scale_y_continuous(limits = c(0, 2000000), labels = scales::dollar_format()) +
- labs(
- x = "Strata",
- y = "FMV",
- fill = "Imputation Status"
- ) +
- theme_minimal() +
- theme(legend.position = "top")
-```
-
-### Descriptive Stats
-
-```{r _outcomes_descriptive_stats}
-outcomes_strata_descriptive <- outcomes_strata %>%
- group_by(meta_strata_1) %>%
- summarize(
- `Est. Min.` = min(pred_pin_final_fmv, na.rm = TRUE),
- `Est. Mean` = mean(pred_pin_final_fmv, na.rm = TRUE),
- `Est. Med.` = median(pred_pin_final_fmv, na.rm = TRUE),
- `Est. Max.` = max(pred_pin_final_fmv, na.rm = TRUE),
- `Sale Min.` = min(sale_recent_1_price, na.rm = TRUE),
- `Sale Mean` = mean(sale_recent_1_price, na.rm = TRUE),
- `Sale Med.` = median(sale_recent_1_price, na.rm = TRUE),
- `Sale Max.` = max(sale_recent_1_price, na.rm = TRUE)
- ) %>%
- rename(`Strata 1` = meta_strata_1) %>%
- mutate(across(`Est. Min.`:`Sale Max.`, ~ dollar(round(.x, 2))))
-
-datatable(
- outcomes_strata_descriptive,
- options = list(pageLength = 20, autoWidth = TRUE, scrollX = TRUE)
-)
-```
-
-### Descriptive Stats by Imputation Status
-
-```{r _outcomes_descriptive_stats_imputed}
-outcomes_strata_descriptive_imp <- outcomes_strata %>%
- group_by(meta_strata_1, imputed) %>%
- summarize(
- `Est. Min.` = min(pred_pin_final_fmv, na.rm = TRUE),
- `Est. Mean` = mean(pred_pin_final_fmv, na.rm = TRUE),
- `Est. Med.` = median(pred_pin_final_fmv, na.rm = TRUE),
- `Est. Max.` = max(pred_pin_final_fmv, na.rm = TRUE),
- `Sale Min.` = min(sale_recent_1_price, na.rm = TRUE),
- `Sale Mean` = mean(sale_recent_1_price, na.rm = TRUE),
- `Sale Med.` = median(sale_recent_1_price, na.rm = TRUE),
- `Sale Max.` = max(sale_recent_1_price, na.rm = TRUE)
- ) %>%
- rename(`Strata 1` = meta_strata_1, Imputed = imputed) %>%
- mutate(across(`Est. Min.`:`Sale Max.`, ~ dollar(round(.x, 2))))
-
-datatable(
- outcomes_strata_descriptive_imp,
- options = list(pageLength = 20, autoWidth = TRUE, scrollX = TRUE)
-)
-```
-
-:::
-
-## Strata Sale Prices vs Mailed Values
-
-::: panel-tabset
-
-```{r _outcomes_strata_plots}
-outcomes_strata_groups <- outcomes_strata %>%
- filter(!is.na(sale_recent_1_price)) %>%
- split(.$meta_strata_1)
-
-outcomes_strata_plots <- map(outcomes_strata_groups, ~ {
- p <- ggplot(.x, aes(
- x = pred_pin_final_fmv, y = sale_recent_1_price,
- color = township,
- text = paste(
- "PIN:", meta_pin,
- "
Sale Price:", dollar(sale_recent_1_price),
- "
Assessed Value:", dollar(pred_pin_final_fmv),
- "
Township:", township,
- "
Imputed:", imputed
- )
- )) +
- geom_point(aes(shape = imputed)) +
- labs(
- x = "FMV",
- y = "Sale Price",
- title = paste("Strata", unique(.x$meta_strata_1)),
- color = "Township"
- ) +
- scale_x_continuous(limits = c(0, 2000000), labels = dollar) +
- scale_y_continuous(limits = c(0, 2000000), labels = dollar) +
- theme_minimal()
-
- ggplotly(p, tooltip = "text")
-})
-```
-
-### Strata 1
-
-```{r results='asis'}
-outcomes_strata_plots$`1`
-```
-
-### Strata 2
-
-```{r results='asis'}
-outcomes_strata_plots$`2`
-```
-
-### Strata 3
-
-```{r results='asis'}
-outcomes_strata_plots$`3`
-```
-
-### Strata 4
-
-```{r results='asis'}
-outcomes_strata_plots$`4`
-```
-
-### Strata 5
-
-```{r results='asis'}
-outcomes_strata_plots$`5`
-```
-
-### Strata 6
-
-```{r results='asis'}
-outcomes_strata_plots$`6`
-```
-
-### Strata 7
-
-```{r results='asis'}
-outcomes_strata_plots$`7`
-```
-
-### Strata 8
-
-```{r results='asis'}
-outcomes_strata_plots$`8`
-```
-
-### Strata 9
-
-```{r results='asis'}
-outcomes_strata_plots$`9`
-```
-
-### Strata 10
-
-```{r results='asis'}
-outcomes_strata_plots$`10`
-```
-
-:::