Skip to content

Commit

Permalink
Add raw price threshold for sales val (#142)
Browse files Browse the repository at this point in the history
* Add first pass at hard threshold

* Update log transform

* Change cap amount

* Add to docstring

* Add arg for raw price thresh

* Add yaml entry

* Revert changes

* Revert changes

* Update glue/sales_val_flagging.py

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>

* Update manual_flagging/yaml/inputs.yaml

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>

* Persist raw price thresh

* Standardize threshold naming

* Update glue/sales_val_flagging.py

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>

* Update manual_flagging/yaml/inputs.yaml

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>

---------

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>
  • Loading branch information
wagnerlmichael and jeancochrane authored Jan 6, 2025
1 parent 8a288d8 commit 962ef1d
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 12 deletions.
23 changes: 19 additions & 4 deletions glue/flagging_script_glue/flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def go(
iso_forest_cols: list,
dev_bounds: tuple,
condos: bool,
raw_price_threshold: int,
):
"""
This function runs all of our other functions in the correct sequence.
Expand Down Expand Up @@ -50,7 +51,9 @@ def go(
print("string_processing() done")
df = iso_forest(df, groups, iso_forest_cols)
print("iso_forest() done")
df = outlier_taxonomy(df, dev_bounds, groups, condos=condos)
df = outlier_taxonomy(
df, dev_bounds, groups, condos=condos, raw_price_threshold=raw_price_threshold
)
print("outlier_taxonomy() done\nfinished")

return df
Expand All @@ -69,7 +72,13 @@ def create_group_string(groups: tuple, sep: str) -> str:
return sep.join(groups)


def outlier_taxonomy(df: pd.DataFrame, permut: tuple, groups: tuple, condos: bool):
def outlier_taxonomy(
df: pd.DataFrame,
permut: tuple,
groups: tuple,
condos: bool,
raw_price_threshold: int,
):
"""
Creates columns having to do with our chosen outlier taxonomy.
Ex: Family sale, Home flip sale, Non-person sale, High price (raw and or sqft), etc.
Expand All @@ -84,7 +93,7 @@ def outlier_taxonomy(df: pd.DataFrame, permut: tuple, groups: tuple, condos: boo

df = check_days(df, SHORT_TERM_OWNER_THRESHOLD)
df = pricing_info(df, permut, groups, condos=condos)
df = outlier_type(df, condos=condos)
df = outlier_type(df, condos=condos, raw_price_threshold=raw_price_threshold)

return df

Expand Down Expand Up @@ -740,7 +749,9 @@ def z_normalize_groupby(s: pd.Series):
return zscore(s, nan_policy="omit")


def outlier_type(df: pd.DataFrame, condos: bool) -> pd.DataFrame:
def outlier_type(
df: pd.DataFrame, condos: bool, raw_price_threshold: int
) -> pd.DataFrame:
"""
This function create indicator columns for each distinct outlier type between price
and characteristic outliers. These columns are prefixed with 'sv_ind_'.
Expand Down Expand Up @@ -807,6 +818,10 @@ def outlier_type(df: pd.DataFrame, condos: bool) -> pd.DataFrame:
"sv_ind_price_low_price_sqft",
]

# Implement raw threshold, unlog price
price_conditions.append((10 ** df["meta_sale_price"]) > raw_price_threshold)
price_labels.append("sv_ind_raw_price_threshold")

combined_conditions = price_conditions + char_conditions
combined_labels = price_labels + char_labels

Expand Down
26 changes: 19 additions & 7 deletions glue/sales_val_flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,8 @@ def classify_outliers(df, stat_groups: list, min_threshold):
2. Implement our group threshold requirement. In the statistical flagging process, if
the group a sale belongs too is below N=30 then we want to manually set these flags to
non-outlier status, even if they were flagged in the mansueto script. This requirement
is bypasses for ptax outliers - we don't care about group threshold in this case.
is bypasses for ptax outliers and raw price threshold outliers - we don't care about
group threshold in this case.
Inputs:
df: The data right after we perform the flagging script (go()), when the exploded
Expand Down Expand Up @@ -178,6 +179,7 @@ def classify_outliers(df, stat_groups: list, min_threshold):
"sv_ind_ptax_flag_w_high_price_sqft": "High price per square foot",
"sv_ind_price_low_price_sqft": "Low price per square foot",
"sv_ind_ptax_flag_w_low_price_sqft": "Low price per square foot",
"sv_ind_raw_price_threshold": "Raw price threshold",
"sv_ind_ptax_flag": "PTAX-203 Exclusion",
"sv_ind_char_short_term_owner": "Short-term owner",
"sv_ind_char_family_sale": "Family Sale",
Expand All @@ -199,6 +201,11 @@ def classify_outliers(df, stat_groups: list, min_threshold):
Note: This doesn't apply for sales that also have a ptax outlier status.
In this case, we still assign the price outlier status.
We also don't apply this threshold with sv_raw_price_threshold,
since this is designed to be a safeguard that catches very high price
sales that may have slipped through the cracks due to the group
threshold requirement
"""
group_thresh_price_fix = [
"sv_ind_price_high_price",
Expand Down Expand Up @@ -237,12 +244,14 @@ def fill_outlier_reasons(row):
# Drop the _merge column
df = df.drop(columns=["_merge"])

# Assign outlier status
# Assign outlier status, these are the outlier types
# that assign a sale as an outlier
values_to_check = {
"High price",
"Low price",
"High price per square foot",
"Low price per square foot",
"Raw price threshold",
}

df["sv_is_outlier"] = np.where(
Expand Down Expand Up @@ -471,8 +480,9 @@ def get_parameter_df(
ptax_sd,
rolling_window,
time_frame,
short_term_thresh,
min_group_thresh,
short_term_threshold,
min_group_threshold,
raw_price_threshold,
run_id,
):
"""
Expand All @@ -488,8 +498,9 @@ def get_parameter_df(
ptax_sd: list of standard deviations used for ptax flagging
rolling_window: how many months used in rolling window methodology
date_floor: parameter specification that limits earliest flagging write
short_term_thresh: short-term threshold for Mansueto's flagging model
short_term_threshold: short-term threshold for Mansueto's flagging model
min_group_thresh: minimum group size threshold needed to flag as outlier
raw_price_threshold: raw price threshold at which we unconditionally classify sales as outliers
run_id: unique run_id to flagging program run
Outputs:
df_parameters: parameters table associated with flagging run
Expand All @@ -512,8 +523,9 @@ def get_parameter_df(
"ptax_sd": [ptax_sd],
"rolling_window": [rolling_window],
"time_frame": [time_frame],
"short_term_owner_threshold": [short_term_thresh],
"min_group_thresh": [min_group_thresh],
"short_term_owner_threshold": [short_term_threshold],
"min_group_thresh": [min_group_threshold],
"raw_price_threshold": [raw_price_threshold],
}

df_parameters = pd.DataFrame(parameter_dict_to_df)
Expand Down
4 changes: 3 additions & 1 deletion manual_flagging/flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ def create_bins_and_labels(input_list):
iso_forest_cols=df_info["iso_forest_cols"],
dev_bounds=tuple(inputs["dev_bounds"]),
condos=df_info["condos_boolean"],
raw_price_threshold=inputs["raw_price_threshold"],
)

# Add the edited or unedited dataframe to the new dictionary
Expand Down Expand Up @@ -400,8 +401,9 @@ def create_bins_and_labels(input_list):
ptax_sd=inputs["ptax_sd"],
rolling_window=inputs["rolling_window_months"],
time_frame=inputs["time_frame"],
short_term_thresh=flg_model.SHORT_TERM_OWNER_THRESHOLD,
short_term_threshold=flg_model.SHORT_TERM_OWNER_THRESHOLD,
min_group_thresh=inputs["min_groups_threshold"],
raw_price_threshold=inputs["raw_price_threshold"],
run_id=run_id,
)

Expand Down
4 changes: 4 additions & 0 deletions manual_flagging/yaml/inputs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,7 @@ ptax_sd: [1, 1]

# Flags are only applied if there are at least this many sales in the group
min_groups_threshold: 30

# This is the raw price threshold that is used to set sales to outlier status
# regardless of group size
raw_price_threshold: 15_000_000

0 comments on commit 962ef1d

Please sign in to comment.