Skip to content

Commit

Permalink
Implement design doc changes (#128)
Browse files Browse the repository at this point in the history
* Rename reason codes from issue 125

* Add beginning logic for new flags

* Intitial 3 column outlier_reasons working

* Add functioning 3 column output

* Remove print statements

* Fix ptax classifiers

* Change comments

* Fix raw and sqft condition

* Remove comments

* Edit docstring and comments

* Remove condos bool

* Fix conditions based on design doc spec

* Update glue/flagging_script_glue/flagging.py

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>

* Fix conditions

* Start re-work

* Start re-work

* Establish sorting logic

* Edit comment

* Correctly classify outliers

* Remove sv_is_outlier logic from mansueto script

* Add comments and un-comment things

* Remove unneeded code

* Edit comments

* Remove doc string component

* Fix condos conditional

* Add Low price to values to check

* Update docs

* Change comment

* Edit docstring

* Remove docs

* Remove comment

* Update glue/sales_val_flagging.py

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>

* Remove na column creation

* Try refactor of sqft separation

* Improve ptax reference

* Replace string null with np nan

* Re-factor classify_outliers

* Remove dict

* Update glue/sales_val_flagging.py

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>

* Update glue/sales_val_flagging.py

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>

* Update glue/sales_val_flagging.py

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>

* Fix idx ref

* Handle condos sqft indictoar error and change group thresh handling

* Fix comma problem

* Fix dtypes

* Edit workflow to disentangle ptax and price

* Add documentation

* Add documentation

* Remove todo

* Simplify func

* Restore yaml

* Restore yaml

* Add docs

---------

Co-authored-by: Jean Cochrane <jeancochrane@users.noreply.github.com>
  • Loading branch information
wagnerlmichael and jeancochrane authored Jun 18, 2024
1 parent 1cb6f2b commit 7ea5146
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 209 deletions.
193 changes: 57 additions & 136 deletions glue/flagging_script_glue/flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,8 @@ def outlier_taxonomy(df: pd.DataFrame, permut: tuple, groups: tuple, condos: boo
"""

df = check_days(df, SHORT_TERM_OWNER_THRESHOLD)

df = pricing_info(df, permut, groups, condos=condos)

df = outlier_type(df, condos=condos)
df = outlier_flag(df)
df = special_flag(df)

return df

Expand Down Expand Up @@ -270,26 +266,6 @@ def pricing_info(
return df


def special_flag(df: pd.DataFrame) -> pd.DataFrame:
"""
Creates column that checks whether there is a special flag for this record.
Inputs:
df (pd.DataFrame): dataframe to add flags onto
Outputs:
df (pd.DataFrame): dataframe with 'special_flags' column
"""
cond = [
(df["sv_name_match"] != "No match"),
(df["sv_short_owner"] == "Short-term owner"),
(df["sv_transaction_type"] == "legal_entity-legal_entity"),
]
labels = ["Family sale", "Home flip sale", "Non-person sale"]

df["sv_special_flags"] = np.select(cond, labels, default="Not special")

return df


def which_price(row: pd.Series, thresholds: dict, groups: tuple) -> str:
"""
Determines whether sale_price, price_per_sqft, or both are outliers,
Expand Down Expand Up @@ -598,11 +574,10 @@ def get_sale_counts(dups: pd.DataFrame) -> pd.DataFrame:
v_counts = (
dups.pin.value_counts()
.reset_index()
.rename(columns={"index": "pin", "pin": "sv_sale_dup_counts"})
.rename(columns={"count": "sv_sale_dup_counts"})
)

# Explicitly specify the merging columns
dups = pd.merge(dups, v_counts, on="pin")
dups = pd.merge(dups, v_counts)

return dups

Expand Down Expand Up @@ -767,131 +742,77 @@ def z_normalize_groupby(s: pd.Series):

def outlier_type(df: pd.DataFrame, condos: bool) -> pd.DataFrame:
"""
Runs np.select that creates an outlier taxonomy.
This function create indicator columns for each distinct outlier type between price
and characteristic outliers. These columns are prefixed with 'sv_ind_'.
Inputs:
df (pd.DataFrame): dataframe with necessary columns created from previous functions.
df (pd.DataFrame): Dataframe
Outputs:
df (pd.DataFrame): dataframe with 'sv_outlier_type' column.
df (pd.DataFrame): Dataframe with indicator columns for each flag type
"""
if condos == True:
conditions = [
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("High")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("High")),
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("High")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("High")),
(df["sv_pricing"].str.contains("High price swing")),
(df["sv_pricing"].str.contains("High")),
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("Low")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("Low")),
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("Low")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("Low")),
(df["sv_pricing"].str.contains("Low price swing")),
(df["sv_pricing"].str.contains("Low")),

char_conditions = [
df["sv_short_owner"] == "Short-term owner",
df["sv_name_match"] != "No match",
df[["sv_buyer_category", "sv_seller_category"]].eq("legal_entity").any(axis=1),
df["sv_anomaly"] == "Outlier",
df["sv_pricing"].str.contains("High price swing")
| df["sv_pricing"].str.contains("Low price swing"),
]

# Define labels for characteristic-based reasons
char_labels = [
"sv_ind_char_short_term_owner",
"sv_ind_char_family_sale",
"sv_ind_char_non_person_sale",
"sv_ind_char_statistical_anomaly",
"sv_ind_char_price_swing_homeflip",
]

if condos:
# Define conditions for price-based reasons
price_conditions = [
df["sv_pricing"].str.contains("High"),
df["sv_pricing"].str.contains("Low"),
]

labels = [
"Home flip sale (high)",
"Family sale (high)",
"Non-person sale (high)",
"Anomaly (high)",
"High price swing",
"High price (raw)",
"Home flip sale (low)",
"Family sale (low)",
"Non-person sale (low)",
"Anomaly (low)",
"Low price swing",
"Low price (raw)",
# Define labels for price-based reasons
price_labels = [
"sv_ind_price_high_price",
"sv_ind_price_low_price",
]

else:
conditions = [
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("High")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("High")),
# Define conditions for price-based reasons
price_conditions = [
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("High")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("High")),
(df["sv_pricing"].str.contains("High price swing")),
(df["sv_pricing"].str.contains("High"))
& (df["sv_which_price"] == "(raw & sqft)"),
(df["sv_pricing"].str.contains("High")) & (df["sv_which_price"] == "(raw)"),
(df["sv_pricing"].str.contains("High"))
& (df["sv_which_price"] == "(sqft)"),
(df["sv_short_owner"] == "Short-term owner")
& (df["sv_pricing"].str.contains("Low")),
(df["sv_name_match"] != "No match")
& (df["sv_pricing"].str.contains("Low")),
df["sv_pricing"].str.contains("High")
& (df["sv_which_price"].str.contains("raw"))
),
(
df[["sv_buyer_category", "sv_seller_category"]]
.eq("legal_entity")
.any(axis=1)
)
& (df["sv_pricing"].str.contains("Low")),
(df["sv_anomaly"] == "Outlier") & (df["sv_pricing"].str.contains("Low")),
(df["sv_pricing"].str.contains("Low price swing")),
df["sv_pricing"].str.contains("Low")
& (df["sv_which_price"].str.contains("raw"))
),
(df["sv_pricing"].str.contains("High"))
& (df["sv_which_price"].str.contains("sqft")),
(df["sv_pricing"].str.contains("Low"))
& (df["sv_which_price"] == "(raw & sqft)"),
(df["sv_pricing"].str.contains("Low")) & (df["sv_which_price"] == "(raw)"),
(df["sv_pricing"].str.contains("Low")) & (df["sv_which_price"] == "(sqft)"),
& (df["sv_which_price"].str.contains("sqft")),
]

labels = [
"Home flip sale (high)",
"Family sale (high)",
"Non-person sale (high)",
"Anomaly (high)",
"High price swing",
"High price (raw & sqft)",
"High price (raw)",
"High price (sqft)",
"Home flip sale (low)",
"Family sale (low)",
"Non-person sale (low)",
"Anomaly (low)",
"Low price swing",
"Low price (raw & sqft)",
"Low price (raw)",
"Low price (sqft)",
# Define labels for price-based reasons
price_labels = [
"sv_ind_price_high_price",
"sv_ind_price_low_price",
"sv_ind_price_high_price_sqft",
"sv_ind_price_low_price_sqft",
]

df["sv_outlier_type"] = np.select(conditions, labels, default="Not outlier")
combined_conditions = price_conditions + char_conditions
combined_labels = price_labels + char_labels

return df


def outlier_flag(df: pd.DataFrame) -> pd.DataFrame:
"""
Creates a flag that shows whether the record is an
outlier (a special flag) according to our outlier taxonomy.
Inputs:
df (pd.DataFrame): dataframe to create outlier flag
Outputs:
df (pd.DataFrame): dataframe with 'is_outlier' column
"""

df["sv_is_outlier"] = np.select(
[(df["sv_outlier_type"] == "Not outlier")], [0], default=1
)
# Create indicator columns for each flag type
for label, condition in zip(combined_labels, combined_conditions):
df[label] = condition.astype(int)

return df

Expand Down
Loading

0 comments on commit 7ea5146

Please sign in to comment.