Skip to content

Commit

Permalink
Add iso forest fix and tri res subset
Browse files Browse the repository at this point in the history
  • Loading branch information
wagnerlmichael committed Jan 24, 2024
1 parent 600b063 commit 51efd99
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 82 deletions.
35 changes: 15 additions & 20 deletions glue/flagging_script_glue/flagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from scipy.stats import zscore
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

SHORT_TERM_OWNER_THRESHOLD = 365 # 365 = 365 days or 1 year
Expand Down Expand Up @@ -92,36 +93,28 @@ def outlier_taxonomy(df: pd.DataFrame, permut: tuple, groups: tuple, condos: boo
return df


def iso_forest(
df: pd.DataFrame,
groups: tuple,
columns: list,
n_estimators: int = 1000,
max_samples: int or float = 0.2,
) -> pd.DataFrame:
def iso_forest(df, groups, columns, n_estimators=1000, max_samples=0.2):
"""
Runs an isolation forest model on our data for outlier detection.
First does PCA, then, attaches township/class info, and then runs the
IsoForest model with given parameters.
Inputs:
df (pd.DataFrame): dataframe with data for IsoForest
groups (tuple): grouping for the data to input into the IsoForest
columns (list): list with columns to run PCA/IsoForest on
n_estimators (int): number of estimators in IsoForest
max_samples(int or float): share of data to use as sample if float,
number to use if int
Outputs:
df (pd.DataFrame): with 'sv_anomaly' column from IsoForest.
Modified iso_forest function with label encoding for the 'geography_split' group.
"""

# Set index
df.set_index("meta_sale_document_num", inplace=True)

# Perform PCA (assuming pca is a predefined function)
feed = pca(df, columns)

feed.index = df.index

# Label encode non-numeric groups
label_encoders = {}
for group in groups:
if df[group].dtype not in ["int64", "float64", "int32", "float32"]:
le = LabelEncoder()
df[group] = le.fit_transform(df[group])
label_encoders[group] = le # Store the encoder if needed later
feed[group] = df[group]

# Initialize and fit the Isolation Forest
isof = IsolationForest(
n_estimators=n_estimators,
max_samples=max_samples,
Expand All @@ -130,12 +123,14 @@ def iso_forest(
)
df["sv_anomaly"] = isof.fit_predict(feed)

# Assign labels for anomalies
df["sv_anomaly"] = np.select(
[(df["sv_anomaly"] == -1), (df["sv_anomaly"] == 1)],
["Outlier", "Not Outlier"],
default="Not Outlier",
)

# Reset index
df.reset_index(inplace=True)

return df
Expand Down
211 changes: 157 additions & 54 deletions manual_flagging/manual_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
root = sp.getoutput("git rev-parse --show-toplevel")
os.chdir(os.path.join(root, "manual_flagging"))

# Set time for run_id
chicago_tz = pytz.timezone("America/Chicago")

# Use yaml as inputs
with open(os.path.join("yaml", "inputs_update.yaml"), "r") as stream:
inputs = yaml.safe_load(stream)
Expand Down Expand Up @@ -51,6 +48,7 @@
res.class AS class,
res.township_code AS township_code,
res.year AS year,
res.char_yrblt as yrblt,
res.pin AS pin,
res.char_bldg_sf AS char_bldg_sf,
res.pin_is_multicard
Expand All @@ -69,6 +67,7 @@
condo.class AS class,
condo.township_code AS township_code,
condo.year AS year,
NULL AS yrblt,
condo.pin AS pin,
NULL AS char_bldg_sf,
FALSE AS pin_is_multicard
Expand All @@ -85,17 +84,23 @@
sale.doc_no AS meta_sale_document_num,
sale.seller_name AS meta_sale_seller_name,
sale.buyer_name AS meta_sale_buyer_name,
sale.nbhd as nbhd,
sale.sale_filter_ptax_flag AS ptax_flag_original,
data.class,
data.township_code,
data.yrblt,
data.year,
data.pin,
data.char_bldg_sf,
data.indicator -- Selecting the indicator column
data.indicator, -- Selecting the indicator column
universe.triad_code
FROM CombinedData data
INNER JOIN default.vw_pin_sale sale
ON sale.pin = data.pin
AND sale.year = data.year
INNER JOIN "default"."vw_pin_universe" universe
ON universe.pin = data.pin
AND universe.year = data.year
WHERE {sql_time_frame}
AND NOT sale.sale_filter_same_sale_within_365
AND NOT sale.sale_filter_less_than_10k
Expand All @@ -112,7 +117,7 @@
FROM sale.flag
"""

# Execute queries and return as pandas df
# Execute query and return as pandas data frame
cursor = conn.cursor()
cursor.execute(SQL_QUERY)
metadata = cursor.description
Expand All @@ -130,87 +135,185 @@
if flg.sql_type_to_pd_type(col[1]) is not None
}
df = df.astype(conversion_dict)
df["ptax_flag_original"].fillna(False, inplace=True)

# Separate res and condo sales based on the indicator column
df["ptax_flag_original"].fillna(False, inplace=True)
# - - -
# Testing new ingest
# - - -

# Ingest groups
df_new_groups = pd.read_excel(
os.path.join(root, "QC_salesval_nbhds_round2.xlsx"),
usecols=["Town Nbhd", "Town Grp 1"],
).rename(columns={"Town Nbhd": "nbhd", "Town Grp 1": "geography_split"})

# Subset to only City Tri data
df = df[df["triad_code"] == "1"]
# Grab only res data
df_res = df[df["indicator"] == "res"].reset_index(drop=True)
df_condo = df[df["indicator"] == "condo"].reset_index(drop=True)

# Create rolling windows
df_res_to_flag = flg.add_rolling_window(
df_res, num_months=inputs["rolling_window_months"]
df_res["nbhd"] = df_res["nbhd"].astype(int)
# - - -
# Create new geographies and building age features
# - - -


df_res = pd.merge(df_res, df_new_groups, on="nbhd", how="left")

# Calculate the building's age
current_year = datetime.datetime.now().year
df_res["bldg_age"] = current_year - df_res["yrblt"]

# - - -
# Single Family
# - - -
single_family_classes = [
"202",
"203",
"204",
"205",
"206",
"207",
"208",
"209",
"210",
"218",
"219",
"234",
"278",
"295",
]

df_res_single_fam = df_res[df_res["class"].isin(single_family_classes)]

# Define bins for char_bldg_sf
char_bldg_sf_bins = [0, 1200, 2400, float("inf")]
char_bldg_sf_labels = ["below_1200", "1200_to_2400", "above_2400"]

# Bin the char_bldg_sf data
df_res_single_fam["char_bldg_sf_bin"] = pd.cut(
df_res_single_fam["char_bldg_sf"],
bins=char_bldg_sf_bins,
labels=char_bldg_sf_labels,
)

# Define bins for building age
bldg_age_bins = [0, 40, float("inf")]
bldg_age_labels = ["below_40_years", "above_40_years"]

# Bin the building age data
df_res_single_fam["bldg_age_bin"] = pd.cut(
df_res_single_fam["bldg_age"], bins=bldg_age_bins, labels=bldg_age_labels
)

# - - - -
# Multi Family
# - - - -

multi_family_classes = ["211", "212"]
df_res_multi_fam = df_res[df_res["class"].isin(multi_family_classes)]

# Define bins for building age
bldg_age_bins = [0, 20, float("inf")]
bldg_age_labels = ["below_20_years", "above_20_years"]

# Bin the building age data
df_res_multi_fam["bldg_age_bin"] = pd.cut(
df_res_multi_fam["bldg_age"], bins=bldg_age_bins, labels=bldg_age_labels
)
df_condo_to_flag = flg.add_rolling_window(
df_condo, num_months=inputs["rolling_window_months"]

# - - -
# Create rolling windows
# - - -

#
# test something
#
"""
from sklearn.preprocessing import LabelEncoder
# Initialize LabelEncoder
le = LabelEncoder()
# Fit and transform the data
df_res_multi_fam['Encoded_Category'] = le.fit_transform(df_res_multi_fam['geography_split'])
df_res_multi_fam[["Encoded_Category", "geography_split"]]
#
#
#
"""

# Rolling window for single family
df_res_single_fam_to_flag = flg.add_rolling_window(
df_res_single_fam, num_months=inputs["rolling_window_months"]
)

# Create condo stat groups. Condos are all collapsed into a single class,
# since there are very few 297s or 399s
condo_stat_groups = inputs["stat_groups"].copy()
condo_stat_groups.remove("class")
# Rolling window for multi_family
df_res_multi_fam_to_flag = flg.add_rolling_window(
df_res_multi_fam, num_months=inputs["rolling_window_months"]
)

# - - -
# Separate flagging for both
# - - -
# Flag outliers using the main flagging model
df_res_flagged = flg_model.go(
df=df_res_to_flag,
groups=tuple(inputs["stat_groups"]),
df_res_single_fam_flagged = flg_model.go(
df=df_res_single_fam_to_flag,
groups=tuple(inputs["stat_groups"]["single_family"]),
iso_forest_cols=inputs["iso_forest"],
dev_bounds=tuple(inputs["dev_bounds"]),
condos=False,
)

# Flag outliers using the main flagging model
df_res_multi_fam_flagged = flg_model.go(
df=df_res_multi_fam_to_flag,
groups=tuple(inputs["stat_groups"]["multi_family"]),
iso_forest_cols=inputs["iso_forest"],
dev_bounds=tuple(inputs["dev_bounds"]),
condos=False,
)
# - - -
# Separate group size for both
# - - -
# Discard any flags with a group size under the threshold
df_res_flagged_updated = flg.group_size_adjustment(
df=df_res_flagged,
stat_groups=inputs["stat_groups"],
df_res_single_fam_flagged_updated = flg.group_size_adjustment(
df=df_res_single_fam_flagged,
stat_groups=inputs["stat_groups"]["single_family"],
min_threshold=inputs["min_groups_threshold"],
condos=False,
)

# Flag condo outliers, here we remove price per sqft as an input
# for the isolation forest model since condos don't have a unit sqft
condo_iso_forest = inputs["iso_forest"].copy()
condo_iso_forest.remove("sv_price_per_sqft")

df_condo_flagged = flg_model.go(
df=df_condo_to_flag,
groups=tuple(condo_stat_groups),
iso_forest_cols=condo_iso_forest,
dev_bounds=tuple(inputs["dev_bounds"]),
condos=True,
)

df_condo_flagged_updated = flg.group_size_adjustment(
df=df_condo_flagged,
stat_groups=condo_stat_groups,
df_res_multi_fam_flagged_updated = flg.group_size_adjustment(
df=df_res_multi_fam_flagged,
stat_groups=inputs["stat_groups"]["multi_family"],
min_threshold=inputs["min_groups_threshold"],
condos=True,
condos=False,
)

# Update the PTAX flag column with an additional std dev conditional w/ res groups
df_res_flagged_updated_ptax = flg.ptax_adjustment(
df=df_res_flagged_updated,
groups=inputs["stat_groups"],
# Iterate through both ptax groups
# Update the PTAX flag column with an additional std dev conditional
df_res_single_fam_flagged_ptax = flg.ptax_adjustment(
df=df_res_single_fam_flagged_updated,
groups=inputs["stat_groups"]["single_family"],
ptax_sd=inputs["ptax_sd"],
condos=False,
)

# Update the PTAX flag column with an additional std dev conditional w/ condo groups
df_condo_flagged_updated_ptax = flg.ptax_adjustment(
df=df_condo_flagged_updated,
groups=condo_stat_groups,
df_res_multi_fam_flagged_ptax = flg.ptax_adjustment(
df=df_res_multi_fam_flagged_updated,
groups=inputs["stat_groups"]["multi_family"],
ptax_sd=inputs["ptax_sd"],
condos=True,
)

df_flagged_ptax_merged = pd.concat(
[df_res_flagged_updated_ptax, df_condo_flagged_updated_ptax]
df_flagged_merged = pd.concat(
[df_res_single_fam_flagged_ptax, df_res_multi_fam_flagged_ptax]
).reset_index(drop=True)

# Finish flagging and subset to write to flag table
df_flagged_final, run_id, timestamp = flg.finish_flags(
df=df_flagged_ptax_merged,
df=df_flagged_merged,
start_date=inputs["time_frame"]["start"],
manual_update=True,
manual_update=False,
)

# -----------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 51efd99

Please sign in to comment.