Add iso forest fix and tri res subset

ccao-data · Jan 24, 2024 · 51efd99 · 51efd99
1 parent 600b063
commit 51efd99
Show file tree

Hide file tree

Showing 3 changed files with 185 additions and 82 deletions.
diff --git a/glue/flagging_script_glue/flagging.py b/glue/flagging_script_glue/flagging.py
@@ -9,6 +9,7 @@
 
 from scipy.stats import zscore
 from sklearn.ensemble import IsolationForest
+from sklearn.preprocessing import LabelEncoder
 from sklearn.decomposition import PCA
 
 SHORT_TERM_OWNER_THRESHOLD = 365  # 365 = 365 days or 1 year
@@ -92,36 +93,28 @@ def outlier_taxonomy(df: pd.DataFrame, permut: tuple, groups: tuple, condos: boo
     return df
 
 
-def iso_forest(
-    df: pd.DataFrame,
-    groups: tuple,
-    columns: list,
-    n_estimators: int = 1000,
-    max_samples: int or float = 0.2,
-) -> pd.DataFrame:
+def iso_forest(df, groups, columns, n_estimators=1000, max_samples=0.2):
     """
-    Runs an isolation forest model on our data for outlier detection.
-    First does PCA, then, attaches township/class info, and then runs the
-    IsoForest model with given parameters.
-    Inputs:
-        df (pd.DataFrame): dataframe with data for IsoForest
-        groups (tuple): grouping for the data to input into the IsoForest
-        columns (list): list with columns to run PCA/IsoForest on
-        n_estimators (int): number of estimators in IsoForest
-        max_samples(int or float): share of data to use as sample if float,
-                                   number to use if int
-    Outputs:
-        df (pd.DataFrame): with 'sv_anomaly' column from IsoForest.
+    Modified iso_forest function with label encoding for the 'geography_split' group.
     """
-
+    # Set index
     df.set_index("meta_sale_document_num", inplace=True)
 
+    # Perform PCA (assuming pca is a predefined function)
     feed = pca(df, columns)
 
     feed.index = df.index
+
+    # Label encode non-numeric groups
+    label_encoders = {}
     for group in groups:
+        if df[group].dtype not in ["int64", "float64", "int32", "float32"]:
+            le = LabelEncoder()
+            df[group] = le.fit_transform(df[group])
+            label_encoders[group] = le  # Store the encoder if needed later
         feed[group] = df[group]
 
+    # Initialize and fit the Isolation Forest
     isof = IsolationForest(
         n_estimators=n_estimators,
         max_samples=max_samples,
@@ -130,12 +123,14 @@ def iso_forest(
     )
     df["sv_anomaly"] = isof.fit_predict(feed)
 
+    # Assign labels for anomalies
     df["sv_anomaly"] = np.select(
         [(df["sv_anomaly"] == -1), (df["sv_anomaly"] == 1)],
         ["Outlier", "Not Outlier"],
         default="Not Outlier",
     )
 
+    # Reset index
     df.reset_index(inplace=True)
 
     return df

diff --git a/manual_flagging/manual_update.py b/manual_flagging/manual_update.py
@@ -15,9 +15,6 @@
 root = sp.getoutput("git rev-parse --show-toplevel")
 os.chdir(os.path.join(root, "manual_flagging"))
 
-# Set time for run_id
-chicago_tz = pytz.timezone("America/Chicago")
-
 # Use yaml as inputs
 with open(os.path.join("yaml", "inputs_update.yaml"), "r") as stream:
     inputs = yaml.safe_load(stream)
@@ -51,6 +48,7 @@
         res.class AS class,
         res.township_code AS township_code,
         res.year AS year,
+        res.char_yrblt as yrblt,
         res.pin AS pin,
         res.char_bldg_sf AS char_bldg_sf,
         res.pin_is_multicard
@@ -69,6 +67,7 @@
         condo.class AS class,
         condo.township_code AS township_code,
         condo.year AS year,
+        NULL AS yrblt,
         condo.pin AS pin,
         NULL AS char_bldg_sf,
         FALSE AS pin_is_multicard
@@ -85,17 +84,23 @@
     sale.doc_no AS meta_sale_document_num,
     sale.seller_name AS meta_sale_seller_name,
     sale.buyer_name AS meta_sale_buyer_name,
+    sale.nbhd as nbhd,
     sale.sale_filter_ptax_flag AS ptax_flag_original,
     data.class,
     data.township_code,
+    data.yrblt,
     data.year,
     data.pin,
     data.char_bldg_sf,
-    data.indicator  -- Selecting the indicator column
+    data.indicator,  -- Selecting the indicator column
+    universe.triad_code
 FROM CombinedData data
 INNER JOIN default.vw_pin_sale sale
     ON sale.pin = data.pin
     AND sale.year = data.year
+INNER JOIN "default"."vw_pin_universe" universe 
+    ON universe.pin = data.pin
+    AND universe.year = data.year
 WHERE {sql_time_frame}
 AND NOT sale.sale_filter_same_sale_within_365
 AND NOT sale.sale_filter_less_than_10k
@@ -112,7 +117,7 @@
 FROM sale.flag
 """
 
-# Execute queries and return as pandas df
+# Execute query and return as pandas data frame
 cursor = conn.cursor()
 cursor.execute(SQL_QUERY)
 metadata = cursor.description
@@ -130,87 +135,185 @@
     if flg.sql_type_to_pd_type(col[1]) is not None
 }
 df = df.astype(conversion_dict)
-df["ptax_flag_original"].fillna(False, inplace=True)
 
-# Separate res and condo sales based on the indicator column
+df["ptax_flag_original"].fillna(False, inplace=True)
+# - - -
+# Testing new ingest
+# - - -
+
+# Ingest groups
+df_new_groups = pd.read_excel(
+    os.path.join(root, "QC_salesval_nbhds_round2.xlsx"),
+    usecols=["Town Nbhd", "Town Grp 1"],
+).rename(columns={"Town Nbhd": "nbhd", "Town Grp 1": "geography_split"})
+
+# Subset to only City Tri data
+df = df[df["triad_code"] == "1"]
+# Grab only res data
 df_res = df[df["indicator"] == "res"].reset_index(drop=True)
-df_condo = df[df["indicator"] == "condo"].reset_index(drop=True)
 
-# Create rolling windows
-df_res_to_flag = flg.add_rolling_window(
-    df_res, num_months=inputs["rolling_window_months"]
+df_res["nbhd"] = df_res["nbhd"].astype(int)
+# - - -
+# Create new geographies and building age features
+# - - -
+
+
+df_res = pd.merge(df_res, df_new_groups, on="nbhd", how="left")
+
+# Calculate the building's age
+current_year = datetime.datetime.now().year
+df_res["bldg_age"] = current_year - df_res["yrblt"]
+
+# - - -
+# Single Family
+# - - -
+single_family_classes = [
+    "202",
+    "203",
+    "204",
+    "205",
+    "206",
+    "207",
+    "208",
+    "209",
+    "210",
+    "218",
+    "219",
+    "234",
+    "278",
+    "295",
+]
+
+df_res_single_fam = df_res[df_res["class"].isin(single_family_classes)]
+
+# Define bins for char_bldg_sf
+char_bldg_sf_bins = [0, 1200, 2400, float("inf")]
+char_bldg_sf_labels = ["below_1200", "1200_to_2400", "above_2400"]
+
+# Bin the char_bldg_sf data
+df_res_single_fam["char_bldg_sf_bin"] = pd.cut(
+    df_res_single_fam["char_bldg_sf"],
+    bins=char_bldg_sf_bins,
+    labels=char_bldg_sf_labels,
+)
+
+# Define bins for building age
+bldg_age_bins = [0, 40, float("inf")]
+bldg_age_labels = ["below_40_years", "above_40_years"]
+
+# Bin the building age data
+df_res_single_fam["bldg_age_bin"] = pd.cut(
+    df_res_single_fam["bldg_age"], bins=bldg_age_bins, labels=bldg_age_labels
+)
+
+# - - - -
+# Multi Family
+# - - - -
+
+multi_family_classes = ["211", "212"]
+df_res_multi_fam = df_res[df_res["class"].isin(multi_family_classes)]
+
+# Define bins for building age
+bldg_age_bins = [0, 20, float("inf")]
+bldg_age_labels = ["below_20_years", "above_20_years"]
+
+# Bin the building age data
+df_res_multi_fam["bldg_age_bin"] = pd.cut(
+    df_res_multi_fam["bldg_age"], bins=bldg_age_bins, labels=bldg_age_labels
 )
-df_condo_to_flag = flg.add_rolling_window(
-    df_condo, num_months=inputs["rolling_window_months"]
+
+# - - -
+# Create rolling windows
+# - - -
+
+#
+# test something
+#
+"""
+from sklearn.preprocessing import LabelEncoder
+
+# Initialize LabelEncoder
+le = LabelEncoder()
+
+# Fit and transform the data
+df_res_multi_fam['Encoded_Category'] = le.fit_transform(df_res_multi_fam['geography_split'])
+df_res_multi_fam[["Encoded_Category", "geography_split"]]
+#
+#
+#
+"""
+
+# Rolling window for single family
+df_res_single_fam_to_flag = flg.add_rolling_window(
+    df_res_single_fam, num_months=inputs["rolling_window_months"]
 )
 
-# Create condo stat groups. Condos are all collapsed into a single class,
-# since there are very few 297s or 399s
-condo_stat_groups = inputs["stat_groups"].copy()
-condo_stat_groups.remove("class")
+# Rolling window for multi_family
+df_res_multi_fam_to_flag = flg.add_rolling_window(
+    df_res_multi_fam, num_months=inputs["rolling_window_months"]
+)
 
+# - - -
+# Separate flagging for both
+# - - -
 # Flag outliers using the main flagging model
-df_res_flagged = flg_model.go(
-    df=df_res_to_flag,
-    groups=tuple(inputs["stat_groups"]),
+df_res_single_fam_flagged = flg_model.go(
+    df=df_res_single_fam_to_flag,
+    groups=tuple(inputs["stat_groups"]["single_family"]),
     iso_forest_cols=inputs["iso_forest"],
     dev_bounds=tuple(inputs["dev_bounds"]),
     condos=False,
 )
 
+# Flag outliers using the main flagging model
+df_res_multi_fam_flagged = flg_model.go(
+    df=df_res_multi_fam_to_flag,
+    groups=tuple(inputs["stat_groups"]["multi_family"]),
+    iso_forest_cols=inputs["iso_forest"],
+    dev_bounds=tuple(inputs["dev_bounds"]),
+    condos=False,
+)
+# - - -
+# Separate group size for both
+# - - -
 # Discard any flags with a group size under the threshold
-df_res_flagged_updated = flg.group_size_adjustment(
-    df=df_res_flagged,
-    stat_groups=inputs["stat_groups"],
+df_res_single_fam_flagged_updated = flg.group_size_adjustment(
+    df=df_res_single_fam_flagged,
+    stat_groups=inputs["stat_groups"]["single_family"],
     min_threshold=inputs["min_groups_threshold"],
     condos=False,
 )
 
-# Flag condo outliers, here we remove price per sqft as an input
-# for the isolation forest model since condos don't have a unit sqft
-condo_iso_forest = inputs["iso_forest"].copy()
-condo_iso_forest.remove("sv_price_per_sqft")
-
-df_condo_flagged = flg_model.go(
-    df=df_condo_to_flag,
-    groups=tuple(condo_stat_groups),
-    iso_forest_cols=condo_iso_forest,
-    dev_bounds=tuple(inputs["dev_bounds"]),
-    condos=True,
-)
-
-df_condo_flagged_updated = flg.group_size_adjustment(
-    df=df_condo_flagged,
-    stat_groups=condo_stat_groups,
+df_res_multi_fam_flagged_updated = flg.group_size_adjustment(
+    df=df_res_multi_fam_flagged,
+    stat_groups=inputs["stat_groups"]["multi_family"],
     min_threshold=inputs["min_groups_threshold"],
-    condos=True,
+    condos=False,
 )
 
-# Update the PTAX flag column with an additional std dev conditional w/ res groups
-df_res_flagged_updated_ptax = flg.ptax_adjustment(
-    df=df_res_flagged_updated,
-    groups=inputs["stat_groups"],
+# Iterate through both ptax groups
+# Update the PTAX flag column with an additional std dev conditional
+df_res_single_fam_flagged_ptax = flg.ptax_adjustment(
+    df=df_res_single_fam_flagged_updated,
+    groups=inputs["stat_groups"]["single_family"],
     ptax_sd=inputs["ptax_sd"],
-    condos=False,
 )
 
-# Update the PTAX flag column with an additional std dev conditional w/ condo groups
-df_condo_flagged_updated_ptax = flg.ptax_adjustment(
-    df=df_condo_flagged_updated,
-    groups=condo_stat_groups,
+df_res_multi_fam_flagged_ptax = flg.ptax_adjustment(
+    df=df_res_multi_fam_flagged_updated,
+    groups=inputs["stat_groups"]["multi_family"],
     ptax_sd=inputs["ptax_sd"],
-    condos=True,
 )
 
-df_flagged_ptax_merged = pd.concat(
-    [df_res_flagged_updated_ptax, df_condo_flagged_updated_ptax]
+df_flagged_merged = pd.concat(
+    [df_res_single_fam_flagged_ptax, df_res_multi_fam_flagged_ptax]
 ).reset_index(drop=True)
 
 # Finish flagging and subset to write to flag table
 df_flagged_final, run_id, timestamp = flg.finish_flags(
-    df=df_flagged_ptax_merged,
+    df=df_flagged_merged,
     start_date=inputs["time_frame"]["start"],
-    manual_update=True,
+    manual_update=False,
 )
 
 # -----------------------------------------------------------------------------