diff --git a/README.md b/README.md index 4c70db3b..7fd6a8a8 100644 --- a/README.md +++ b/README.md @@ -54,11 +54,11 @@ Commercial, industrial, and land-only property sales are _not_ flagged by this m Outlier flags are broken out into 2 types: statistical outliers and heuristic outliers. - Statistical outliers are sales that are a set number of standard deviations (usually 2) away from the mean of a group of similar properties (e.g. within the same township, class, timeframe, etc.). -- Heuristic outliers use some sort of existing flag or rule to identify potentially non-arms-length sales. Heuristic outliers are ***always combined with a statistical threshold***, i.e. a sale with a matching last name must _also_ be N standard deviations from a group mean in order to be flagged. Examples of heuristic outlier flags include: - - **PTAX flag**: The [PTAX-203](https://tax.illinois.gov/content/dam/soi/en/web/tax/localgovernments/property/documents/ptax-203.pdf) form is required by the Illinois Department of Revenue for most property transfers. Certain fields on this form are highly indicative of a non-arms-length transaction, i.e. Question 10 indicating a short sale. - - **Non-person sale**: Flagged keyword suggests the sale involves a non-person legal entity (industrial buyer, bank, real estate firm, construction, etc.). - - **Flip Sale**: Flagged when the owner of the home owned the property for less than 1 year - - **Anomaly**: Flagged via an unsupervised machine learning model (isolation forest). +- Heuristic outliers use some sort of existing flag or rule to identify potentially non-arms-length sales. Heuristic outliers are _**always combined with a statistical threshold**_, i.e. a sale with a matching last name must _also_ be N standard deviations from a group mean in order to be flagged. Examples of heuristic outlier flags include: + - **PTAX flag**: The [PTAX-203](https://tax.illinois.gov/content/dam/soi/en/web/tax/localgovernments/property/documents/ptax-203.pdf) form is required by the Illinois Department of Revenue for most property transfers. Certain fields on this form are highly indicative of a non-arms-length transaction, i.e. Question 10 indicating a short sale. + - **Non-person sale**: Flagged keyword suggests the sale involves a non-person legal entity (industrial buyer, bank, real estate firm, construction, etc.). + - **Flip Sale**: Flagged when the owner of the home owned the property for less than 1 year + - **Anomaly**: Flagged via an unsupervised machine learning model (isolation forest). The following is a list of all current flag types: @@ -97,24 +97,6 @@ The following is a list of all current flag types: This query is used to generate the total sales that have some sort of outlier classification /* -WITH TotalRecords AS ( - SELECT COUNT(*) as total_count - FROM default.vw_pin_sale - WHERE sv_is_outlier IS NOT null -), OutlierCount AS ( - SELECT COUNT(*) as outlier_count - FROM default.vw_pin_sale - WHERE sv_is_outlier IS NOT NULL - AND sv_outlier_type <> 'Not outlier' -) - -SELECT - ROUND( - (outlier_count * 100.0) / total_count, - 3 - ) AS outlier_percentage -FROM - TotalRecords, OutlierCount; --> As of 2024-03-15, around **6.9%** of the total sales have some sort of outlier classification. Within that 6.9%, the proportion of each outlier type is: @@ -123,54 +105,8 @@ As of 2024-03-15, around **6.9%** of the total sales have some sort of outlier c /* This query is used to generate the proportion of different outlier types /* - -WITH TotalRecords AS ( - SELECT COUNT(*) as total_count - FROM default.vw_pin_sale - WHERE sv_is_outlier IS NOT null - AND sv_outlier_type <> 'Not outlier' -) - -SELECT - sv_outlier_type, - ROUND((COUNT(*) * 1.0 / total_count) * 100, 2) as proportion -FROM - default.vw_pin_sale -CROSS JOIN - TotalRecords -WHERE - sv_is_outlier IS NOT null - AND sv_outlier_type <> 'Not outlier' -GROUP BY - sv_outlier_type, total_count -ORDER BY - proportion DESC; --> -|Outlier Type |Percentage| -|-----------------------|---------:| -|PTAX-203 flag (Low) |35.50% | -|Non-person sale (low) |18.17% | -|Non-person sale (high) |10.29% | -|Anomaly (high) |7.08% | -|High price (raw) |6.20% | -|Low price (raw) |4.46% | -|Low price (raw & sqft) |4.02% | -|PTAX-203 flag (High) |2.98% | -|Home flip sale (high) |2.31% | -|Low price (sqft) |1.95% | -|High price (sqft) |1.88% | -|Anomaly (low) |1.76% | -|High price (raw & sqft)|1.44% | -|Home flip sale (low) |1.26% | -|Family sale (low) |0.64% | -|Family sale (high) |0.06% | -|High price swing |0.01% | -|Low price swing |0.00% | - - -*These outliers are flagged if the relevant price columns (log10 transformed and normalized) are 2 standard deviations below or above the mean within a given group* - ## Flagging Details ### Model run modes @@ -264,7 +200,9 @@ erDiagram boolean sv_is_ptax_outlier boolean ptax_flag_original boolean sv_is_heuristic_outlier - string sv_outlier_type + string sv_outlier_reason1 + string sv_outlier_reason2 + string sv_outlier_reason3 string group string run_id FK bigint version PK diff --git a/migrations/0002_update_outlier_column_structure_w_iasworld_2024_update.py b/migrations/0002_update_outlier_column_structure_w_iasworld_2024_update.py new file mode 100644 index 00000000..6cd3ea43 --- /dev/null +++ b/migrations/0002_update_outlier_column_structure_w_iasworld_2024_update.py @@ -0,0 +1,171 @@ +import awswrangler as wr +import boto3 +import os +import subprocess as sp +import numpy as np +from pyathena import connect +from pyathena.pandas.util import as_pandas + +# Set working dir to manual_update, standardize yaml and src locations +root = sp.getoutput("git rev-parse --show-toplevel") +os.chdir(os.path.join(root)) + + +def read_parquet_files_from_s3(input_path): + """ + Reads all Parquet files from a specified S3 path into a dictionary of DataFrames. + + Parameters: + input_path (str): The S3 bucket path where Parquet files are stored, e.g., 's3://my-bucket/my-folder/' + + Returns: + dict: A dictionary where each key is the filename and the value is the corresponding DataFrame. + """ + # Initialize the S3 session + session = boto3.Session() + + # List all objects in the given S3 path that are Parquet files + s3_objects = wr.s3.list_objects(path=input_path, boto3_session=session) + + # Filter objects to get only parquet files + parquet_files = [obj for obj in s3_objects if obj.endswith(".parquet")] + + # Dictionary to store DataFrames + dataframes = {} + + # Read each Parquet file into a DataFrame and store it in the dictionary + for file_path in parquet_files: + # Read the Parquet file into a DataFrame + df = wr.s3.read_parquet(path=file_path, boto3_session=session) + + # Extract the filename without the path for use as the dictionary key + filename = file_path.split("/")[-1].replace(".parquet", "") + + # Add the DataFrame to the dictionary + dataframes[filename] = df + + return dataframes + + +def process_dataframe(df, recode_dict): + """ + Transforms old structure with sv_outlier_type + to new structure with 3 separate outlier reason columns + """ + # Insert new columns filled with NaN + pos = df.columns.get_loc("sv_outlier_type") + 1 + for i in range(1, 4): + df.insert(pos, f"sv_outlier_reason{i}", np.nan) + pos += 1 + + # Use the dictionary to populate the new columns + for key, value in recode_dict.items(): + mask = df["sv_outlier_type"] == key + for col, val in value.items(): + df.loc[mask, col] = val + + df = df.drop(columns=["sv_outlier_type"]) + + return df + + +def write_dfs_to_s3(dfs, bucket, table): + """ + Writes dictionary of dfs to bucket + """ + + for df_name, df in dfs.items(): + file_path = f"{bucket}/0002_update_outlier_column_structure_w_iasworld_2024_update/new_prod_data/{table}/{df_name}.parquet" + wr.s3.to_parquet( + df=df, path=file_path, index=False, dtype={"sv_outlier_reason3": "string"} + ) + + +dfs_flag = read_parquet_files_from_s3( + os.path.join( + os.getenv("AWS_S3_WAREHOUSE_BUCKET"), + "sale", + "flag", + ) +) + +recode_dict = { + "PTAX-203 flag (Low)": { + "sv_outlier_reason1": "Low price", + "sv_outlier_reason2": "PTAX-203 Exclusion", + }, + "PTAX-203 flag (High)": { + "sv_outlier_reason1": "High price", + "sv_outlier_reason2": "PTAX-203 Exclusion", + }, + "Non-person sale (low)": { + "sv_outlier_reason1": "Low price", + "sv_outlier_reason2": "Non-person sale", + }, + "Non-person sale (high)": { + "sv_outlier_reason1": "High price", + "sv_outlier_reason2": "Non-person sale", + }, + "High price (raw)": { + "sv_outlier_reason1": "High price", + "sv_outlier_reason2": np.nan, + }, + "Low price (raw)": { + "sv_outlier_reason1": "Low price", + "sv_outlier_reason2": np.nan, + }, + "Anomaly (high)": { + "sv_outlier_reason1": "High price", + "sv_outlier_reason2": "Statistical Anomaly", + }, + "Anomaly (low)": { + "sv_outlier_reason1": "Low price", + "sv_outlier_reason2": "Statistical Anomaly", + }, + "Low price (raw & sqft)": { + "sv_outlier_reason1": "Low price", + "sv_outlier_reason2": "Low price per square foot", + }, + "High price (raw & sqft)": { + "sv_outlier_reason1": "High price", + "sv_outlier_reason2": "High price per square foot", + }, + "High price (sqft)": { + "sv_outlier_reason1": "High price per square foot", + "sv_outlier_reason2": np.nan, + }, + "Low price (sqft)": { + "sv_outlier_reason1": "Low price per square foot", + "sv_outlier_reason2": np.nan, + }, + "Home flip sale (high)": { + "sv_outlier_reason1": "High price", + "sv_outlier_reason2": "Price swing / Home flip", + }, + "Home flip sale (low)": { + "sv_outlier_reason1": "Low price", + "sv_outlier_reason2": "Price swing / Home flip", + }, + "Family sale (high)": { + "sv_outlier_reason1": "High price", + "sv_outlier_reason2": "Family sale", + }, + "Family sale (low)": { + "sv_outlier_reason1": "Low price", + "sv_outlier_reason2": "Family sale", + }, + "High price swing": { + "sv_outlier_reason1": "High price", + "sv_outlier_reason2": "Price swing / Home flip", + }, + "Low price swing": { + "sv_outlier_reason1": "Low price", + "sv_outlier_reason2": "Price swing / Home flip", + }, +} + +for key in dfs_flag: + dfs_flag[key] = process_dataframe(dfs_flag[key], recode_dict) + + +write_dfs_to_s3(dfs_flag, os.getenv("AWS_BUCKET_SV"), "flag")