-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathzhvi.py
86 lines (68 loc) · 3.24 KB
/
zhvi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import polars as pl
import pyarrow.parquet as pq
import requests
from io import StringIO
def download_csv_to_dataframe(url):
response = requests.get(url)
if response.status_code == 200:
data = StringIO(response.text)
return pl.read_csv(data)
else:
print(f"Failed to download data from {url}. Status code: {response.status_code}")
return None
url_zhvi = "https://files.zillowstatic.com/research/public_csvs/zhvi/County_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv"
url_fips = "https://raw.githubusercontent.com/kjhealy/fips-codes/master/state_and_county_fips_master.csv"
# url_fips_other = "https://raw.githubusercontent.com/kjhealy/fips-codes/master/county_fips_master.csv"
df_zhvi = download_csv_to_dataframe(url_zhvi)
df_fips = download_csv_to_dataframe(url_fips)
# df_fips_other = download_csv_to_dataframe(url_fips_other)
print(f"Zillow Data: {df_zhvi.shape}")
print(f"FIPS: {df_fips.shape}")
# create a FIPS column in the Zillow Data
# join on the FIPS df to create the necessary amount of rows
df_zhvi = (
df_zhvi
.with_columns([
df_zhvi['StateCodeFIPS'].cast(pl.datatypes.Utf8).alias('StateCodeFIPS_str'),
df_zhvi['MunicipalCodeFIPS'].cast(pl.datatypes.Utf8).str.zfill(3).alias('MunicipalCodeFIPS_str')
])
.with_columns((pl.col('StateCodeFIPS_str') + pl.col('MunicipalCodeFIPS_str')).alias('CombinedFIPS'))
)
df_zhvi.select(['StateCodeFIPS', 'MunicipalCodeFIPS', 'CombinedFIPS']).head()
# drop the StateCodeFIPS and MunicipalCodeFIPS - now that they're combined we don't need them
df_zhvi = df_zhvi.drop(['StateCodeFIPS', 'MunicipalCodeFIPS', 'StateCodeFIPS_str', 'MunicipalCodeFIPS_str', 'RegionName', 'RegionType', 'StateName', 'State'])
# convert the combined FIPS to int
df_zhvi = df_zhvi.with_columns(df_zhvi['CombinedFIPS'].cast(pl.datatypes.Int64).alias('CombinedFIPS'))
# move the combined column to the front
df_zhvi = df_zhvi.select(['CombinedFIPS'] + [col for col in df_zhvi.columns if col != 'CombinedFIPS'])
df_zhvi.head()
zhvi = df_fips.join(df_zhvi, left_on="fips", right_on="CombinedFIPS", how="left")
mask = zhvi["name"].str.to_uppercase() != zhvi["name"]
zhvi = zhvi.filter(mask)
zhvi = zhvi.drop(["state", "RegionID", "name"])
zhvi.head()
# zhvi.write_csv("output_zhvi.csv")
# print(zhvi.shape)
# this is going to calculate the % growth for each county - from 2001 onward (assuming that they have iinfo from 2000)
months = [col for col in zhvi.columns if '-' in col]
for month in months:
year, m, _ = month.split('-')
last_year = str(int(year) - 1)
prev_month_col = f"{last_year}-{m}-31"
if prev_month_col in months:
growth_col_name = f"Growth_{month}"
growth_rate = 100 * (zhvi[month] - zhvi[prev_month_col]) / zhvi[prev_month_col]
zhvi = zhvi.with_columns(growth_rate.alias(growth_col_name))
lslice = 500
values = list(range(0, zhvi.shape[0], lslice))
import os
if not os.path.exists("zhvi_slices"):
os.makedirs("zhvi_slices")
previous = 0
for i in values:
print(str(i))
zhvi.slice(i, lslice).write_parquet("zhvi_slices/zhvi_" + str(i) +".parquet")
# To read them back in:
# df_zhvi = pl.read_parquet("zhvi_slices/*")
print(zhvi.shape)
print(zhvi.head())