-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis_pandemic.py
86 lines (60 loc) · 2.67 KB
/
analysis_pandemic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
DATASET_FILE = "combined.csv"
PANDEMIC_DATE = pd.to_datetime("2020-03-01")
POST_PANDEMIC_DATE = pd.to_datetime("2022-04-03")
MAX_DEPTH = 50
MIN_SAMPLES_SPLIT = 10
N_ESTIMATORS = 20
if __name__ == "__main__":
df = pd.read_csv("combined.csv")
df["TIME"] = pd.to_datetime(df["TIME"])
time_column = df["TIME"]
# limit samples till pandemic end
df = df[df["TIME"] < POST_PANDEMIC_DATE]
# filter pre pandemic for training
training_df = df[df["TIME"] < PANDEMIC_DATE]
# drop time and station id
training_df = training_df.drop(["TIME"], axis=1)
# input and output
x = training_df.drop(["USAGE"], axis=1)
y = training_df["USAGE"]
# normalize x values
x = (x - x.min()) / (x.max() - x.min())
rf = RandomForestRegressor(max_depth=MAX_DEPTH, min_samples_split=MIN_SAMPLES_SPLIT, n_estimators=N_ESTIMATORS).fit(x, y)
print("FEATURE IMPORTANCE")
feature_importances = rf.feature_importances_
res = []
# Display the feature_importances for each input feature
for i, column in enumerate(x.columns):
res.append({
"Parameter": column,
"Importance": feature_importances[i],
})
print(pd.DataFrame(res))
x_pred = df.drop(["USAGE", "TIME"], axis=1)
# normalize x values
x_pred = (x_pred - x_pred.min()) / (x_pred.max() - x_pred.min())
df['predicted_USAGE'] = rf.predict(x_pred)
# change to weekly samples
df["TIME"] = pd.to_datetime(time_column)
df.set_index("TIME", inplace=True)
station_sum_df = df.resample("W").sum()
station_sum_df.reset_index(drop=False, inplace=True)
rf_station_sum_df = station_sum_df[station_sum_df["TIME"] > PANDEMIC_DATE]
mse_rf = mean_squared_error(rf_station_sum_df["USAGE"], rf_station_sum_df["predicted_USAGE"])
mae_rf = mean_absolute_error(rf_station_sum_df["USAGE"], rf_station_sum_df["predicted_USAGE"])
print("MSE MAE RF:", mse_rf, mae_rf)
plt.figure(figsize=(10, 6))
plt.plot(station_sum_df["TIME"], station_sum_df["USAGE"], linestyle='-', marker='', color='blue', label='Actual', linewidth=2)
plt.plot(rf_station_sum_df["TIME"], rf_station_sum_df["predicted_USAGE"], linestyle='-', marker='', color='red', label='Prediction', linewidth=2)
# separators
plt.axvline(x=PANDEMIC_DATE, color='k', linestyle='--')
plt.text(PANDEMIC_DATE, plt.ylim()[1] * 0.95, "Pandemic", color='k')
plt.xlabel('Week')
plt.ylabel('Bike Usage')
plt.title('Bike Usage Analysis - Pandemic')
plt.legend()
plt.show()