generated from KSUDS/p4_machinelearning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
84 lines (76 loc) · 2.48 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# %%
import sys
!{sys.executable} -m pip install dalex
# %%
import pandas as pd
import numpy as np
import joblib # to savel ml models
from plotnine import *
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
# %%
dat_ml = pd.read_pickle('dat_ml.pkl')
# %%
# Split our data into training and testing sets.
X_pred = dat_ml.drop(['yrbuilt', 'before1980'], axis = 1)
y_pred = dat_ml.before1980
X_train, X_test, y_train, y_test = train_test_split(
X_pred, y_pred, test_size = .34, random_state = 76)
# %%
# Build models
clfNB = GaussianNB()
clfGB = GradientBoostingClassifier()
clfNB.fit(X_train, y_train)
clfGB.fit(X_train, y_train)
ypred_clfNB = clfNB.predict(X_test)
ypred_clfGB = clfGB.predict(X_test)
# %%
ypred_clfGB_prop = clfGB.predict_proba(X_test)
# %%
metrics.plot_roc_curve(clfGB, X_test, y_test)
metrics.plot_roc_curve(clfNB, X_test, y_test)
# %%
metrics.confusion_matrix(y_test, ypred_clfNB)
# %%
metrics.confusion_matrix(y_test, ypred_clfGB)
# %%
df_features = pd.DataFrame(
{'f_names': X_train.columns,
'f_values': clfGB.feature_importances_}).sort_values('f_values', ascending = False).head(12)
# %%
# Python sequence slice addresses
# can be written as a[start:end:step]
# and any of start, stop or end can be dropped.
# a[::-1] is reverse sequence.
f_names_cat = pd.Categorical(
df_features.f_names,
categories=df_features.f_names[::-1])
df_features = df_features.assign(f_cat = f_names_cat)
(ggplot(df_features,
aes(x = 'f_cat', y = 'f_values')) +
geom_col() +
coord_flip() +
theme_bw()
)
# %%
# build reduced model
compVars = df_features.f_names[::-1].tolist()
X_pred_reduced = dat_ml.filter(compVars, axis = 1)
y_pred = dat_ml.before1980
X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(
X_pred_reduced, y_pred, test_size = .34, random_state = 76)
clfGB_reduced = GradientBoostingClassifier()
clfGB_reduced.fit(X_train_reduced, y_train)
ypred_clfGB_red = clfGB_reduced.predict(X_test_reduced)
# %%
print(metrics.classification_report(ypred_clfGB_red, y_test))
metrics.confusion_matrix(y_test, ypred_clfGB_red)
# %%
joblib.dump(clfNB, 'models/clfNB.pkl')
joblib.dump(clfGB, 'models/clfGB.pkl')
joblib.dump(clfGB_reduced, 'models/clfGB_final.pkl')
df_features.f_names[::-1].to_pickle('models/compVars.pkl')
# %%