-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathViewsEstimators.py
123 lines (101 loc) · 5.11 KB
/
ViewsEstimators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from typing import Optional, Union
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import XGBRFRegressor, XGBRFClassifier
from lightgbm import LGBMClassifier, LGBMRegressor
#from lightgbm import LGBMClassifier, LGBMRegressor
class HurdleRegression(BaseEstimator):
""" Regression model which handles excessive zeros by fitting a two-part model and combining predictions:
1) binary classifier
2) continuous regression
Implementeted as a valid sklearn estimator, so it can be used in pipelines and GridSearch objects.
Args:
clf_name: currently supports either 'logistic' or 'LGBMClassifier'
reg_name: currently supports either 'linear' or 'LGBMRegressor'
clf_params: dict of parameters to pass to classifier sub-model when initialized
reg_params: dict of parameters to pass to regression sub-model when initialized
"""
def __init__(self,
clf_name: str = 'logistic',
reg_name: str = 'linear',
clf_params: Optional[dict] = None,
reg_params: Optional[dict] = None):
self.clf_name = clf_name
self.reg_name = reg_name
self.clf_params = clf_params
self.reg_params = reg_params
self.clf_fi = []
self.reg_fi = []
@staticmethod
def _resolve_estimator(func_name: str):
""" Lookup table for supported estimators.
This is necessary because sklearn estimator default arguments
must pass equality test, and instantiated sub-estimators are not equal. """
funcs = {'linear': LinearRegression(),
'logistic': LogisticRegression(solver='liblinear'),
'LGBMRegressor': LGBMRegressor(n_estimators=250),
'LGBMClassifier': LGBMClassifier(n_estimators=250),
'RFRegressor': XGBRFRegressor(n_estimators=250,n_jobs=-2),
'RFClassifier': XGBRFClassifier(n_estimators=250,n_jobs=-2),
'GBMRegressor': GradientBoostingRegressor(n_estimators=200),
'GBMClassifier': GradientBoostingClassifier(n_estimators=200),
'XGBRegressor': XGBRegressor(n_estimators=100,learning_rate=0.05,n_jobs=-2),
'XGBClassifier': XGBClassifier(n_estimators=100,learning_rate=0.05,n_jobs=-2),
'HGBRegressor': HistGradientBoostingRegressor(max_iter=200),
'HGBClassifier': HistGradientBoostingClassifier(max_iter=200),
}
return funcs[func_name]
def fit(self,
X: Union[np.ndarray, pd.DataFrame],
y: Union[np.ndarray, pd.Series]):
X, y = check_X_y(X, y, dtype=None,
accept_sparse=False,
accept_large_sparse=False,
force_all_finite='allow-nan')
if X.shape[1] < 2:
raise ValueError('Cannot fit model when n_features = 1')
self.clf_ = self._resolve_estimator(self.clf_name)
if self.clf_params:
self.clf_.set_params(**self.clf_params)
self.clf_.fit(X, y > 0)
self.clf_fi = self.clf_.feature_importances_
self.reg_ = self._resolve_estimator(self.reg_name)
if self.reg_params:
self.reg_.set_params(**self.reg_params)
self.reg_.fit(X[y > 0], y[y > 0])
self.reg_fi = self.reg_.feature_importances_
self.is_fitted_ = True
return self
# def predict(self, X: Union[np.ndarray, pd.DataFrame]):
def predict_bck(self, X: Union[np.ndarray, pd.DataFrame]):
""" Predict combined response using binary classification outcome """
X = check_array(X, accept_sparse=False, accept_large_sparse=False)
check_is_fitted(self, 'is_fitted_')
return self.clf_.predict(X) * self.reg_.predict(X)
def predict(self, X: Union[np.ndarray, pd.DataFrame]):
# def predict_expected_value(self, X: Union[np.ndarray, pd.DataFrame]):
""" Predict combined response using probabilistic classification outcome """
X = check_array(X, accept_sparse=False, accept_large_sparse=False)
check_is_fitted(self, 'is_fitted_')
return self.clf_.predict_proba(X)[:, 1] * self.reg_.predict(X)
def manual_test():
""" Validate estimator using sklearn's provided utility and ensure it can fit and predict on fake dataset. """
check_estimator(HurdleRegression)
from sklearn.datasets import make_regression
X, y = make_regression()
reg = HurdleRegression()
reg.fit(X, y)
reg.predict(X)
#if __name__ == '__main__':
# manual_test()