-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutil.py
152 lines (108 loc) · 5.82 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import deepchem as dc
import numpy as np
import pandas as pd
import itertools
from sklearn.ensemble import RandomForestRegressor
def print_progress(i, n):
print('\r[%s%s]' % (''.join(['#']*int(20*((i+1)/n))), ''.join([' ']*int(20*(((n-i+1)/n))))), end='')
def grid_search_graph_conv(train_set, hyper_params, folds=5):
params = list(map(lambda key: hyper_params[key], hyper_params.keys()))
n_of_tries = len(list(itertools.product(*params)))
search_results = []
# split dataset into folds
splitter = dc.splits.RandomSplitter()
fold_sets = splitter.k_fold_split(train_set, folds)
# save best hyperparams
best_score = 1e10
best_params = None
# try all possible combinations of hyperparams
for i, (batch_size, conv_layers, layer_sizes, dropout_rate) in enumerate(itertools.product(*params)):
rmse_scores = []
for train, valid in fold_sets:
transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train, move_mean=True)]
# preprocess data
for transformer in transformers:
train = transformer.transform(train)
valid = transformer.transform(valid)
# intantiate and fit model
model = dc.models.GraphConvModel(1, mode='regression', batch_size=batch_size, graph_conv_layers=conv_layers, dense_layer_size=layer_sizes, dropout=dropout_rate)
model.fit(train, nb_epoch=50)
# evaluate model
metric = dc.metrics.Metric(dc.metrics.rms_score, np.mean)
rmse = model.evaluate(valid, [metric], transformers)['mean-rms_score']
rmse_scores.append(rmse)
average_rmse = np.mean(rmse_scores)
# save best hyperparams
if average_rmse < best_score:
best_score = average_rmse
best_params = (batch_size, conv_layers, layer_sizes, dropout_rate)
search_results.append([average_rmse, batch_size, conv_layers, layer_sizes, dropout_rate])
print_progress(i, n_of_tries)
search_results = pd.DataFrame(search_results, columns=['rmse', 'batch_size', 'conv_layers', 'layer_sizes', 'dropout_rate']).sort_values(by='rmse')
return search_results, best_params
def grid_search_mpnn(train_set, hyper_params, folds=5):
params = list(map(lambda key: hyper_params[key], hyper_params.keys()))
n_of_tries = len(list(itertools.product(*params)))
search_results = []
# split dataset into folds
splitter = dc.splits.RandomSplitter()
fold_sets = splitter.k_fold_split(train_set, folds)
# save best hyperparams
best_score = 1e10
best_params = None
# try all possible combinations of hyperparams
for i, (batch_size, n_atom_feat, n_pair_feat, n_hidden) in enumerate(itertools.product(*params)):
rmse_scores = []
for train, valid in fold_sets:
transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train, move_mean=True)]
# preprocess data
for transformer in transformers:
train = transformer.transform(train)
valid = transformer.transform(valid)
# intantiate and fit model
model = dc.models.MPNNModel(1, mode='regression', batch_size=batch_size, use_queue=False, n_atom_feat=n_atom_feat, n_pair_feat=n_pair_feat, n_hidden=n_hidden, learning_rate=0.0001, T=3, M=5)
model.fit(train, nb_epoch=50, checkpoint_interval=100)
# evaluate model
metric = dc.metrics.Metric(dc.metrics.rms_score, np.mean)
rmse = model.evaluate(valid, [metric], transformers)['mean-rms_score']
rmse_scores.append(rmse)
average_rmse = np.mean(rmse_scores)
# save best hyperparams
if average_rmse < best_score:
best_score = average_rmse
best_params = (batch_size, n_atom_feat, n_pair_feat, n_hidden)
search_results.append([average_rmse, batch_size, n_atom_feat, n_pair_feat, n_hidden])
print_progress(i, n_of_tries)
search_results = pd.DataFrame(search_results, columns=['rmse', 'batch_size', 'n_atom_feat', 'n_pair_feat', 'n_hidden']).sort_values(by='rmse')
return search_results, best_params
def grid_search_random_forest(train_set, hyper_params, folds=5):
params = list(map(lambda key: hyper_params[key], hyper_params.keys()))
n_of_tries = len(list(itertools.product(*params)))
search_results = []
# split dataset into folds
splitter = dc.splits.RandomSplitter()
fold_sets = splitter.k_fold_split(train_set, folds)
# save best hyperparams
best_score = 1e10
best_params = None
# try all possible combinations of hyperparams
for i, (n_estimators, criterion, max_features) in enumerate(itertools.product(*params)):
rmse_scores = []
for train, valid in fold_sets:
# intantiate and fit model
sklearn_model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_features=max_features, random_state=0)
model = dc.models.SklearnModel(sklearn_model)
model.fit(train)
# evaluate model
metric = dc.metrics.Metric(dc.metrics.rms_score, np.mean)
rmse = model.evaluate(valid, [metric], [])['mean-rms_score']
rmse_scores.append(rmse)
average_rmse = np.mean(rmse_scores)
# save best hyperparams
if average_rmse < best_score:
best_score = average_rmse
best_params = (n_estimators, criterion, max_features)
search_results.append([average_rmse, n_estimators, criterion, max_features])
print_progress(i, n_of_tries)
search_results = pd.DataFrame(search_results, columns=['rmse', 'n_estimators', 'criterion', 'max_features']).sort_values(by='rmse')
return search_results, best_params