-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocessing.py
117 lines (98 loc) · 4.21 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy
# Constants
BROKEN_FEATURE_VALUES = [-512]
RAND = 1234 # fixed random state
LOG10_TRESHOLD = 5e-3 # Due to the resolution of CPU timer, runtimes below 0.01 seconds are measured as 0 seconds.
# To make yi = log(ri) well defined in these cases,
# we count them as 0.005 (which, in log space, has the same
# distance from 0.01 as the next bigger value measurable with our CPU timer, 0.02).
STD_TRESHOLD = 1e-6 # for constant columns removal
# functions for calculating mean and st. dev. vectors for feature matrix X;
# ignore broken features by default
def mean(X, ignore_broken_features=True):
num_features = X.shape[1]
mean = numpy.zeros(num_features)
for i in range(0, num_features):
if ignore_broken_features:
mean[i] = numpy.mean([x for x in X[:, i] if x not in BROKEN_FEATURE_VALUES])
else:
mean[i] = numpy.mean(X[:, i])
return mean
def std(X, ignore_broken_features=True):
num_features = X.shape[1]
std = numpy.zeros(num_features)
for i in range(0, num_features):
if ignore_broken_features:
std[i] = numpy.std([x for x in X[:, i] if x not in BROKEN_FEATURE_VALUES])
else:
std[i] = numpy.std(X[:, i])
return std
# function for removing constant columns (features) from feature matrix X
def remove_const_cols(X):
std_vec = std(X)
remove_indices = numpy.array([])
for i in range(0, std_vec.shape[0]):
if std_vec[i] < STD_TRESHOLD:
remove_indices = numpy.append(remove_indices, i)
# print('Removed', remove_indices.shape[0], 'columns:', remove_indices)
return numpy.delete(X, remove_indices, axis=1), remove_indices
# function for subtracting mean from feature matrix X; ignore broken features by default
# (in which case we're setting broken feature values to 0)
def center(X, mean_vec, ignore_broken_features=True):
X_centered = numpy.zeros(X.shape)
for i in range(0, X_centered.shape[0]):
for j in range(0, X_centered.shape[1]):
should_center = (ignore_broken_features and (X[i, j] not in BROKEN_FEATURE_VALUES)) or ~ignore_broken_features
if should_center:
X_centered[i, j] = X[i, j] - mean_vec[j]
else:
X_centered[i, j] = 0
return X_centered
# function for standardizing feature matrix X (mean 0, std 1); ignore broken features by default
# (in which case we're setting broken feature values to 0)
def standardize(X, mean_vec, std_vec, ignore_broken_features=True):
X_scaled = numpy.zeros(X.shape)
for i in range(0, X_scaled.shape[0]):
for j in range(0, X_scaled.shape[1]):
should_scale = (ignore_broken_features and (X[i, j] not in BROKEN_FEATURE_VALUES)) or ~ignore_broken_features
if should_scale:
X_scaled[i, j] = (X[i, j] - mean_vec[j]) / std_vec[j]
else:
X_scaled[i, j] = 0
return X_scaled
# function for quadratic expansion of feature matrix
def calculate_interactions(X, ignore_broken_features=True):
num_features = X.shape[1]
num_int_features = num_features + num_features * (num_features - 1) // 2
X_int = numpy.zeros((X.shape[0], num_int_features))
for i in range(0, X.shape[0]):
for j in range(0, num_features):
k_from = 0
for l in range(0, j):
k_from += num_features - l
k_to = k_from + num_features - j
for k in range(k_from, k_to):
if ignore_broken_features and X[i, j] in BROKEN_FEATURE_VALUES:
X_int[i, k] = X[i, j]
elif ignore_broken_features and X[i, j + k - k_from] in BROKEN_FEATURE_VALUES:
X_int[i, k] = X[i, j + k - k_from]
else:
X_int[i, k] = X[i, j] * X[i, j + k - k_from]
return numpy.hstack((X, X_int))
# function for log10 transformation of response variable
def log10_transform(Y, use_treshold=True):
if use_treshold:
Y[Y < LOG10_TRESHOLD] = LOG10_TRESHOLD
return numpy.log10(Y)
# function for setting broken feature values to mean value for that feature
def handle_broken_features(X, mean_vec):
num_features = X.shape[1]
num_samples = X.shape[0]
X_new = numpy.zeros(X.shape)
for i in range(0, num_samples):
for j in range(0, num_features):
if X[i, j] in BROKEN_FEATURE_VALUES:
X_new[i, j] = mean_vec[j]
else:
X_new[i, j] = X[i, j]
return X_new