forked from Ljwccc/ByteDanceSecurityAI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
83 lines (70 loc) · 3.08 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
# 数据统计函数
def statics(data):
stats = []
for col in data.columns:
stats.append((col, data[col].nunique(), data[col].isnull().sum() * 100 / data.shape[0],
data[col].value_counts(normalize=True, dropna=False).values[0] * 100, data[col].dtype))
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage_of_missing_values',
'Percentage_of_values_in_the_biggest category', 'type'])
stats_df.sort_values('Percentage_of_missing_values', ascending=False, inplace=True)
return stats_df
# 类别频次特征
def freq_enc(df, col):
vc = df[col].value_counts(dropna=True, normalize=True).to_dict()
df[f'{col}_freq'] = df[col].map(vc)
return df
# label_ebcoder
from sklearn.preprocessing import LabelEncoder
def label_enc(df, cat_cols):
for col in cat_cols:
df[col] = df[col].astype(str)
lbl = LabelEncoder().fit(df[col])
df[col] = lbl.transform(df[col])
return df
# w2v聚合
from gensim.models import Word2Vec
def emb(df, f1, f2):
emb_size = 16
tmp = df.groupby(f1, as_index=False)[f2].agg({'{}_{}_list'.format(f1, f2): list})
sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
del tmp['{}_{}_list'.format(f1, f2)]
for i in range(len(sentences)):
sentences[i] = [str(x) for x in sentences[i]]
model = Word2Vec(sentences, size=emb_size, window=6, min_count=5, sg=0, hs=0, seed=1, iter=5)
emb_matrix = []
for seq in sentences:
vec = []
for w in seq:
if w in model.wv.vocab:
vec.append(model.wv[w])
if len(vec) > 0:
emb_matrix.append(np.mean(vec, axis=0))
else:
emb_matrix.append([0] * emb_size)
emb_matrix = np.array(emb_matrix)
for i in range(emb_size):
tmp['{}_{}_emb_{}'.format(f1, f2, i)] = emb_matrix[:, i]
return tmp
# 降低内存占用
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in tqdm(df.columns):
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
df[col] = df[col].astype(np.int32)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df