-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathbert_model.py
93 lines (86 loc) · 4.54 KB
/
bert_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from functools import partial
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from sklearn.model_selection import train_test_split
import fastai
import fastai.text
import pandas as pd
import torch
#from fastai.text import BaseTokenizer
class FastAiBertTokenizer(fastai.text.BaseTokenizer):
"""Wrapper around BertTokenizer to be compatible with fast.ai"""
def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=128, **kwargs):
super(FastAiBertTokenizer, self).__init__(**kwargs)
self._pretrained_tokenizer = tokenizer
self.max_seq_len = max_seq_len
def __call__(self, *args, **kwargs):
return self
def tokenizer(self, t: str):
"""Limits the maximum sequence length"""
return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]
def train_bert(path):
'''
Function to train BERT using already created csv files.
This implementation requires Fastai
Requires:
- Sentences_AllAgree_preprocessed.csv
- pre_processed_aapl_sentences.csv
Creates:
- aapl_bert.csv
'''
train = pd.read_csv(path+'Sentences_AllAgree_preprocessed.csv')
test = pd.read_csv(path+'pre_processed_aapl_sentences.csv', index_col=None, engine='python')
test.dropna(inplace=True)
test2 = pd.DataFrame(test['text'])
test2.columns = ['sentence']
train_1, val = train_test_split(train, shuffle=True, test_size=0.2, random_state=42)
bert_tok = BertTokenizer.from_pretrained("bert-base-uncased",)
fastai_bert_vocab = fastai.text.Vocab(list(bert_tok.vocab.keys()))
fastai_tokenizer = fastai.text.Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=256),
pre_rules=[],
post_rules=[])
label_cols = ["label_negative", "label_neutral", "label_positive"]
databunch_1 = fastai.text.TextDataBunch.from_df(".", train_1, val,
test_df=test2,
tokenizer=fastai_tokenizer,
vocab=fastai_bert_vocab,
include_bos=False,
include_eos=False,
text_cols="sentence",
label_cols=label_cols,
bs=32,
collate_fn=partial(fastai.text.pad_collate,
pad_first=False,
pad_idx=0),
)
bert_model_class = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
learner = fastai.text.Learner(databunch_1,
bert_model_class,
loss_func=torch.nn.BCEWithLogitsLoss(),
model_dir=path+'temp/model',
metrics=partial(fastai.text.accuracy_thresh, thresh=.25)
)
def bert_class_split(model):
embedder = model.bert.embeddings
pooler = model.bert.pooler
encoder = model.bert.encoder
classifier = [model.dropout, model.classifier]
n = len(encoder.layer) // 3
return [[embedder], list(encoder.layer[:n]), list(encoder.layer[n+1:2*n]), list(encoder.layer[(2*n)+1:]), [pooler], classifier]
x = bert_class_split(bert_model_class)
learner.split([x[0], x[1], x[2], x[3], x[5]])
learner.fit_one_cycle(2,
slice(5e-6, 5e-5),
moms=(0.8, 0.7),
pct_start=0.2,
wd=(1e-7, 1e-5, 1e-4, 1e-3, 1e-1)) # parameters resulted from previous parameter tuning
preds = learner.get_preds(ds_type=fastai.basic_data.DatasetType.Test)
test_res = pd.DataFrame(preds[0].tolist())
test = pd.concat([test, test_res], axis=1)
test = test[['article_time', 'text', 0, 1, 2]]
tmp = pd.get_dummies(test[[0, 1, 2]].idxmax(axis=1))
tmp.columns = ['neg', 'neut', 'pos']
test.columns = ['article_time', 'text', 'neg_prob', 'neut_prob', 'pos_prob']
test = pd.concat([test, tmp], axis=1)
test.to_csv(path+'aapl_bert.csv')
return test