-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprep.py
57 lines (48 loc) · 1.8 KB
/
prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os, sys, json
def gen_vocab(path):
vocab = {'#pad#':0, '<s>':1, '</s>':2, '<unk>':3, }
tags = {}
for line in open(path, 'rU'):
for i, x in enumerate(line.strip().split()):
if i == 0:
continue
elif i%2 == 1:
x = x.lower()
if x not in vocab: vocab[x] = len(vocab)
else:
if x not in tags: tags[x] = len(tags)
return vocab, tags
def make_indices(path, vocab, tags):
data = []
for line in open(path, 'rU'):
cur_data = []
line = line.strip().split()
for i in range(1, len(line), 2):
x = line[i].lower()
x = vocab[x] if x in vocab else vocab['<unk>']
y = line[i+1]
if y not in tags:
tags[y] = len(tags)
y = tags[y]
cur_data.append((x,y))
data.append(cur_data)
data_x = []
data_y = []
st, ed = vocab['<s>'], vocab['</s>']
for sent in data:
for i in range(len(sent)):
x = []
x.append(st if i-2 < 0 else sent[i-2][0])
x.append(st if i-1 < 0 else sent[i-1][0])
x.append(sent[i][0])
x.append(ed if i+1 >= len(sent) else sent[i+1][0])
x.append(ed if i+2 >= len(sent) else sent[i+2][0])
data_x.append(x)
data_y.append([sent[i][1],])
return data_x, data_y
vocab, tags = gen_vocab('../data/pos/train')
train_x, train_y = make_indices('../data/pos/train', vocab, tags)
test_x, test_y = make_indices('../data/pos/test', vocab, tags)
dev_x, dev_y = make_indices('../data/pos/dev', vocab, tags)
data = {'train_x':train_x, 'train_y':train_y, 'test_x':test_x, 'test_y':test_y, 'dev_x':dev_x, 'dev_y':dev_y, 'vocab':vocab, 'tags':tags,}
json.dump(data, open('data.json','wb'))