-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathreader.py
executable file
·171 lines (133 loc) · 5.78 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
Utilities for reading Switchboard files
"""
import numpy as np
from tensorflow.contrib import learn
import os
def _read_data(filename):
with open(filename) as fp:
lines = fp.readlines()
return lines
def _length(sentences):
sn_length = [len(sn.split()) for sn in sentences]
return sn_length
def _mask(sentences, max_length):
"""
- Builds a mask array to ignore padded integers for calculating precision, recall and fscore
Args:
sentences: a list of input sentences
max_length: maximum length used for padding sentences
Returns:
mask_array: an array of actual length of sentences
"""
sn_length = _length(sentences)
mask_array = np.zeros((len(sn_length) * max_length, 1), dtype=np.float64)
row_num = 0
for length in sn_length:
mask_array[row_num:length+row_num] = 1
row_num += length + (max_length - length)
return mask_array
def swbd_data(data_path=None):
"""
- Loads Switchboard input and output files from data dir "./data_path",
- Then, reads Switchboard files and converts strings to integer ids,
- Finally, creates mask arrays for input files.
Args:
data_path: string path to the dir where train, dev and test input and output files are stored
(check out ./sample_data for the input format)
Returns:
tuple (
train_input_ids,
dev_input_ids,
test_input_ids,
train_output_ids,
dev_output_ids,
test_output_ids,
train_mask,
dev_mask,
test_mask,
max_length,
input_vocab_processor
): where each of the data objects can be passed to swbd_minibathes
"""
train_input_data = _read_data(os.path.join(data_path, "swbd.train.txt"))
dev_input_data = _read_data(os.path.join(data_path, "swbd.dev.txt"))
test_input_data = _read_data(os.path.join(data_path, "swbd.test.txt"))
max_length = max(_length(train_input_data))
input_vocab_processor = learn.preprocessing.VocabularyProcessor(max_length, min_frequency=1)
train_input_ids = np.array(list(input_vocab_processor.fit_transform(train_input_data)))
dev_input_ids = np.array(list(input_vocab_processor.transform(dev_input_data)))
test_input_ids = np.array(list(input_vocab_processor.transform(test_input_data)))
train_output_data = _read_data(os.path.join(data_path, "swbd.train.label.txt"))
dev_output_data = _read_data(os.path.join(data_path, "swbd.dev.label.txt"))
test_output_data = _read_data(os.path.join(data_path, "swbd.test.label.txt"))
label_vocab = {'F': 0, 'E': 1}
output_vocab_processor = learn.preprocessing.VocabularyProcessor(max_length, vocabulary=label_vocab)
train_output_ids = np.array(list(output_vocab_processor.transform(train_output_data)))
dev_output_ids = np.array(list(output_vocab_processor.transform(dev_output_data)))
test_output_ids = np.array(list(output_vocab_processor.transform(test_output_data)))
train_mask = _mask(train_input_data, max_length)
dev_mask = _mask(dev_input_data, max_length)
test_mask = _mask(test_input_data, max_length)
return train_input_ids, \
dev_input_ids, \
test_input_ids, \
train_output_ids, \
dev_output_ids, \
test_output_ids, \
train_mask, \
dev_mask, \
test_mask, \
max_length, \
input_vocab_processor
def swbd_minibatches(input_ids, output_ids, mask_data, batch_size, num_epochs, max_length, shuffle=True):
"""
- Iterates on the Switchboard input and output files
Args:
input_ids: one of the input id files from swbd_data
output_ids: one of the output id files from swbd_data
mask_data: one of the mask files from swbd_data
batch_size: int, the batch size
num_epochs: int, the number of training epochs
max_length: int, the maximum length used for padding
shuffle: Boolean, whether to shuffle training data or not
Returns:
tuple (x, y, z): which are minibathes of (input, output, mask)
"""
output_ids = np.reshape(np.array(output_ids), (-1, max_length))
mask_data = np.reshape(np.array(mask_data), (-1, max_length))
if shuffle:
shuffle_indices = np.random.permutation(np.arange(len(input_ids)))
input_ids = input_ids[shuffle_indices]
output_ids = output_ids[shuffle_indices]
mask_data = mask_data[shuffle_indices]
input_ids = np.array([np.concatenate(input_ids, 0)]).T
output_ids = np.array([np.concatenate(output_ids, 0)]).T
mask_data = mask_data.reshape(-1, 1)
data_size = len(input_ids) // max_length
num_batches_per_epoch = data_size // batch_size
for epoch in range(num_epochs):
for batch_num in range(num_batches_per_epoch):
start_index = (batch_num * batch_size) * max_length
end_index = (min((batch_num + 1) * batch_size, data_size)) * max_length
x = np.reshape(input_ids[start_index:end_index], (batch_size, max_length))
y = output_ids[start_index:end_index]
z = mask_data[start_index:end_index]
yield (x, y, z)
def batch_iter(input_id, max_length, mask):
"""
- Iterates on input data (usef for prediction)
Args:
input_id: list of input sentences mapped to integers
max_length: maximum length of sentences
mask: list of actual length of sentences
Returns:
tuple (x_input, z_mask): which are minibathes of (input, mask)
"""
x = np.array(input_id)
for sn in range(len(input_id)):
start = sn * max_length
end = (1 + sn) * max_length
x_input = x[sn : sn + 1]
z_mask = mask[start:end]
yield (x_input, z_mask)