-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathData_loader.py
151 lines (116 loc) · 3.83 KB
/
Data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import json
import h5py
import numpy as np
import copy
from random import shuffle, seed
import sys
import os.path
import argparse
import glob
import numpy as np
import scipy.io
import pdb
import string
import h5py
import nltk
from nltk.tokenize import word_tokenize
import gensim
import json
import re
import cv2
import matplotlib.pyplot as plt
def extract_feat(doc):
feat = []
for word in doc:
try:
feat.append(model_w2v[word])
except:
pass
return feat
def tokenize(sentence):
return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", sentence) if i!='' and i!=' ' and i!='\n'];
def prepro_question(imgs, method):
# preprocess all the question
print('example processed tokens:')
for i,img in enumerate(imgs):
s = img['question']
if method == 'nltk':
txt = word_tokenize(str(s).lower())
else:
txt = tokenize(s)
img['processed_tokens'] = txt
if i < 10: print(txt)
if i % 1000 == 0:
sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(img), i*100.0/len(imgs)) )
sys.stdout.flush()
return imgs
def get_top_answers(imgs, num_ans):
counts = {}
for img in imgs:
ans = img['ans']
counts[ans] = counts.get(ans, 0) + 1
cw = sorted([(count,w) for w,count in counts.items()], reverse=True)
#print('top answer and their counts:')
#print('\n'.join(map(str,cw[:20])))
vocab = []
for i in range(num_ans):
vocab.append(cw[i][1])
return vocab[:num_ans]
def filter_question(imgs, atoi):
new_imgs = []
for i, img in enumerate(imgs):
if atoi.get(img['ans'],len(atoi)+1) != len(atoi)+1:
new_imgs.append(img)
print('question number reduce from %d to %d '%(len(imgs), len(new_imgs)))
return new_imgs
imgs_train = json.load(open('Raw_Data/vqa_raw_train.json' , 'r'))
seed(125)
shuffle(imgs_train)
num_ans = 1000
top_ans = get_top_answers(imgs_train, num_ans )
atoi = {w:i for i,w in enumerate(top_ans)}
itoa = {i:w for i,w in enumerate(top_ans)}
feat_dim = 300
imgs_data_train = json.load(open('vqa_final_train.json' , 'r'))
seed(125)
shuffle(imgs_data_train)
num_ans = 1000
method = 'nltk'
max_length = 26
dir_path = "Final_Data/QA/"
N = len(imgs_data_train)
# In[6]:
def load_data(batch):
start = 0
end = batch
while True:
#print(start,end)
images = []
questions = []
answers = []
ids = []
#arrs = np.random.randint(0,len(imgs_data_train),batch)
#data = [imgs_data_train[i] for i in arrs]
data = imgs_data_train[start:end]
start = end
end = end + batch
if end > len(imgs_data_train):
start = 0
end = batch
for i,img in enumerate(data):
img_path = img['img_path']
question_id = img['ques_id']
label_arrays = np.zeros((1, max_length, feat_dim), dtype='float32')
with h5py.File(os.path.join(dir_path,str(question_id) + '.h5'),'r') as hf:
question = hf['.']['ques_train'].value
answer = hf['.']['answers'].value
image = cv2.imread(os.path.join('Final_Data/',img_path) , cv2.IMREAD_COLOR)
image = cv2.cvtColor(image , cv2.COLOR_BGR2RGB)
image = cv2.resize(image , (224,224))
image = (image - 127.5)/127.5
images.append(image)
questions.append(np.array(question))
answers.append(np.array(answer))
ids.append(question_id)
questions = np.reshape(np.array(questions) , [batch,max_length,feat_dim])
yield(np.array(images) , questions ,np.array(answers) , np.array(ids))