-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsave_QA.py
143 lines (108 loc) · 3.55 KB
/
save_QA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import json
import h5py
import numpy as np
import copy
from random import shuffle, seed
import sys
import os.path
import argparse
import glob
import numpy as np
import scipy.io
import pdb
import string
import h5py
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import gensim
import json
import re
import cv2
import matplotlib.pyplot as plt
# In[2]:
def extract_feat(doc):
feat = []
for word in doc:
try:
feat.append(model_w2v[word])
except:
pass
return feat
def tokenize(sentence):
return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", sentence) if i!='' and i!=' ' and i!='\n'];
def prepro_question(imgs, method):
# preprocess all the question
print('example processed tokens:')
for i,img in enumerate(imgs):
s = img['question']
if method == 'nltk':
txt = word_tokenize(str(s).lower())
else:
txt = tokenize(s)
img['processed_tokens'] = txt
if i < 10: print(txt)
if i % 1000 == 0:
sys.stdout.write("processing %d/%d (%.2f%% done) \r" % (i, len(img), i*100.0/len(imgs)) )
sys.stdout.flush()
return imgs
def get_top_answers(imgs, num_ans):
counts = {}
for img in imgs:
ans = img['ans']
counts[ans] = counts.get(ans, 0) + 1
cw = sorted([(count,w) for w,count in counts.items()], reverse=True)
print('top answer and their counts:')
print('\n'.join(map(str,cw[:20])))
vocab = []
for i in range(num_ans):
vocab.append(cw[i][1])
return vocab[:num_ans]
def filter_question(imgs, atoi):
new_imgs = []
for i, img in enumerate(imgs):
if atoi.get(img['ans'],len(atoi)+1) != len(atoi)+1:
new_imgs.append(img)
print('question number reduce from %d to %d '%(len(imgs), len(new_imgs)))
return new_imgs
# In[3]:
model_path = 'Raw_Data/WordVecs/GoogleNews-vectors-negative300.bin'
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
imgs_train = json.load(open('Raw_Data/vqa_raw_train.json' , 'r'))
num_ans = 1000
top_ans = get_top_answers(imgs_train, num_ans )
atoi = {w:i for i,w in enumerate(top_ans)}
itoa = {i:w for i,w in enumerate(top_ans)}
feat_dim = 300
imgs_data_train = json.load(open('vqa_final_train.json' , 'r'))
num_ans = 1000
method = 'nltk'
max_length = 26
dir_path = "Final_Data/QA/"
N = len(imgs_data_train)
def save_data():
for i,img in enumerate(imgs_data_train):
#print('X' , img['ques_id'])
img_path = img['img_path']
s = img['question']
print(i,s)
if method == 'nltk':
txt = word_tokenize(str(s).lower())
else:
txt = tokenize(s)
img['processed_tokens'] = txt
question_id = img['ques_id']
feat = np.array(extract_feat(img['processed_tokens']))
label_arrays = np.zeros((1, max_length, feat_dim), dtype='float32')
label_length = min(max_length, len(feat)) # record the length of this sequence
label_arrays[0, :label_length, :] = feat
ans_arrays = atoi[img['ans']]
f = h5py.File(os.path.join( dir_path , str(question_id) + '.h5'), "w")
f.create_dataset("ques_train", dtype='float32', data=label_arrays)
f.create_dataset("answers", dtype='uint32', data=ans_arrays)
f.close()
return
data = save_data()