-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathgenerating_training_data_1.02.py
155 lines (122 loc) · 4.41 KB
/
generating_training_data_1.02.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
__author__ = 'Oswaldo Ludwig'
__version__ = '1.01'
#This code automatically derive (from the COCO annotations) the VIEW-style training annotation: training_sentences.csv and training_target.
#In case of publication, cite:
##@ARTICLE{2016arXiv160308474L,
## author = {Oswaldo Ludwig and Xiao Liu and Parisa Kordjamshidi and Marie-Francine Moens
## },
## title = "{Deep Embedding for Spatial Role Labeling}",
## journal = {ArXiv e-prints},
##archivePrefix = "arXiv",
## eprint = {1603.08474},
## primaryClass = "cs.CL",
## keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Learning, Computer Science - Neural and Evolutionary Computing},
## year = 2016,
## month = mar,
## adsurl = {http://adsabs.harvard.edu/abs/2016arXiv160308474L},
##}
from coco import COCO
import csv
import sys
import numpy as np
import pylab
import mask
import itertools
import pickle
import time
dataDir='..'
dataType='val2014'
annFile='instances_%s.json'%(dataType)
def assemble_output_vector2(anotacao):
outvec=np.array(0)
flag = 0
outvec = [0] * 201
coord=np.zeros((2, 2))
sa=len(anotacao)
if sa>1:
salience = [0] * sa
indexes = [0] * 2
#getting the coordinates of the two most salient objects:
for j in range(sa):
anota=anotacao[j]
bb=np.array(anota['bbox'])
salience[j]=bb[2]*bb[3]- 0.3*(320-(2*bb[0]+bb[2])/2)**2 #this is a emprical measure of salience of the objects given their area and position
for i in range(2):
max_salience = 0
for j in range(sa):
if salience[j]>max_salience:
max_salience=salience[j]
indexes[i]=j
salience[indexes[i]]=0
anota=anotacao[indexes[i]]
bb=np.array(anota['bbox'])
coord[0][i]=(2*bb[0]+bb[2])/2 #coordinates of the center of the bounding box
coord[1][i]=(2*bb[1]+bb[3])/2
if coord[1][1]<coord[1][0]: #flipping the objects, if the second isn't the highest
aux = coord[1][0]
coord[1][0]=coord[1][1]
coord[1][1]=aux
aux = coord[0][0]
coord[0][0]=coord[0][1]
coord[0][1]=aux
flag = 1
detax=coord[0][1]-coord[0][0]
detay=coord[1][1]-coord[1][0]
if detax == 0:
grad = 0
else:
grad=detay/detax
#defining the relative position between objects
if (grad>1) or (grad<-1):
outvec[0] = 1 #below(A,B)
else:
outvec[1] = 1 #besides(A,B)
anota=anotacao[indexes[0]]
an0=anota['category_id']
anota=anotacao[indexes[1]]
an1=anota['category_id']
if flag == 0:
outvec[2+an0] = 1
outvec[100+an1] = 1
else:
outvec[2+an1] = 1
outvec[100+an0] = 1
else:
if sa>0:
anota=anotacao[0]
outvec[2+anota['category_id']] = 1
return outvec
# initialize COCO api for instance annotations
coco=COCO(annFile)
# get the image Ids:
imgIds = coco.getImgIds();
s=len(imgIds)
out = np.zeros(shape=(5*s,201))
count=0
for imgId in imgIds:
#get the Id of the annotation corresponding to the image Id:
annIds = coco.getAnnIds(imgIds=imgId, iscrowd=None)
#get the annotation:
anotacao=coco.loadAnns(ids=annIds)
for k in range(5):
out[count,:]=assemble_output_vector2(anotacao)
count += 1
# Saving the objects:
with open('training_target', 'w') as f:
pickle.dump(out, f)
# initialize COCO api for caption annotations
annFile = 'captions_%s.json'%(dataType)
caps=COCO(annFile)
with open('training_sentences.csv', 'wb') as f:
wri = csv.writer(f, skipinitialspace=True)
for imgId in imgIds:
annId = caps.getAnnIds(imgIds=imgId)
#print caps.loadAnns(annId)
captions = coco.showAnns(caps.loadAnns(annId))
#print captions
for n in range(5):
#print captions[n]
if len(captions[n])<3:
captions[n]='none'
cap = captions[n].lower()
wri.writerow([cap])