-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathclue.py
173 lines (124 loc) · 7.18 KB
/
clue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import numpy as np
import scipy.spatial.distance
from sklearn.cluster import SpectralClustering
import heapq
from common import baseline_retrieval
from utils import tqdm
## CLUE ##
def clue(features, queries, select_clusters, k = 200, max_clusters = 10, T = 0.9, min_cluster_size = 2, show_progress = False):
""" CLUE method for cluster-based relevance feedback in image retrieval.
Reference:
Chen, Yixin; Wang, James Z.; Krovetz, Robert.
"CLUE: Cluster-Based Retrieval of Images by Unsupervised Learning."
IEEE transactions on Image Processing 14.8, 2005, pp. 1187-1201.
features - n-by-d matrix containing d-dimensional features of n samples.
queries - Dictionary mapping query IDs to dictionaries with keys 'relevant' and 'img_id'. 'img_id' gives the ID of the query
image and 'relevant' points to a list of IDs of images relevant for this query.
select_clusters - Callback function taking a query dictionary with keys 'relevant' and 'img_id' and a list of lists of images
for each cluster as arguments and returning a list of indices of selected clusters.
k - The number of baseline retrieval results to be used for the initial clustering step.
max_clusters - Maximum number of clusters.
T - Threshold for the n-cut value. Nodes with an n-cut value larger than this threshold won't be subdivided any further.
min_cluster_size - Minimum number of items per cluster.
show_progress - If True, a progress bar will be shown (requires tqdm).
Returns: re-ranked retrieval results as dictionary mapping query IDs to tuples consisting of an ordered list of retrieved image IDs
and a corresponding list of adjusted distances to the query.
"""
# Baseline retrieval
retrievals = baseline_retrieval(features, queries, select_clusters)
ret_it = tqdm(retrievals.items(), desc = 'CLUE', total = len(retrievals), leave = False) if show_progress else retrievals.items()
for qid, (ret, distances) in ret_it:
query = queries[qid]
query_feat = features[query['img_id']]
# Spectral clustering of top results
tree = RecursiveNormalizedCuts(max_clusters, T, min_cluster_size)
tree.fit([(id, features[id]) for id in ret[:k]])
clusters = tree.clusters()
# Ask user to select relevant clusters
selected_clusters = select_clusters(query, tree.sort_items_by_centroid_distance())
# Put images from the selected clusters first
offset = 0
selected_clusters.sort() # disable cheating through fine-grained relevance ranking
for c in selected_clusters:
ret[offset:offset+len(clusters[c])] = [id for id, _ in clusters[c]]
offset += len(clusters[c])
# Add remaining clusters in tree order
for i, c in enumerate(clusters):
if i not in selected_clusters:
ret[offset:offset+len(c)] = [id for id, _ in c]
offset += len(c)
return retrievals
class RecursiveNormalizedCuts(object):
def __init__(self, max_clusters, T, min_cluster_size = 2):
object.__init__(self)
self.max_clusters = max_clusters
self.T = T
self.min_cluster_size = min_cluster_size
self.tree = { 'depth' : 0, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : None, 'items' : [], 'affinity' : [] }
def fit(self, feat):
# Compute affinity matrix using RBF kernel on pair-wise distances
affinity = scipy.spatial.distance.pdist(np.array([f for id, f in feat]))
sigma = -2 * np.var(affinity)
affinity = np.exp(scipy.spatial.distance.squareform(affinity) / sigma)
# Recursive clustering
self.tree = { 'depth' : 0, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : None, 'items' : feat, 'affinity' : affinity }
queue = []
heapq.heappush(queue, (-1 * len(self.tree['items']), np.random.rand(), self.tree))
while (self.tree['leafs'] < self.max_clusters) and (len(queue) > 0):
if len(queue[0][2]['items']) <= self.min_cluster_size:
break
left, right, ncut_value = self.split(heapq.heappop(queue)[2])
if ncut_value > self.T:
break
if (left is not None) and (right is not None):
heapq.heappush(queue, (-1 * len(left['items']), np.random.rand(), left))
heapq.heappush(queue, (-1 * len(right['items']), np.random.rand(), right))
def split(self, node):
# Perform normalized cut
try:
ind = SpectralClustering(2, affinity = 'precomputed', assign_labels = 'discretize').fit_predict(node['affinity'])
except KeyboardInterrupt:
raise
except:
return None, None, 0
# Create left and right node
mask1, mask2 = (ind == 0), (ind == 1)
if not (np.any(mask1) and np.any(mask2)):
return None, None, 0
left = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 0], 'affinity' : node['affinity'][np.ix_(mask1, mask1)] }
right = { 'depth' : node['depth'] + 1, 'height' : 0, 'size' : 0, 'leafs' : 1, 'children' : [], 'parent' : node, 'items' : [f for i, f in enumerate(node['items']) if ind[i] == 1], 'affinity' : node['affinity'][np.ix_(mask2, mask2)] }
# Force the node with the lower minimum distance to the query to be the left node
if ind[0] == 1: # items are already sorted when passed to fit(), so we just need to look at the first item instead of re-computing all distances
left, right = right, left
# Modify parent
node['children'] = [left, right]
# Modify parent chain
parent = node
while parent is not None:
parent['height'] += 1
parent['size'] += 2
parent['leafs'] += 1
parent = parent['parent']
return left, right, self.ncut_value(node['affinity'], ind)
def clusters(self):
def _clusters(node):
return sum([_clusters(child) for child in node['children']], []) if len(node['children']) > 0 else [node['items']]
return _clusters(self.tree)
def sort_items_by_centroid_distance(self):
clusters = self.clusters()
sorted_clusters = []
for c in clusters:
feat = np.array([f for id, f in c])
dist = np.linalg.norm(feat - feat.mean(axis = 0), axis = -1)
ind = np.argsort(dist)
sorted_clusters.append([c[i][0] for i in ind])
return sorted_clusters
def ncut_value(self, affinity, lbl):
mask_a, mask_b = (lbl == 0), (lbl == 1)
cut_a_b = affinity[mask_a,:][:,mask_b].sum()
cut_a_v = affinity[mask_a,:].sum()
cut_b_v = affinity[mask_b,:].sum()
if (cut_a_v == 0) or (cut_b_v == 0):
print(affinity)
print(lbl)
return cut_a_b / cut_a_v + cut_a_b / cut_b_v