-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathGradientClustering.py
435 lines (289 loc) · 15.6 KB
/
GradientClustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
# MIT License
# Copyright (c) 2016 Denis Vida
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from __future__ import print_function, absolute_import, division
import copy
import numpy as np
import matplotlib.pyplot as plt
### Define constants which tell what is the UNDEFINED value in the reachability plot
# -1 Value was used in cython as an undefined value, which is not satisfactory anymore, as the undefined value
# should be infinite
UNDEFINED = -1
# The new undefined value is the largest number possible by a signed 32-bit number
# NOTE: CHANGE THIS TO A HIGER VALUE IF YOU ARE USING EXTREMELY LARGE REACHABILITY DISTANCES (E.G. IF YOUR
# NORMAL REACHABILITY DISTANCES ARE ABOVE 10**8 - and don't forget to use more bits for numpy arrays!)
NEW_UNDEFINED = 2**31 - 1
###
class ReachabilityVector(object):
"""Init the vector between two points in the reachability diagram. """
def __init__(self, rx, ry, w):
""" Initilization function for the vector.
Arguments:
rx: [float] reachability of the first point
ry: [float] reachability of the second point
w: [float] distance between data points in the reachability plot
Return:
ReachabilityVector [object]
"""
self.value = np.array((w, ry - rx))
def abs(self):
""" Returns the magnitude of a vector. """
return np.sqrt(self.value.dot(self.value))
def inflectionIndex(reach_list, y, w):
""" Calculates the inflection index of a certain point in the reachability diagram.
Arguments:
reach_list: [ndarray] 1D numpy array containing reachability distance values of individual data points
y: [int] index of the point for which the inflection index is calculated
w: [float] distance between data points in the reachability plot
Returns:
[float] inflection index of the given point
"""
# Get reachabilities of the previous, current and the next point
x_r = reach_list[y - 1]
y_r = reach_list[y]
z_r = reach_list[y + 1]
# Create vectors between with adjecent points
prev_vect = ReachabilityVector(x_r, y_r, w)
next_vect = ReachabilityVector(y_r, z_r, w)
return (-w**2 + (x_r - y_r)*(z_r - y_r))/(prev_vect.abs()*next_vect.abs())
def gradientDeterminant(reach_list, y, w):
""" Calculate the gradient determinant between two vectors.
Arguments:
reach_list: [ndarray] 1D numpy array containing reachability distance values of individual data points
y: [int] index of the point for which the gradient is calculated
w: [float] distance between data points in the reachability plot
Returns:
[float] gradient determinant of the given point (see source paper for more information)
"""
# Get reachabilities of the previous, current and the next point
x_r = reach_list[y - 1]
y_r = reach_list[y]
z_r = reach_list[y + 1]
return w*(y_r-x_r) - w*(z_r-y_r)
def gradientClustering(reach_list, min_pts, t, w):
""" Extracts clusters from the reachability diagram by using the gradient method.
Source paper: Brecheisen, S., Kriegel, H.P., Kroger, P. and Pfeifle, M., 2004, April.
Visually Mining through Cluster Hierarchies. In SDM (pp. 400-411).
Arguments:
reach_list: [ndarray] 1D numpy array containing reachability distance values of individual data points
min_pts: [int] minimum number of points used for clustering
t: [float] angle of minimum inflection index in the inflection point, values in the range 120-160 deg
should work fine (see source paper for more information)
w: [float] distance between data points in the reachability plot, this value influences the
sensitivity of the gradient clustering procedure, a value of 0.025 should work fine
Return:
set_of_clusters: [list] a list of found clusters
- note: individual cluster is just a list of point indices belonging to a specific cluster
"""
# Replace all UNDEFINED values in the reach_list with infinites
reach_list[reach_list == UNDEFINED] = NEW_UNDEFINED
# Covert t from degrees to cosinus value
t = np.cos(np.radians(t))
# Input data size
reach_list_size = len(reach_list)
# Init the last endpoint
last_endpoint = reach_list_size - 1
# Init the empty working lists
start_pts = []
set_of_clusters = []
curr_cluster = []
# Add the first point to the start_pts
start_pts.append(0)
# Go through all the remaining points
for i in range(1, reach_list_size - 1):
# Check if the point is an inflection point
if inflectionIndex(reach_list, i, w) > t:
# Check if the next vector deviates to the right
if gradientDeterminant(reach_list, i, w) > 0:
# Check if the current cluster size is larger than the minimum number of points required
if len(curr_cluster) >= min_pts:
# Add the cluster to a list of clusters
set_of_clusters.append(curr_cluster)
# Reset the current cluster
curr_cluster = []
# Check if the list of start points is not empty
if start_pts:
# Check if the last point in the start_pts has a smaller reachability distance than the
# current point, if so, remove the last point
if reach_list[start_pts[-1]] <= reach_list[i]:
start_pts.pop()
# Check if the list of start points is not empty
if start_pts:
# While the last point in the start_pts has a smaller reachability distance than the current
# point, keep removing points from the end and add them to a list of clusters
while reach_list[start_pts[-1]] < reach_list[i]:
temp_cluster = range(start_pts[-1], last_endpoint)
# Check if the temporary cluster size is larger than the minimum number of points required
if len(temp_cluster) >= min_pts:
set_of_clusters.append(temp_cluster)
# Remove the last start point from the list of start points
start_pts.pop()
# Lastly, add another cluster at the end, if it is not empty
temp_cluster = range(start_pts[-1], last_endpoint)
if len(temp_cluster) >= min_pts:
# Add a new cluster
set_of_clusters.append(range(start_pts[-1], last_endpoint))
# If the current point is a starting point, add it to the list
if reach_list[i+1] < reach_list[i]:
start_pts.append(i)
# The next vector deviates to the left, marking an endpoint
else:
# If the current point is the endpoint and goes up, then take all points from the last start
# point to this one as a current cluster
if reach_list[i+1] > reach_list[i]:
# Keep track of the lastest endpoint (add 1 for the python peculiarities, as otherwise the
# last point would not be included in the list)
last_endpoint = i + 1
curr_cluster = range(start_pts[-1], last_endpoint)
# Add clusters at the end of plot, while start points has any members
while start_pts:
# Take points from the last point in the start pooints to the last point
curr_cluster = range(start_pts[-1], reach_list_size)
if (reach_list[start_pts[-1]] > reach_list[-1]) and (len(curr_cluster) >= min_pts):
set_of_clusters.append(curr_cluster)
start_pts.pop()
return set_of_clusters
def filterLargeClusters(clusters, input_size, cluster_fraction_thresh):
""" Remove big clusters which have more points than a given fraction of total points in the input data.
Arguments:
clusters: [list] a python list containing found clusters in the reachability diagram plot (optional)
- note: individual cluster is just a list of point indices belonging to a specific cluster
input_size: [int] total number of all data points used for clustering
cluster_fraction_thresheld: [float] a number that defines the maximum ratio of the largest acceptable
cluster size and the total number of all points used for clustering
Return:
filtered_clusters: [list] a list of clusters where the largle clusters were removed
"""
filtered_clusters = []
# Go through every cluster and check if it is too big
for cluster in clusters:
# If the cluster is small enough, add it to the list
if len(cluster) < cluster_fraction_thresh*input_size and len(cluster) > 0:
filtered_clusters.append(cluster)
return filtered_clusters
def mergeSimilarClusters(clusters, similarity_threshold):
""" Merge clusters which have a minimum ratio of shared points compared to the other clusters.
Arguments:
clusters: [list] a python list containing found clusters in the reachability diagram plot (optional)
- note: individual cluster is just a list of point indices belonging to a specific cluster
similarity_threshold: [float] a minimum ratio between the intersection of two clusters and the
members of the larger cluster
Return:
clusters: [list] a list of merged clusters
"""
# Sort clusters by ascending size
clusters = list(reversed(sorted(clusters, key=len)))
# Init the filtering lists
previous_iteration_clusters = clusters
merged_clusters = []
# Run the filtering until the final list of merged clusters does not change
while len(previous_iteration_clusters) != len(merged_clusters):
# Sort clusters by ascending size
clusters = list(reversed(sorted(clusters, key=len)))
# Init the list of clusters to skip
skip_list = []
# Deep copy the results of previous interations into the variable that holds info about clusters from
# the previous iteration
previous_iteration_clusters = copy.deepcopy(merged_clusters)
merged_clusters = []
# Merge similar clusters by looking at the ratio of their intersection and their total number
for i, cluster1 in enumerate(clusters):
# Skip if the cluster was already processed
if i in skip_list:
continue
found_similar = False
for j, cluster2 in enumerate(clusters):
# Skip if the same cluster and all previous up until cluster1
if j <= i:
continue
# Skip if already processed
if j in skip_list:
continue
# Find the number of common points between clusters
intersect_points = len(set(cluster1).intersection(cluster2))
# Find the bigger cluster
if len(cluster1) > len(cluster2):
bigger_size = len(cluster1)
else:
bigger_size = len(cluster2)
# Check for the intersection ratio, i.e. the ratio of common points and the point number of
# the bigger cluster
if (intersect_points >= similarity_threshold*bigger_size):
# Merge two clusters into one
new_merged_cluster = sorted(list(set(set(cluster1) | set(cluster2))))
# Put the merged cluster to the list of merged clusters
merged_clusters.append(new_merged_cluster)
# Put the index of the removed cluster to the list of those clusters to be skipped
skip_list.append(j)
# Mark that a similar cluster has been found
found_similar = True
break
# If the current cluster was not merged to any other, out it back to the list of all clusters
if not found_similar:
merged_clusters.append(cluster1)
# Copy the resulting list to the list of final clusters
clusters = copy.deepcopy(merged_clusters)
# Sort clusters by descending size
clusters = list(reversed(sorted(clusters, key=len)))
return clusters
def plotClusteringReachability(reach_list, clusters=[]):
""" Plot the reachability diagram and the detected clusters (clusters are optional).
Arguments:
reach_list: [ndarray] 1D numpy array containing reachability distance values of individual data points
clusters: [list] a python list containing found clusters in the reachability diagram plot (optional)
- note: individual cluster is just a list of point indices belonging to a specific cluster
Return:
None
"""
# Replace all UNDEFINED values in the reach_list with infinites
reach_list[reach_list == UNDEFINED] = NEW_UNDEFINED
NUM_COLORS = len(clusters) + 1
# Renerate a range of colors
cm = plt.get_cmap('gist_rainbow')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_prop_cycle(color=[cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)])
# Plot the reachability plot
ax.plot(range(len(reach_list)), reach_list)
# Sort the clusters by size
clusters = sorted(clusters, key=len, reverse=True)
# Plot the clusters by their reachability
for i, cluster in enumerate(clusters):
# Get the vertical value of the plot as the first non-undefined reachability distance
for k in reversed(range(len(cluster))):
vertical_reach = reach_list[cluster[k]]
if vertical_reach < NEW_UNDEFINED:
break
vertical_values = np.zeros(len(cluster)) + vertical_reach
# Plot cluster lines on top of one another, smaller ones on the top, with line thicknesses
# correspoding to the size of the cluster
ax.plot(cluster, vertical_values, linewidth=np.log10(len(cluster))/2, zorder=i)
# Set the axis limits
plt.xlim((0, len(reach_list)-1))
plt.ylim((0, np.max(reach_list[reach_list < NEW_UNDEFINED])*1.1))
# Set the titles
if clusters:
plt.title('OPTICS results (Reachability diagram) - ' + str(len(clusters)) + ' clusters found')
else:
plt.title('OPTICS results (Reachability diagram)')
# Set axis labels
plt.xlabel('Point indices')
plt.ylabel('Reachability distance')
plt.tight_layout()
plt.show()
plt.clf()
plt.close()