-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsaregamapa_cluster.py
160 lines (125 loc) · 5.38 KB
/
saregamapa_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""
Created on Tue Nov 21 17:38:54 2017
@author: Valerio Guarrasi
"""
import math
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt
class Saregamapa_Cluster:
isServer = True
smeta = {}
songs_dict = {}
cluster_results = []
def search_complete(self, diz):
#print("Inside search complete")
###return all the songs that contain all the query terms
q = self.smeta["sQuery"]
#ask query
#q = input("Insert query: ")
q = q.split()
#create a dictionary word of the query(key): in wich song it is in(value)
diz_intersect = {}
for query in q:
for word in diz.keys():
if query==word:
for i in range(len(diz[word])):
if query not in diz_intersect:
diz_intersect[query] = [diz[word][i][0]]
else:
diz_intersect[query].append(diz[word][i][0])
#create a set of all the songs that contain all the query terms
intersect = diz_intersect[q[0]]
for i in range(1,len(q)):
if i == len(q):
break
else:
intersect = set(intersect).intersection(diz_intersect[q[i]])
if(self.isServer):
print("Documents Intersection: ", intersect)
return intersect
def normalize_results(self, intersect, diz_tf_idf):
#Normalize the vector
data = []
#for every document in the intersection create a list where every element(associated to a word) is it's tf-idf normalized
for doc in intersect:
#diz_normalized is a dictionary word(key): with it's tf-idf normalized(value)
diz_normalized = {}
#norm of the document: denominator
doc_norm = 0
for word in diz_tf_idf.keys():
for i in range(len(diz_tf_idf[word])):
if doc == diz_tf_idf[word][i][0]:
diz_normalized[word] = diz_tf_idf[word][i][1]
doc_norm += diz_tf_idf[word][i][1]**2
#print(diz_normalized)
#print(doc_norm)
#create the vector desired and put it in a array
for w in diz_normalized.keys():
diz_normalized[w] = diz_normalized[w]/math.sqrt(doc_norm)
#print(diz_normalized)
l = []
for word in diz_tf_idf.keys():
if word in diz_normalized:
l.append(diz_normalized[word])
else:
l.append(0)
#print(l)
data.append(l)
#print(data)
return data
def cluster_documents(self, data, intersect):
#HOW MANY CLUSTERS?
#k = int(input("How many clusters? "))
k = self.smeta["clusters_count"]
#use k-means to clusterize the songs
kmeans = KMeans(n_clusters=k, init='random') # initialization
kmeans.fit(data) # actual execution
c = kmeans.predict(data)
#print(c.shape)
#print(c)
if(self.isServer):
for i in range(len(intersect)):
print("song "+str(list(intersect)[i])+" is in cluster "+str(c[i]))
#we could try it more times to see the best solution, since it isn't optimal
return c
def insert_doc(self, doc_id):
curDoc = self.songs_dict[str(doc_id)]
return [curDoc[0], curDoc[0], curDoc[1], curDoc[3]]
def create_wordcloud(self, intersect, c):
#word CLoud
cluster_diz = {}
for i in range(len(c)):
cur_doc_id = list(intersect)[i]
if c[i] in cluster_diz:
cluster_diz[c[i]].append(cur_doc_id)
else:
cluster_diz[c[i]] = [cur_doc_id]
for i in range(len(cluster_diz.keys())):
self.cluster_results.append([])
for cluster in cluster_diz.keys():
strg_cloud = " "
for doc in cluster_diz[cluster]:
strg_cloud += self.songs_dict[str(doc)][4] + " "
self.cluster_results[int(cluster)].append(self.insert_doc(doc))
#strg_cloud = ' '.join(strg_cloud.split())
wordcloud = WordCloud(width = 300, height = 300, margin = 0, collocations=False).generate(strg_cloud)
plt.title("Cluster number: "+str(cluster))
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.margins(x=0,y=0)
plt.savefig("static/wordcloud/cluster_" + str(cluster))
if(self.isServer):
plt.show()
def cluster(self):
self.cluster_results = []
diz_tf_idf = self.smeta["sindexes"]
intersect = self.search_complete(diz_tf_idf)
n_doc = self.normalize_results(intersect, diz_tf_idf)
cluster = self.cluster_documents(n_doc, intersect)
self.create_wordcloud(intersect, cluster)
return self.cluster_results
def __init__(self, smeta, isServer):
self.isServer = isServer
self.smeta = smeta
self.songs_dict = smeta["songs_dict"]