-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsaregamapa_common.py
72 lines (43 loc) · 1.86 KB
/
saregamapa_common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
Created on Tue Nov 21 17:38:54 2017
@author: Valerio Guarrasi
"""
from nltk.tokenize import RegexpTokenizer
from itertools import islice
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
class Saregamapa_Common:
common_message = ''
def format_text(self, lyric):
# let's clean the lyric using RegexpTokenizer from nltk.tokenize
tokenizer = RegexpTokenizer(r'\w+')
#Remove punctuation
lyric = ' '.join(tokenizer.tokenize(lyric))
#Remove stopwords
lyric = self.remove_stopwords(lyric)
return lyric
def remove_stopwords(self, s):
return " ".join([word for word in s.split() if word not in stopwords.words('english')])
#@TODO: Need to remove this function if not used
def get_documents_meta(self, songs_list):
documents_meta = []
for song in songs_list:
#Combine all the available strings for a song as one
#doc_string = song['title'] + ' ' + song['artist'] + ' ' + song['lyrics']
doc_string = song['lyrics']
documents_meta.append([song['index'], doc_string, song['_id'], song['title'], song['url']])
return documents_meta
def generate_dict_fromlist(self, dict_list):
dict_consolidated = {}
for curDict in dict_list:
curDict.pop('_id', None)
dict_consolidated.update(curDict)
return dict_consolidated
#Create cheunks of the big dictionary based on the size passed
def chunks(self, data, SIZE=10000):
it = iter(data)
for i in range(0, len(data), SIZE):
yield {k:data[k] for k in islice(it, SIZE)}
def __init__(self, s):
self.common_message = s