-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngrams.py
104 lines (84 loc) · 3.35 KB
/
ngrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#Homework 2, 4.2
#imports -------------------------------------------------------------------#
import os
import re
import csv
#get desired document text for preprocessing & analysis --------------------#
#all documents in "train" directory
directory_path = 'movies/train'
corpus_train = ""
for filename in os.listdir(directory_path):
filepath = os.path.join(directory_path,filename)
if os.path.isfile(filepath):
text = ''.join(open(filepath,'r').readlines())
corpus_train += text+' '
#preprocessing: tokenize ---------------------------------------------------#
def heavy_norm_tokenizer(text):
text = re.sub('\n',' ',text) #remove newline character
text = re.sub('\'',' \'',text) #remove apostrophes (e.g. so 'haven't -> havent')
words = re.sub(r'[^\w\s\']',' ',text) #replace all punctuation with spaces
normalized = words.lower()
tokens = normalized.split(' ') #tokenise
tokens = [tok for tok in tokens if tok != ''] #remove empty-strings
return tokens
tokens = heavy_norm_tokenizer(corpus_train)
#analysis: count unigrams --------------------------------------------------#
def create_ngrams(n, tokens):
ngram_list = []
for tok_location, tok in enumerate(tokens):
if tok_location <= len(tokens)-n: #stops at last possible index for ngram length
ngram = ''
for i in range(n): #n=3 -> [0,1,2]
ngram += tokens[tok_location+i]
ngram += ' '
ngram_list.append(ngram)
return ngram_list
def count_ngrams(all_tokens,unique_tokens):
vocab_counts_over25 = []
occurs_once = []
occurs_twice = []
occurs_thrice = []
occurs_fourtimes = []
for token in unique_tokens:
tokcount = all_tokens.count(token)
if tokcount >= 25:
vocab_counts_over25.append((tokcount,token))
elif tokcount ==1:
occurs_once.append(token)
elif tokcount ==2:
occurs_twice.append(token)
elif tokcount ==3:
occurs_thrice.append(token)
elif tokcount ==4:
occurs_fourtimes.append(token)
return [vocab_counts_over25,occurs_once,occurs_twice,occurs_thrice,occurs_fourtimes]
#count unique unigrams----------------------
print("Unique ngram counts 1,2,3:")
all_unigrams = create_ngrams(1,tokens)
unique_unigrams = list(set(all_unigrams))
print(len(unique_unigrams))
all_bigrams = create_ngrams(2,tokens)
unique_bigrams = list(set(all_bigrams))
print(len(unique_bigrams))
all_trigrams = create_ngrams(3,tokens)
unique_trigrams = list(set(all_trigrams))
print(len(unique_trigrams))
#vocabulary for unigrams--------------------
count_lists = count_ngrams(all_unigrams,unique_unigrams)
print("Occurs once: "+str(len(count_lists[1])))
print("Occurs twice: "+str(len(count_lists[2])))
print("Occurs thrice: "+str(len(count_lists[3])))
print("Occurs four times: "+str(len(count_lists[4])))
#get 10 most frequent--------------------------
vocab_counts_over25 = count_lists[0] #(tokcount,token)
def getkey(item):#helperfunction for sorting by key
return item[0]
#sort
sorted_vocab = sorted(vocab_counts_over25, key = getkey, reverse = True)
print("Top 10 occurring unigrams:")
print(sorted_vocab[:10])
#print vocabulary with frequencies to csv file
with open("vocab25.csv", "w") as the_file:
writer = csv.writer(the_file)
for tup in vocab_counts_over25:
writer.writerow(tup)