-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwordSegment.py
182 lines (145 loc) · 5.65 KB
/
wordSegment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# -*- coding: utf-8 -*-
"""
Chinese word segmentation
Author: Aining Wang
Date: 07/2017
"""
import re
import sys
from MathTool import entropy, Counter
WORD_LEN = [1,2,3,4,5,6,7]
CANDI_MIN = 2
CANDI_MAX = 6
ENT_VALUE = 10000
AGG_VALUE = 1000
SCORE_LIMIT = 10**9
FREQ_MIN = 5
FREQ_MAX = 10
INPUT_FILE = "AI_news.txt"
STOPWORD_FILE = "stopwords.txt"
class WordInfo:
def __init__(self, word, freq):
self.word = word
self.length = len(word)
self.freq = freq
self.aggregation = 0 # Internal aggregation degree
self.left_dict = {} # {left word: freq}
self.right_dict = {} # {right word: freq}
self.entropy = 0 # External using freedom
self.score = 0
def turnDictToLict(self):
for word in self.left_dict:
self.left_prob_list.append(self.left_dict[word])
for word in self.right_dict:
self.right_prob_list.append(self.right_dict[word])
def calculateEntropy(self):
# turn dict to list
left_prob_list = []
right_prob_list = []
for word in self.left_dict:
left_prob_list.append(self.left_dict[word])
for word in self.right_dict:
right_prob_list.append(self.right_dict[word])
# calculate entropy
self.entropy = min(entropy(left_prob_list), entropy(right_prob_list))
def calculateAggregation(self, word_freq_dict):
ent_record = []
for i in range(self.length - 1):
word1 = self.word[i + 1 :]
word2 = self.word[: i + 1]
ent = entropy([word_freq_dict[self.word], word_freq_dict[word1] * word_freq_dict[word2]])
ent_record.append(ent)
self.aggregation = min(ent_record)
class WordSegment:
def __init__(self):
self.candidate_dict = {} # length 2-5
self.text = ""
self.word_freq = {} # count word freq (length between 1-6)
self.stopwords_list = []
self.new_word_list = []
def getInputText(self):
file = open(INPUT_FILE, "r")
pattern = re.compile(u"([\u4e00-\u9fff]+)")
origin_text = ""
for line in file:
origin_text += line
# get chinese words
origin_text = origin_text.decode("utf8")
chinese = pattern.findall(origin_text)
for sentence in chinese:
self.text += sentence
def countWordFreq(self):
self.getInputText()
# count word freq
for word_length in WORD_LEN:
print word_length
for i in range(len(self.text) - word_length):
Counter(self.text[i : i+word_length], self.word_freq)
# output outcome
file2 = open("word_freq.txt", "w")
for key in self.word_freq:
file2.write((key + " " + str(self.word_freq[key]) + "\t").encode("utf8"))
file2.close()
def setCandidateWord(self):
# set words as candidates (freq > 1, 1 < len < 6):
for word in self.word_freq:
if len(word) >= CANDI_MIN and len(word) <= CANDI_MAX and self.word_freq[word] >= FREQ_MIN:
self.candidate_dict[word] = WordInfo(word, self.word_freq[word])
# output outcome
file = open("candidate.txt", "w")
for key in self.candidate_dict:
file.write((key + " " + str(self.candidate_dict[key].freq) + "\t").encode("utf8"))
file.close()
def delectStopWord(self):
# store stopwords in dict
file = open(STOPWORD_FILE, "r")
del_list = []
for line in file:
line = line.strip("\n").decode("utf8")
self.stopwords_list.append(line)
# delect stop word in candidate dict
for word in self.candidate_dict:
for stopword in self.stopwords_list:
if stopword in word:
del_list.append(word)
for del_word in del_list:
if del_word in self.candidate_dict:
del self.candidate_dict[del_word]
def collectLeftRightDict(self):
for word in self.word_freq:
if len(word) > 2:
# find right words
if word[: -1] in self.candidate_dict:
self.candidate_dict[word[: -1]].right_dict[word[-1]] = self.word_freq[word[-1]]
# find left words
elif word[1 :] in self.candidate_dict:
self.candidate_dict[word[1 :]].left_dict[word[0]] = self.word_freq[word[0]]
def calculateCandidateScore(self):
# calculate entropy and aggregation
for word in self.candidate_dict:
info = self.candidate_dict[word]
info.calculateEntropy()
info.calculateAggregation(self.word_freq)
info.score = info.aggregation * info.entropy
# filter candidate
self.new_word_list = []
for word in self.candidate_dict:
info = self.candidate_dict[word]
if info.freq >= FREQ_MIN and info.freq <= FREQ_MAX and info.score >=SCORE_LIMIT:
self.new_word_list.append(word)
def execute(self):
self.countWordFreq()
self.setCandidateWord()
self.delectStopWord()
self.collectLeftRightDict()
self.calculateCandidateScore()
# output outcome
file = open("AI_new_word.txt", "w")
for word in self.new_word_list:
info = self.candidate_dict[word]
file.write(word.encode("utf8") + "\t")
print str(info.freq) + "\t" + str(info.aggregation) + "\t" + str(info.entropy)
file.close()
# test
c = WordSegment()
c.execute()