-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgene.py
executable file
·485 lines (433 loc) · 14.5 KB
/
gene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
#!/usr/bin/python -OOS
# coding=utf-8
from sys import exit
from sys import stderr
import argparse
parser = argparse.ArgumentParser(
description='Generate code book for Chinese input method')
parser.add_argument('-b', '--breakdown_file',
help='input file breaking characters down to roots',
default='breakdown.txt')
parser.add_argument('-c', '--code_file',
help='output codes to this file',
default='code.txt')
parser.add_argument('-d', '--ding_gong',
help='starting code position to push the first-choice character on screen without pressing space bar',
type=int, default=0)
parser.add_argument('-f', '--freq_file',
help='output root frequencies to this file',
default='freq.txt')
parser.add_argument('-g', '--group_file',
help='input file grouping roots',
default='group.txt')
parser.add_argument('-m', '--max_code_length',
help='maximum code length',
type=int, default=2)
parser.add_argument('-p', '--pad_with_pin_yin',
help='add Pin-Yin for code length below the maximum',
type=int, default=6)
parser.add_argument('-s', '--simple_code_length',
help='generate simple codes up to this length',
type=int, default=0)
parser.add_argument('-o', '--optimize',
help='optimize for root groups, '
'taking "all" or a comma-separated list of roots')
parser.add_argument('-n', '--new_group_file',
help='output optimized groups to this file',
default='group')
args = parser.parse_args()
with open(args.breakdown_file) as f:
lines = f.readlines()
for i in range(len(lines)):
if lines[i].strip() == '#START':
break
else:
print 'The character table must start with a "#START" line'
exit(1)
#an B0B6 岸 山(厂干ZS)SX
#ai DEDF 捱 扌[涯右]
#biao ECAE 飚 风(火(火火)SX)ZX
#sui CBE8 髓 骨(辶(左月SX)ZX)
def parse_breakdown(line, i):
breakdown = []
while i < len(line):
if ord(line[i]) >= 240: # 4-byte character
breakdown += [line[i:i+4]]
i += 4
elif ord(line[i]) >= 224: # 3-byte character
breakdown += [line[i:i+3]]
i += 3
elif line[i] == '[': # non-character
j = i + 1
while line[j] != ']':
j += 1
if j >= len(line):
return '', i, 'mismatched ['
breakdown += [line[i:j+1]]
i = j + 1
elif line[i] == '(': # structure
part, i, err = parse_breakdown(line, i+1)
if err != '':
return part, i, err
assert line[i] == ')', 'mismatched ('
i += 1
breakdown += [part]
elif line[i] == ')': # structure
return breakdown, i, ''
elif line[i] == '?': # unknown
return '', i, 'unknown breakdown'
elif line[i].isalpha() or line[i] in ['{', '}']:
i += 1
else:
return '', i, 'invalid character %s' % line[i]
return breakdown, i, ''
def reverse_order(breakdown, special_roots):
if breakdown[0] in special_roots:
breakdown = breakdown[1:] + breakdown[:1]
return [reverse_order(part, special_roots) if isinstance(part, list)
else part for part in breakdown]
def traverse(breakdown):
flat = []
for part in breakdown:
flat += traverse(part) if isinstance(part, list) else [part]
return flat
sp_map = {
'iu': 'q',
'en': 'w',
'e': 'e',
'zh': 'e',
'ou': 'r',
'ui': 't',
'ue': 't',
'ue:': 't',
'uan': 'y',
'u': 'u',
'sh': 'u',
'i': 'i',
'ch': 'i',
'o': 'o',
'uo': 'o',
'un': 'p',
'a': 'a',
'ao': 's',
'ang': 'd',
'eng': 'f',
'ng': 'f',
'ing': 'g',
'er': 'f',
'ong': 'h',
'iong': 'h',
'ai': 'j',
'an': 'k',
'ian': 'l',
'uai': 'l',
'iang': ';',
'uang': ';',
'ua': 'x',
'ia': 'x',
'ie': 'c',
'u:': 'v',
'in': 'b',
'iao': 'n',
'ei': 'm',
'': '',
}
def convert_py_to_sp(py):
if py[0] in 'aoe' or py == 'ng':
return 'j' + sp_map[py]
if py[0] in 'zcs' and py[1] == 'h':
if args.ding_gong >= 2:
return py[0] + sp_map[py[2:]]
else:
return sp_map[py[0:2]] + sp_map[py[2:]]
return py[0:1] + sp_map[py[1:]]
lines = lines[i+1:]
dict = {}
pinyin = {}
incomplete = []
for line in lines:
line = line.strip()
# Skip empty line or comment.
if line == '' or line[0] == '#':
continue
# Skip a line that doesn't define a character.
item = line.split()
if len(item) < 3:
continue
# Extra Pin-Yin if it's a character.
p = item[0]
c = item[2]
if p[0].isalpha():
p = p[:args.pad_with_pin_yin]
if not pinyin.has_key(c):
pinyin[c] = [p]
elif not p in pinyin[c]:
pinyin[c] += [p]
# Handle a character whose breakdown is not defined. Can be a root.
if len(item) < 4:
if not dict.has_key(c):
dict[c] = ''
continue
# Merge the rest of the items into one string that defines the breakdown.
d = item[3]
for i in range(4, len(item)):
d += item[i]
breakdown, i, err = parse_breakdown(d, 0)
if err != '':
stderr.write('%s at: %s\n' % (err, line))
exit(1)
if i != len(d):
stderr.write('Incomplete parse at: %s\n' % line)
exit(1)
breakdown = reverse_order(breakdown, ['辶' , '廴'])
breakdown = traverse(breakdown)
# Mark a reference to the part if it's not yet defined.
for part in breakdown:
if not dict.has_key(part):
dict[part] = ''
# The character has been seen before.
if dict.has_key(c) and dict[c] != '':
if breakdown != dict[c]:
print 'Different breakdown for %s: %s, %s.' % (c,
''.join(dict[c]), ''.join(breakdown))
continue
dict[c] = breakdown
new_group = True
num_groups = 0
root_group = {} # map from roots to their group indexes
group_rep = [] # roots representing each group
dup_roots = [] # roots defined multiple times
# Read groups from group file.
with open(args.group_file) as f:
lines = f.readlines()
for line in lines:
if line[0] == '#':
continue
roots = line.split()
if roots == []:
new_group = True
continue
if new_group:
num_groups += 1
for root in roots[1:]:
# Skip frequency at the end if any.
if root[0] in '1234567890':
continue
if root_group.has_key(root):
dup_roots += [root]
continue
root_group[root] = num_groups - 1
if new_group:
group_rep += [root]
new_group = False
# Fail when there are undefined roots.
undefined_roots = [c for c, roots in dict.items()
if roots == '' and not root_group.has_key(c)]
if undefined_roots != []:
stderr.write('Undefined roots: ' + ' '.join(undefined_roots) + '\n')
exit(1)
# Warn about duplicate roots.
if dup_roots != []:
stderr.write('Duplicate roots: ' + ' '.join(dup_roots) + '\n')
# Recursively break a character down to roots
def break_character(c):
if dict[c]=='':
return [c]
roots = []
for part in dict[c]:
if dict.has_key(part):
roots += break_character(part)
return roots
# Remove minor (single-stroke) roots since they carry less info.
minor_roots = ['丶']
def remove_minor_roots(roots):
new_roots = roots[:0]
for root in roots[0:]:
if root not in minor_roots:
new_roots += [root]
return new_roots
# Generate flat dictionary where characters are broken down to roots.
flat_dict = {}
root_freq = {}
for c in dict.keys():
if c in incomplete or c[0] == '[':
continue
roots = break_character(c)
if len(roots) > args.max_code_length:
roots = remove_minor_roots(roots)
if len(roots) > args.max_code_length:
if args.max_code_length == 1:
roots = roots[:1]
else:
roots = roots[:args.max_code_length-1] + roots[-1:]
flat_dict[c] = roots
for root in roots:
if root_freq.has_key(root):
root_freq[root] += 1
else:
root_freq[root] = 1
# Output root frequencies to a file.
freq_list = [[root, freq] for root, freq in root_freq.items()]
freq_list.sort(key=lambda x: x[1], reverse=True)
with open(args.freq_file, 'w') as f:
for root, freq in freq_list:
f.write('%s\t%d\n' % (root, freq))
# Warn about unused roots.
unused_roots = [root for root in root_group if root not in root_freq.keys()]
if unused_roots != []:
stderr.write('Unused roots: ' + ' '.join(unused_roots) + '\n')
# Read table of character frequencies.
with open('char_freq.txt') as f:
lines = f.readlines()
char_freq = {}
char_freq_py = {}
for line in lines:
item = line.split()
char_freq[item[0]] = int(item[1])
if len(item) >= 3:
char_freq_py[item[0]] = convert_py_to_sp(item[2])
for c in flat_dict:
if not char_freq.has_key(c):
char_freq[c] = 0
# Keys used for encoding roots.
code_keys = 'hjkl;trewqgfdsxazvcyuiopbnm'
def convert_to_ding_gong(code):
reduced_code = code[:args.ding_gong - 2]
for ch in code[args.ding_gong - 2:]:
if ch in 'qwert':
reduced_code += 'e'
elif ch in 'uiop':
reduced_code += 'i'
elif ch in 'y':
reduced_code += 'o'
elif ch in 'asdfg':
reduced_code += 'a'
elif ch in 'hjkl;':
reduced_code += ';'
elif ch in 'zxcv':
reduced_code += 'v'
else:
reduced_code += 'u'
return reduced_code
# Generate the code book.
def generate_code_book():
for c, roots in flat_dict.items():
if not pinyin.has_key(c):
continue
code = ''
for root in roots:
code += code_keys[root_group[root]]
plus = pinyin[c]
for p in plus:
if args.ding_gong >= 2:
code_plus = convert_py_to_sp(p) + convert_to_ding_gong(code)
while len(code_plus) < args.max_code_length + 2:
code_plus += 'o'
else:
code_plus = convert_py_to_sp(p) + code
if not code_book.has_key(code_plus):
code_book[code_plus] = [c]
elif not c in code_book[code_plus]:
code_book[code_plus] += [c]
def get_char_py_freq(char, py):
freq = char_freq[char]
if not char_freq_py.has_key(char) or char_freq_py[char][:2] == py[:2]:
return freq
else:
return freq / 20
# Generate simple codes based on character frequencies.
def generate_simple_codes():
for length in range(1, args.simple_code_length+1):
simple_code_candidates = {}
for code, characters in code_book.items():
if len(code) <= length:
continue
simple_code = code[0:length]
if code_book.has_key(simple_code):
continue
if not simple_code_candidates.has_key(simple_code):
simple_code_candidates[simple_code] = []
for character in characters:
simple_code_candidates[simple_code] += [(code, character)]
for simple_code, candidates in simple_code_candidates.items():
# Reduce the frequency if the Pin-Yin doesn't match what's
# specified in the frequency table.
candidates.sort(key=lambda c: get_char_py_freq(c[1], c[0]), reverse=True)
code = candidates[0][0]
character = candidates[0][1]
simple_code_book[simple_code] = character
code_book[code].remove(character)
if len(code_book[code]) == 0:
del code_book[code]
# Output code book to a file.
def output_code_book():
with open(args.code_file, 'w') as f:
for code in sorted(simple_code_book):
f.write('%-4s\t%s\t1\n' % (code, simple_code_book[code]))
for code in sorted(code_book.keys()):
characters = code_book[code]
characters.sort(key=lambda c: char_freq[c], reverse=True)
for c in characters:
f.write('%-4s\t%s\t%d\n' % (code, c, len(characters)))
def count_dups():
num_dups = 0
for code in sorted(code_book.keys()):
characters = code_book[code]
# Don't count dups for code length 1 because many of them are
# non-characters.
if len(code) <= 1:
continue
if len(characters) >= 2:
num_dups += len(characters)
return num_dups
code_book = {}
generate_code_book()
simple_code_book = {}
generate_simple_codes()
output_code_book()
num_dups = count_dups()
print '编码键数: %d 重码字数: %d' % (num_groups, num_dups)
# See if optimization for root grouping is asked.
if args.optimize is None:
exit(0)
elif args.optimize == 'all':
roots_to_optimize = root_freq.keys()
else:
roots_to_optimize = list(set(args.optimize.split(',')) - set(unused_roots))
non_roots = list(set(roots_to_optimize) - set(root_group.keys()))
if non_roots != []:
stderr.write('Cannot optimize for non-root: %s\n' % ' '.join(non_roots))
exit(1)
# Evaluation to figure out duplicate codes.
def evalulate_dups():
global code_book
code_book = {}
generate_code_book()
global simple_code_book
global char_simple_code
simple_code_book = {}
char_simple_code = {}
generate_simple_codes()
return count_dups()
# Sort roots by frequency. Optimize for frequently used roots first.
roots_to_optimize.sort(key=lambda r: root_freq[r], reverse=True)
for root in roots_to_optimize:
min_dups = num_dups
min_group = root_group[root]
for target_group in range(num_groups):
root_group[root] = target_group
new_dups = evalulate_dups()
if len(roots_to_optimize) == 1:
print code_keys[target_group], group_rep[target_group], new_dups
if new_dups < min_dups:
min_dups = new_dups
min_group = target_group
root_group[root] = min_group
num_dups = min_dups
print root, '->', code_keys[min_group], group_rep[min_group], min_dups
# Output optimized groups to a file.
with open(args.new_group_file, 'w') as f:
for g in range(num_groups):
roots = [root for root, freq in freq_list if root_group[root] == g]
freqs = [freq for root, freq in freq_list if root_group[root] == g]
f.write('%c %s %d\n\n' % (code_keys[g], ' '.join(roots), sum(freqs)))