forked from lijiancheng0614/poem_generator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_start_words.py
86 lines (72 loc) · 2.41 KB
/
get_start_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# -*- coding: utf-8 -*-
import os
import re
import time
import jieba
import codecs
import pickle
import argparse
TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
BASE_FOLDER = os.path.abspath(os.path.dirname(__file__))
DATA_FOLDER = os.path.join(BASE_FOLDER, 'data')
DEFAULT_FIN = os.path.join(DATA_FOLDER, 'poem.txt')
DEFAULT_FOUT = os.path.join(DATA_FOLDER, 'start_words.txt')
def read_data(fin):
start_words = dict()
title_flag = False
fd = codecs.open(fin, 'r', 'utf-8')
for line in fd:
line = line.strip()
title_flag = not title_flag
if title_flag or not line:
continue
word = list(jieba.cut(line))[0]
start_words[word] = start_words.get(word, 0) + 1
fd.close()
print('Read data done.')
return start_words
def write_start_words(fout, start_words):
fw = codecs.open(fout, 'w', 'utf-8')
for k, v in start_words.items():
if v > 10 and len(k) > 1:
fw.write(k + '\n')
fw.close()
print('Write start_words done.')
def set_arguments():
parser = argparse.ArgumentParser(description='Get topics')
parser.add_argument('--fin', type=str, default=DEFAULT_FIN,
help='Input file path, default is {}'.format(DEFAULT_FIN))
parser.add_argument('--fout', type=str, default=DEFAULT_FOUT,
help='Output start_words file path, default is {}'.format(DEFAULT_FOUT))
return parser
if __name__ == '__main__':
parser = set_arguments()
cmd_args = parser.parse_args()
print('{} START'.format(time.strftime(TIME_FORMAT)))
start_words = read_data(cmd_args.fin)
write_start_words(cmd_args.fout, start_words)
print('{} STOP'.format(time.strftime(TIME_FORMAT)))
# count = [0, 14992, 3091, 1161, 614, 401, 254, 179, 103, 88, 79, 52, 49, 50, 41, 20, 23, 19, 15, 16, 13, 6, 12, 12, 6, 6, 7, 2, 7, 2, 3, 2, 3, 0, 4, 2, 4, 3, 2, 0, 0, 1, 3, 3, 1, 2, 1, 2, 2, 2, 1, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 1, 0, 1, 0, 2, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]
# 闻道 52
# 忆 69
# 十年 55
# 去年 52
# 为 63
# 妾 54
# 何处 74
# 曾 74
# 故人 90
# 欲 73
# 少年 69
# 年 63
# 长安 83
# 君不见 65
# 月 108
# 谁 67
# 上 54
# 吾 75
# 万里 102
# 江南 61
# 我 112
# 洛阳 55
# 去 61