-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathminimum_example.py
100 lines (81 loc) · 2.78 KB
/
minimum_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
an example for using markovify without Spark
"""
import markovify
STATE_SIZE = 6
SAVE_MODELS = True
def split_line(line):
"""all_abracts.csv has file_name, abstract
let's just grab abstracts for now"""
try:
strings = line.split(',', 1)
return str(strings[1])
except:
pass
def at_least_20_words(text):
if text is None:
return False
return len(text.split()) >= 20
#except: # for some reason we have some None's here
# pass
def text_to_model(text):
'''given an abstract, train a markov model
the 1 will be used for weights, later'''
try:
text_model = markovify.Text(text, state_size=STATE_SIZE, \
retain_original=False)
# class is not serializable, so extract json first
# this makes a Text type object, so we coerce to str
model_json = str(text_model.to_json())
# TODO: change key for category
return "_", model_json
except:
# TODO FIXME: many articles being lost due to illegal characters. see issue tracker.
#print(text)
pass
def combine_models(models_list):
unzipped = zip(models_list)
_ = []
jsons = []
for tup in unzipped:
tup = tup[0] # unnest 1 level
if tup is None:
continue # FIXME I don't know how these Nonetypes keep sneaking in
try:
_name, json = tup
jsons.append(json)
except ValueError:
print(tup)
# reconstitute classes from json
reconstituted_models = [markovify.Text.from_json(json_i) for json_i in jsons]
# hella redundant but combine() method only smashes 2 models at a time
combined_model = reconstituted_models.pop()
weights = [1., 1.]
for model in reconstituted_models:
combined_model = markovify.combine([model, combined_model], weights)
weights[-1] += 1
combined_json = str(combined_model.to_json())
# TODO: change key for category
return "_", combined_json
def model_to_json(model):
if SAVE_MODELS:
model_name, model_json = model
fname = open('./energy_physics_models/{}_model.json'.format(model_name), 'w')
fname.write(model_json)
fname.close
models = []
with open("./results/Energy_Physics_abstracts.csv") as abstracts:
while len(models) < 5:
for abstract in abstracts.readlines():
print(len(models))
abstract = split_line(abstract)
#print(abstract)
#if not abstract: # Nonetype
# continue
if len(abstract) <= 20 or abstract is None:
continue
models.append(text_to_model(abstract))
if len(models) > 5:
break
combined = combine_models(models)
model_to_json(combined)