-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathlangid.py
executable file
·83 lines (68 loc) · 2.16 KB
/
langid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
import os, sys, web, math, random, io, kenlm
#!/usr/bin/env python
from collections import defaultdict
import train
#remove qu, br, ht or ( be, tl )
#remove qu, br, ht or ( be, tl )
valid = train.valid
corp = train.corp
langs = dict(valid)
def test():
counts = {}
for c in corp:
right = 0
wrong = 0
wrongs = defaultdict(int)
text = io.open('testcorpus/' + c, encoding='utf-8').read()
#because Chinese is logographic, so tokenizing by space is inappropriate
text = text.split()
for i in random.sample(range(1, len(text)-23), 1000):
inds = map(lambda j: i + j, range(random.randint(1, 24)))
randogram = map(lambda j: text[j], inds)
ans = train.language(models, ' '.join(randogram))[0]
if(ans != c): wrong += 1
else: right += 1
counts[c] = (right, wrong)
return counts
'''
def GET(self, name):
results = train.language(models,name)
return (langs[results[0]],results[1])
'''
models = map(lambda code: kenlm.LanguageModel('lm/' + code + ".binary"), corp)
print len(models)
#!flask/bin/python
from flask import Flask
app = Flask(__name__)
@app.route('/language/<path:text>')
def index(text):
valid = train.valid
corp = train.corp
models = map(lambda code: kenlm.LanguageModel('lm/' + code + ".binary"), corp)
langs = dict(valid)
text = text.lower()
results = train.language(models,text)
persistent = results
return "(" + langs[results[0]] + "," + str(results[1]) + ")"
if __name__ == '__main__':
app.run(debug=True)
'''
import web
urls = (
'/(.*)', 'language'
)
app = web.application(urls, globals())
class language:
def GET(self, name):
valid = train.valid
corp = train.corp
models = map(lambda code: kenlm.LanguageModel('lm/' + code + ".binary"), corp)
langs = dict(valid)
name = name.lower()
results = train.language(models,name)
persistent = results
return (langs[results[0]],results[1])
if __name__ == "__main__":
app.run()
'''