-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocrstream.py
98 lines (73 loc) · 3.07 KB
/
ocrstream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import logging
from time import sleep
from multiprocessing.pool import Pool
from sources.transform import find_failed_conversions, flat_convert, run_ocr, terminate_finereader
from sources.extract import extract_source_files
from sources.load import load_to_corpus, parse_order_data
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
class MyPipeline(object):
"""Custom class for chaining processing steps.
Contains implementation for pooling workers. The purpose of this class is to provide a graph for different
operations performed on unstructured data.
"""
def __init__(self):
self.TIMEOUT = 60
self.SLEEP = 10
self.BATCH_SIZE = 50
self.cache = dict()
self.PROJECT_DATABASE_EXPORT = str()
def get_order_data(self):
"""Pre-process nested order data from ERP export."""
self.PROJECT_DATABASE_EXPORT = parse_order_data()
def extract_files(self):
"""Copy project source files listed in TW export to target directory (TOP DIR).
Arguments:
fp -- path to TW CSV export with the following columns:
addressID, auftragsDatum, projektNR, D160_prt_D161_AUFTRAG_POS__::_kc_spp_ID
"""
self.cache = extract_source_files(self.PROJECT_DATABASE_EXPORT)
@staticmethod
def convert():
"""Fetch extracted files and convert them to TXT files.
The method uses the PowerGrep converter out of the box. It is a "flat" conversion, because it normalizes
the path to <CLIENT>/<PROJECT-ID>-<CLIENT-ID>/<FILENAME>.
"""
flat_convert()
def ocr(self):
fails = find_failed_conversions()
def batch(my_list, n=1):
"""Slice a list of strings into n-sized batches."""
length = len(my_list)
for ndx in range(0, length, n):
yield my_list[ndx:min(ndx + n, length)]
for x in batch(fails, self.BATCH_SIZE):
self.batch_workers(x)
logging.info("No. of empty files: {}.".format(len(find_failed_conversions())))
def batch_workers(self, fails):
pool = Pool(processes=len(fails))
# Spawn a new worker for each doc that requires OCR processing
# The number of concurrently running workers is determined by the number of CPU cores
for i in range(len(fails)):
pool.apply_async(run_ocr, args=(fails[i], self.cache))
# Add sleep time to prevent licensing errors messages
sleep(self.SLEEP)
pool.close()
print("Waiting for FineReader to finish…", end='')
cnt = 0
while len(find_failed_conversions()) > 0 and cnt < self.TIMEOUT:
print(".", end='')
sleep(self.SLEEP)
cnt += self.SLEEP
terminate_finereader()
sleep(1) # Testing
pool.join()
logging.info("All processes joined")
def load(self):
load_to_corpus(self.cache)
if __name__ == '__main__':
p = MyPipeline()
p.get_order_data()
p.extract_files()
p.convert()
p.ocr()
p.load()