-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_import.py
126 lines (114 loc) · 5.02 KB
/
data_import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import re
import math
import pandas as pd
import sys
from bs4 import BeautifulSoup
import os
import pdfminer.high_level
import pdfminer.layout
def importCSV(csvfile):
with open(csvfile, mode='r', encoding='utf-8') as csv:
outputDataFrame = pd.read_csv(csv, sep = '\s+', header = 0)
return outputDataFrame
def importPDF(pdffile, outputXML='out.xml', outputCSV='out.csv'):
def PDFtoXML(pdffile):
if os.path.exists(outputXML):
os.remove(outputXML)
laparams = pdfminer.layout.LAParams()
out = open(outputXML, mode='wb')
with open(pdffile, mode='rb') as fp:
pdfminer.high_level.extract_text_to_fp(inf=fp, outfp = out, output_type='xml', laparams=laparams)
def updateColumns(columns, page, position, value):
value = int(value)
try:
columns[page][position].append(value)
except:
if page not in columns.keys():
columns[page] = {position : [value]}
else:
columns[page][position] = [value]
return columns
PDFtoXML(pdffile)
colsOnPages = {}
actCol = 0
probVal = False
columns = ['Gender','Age','BP_percentile', 'BPSys_5Hp', 'BPSys_10Hp', 'BPSys_25Hp', 'BPSys_50Hp', 'BPSys_75Hp', 'BPSys_90Hp', 'BPSys_95Hp',
'BPDia_5Hp', 'BPDia_10Hp', 'BPDia_25Hp', 'BPDia_50Hp', 'BPDia_75Hp', 'BPDia_90Hp', 'BPDia_95Hp'
]
BPdb = pd.DataFrame()
with open(outputXML, 'r', encoding='utf-8') as XMLinput:
xml = BeautifulSoup(XMLinput, 'html.parser')
pages = xml.find_all('page')
val = ''
for page in pages:
textboxes = page.find_all('textbox', recursive=False)
for textbox in textboxes:
textlines = textbox.find_all('textline')
for textline in textlines:
texts = textline.find_all('text')
for text in texts:
#check if we just found a digit in text segment
if re.match(r'\d', text.string):
val += text.string
#check if previously we have found digit and now our number is greater than 17 (BP can not be lower than 17 mmHg)
if len(val) > 1 and int(val) > 17:
probVal = True
#if we have found just one digit or number is lower than 17 it can not be a value we are looking for
else:
probVal = False
#check if we did not find a letter t from 'th' string or % symbol following our val
elif (text.string == 't' or text.string == r'%') and val != '':
val = ''
probVal = False
break
#check if we did find an empty string
elif re.match(r'\D', text.string):
#if it is an empty string and our val is probably a value let store it
if probVal == True:
pos = eval(text['bbox'])
colsOnPages = updateColumns(colsOnPages, int(page['id']), math.floor(pos[2]), val)
val=''
probVal = False
#if it is not flush val and go to the next textline
else:
val = ''
percentiles = [50, 90, 95, 99]
for key in colsOnPages.keys():
positions = list(colsOnPages[key].keys())
positions.sort()
Pct = pd.DataFrame()
Age = pd.DataFrame()
BP = pd.DataFrame()
if key % 2 == 0:
age = [11]
else:
age = [1]
ageBin = 0
for idx, val in enumerate(positions):
df = pd.DataFrame({columns[idx+3] : colsOnPages[key][positions[idx]]})
BP = pd.concat([BP,df], axis=1)
while ageBin < len(BP) / 4:
df = pd.DataFrame({'Age' : age * 4})
age[0] = age[0] + 1
ageBin += 1
Age = pd.concat([Age, df], axis=0)
df = pd.DataFrame({'BP_percentile':percentiles})
Pct = pd.concat([Pct, df], axis=0)
Age = Age.reset_index(drop=True)
Pct = Pct.reset_index(drop=True)
if key == 1 or key == 2:
gen = ['M']
df = pd.DataFrame({'Gender' : gen * len(BP)})
BP = pd.concat([BP,df], axis=1)
else:
gen = ['F']
df = pd.DataFrame({'Gender' : gen * len(BP)})
BP = pd.concat([BP,df], axis=1)
BP = pd.concat([BP,Age], axis = 1)
BP = pd.concat([BP,Pct], axis = 1)
BPdb = pd.concat([BPdb, BP], axis = 0)
BPdb = BPdb.reset_index(drop=True)
if os.path.exists(outputCSV):
os.remove(outputCSV)
BPdb.to_csv(outputCSV)
return BPdb