-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRESEARCH AND DEVELOPMENT extraction.py
168 lines (133 loc) · 5.68 KB
/
RESEARCH AND DEVELOPMENT extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import requests
import pdfplumber
import re
import os
import concurrent.futures
from functions.download_reports import download_annual_reports_parallel
# Directory where reports will be saved
reports_dir = 'reports'
os.makedirs(reports_dir, exist_ok=True)
# List of company PDFs to check
companies = ['NASDAQ_TSLA', 'NASDAQ_AAPL', 'NASDAQ_MSFT', 'NASDAQ_AMZN',
'NYSE_BRK-A', 'NYSE_PFE', 'NASDAQ_CCBG'
]
years = ['2022', '2021', '2020', '2019']
download_annual_reports_parallel(companies, years, reports_dir)
def extract_discussion(text):
pass
def is_toc_page(text):
toc_patterns = [
r'\bcontents\b',
r'\bindex\b',
r'\btable of contents\b',
r'\.\.\.\s+\d+',
r'[A-Za-z].*\.\.\.\s+\d+'
]
for pattern in toc_patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def extract_toc(pdf):
toc_text = ''
toc_started = False
for page in pdf.pages:
text = page.extract_text()
if text:
if is_toc_page(text):
toc_text += text + "\n"
toc_started = True
elif toc_started:
break
return toc_text
# Dictionary to store results
toc_extraction_results = {}
for company in companies:
for year in years:
filename = f"{company}_{year}.pdf"
filepath = os.path.join(reports_dir, filename)
if os.path.exists(filepath):
with pdfplumber.open(filepath) as pdf:
toc_text = extract_toc(pdf)
if toc_text:
toc_extraction_results[f"{company}_{year}"] = {'Status': 'Success', 'ToC': toc_text}
else:
toc_extraction_results[f"{company}_{year}"] = {'Status': 'Failed', 'ToC': ''}
else:
toc_extraction_results[f"{company}_{year}"] = {'Status': 'Download Failed', 'ToC': ''}
def extract_section_pages(toc_text, section_keywords):
section_pages = []
lines = toc_text.split('\n')
for i, line in enumerate(lines):
for keyword in section_keywords:
if keyword.lower() in line.lower():
page_number_pattern = r'(\d+)$'
page_match = re.search(page_number_pattern, line)
if page_match:
start_page = int(page_match.group()) # Convert to integer
end_page = None
# Find the start of the next section to determine the end page
for j in range(i + 1, len(lines)):
next_section_match = re.search(page_number_pattern, lines[j])
if next_section_match:
next_page = int(next_section_match.group()) # Convert to integer
end_page = next_page - 1 if next_page > start_page else start_page
break
if not end_page:
end_page = start_page # Fallback if no end page is found
section_pages.append((start_page, end_page))
break
return section_pages
def find_pdf_page_number(page):
text = page.extract_text()
if text:
numbers = re.findall(r'\d+', text)
if numbers:
return int(numbers[-1])
return None
def extract_pages_from_pdf(filepath, page_ranges):
extracted_text = {}
with pdfplumber.open(filepath) as pdf:
pdf_page_map = {find_pdf_page_number(page): i for i, page in enumerate(pdf.pages)}
for section, (toc_start, toc_end) in page_ranges.items():
section_text = ""
for toc_page in range(toc_start, toc_end + 1):
pdf_page_index = pdf_page_map.get(toc_page)
if pdf_page_index is not None:
page_text = pdf.pages[pdf_page_index].extract_text()
if page_text:
section_text += page_text + "\n"
extracted_text[section] = section_text
return extracted_text
# Dictionary to store results
toc_extraction_results = {}
# Directory for saving discussions
discussion_dir = 'discussion'
os.makedirs(discussion_dir, exist_ok=True)
for company in companies:
for year in years:
filename = f"{company}_{year}.pdf"
filepath = os.path.join(reports_dir, filename)
if os.path.exists(filepath):
with pdfplumber.open(filepath) as pdf:
toc_text = extract_toc(pdf)
if toc_text:
toc_extraction_results[f"{company}_{year}"] = {'Status': 'Success', 'ToC': toc_text}
else:
toc_extraction_results[f"{company}_{year}"] = {'Status': 'Failed', 'ToC': ''}
else:
toc_extraction_results[f"{company}_{year}"] = {'Status': 'Download Failed', 'ToC': ''}
for key, result in toc_extraction_results.items():
if result['Status'] == 'Success':
filepath = os.path.join(reports_dir, f"{key}.pdf")
toc_text = result['ToC']
discussion_analysis_pages = extract_section_pages(toc_text, ['management’s discussion', 'management’s report', 'discussion'])
if discussion_analysis_pages:
extracted_sections = extract_pages_from_pdf(filepath, {"Discussion and Analysis": discussion_analysis_pages[0]})
discussion_text = extracted_sections.get("Discussion and Analysis", "Section not found")
output_filename = os.path.join(discussion_dir, f"{key}_DISCUSSION.txt")
with open(output_filename, 'w', encoding='utf-8') as file:
file.write(discussion_text)
print(f"Discussion for {key} saved as {output_filename}")
else:
print(f"Discussion section not found for {key}")
print('end')