-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
95 lines (61 loc) · 2.24 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from multiprocessing.pool import ThreadPool as Pool
from blc import BrokenLinkChecker
from file_operations import FileOperations
from link import Link
from request import Request
from scanner import Scanner
MAX_POOL_SIZE = 10
class Main:
def __init__(self, url, is_crawl_request=False):
self.should_crawl = is_crawl_request
self.link = Link(url)
self.blc = BrokenLinkChecker()
self.file_operations = FileOperations(self.link)
self.request = Request()
self.scanner = Scanner()
self.pool_size = MAX_POOL_SIZE
self.print_line = f'Processing {url}'
print(self.print_line)
def run(self):
try:
url = self.link.get_url()
is_media_url = self.link.is_media_url(url)
if is_media_url:
return None
response = self.request.get_text_response(url)
if not response:
return None
self.parse_response(response)
except Exception as e:
print(f'Error processing url {url}')
print(e)
def parse_response(self, response):
pool = Pool(self.pool_size)
matches = self.scanner.find_all_links(response)
for url in matches:
pool.apply_async(self.process, (url,))
pool.daemon = True
pool.close()
pool.join()
def process(self, url):
if self.should_crawl:
self.process_crawl(url)
else:
self.process_blc(url)
def process_blc(self, url):
self.file_operations.write_in_output(f'{self.print_line}\n{url}\n\n')
if self.link.is_same_domain(url):
return None
if not self.blc.is_broken(url):
print_line = f'|---OK---| {url}'
print(print_line)
return None
print_line = f'|-BROKEN-| {url}'
print(print_line)
self.file_operations.write_in_broken(url)
def process_crawl(self, url):
is_media_url = self.link.is_media_url(url)
is_same_domain_url = self.link.is_same_domain(url)
if is_same_domain_url and not is_media_url:
print(f'Found link: {url}')
self.file_operations.write_in_links(url)