-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl_to_html.py
43 lines (40 loc) · 1.97 KB
/
url_to_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from urllib.request import urlopen # To open URL provided
import urllib.request # To make the request
from urllib.error import HTTPError
import sys # Keeping for now
class bcolors: # Grabbed from https://stackoverflow.com/questions/287871/how-to-print-colored-text-to-the-terminal
# These should work on every terminal with python3+
WARNING = '\033[93m' # Yellow
FAIL = '\033[91m' # Red
ENDC = '\033[0m' # Normal
class settings():
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0" # Use what user agent you want, new one to be sure
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','User-Agent': user_agent} # header from Google :P
debug = 0
def get_html(url, *args):
try:
if args[0] == "-d" or args[0] == "debug":
print("Debug is on")
settings.debug = 1
except IndexError:
pass
req = urllib.request.Request(url,None, settings.headers) # Making the request
try:
page = urlopen(req) # "Opening" the url with previous parameters
html_bytes = page.read() # Reading the page
html = html_bytes.decode("utf-8") # Decoding to utf-8 to get proper Ä,Ö and Å
except HTTPError as err: # https://stackoverflow.com/questions/3193060/catch-specific-http-error-in-python
if err.code == 404:
print(f"{bcolors.WARNING}HTTP Error 404{bcolors.ENDC}")
html = False # Just to make sure
if settings.debug == 1:
print("404 url:", url) # DEBUG
elif err.code == 403: # Forbidden
print(f"{bcolors.WARNING}HTTP Error 403 forbidden, most likely getting rate limited{bcolors.ENDC}")
else:
raise
return err.code # Returns if any error happens
except Exception:
print(f"{bcolors.FAIL}Cannot access webpage{bcolors.ENDC}") # If we get timed out or other issues
# sys.exit(0) # Not sure
return html # If no errors