-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathWattpadConnect.py
107 lines (96 loc) · 4.21 KB
/
WattpadConnect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import requests
import json
from bs4 import BeautifulSoup
# use this to get json
# IMPORTANT NOTE: It's not possible to get a full list of an user's stories.
# You can only get THREE of them.
# With metadataJSON['metadata']['data'][0]['stories']['total] you can get a number of the user's total stories count.
# User lists and conversations/comments are not aviable til yet.
def extractInformationFromWattpad(url):
# wattpad.com/story/-URLs don't include the metadata.
# Nobody knows why BUT we can still get the information by getting the json of
# the first part.
isstory = False
if "wattpad.com/story/" in url:
isstory = True
originalurl = url
firstparturl = extractFirstPartLink(url)
url = firstparturl
try:
# user-agent header is important, otherwise wattpad will return a 403 Forbidden:
mozillaheader = {
"User-Agent": "Mozilla/5.0"
}
response = requests.get(url, headers=mozillaheader)
if response.ok == False:
print("Network response was not OK: " , response.status_code)
pagesourcecode = response.text
# searching the point 'window.prefetched' in source code, which contains the informations:
start_index = pagesourcecode.find('window.prefetched')
if start_index == -1:
# couldn't find 'window.prefetched', so lets return none:
return None
# find the beginning of the JSON by searching for a {:
open_brace_index = pagesourcecode.find('{', start_index)
# find the end of the JSON by counting the number of { and }:
open_braces_count = 1
end_index = open_brace_index + 1
while open_braces_count > 0:
if pagesourcecode[end_index] == '{':
open_braces_count += 1
elif pagesourcecode[end_index] == '}':
open_braces_count -= 1
end_index += 1
# extract the JSON:
jsonCode = pagesourcecode[open_brace_index:end_index]
try:
# try to parse it:
parsedData = json.loads(jsonCode)
if "wattpad.com/user/" in url :
metadataoriginalkey, worksoriginalkey, latestactivityoriginalkey = parsedData.keys()
metadata = parsedData[metadataoriginalkey]
works = parsedData[worksoriginalkey]
latestactivity = parsedData[latestactivityoriginalkey]
finaljson = {
"metadata": metadata,
"works": works,
"latestactivity": latestactivity
}
else:
try:
metadataoriginalkey = list(parsedData.keys())[0]
metadata = parsedData[metadataoriginalkey]
finaljson = {
"metadata": metadata
}
except Exception as error:
print("An Error occurred - maybe a broken link?\nYou can only get wattpad link to user profiles, stories or story-parts.\nPlease read the README.\nError: " , error)
return finaljson
except Exception as error:
print("Error while parsing the JSON: ", error)
except Exception as error:
print("An Error occurred: ", error)
# Code to extract the link to the first part of a book of the wattpad.com/story/xyz URL:
def extractFirstPartLink(url):
try:
mozillaheader = {
"User-Agent": "Mozilla/5.0"
}
response = requests.get(url, headers=mozillaheader)
if response.ok == False:
print("Network response was not OK: " , response.status_code)
sourcecode = response.text
soup = BeautifulSoup(sourcecode, "html.parser")
# find "start reading" button in source code which contains the link:
readBtnElement = soup.find("a", class_="read-btn")
# check if found and return link:
if readBtnElement:
link = readBtnElement["href"]
print(link)
return "https://www.wattpad.com" + link
else:
print("error :(")
return "error!"
except Exception as error:
print("paring error:", error)
return "error!"