forked from yuanxu-li/quora-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
70 lines (57 loc) · 2.07 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from utilities.helpers import correct_url
from driver import Driver
import pdb
class Parser(object):
def __init__(self, url):
self.url = correct_url(url)
self.driver = Driver()
# yield items
def parse_user(self, user_id, depth):
if depth == 0:
return
# print(user_id, depth)
time.sleep(1)
# render the whole page and make it into a beautiful soup :)
this_url = urljoin(self.url, user_id) + "/followers"
self.driver.get(this_url)
self.driver.scroll_down(50)
html = self.driver.page_source
soup = BeautifulSoup(html, "html.parser")
# print this_url, driver.current_url, len(html)
# find the information of this user
# pdb.set_trace()
user_soup = soup.find("span", class_="user")
if user_soup:
name = user_soup.get_text()
else:
name = ""
credential_soup = soup.find("span", class_="UserCredential IdentityCredential")
if credential_soup:
credential = credential_soup.get_text()
else:
credential = ""
description_soup = soup.find("p", class_="qtext_para")
if description_soup:
description = description_soup.get_text()
else:
description = ""
# find the information of his/her followers and recursively crawl the followers
followers = []
for follower in soup.find_all("a", class_="user"):
follower_name = follower.get_text()
partial_href = follower.get("href")
# print(follower_name, partial_href)
follower_id = partial_href.split('/')[-1]
followers.append([follower_id, follower_name])
for item in self.parse_user(follower_id, depth-1):
yield item
yield {
"_id": user_id,
"name": name,
"credential": credential,
"description": description,
"followers": followers
}