-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathb.py
80 lines (65 loc) · 2.46 KB
/
b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from bs4 import BeautifulSoup
import requests
# Luke Skywalker's wookieepedia page url
luke_url = "http://starwars.wikia.com/wiki/Luke_Skywalker"
nien_url = "http://starwars.wikia.com/wiki/Nien_Nunb"
endor_url = "http://starwars.wikia.com/wiki/Battle_of_Endor"
shuttle_url = "http://starwars.wikia.com/wiki/Lambda-class_T-4a_shuttle"
def request_soup(url):
"""Given a url, requests the page content and returns the soup"""
soup = None
try:
page = requests.get(url)
content = page.content
soup = BeautifulSoup(content, 'html.parser')
except:
print "-- FAILED TO GET SOUP:", url
return soup
def html_export(filename, soup):
"""Given a filename and a soup, prettifies the soup and writes it to the
given file
"""
file = open(filename, "w")
pretty = soup.prettify().encode('utf-8')
file.write(pretty)
file.close
def get_links(soup):
"""Given a soup, extracts all the relevant links from the page content and
places each one in a dictionary with the page's url and title, and each
link-dictionary is placed into a list which is returned
"""
all_links = []
if soup is None:
return all_links
for tag in soup.select('p a[href]'):
if "wiki" in tag['href']:
link = {
"url": "http://starwars.wikia.com"+tag["href"],
"title": tag['title']
}
all_links.append(link)
# clean_links represents the original set of links with duplicates removed
clean_links = [dict(t) for t in set([tuple(d.items()) for d in all_links])]
return clean_links
def check_character(link):
"""Given a link, requests the page content and checks if the page is a
character page
Currently the only way I've found to tell if a page is a character page is
to check the javascript for the variable 'wgArticleType' with a value of
"character". Sod's law, some non-characters are still slipping through
"""
soup = request_soup(link['url'])
if soup is not None and "wgArticleType=\"character\"," in soup.script.prettify():
return True
return False
def get_connections(url):
"""Given a url, checks the content for links, checks if each one is a
character and prints them if true
"""
soup = request_soup(url)
links = get_links(soup)
for link in links:
if check_character(link):
print link['title']
if __name__ == "__main__":
get_connections(nien_url)