-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathtalks2019.py
84 lines (67 loc) · 2.4 KB
/
talks2019.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
"""PyConSchedule.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1olRI3ZjZUt5UOC6o2REugCEhhqDDUncC
"""
import yaml
from yaml.representer import SafeRepresenter
import requests
from bs4 import BeautifulSoup
class folded_str(str): pass
class literal_str(str): pass
def change_style(style, representer):
def new_representer(dumper, data):
scalar = representer(dumper, data)
scalar.style = style
return scalar
return new_representer
# represent_str does handle some corner cases, so use that
# instead of calling represent_scalar directly
represent_folded_str = change_style('>', SafeRepresenter.represent_str)
represent_literal_str = change_style('|', SafeRepresenter.represent_str)
yaml.add_representer(folded_str, represent_folded_str)
yaml.add_representer(literal_str, represent_literal_str)
r = requests.get("https://th.pycon.org/en/talks/")
soup = BeautifulSoup(r.content, "html.parser")
titles = list(map(lambda x: x.text,soup.find_all("h2")))
speakers = list(map(lambda x: x.find_next("p").text[3:],soup.find_all("h2")))
descriptions = list(map(lambda x: x.find_next("div",{"class":"section"}).find("p").text,soup.find_all("h2")))
print(titles)
print(speakers)
print(descriptions)
bio = {}
speakerimg = {}
r = requests.get("https://th.pycon.org/en/speakers/")
soup = BeautifulSoup(r.content, "html.parser")
for h2tag in soup.find_all("h2"):
name = h2tag.text.lower()
biotag = h2tag.find_next("div",{"class":"section"})
currbio = str(biotag.decode_contents()).replace("\u2019","'").replace("\u2013","").replace("<h3>Biography</h3>","").strip()
bio[name] = currbio
imgtag = h2tag.find_next("img")
if imgtag: speakerimg[name] = imgtag["src"]
print(speakerimg)
print(bio)
data = []
for i in range(len(speakers)):
try:
subdata = {
"title": titles[i],
"speaker": speakers[i],
"description": folded_str(descriptions[i]),
"bio": literal_str(bio[speakers[i].lower()]),
"speakerimg": speakerimg[speakers[i].lower()]
}
data.append(subdata)
except:
subdata = {
"title": titles[i],
"speaker": speakers[i],
"description": folded_str(descriptions[i]),
}
data.append(subdata)
print(speakers[i])
print(data)
with open('talks.yaml', 'w') as outfile:
yaml.dump({"talks":data}, outfile, default_flow_style=False)