talks2019.py

# -*- coding: utf-8 -*-
"""PyConSchedule.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1olRI3ZjZUt5UOC6o2REugCEhhqDDUncC
"""

import yaml
from yaml.representer import SafeRepresenter

import requests
from bs4 import BeautifulSoup

class folded_str(str): pass

class literal_str(str): pass

def change_style(style, representer):
    def new_representer(dumper, data):
        scalar = representer(dumper, data)
        scalar.style = style
        return scalar
    return new_representer

# represent_str does handle some corner cases, so use that
# instead of calling represent_scalar directly
represent_folded_str = change_style('>', SafeRepresenter.represent_str)
represent_literal_str = change_style('|', SafeRepresenter.represent_str)

yaml.add_representer(folded_str, represent_folded_str)
yaml.add_representer(literal_str, represent_literal_str)

r = requests.get("https://th.pycon.org/en/talks/") 
soup = BeautifulSoup(r.content, "html.parser")
titles = list(map(lambda x: x.text,soup.find_all("h2")))
speakers = list(map(lambda x: x.find_next("p").text[3:],soup.find_all("h2")))
descriptions = list(map(lambda x: x.find_next("div",{"class":"section"}).find("p").text,soup.find_all("h2")))
print(titles)
print(speakers)
print(descriptions)

bio = {}
speakerimg = {}
r = requests.get("https://th.pycon.org/en/speakers/") 
soup = BeautifulSoup(r.content, "html.parser")
for h2tag in soup.find_all("h2"):
  name = h2tag.text.lower()
  biotag = h2tag.find_next("div",{"class":"section"})
  currbio = str(biotag.decode_contents()).replace("\u2019","'").replace("\u2013","").replace("<h3>Biography</h3>","").strip()
  bio[name] = currbio
  imgtag = h2tag.find_next("img")
  if imgtag: speakerimg[name] = imgtag["src"]

print(speakerimg)
print(bio)

data = []

for i in range(len(speakers)):
  try:
    subdata = {
        "title": titles[i],
        "speaker": speakers[i],
        "description": folded_str(descriptions[i]),
        "bio": literal_str(bio[speakers[i].lower()]),
        "speakerimg": speakerimg[speakers[i].lower()]
    }
    data.append(subdata)
  except:
    subdata = {
        "title": titles[i],
        "speaker": speakers[i],
        "description": folded_str(descriptions[i]),
    }
    data.append(subdata)
    print(speakers[i])
    
print(data)

with open('talks.yaml', 'w') as outfile:
  yaml.dump({"talks":data}, outfile, default_flow_style=False)