Created
March 13, 2013 02:55
-
-
Save pipermerriam/5149046 to your computer and use it in GitHub Desktop.
Wanted to scrape some data about the pycon talks I was interested in.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import csv | |
from dateutil import parser | |
from BeautifulSoup import BeautifulSoup | |
CACHE = {} | |
""" | |
https://us.pycon.org/2013/schedule/presentation/35/ | |
https://us.pycon.org/2013/schedule/presentation/68/ | |
https://us.pycon.org/2013/schedule/presentation/135/ | |
https://us.pycon.org/2013/schedule/presentation/63/ | |
https://us.pycon.org/2013/schedule/presentation/39/ | |
https://us.pycon.org/2013/schedule/presentation/117/ | |
https://us.pycon.org/2013/schedule/presentation/45/ | |
https://us.pycon.org/2013/schedule/presentation/88/ | |
https://us.pycon.org/2013/schedule/presentation/140/ | |
https://us.pycon.org/2013/schedule/presentation/118/ | |
https://us.pycon.org/2013/schedule/presentation/100/ | |
https://us.pycon.org/2013/schedule/presentation/124/ | |
https://us.pycon.org/2013/schedule/presentation/113/ | |
https://us.pycon.org/2013/schedule/presentation/59/ | |
https://us.pycon.org/2013/schedule/presentation/60/ | |
https://us.pycon.org/2013/schedule/presentation/34/ | |
https://us.pycon.org/2013/schedule/presentation/146/ | |
https://us.pycon.org/2013/schedule/presentation/92/ | |
https://us.pycon.org/2013/schedule/presentation/125/ | |
https://us.pycon.org/2013/schedule/presentation/74/ | |
https://us.pycon.org/2013/schedule/presentation/97/ | |
https://us.pycon.org/2013/schedule/presentation/216/ | |
https://us.pycon.org/2013/schedule/presentation/72/ | |
https://us.pycon.org/2013/schedule/presentation/215/ | |
https://us.pycon.org/2013/schedule/presentation/78/ | |
https://us.pycon.org/2013/schedule/presentation/129/ | |
""" | |
def get_talk_urls(filename): | |
with open(filename, 'rb') as f: | |
urls = filter(bool, [l.strip('\n') for l in f]) | |
return urls | |
def get_talk_soup(url): | |
if url in CACHE: | |
return BeautifulSoup(CACHE[url]) | |
resp = requests.get(url) | |
resp.raise_for_status() | |
CACHE[url] = resp.content | |
return BeautifulSoup(resp.content) | |
class ParseError(AttributeError): | |
pass | |
def get_talk_info(soup): | |
try: | |
content = soup.findAll('div', attrs={'class': 'box-content'})[0] | |
except IndexError: | |
raise ParseError('No content found in soup') | |
date_el, speaker_el = content.findAll('h4')[:2] | |
start, dash, end = date_el.text.partition('&ndash') | |
start = parse_date(start) | |
end = parse_date(end) | |
speaker = speaker_el.text | |
title = content.findAll('h2')[0].text | |
description = content.findAll('div', attrs={'class': 'description'})[0].text | |
return title, speaker, start, end, description | |
def parse_date(text): | |
text = text.replace('noon', '12:00 p.m.') | |
return parser.parse(text) | |
def scrape_talk(url, output='output.csv'): | |
soup = get_talk_soup(url) | |
return get_talk_info(soup) | |
def scrape_talks(filename='talks.txt'): | |
urls = get_talk_urls(filename) | |
data = [] | |
for url in urls: | |
print "Scraping", url | |
data.append(scrape_talk(url)) | |
return data | |
DAYS = { | |
0: 'Monday', | |
1: 'Tuesday', | |
2: 'Wednesday', | |
3: 'Thursday', | |
4: 'Friday', | |
5: 'Saturday', | |
6: 'Sunday', | |
} | |
def output_talks(output='output.csv'): | |
data = scrape_talks() | |
with open(output, 'wb') as f: | |
writer = csv.writer(f) | |
data.sort(key=lambda d: (d[2].weekday(), d[2].time())) | |
for title, speaker, start, end, description in data: | |
writer.writerow([ | |
title.encode('utf8'), | |
speaker.encode('utf8'), | |
DAYS[start.weekday()], | |
start.strftime('%I:%M %p'), | |
end.strftime('%I:%M %p'), | |
description.encode('utf8'), | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment