Skip to content

Instantly share code, notes, and snippets.

@pipermerriam
Created March 13, 2013 02:55
Show Gist options
  • Save pipermerriam/5149046 to your computer and use it in GitHub Desktop.
Save pipermerriam/5149046 to your computer and use it in GitHub Desktop.
Wanted to scrape some data about the pycon talks I was interested in.
import requests
import csv
from dateutil import parser
from BeautifulSoup import BeautifulSoup
CACHE = {}
"""
https://us.pycon.org/2013/schedule/presentation/35/
https://us.pycon.org/2013/schedule/presentation/68/
https://us.pycon.org/2013/schedule/presentation/135/
https://us.pycon.org/2013/schedule/presentation/63/
https://us.pycon.org/2013/schedule/presentation/39/
https://us.pycon.org/2013/schedule/presentation/117/
https://us.pycon.org/2013/schedule/presentation/45/
https://us.pycon.org/2013/schedule/presentation/88/
https://us.pycon.org/2013/schedule/presentation/140/
https://us.pycon.org/2013/schedule/presentation/118/
https://us.pycon.org/2013/schedule/presentation/100/
https://us.pycon.org/2013/schedule/presentation/124/
https://us.pycon.org/2013/schedule/presentation/113/
https://us.pycon.org/2013/schedule/presentation/59/
https://us.pycon.org/2013/schedule/presentation/60/
https://us.pycon.org/2013/schedule/presentation/34/
https://us.pycon.org/2013/schedule/presentation/146/
https://us.pycon.org/2013/schedule/presentation/92/
https://us.pycon.org/2013/schedule/presentation/125/
https://us.pycon.org/2013/schedule/presentation/74/
https://us.pycon.org/2013/schedule/presentation/97/
https://us.pycon.org/2013/schedule/presentation/216/
https://us.pycon.org/2013/schedule/presentation/72/
https://us.pycon.org/2013/schedule/presentation/215/
https://us.pycon.org/2013/schedule/presentation/78/
https://us.pycon.org/2013/schedule/presentation/129/
"""
def get_talk_urls(filename):
with open(filename, 'rb') as f:
urls = filter(bool, [l.strip('\n') for l in f])
return urls
def get_talk_soup(url):
if url in CACHE:
return BeautifulSoup(CACHE[url])
resp = requests.get(url)
resp.raise_for_status()
CACHE[url] = resp.content
return BeautifulSoup(resp.content)
class ParseError(AttributeError):
pass
def get_talk_info(soup):
try:
content = soup.findAll('div', attrs={'class': 'box-content'})[0]
except IndexError:
raise ParseError('No content found in soup')
date_el, speaker_el = content.findAll('h4')[:2]
start, dash, end = date_el.text.partition('&ndash')
start = parse_date(start)
end = parse_date(end)
speaker = speaker_el.text
title = content.findAll('h2')[0].text
description = content.findAll('div', attrs={'class': 'description'})[0].text
return title, speaker, start, end, description
def parse_date(text):
text = text.replace('noon', '12:00 p.m.')
return parser.parse(text)
def scrape_talk(url, output='output.csv'):
soup = get_talk_soup(url)
return get_talk_info(soup)
def scrape_talks(filename='talks.txt'):
urls = get_talk_urls(filename)
data = []
for url in urls:
print "Scraping", url
data.append(scrape_talk(url))
return data
DAYS = {
0: 'Monday',
1: 'Tuesday',
2: 'Wednesday',
3: 'Thursday',
4: 'Friday',
5: 'Saturday',
6: 'Sunday',
}
def output_talks(output='output.csv'):
data = scrape_talks()
with open(output, 'wb') as f:
writer = csv.writer(f)
data.sort(key=lambda d: (d[2].weekday(), d[2].time()))
for title, speaker, start, end, description in data:
writer.writerow([
title.encode('utf8'),
speaker.encode('utf8'),
DAYS[start.weekday()],
start.strftime('%I:%M %p'),
end.strftime('%I:%M %p'),
description.encode('utf8'),
])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment