Skip to content

Instantly share code, notes, and snippets.

@imankulov
Created December 11, 2012 10:54
Show Gist options
  • Save imankulov/4257729 to your computer and use it in GitHub Desktop.
Save imankulov/4257729 to your computer and use it in GitHub Desktop.
PyCon US Parser
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# requirements: opterator, requests
import csv
import requests
from lxml import html
from opterator import opterate
def get_tree():
text = requests.get('https://us.pycon.org/2012/schedule/').text
tree = html.fromstring(text)
return tree
def parse_tree(tree):
presentations = tree.xpath('//td[contains(@class, "presentation")]')
for pr in presentations:
title = pr.xpath('div[@class="title"]/a/text()')[0]
url = 'https://us.pycon.org' + pr.xpath('div[@class="title"]/a/@href')[0]
speaker = pr.xpath('div[@class="speaker"]/text()')[0]
yield (title, url, speaker)
def write_csv(filename, records):
with open(filename, 'w') as fd:
writer = csv.writer(fd)
for row in records:
row = [i.encode('utf-8') for i in row]
writer.writerow(row)
@opterate
def main(filename):
tree = get_tree()
records = parse_tree(tree)
write_csv(filename, records)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment