Created
January 15, 2014 17:02
-
-
Save neilhawkins/8440073 to your computer and use it in GitHub Desktop.
Scraper collecting all pieces performed at proms from BBC archive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scraperwiki, urllib2, re, time | |
import lxml.html | |
from pprint import pprint | |
decades = [x*10 for x in range(189,202)] | |
for decade in decades: | |
years = [decade+y for y in range(10)] | |
for year in years: | |
url = 'http://www.bbc.co.uk/proms/archive/search/'+str(decade)+'s/'+str(year) | |
try: | |
html = scraperwiki.scrape(url) | |
except urllib2.HTTPError, err: | |
continue | |
root = lxml.html.fromstring(html) | |
concerts = root.cssselect('div.season_listing ul li') | |
h1 = root.cssselect('h1')[0].text_content() | |
year = re.findall(r"\d+", h1) | |
print "fetching proms from " + year[0] | |
unique_keys = ['prom_id'] | |
pieces = [] | |
for index, concert in enumerate(concerts, start=1): | |
prom_title = concert.cssselect('div a')[0].text_content() | |
prom_date = concert.cssselect('div strong')[0].text_content() | |
#for br in concert.xpath('div/p/*//br'): | |
# br.tail = "\n" + br.tail if br.tail else "\n" | |
setlist = concert.xpath('div/p/text()') | |
for index2, item in enumerate(setlist, start=1): | |
try: | |
comp_piece = item.split(' - ') | |
composer = comp_piece[0].strip(' ') | |
piece_title = comp_piece[1].strip(' ') | |
except IndexError: | |
continue | |
piece = { | |
'prom_id': year[0]+ "-" +str(index).zfill(2)+ "-" +str(index2).zfill(2), | |
'composer': composer, | |
'piece': piece_title, | |
'prom_title': prom_title, | |
'prom_date': prom_date, | |
} | |
pieces.append(piece) | |
scraperwiki.sql.save(unique_keys, pieces) | |
print "done" | |
time.sleep(2) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment