Last active
August 29, 2015 14:28
-
-
Save explodecomputer/812903b2696c52f5610b to your computer and use it in GitHub Desktop.
Parse Bristol's gig listings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import urllib2 | |
from bs4 import BeautifulSoup, NavigableString, Tag | |
def get_headfirstbristol(): | |
web_page = urllib2.urlopen("http://www.headfirstbristol.co.uk/gig-listings").read() | |
soup = BeautifulSoup(web_page) | |
band = [] | |
place = [] | |
description = [] | |
dates = [] | |
date_tags = soup.select("div.date_container.date_font") | |
for tag in date_tags: | |
nextelem = tag.nextSibling | |
while check_nextelem(nextelem): | |
if isinstance(nextelem, Tag): | |
if set(['event', 'z']) <= set(nextelem['class']): | |
temp = nextelem.find("a").contents[0] | |
temp = [s.strip() for s in temp.split('@')] | |
band.append(temp[0]) | |
place.append(temp[1]) | |
description.append(nextelem.find("p").contents[0]) | |
dates.append(tag.contents[0]) | |
if not nextelem == None: | |
nextelem = nextelem.nextSibling | |
else: | |
break | |
def check_nextelem(elem): | |
if isinstance(elem, Tag): | |
if elem['class'] != ['date_container', 'date_font']: | |
return True | |
else: | |
return False | |
else: | |
return True | |
def get_thekla(): | |
web_page = urllib2.urlopen("http://www.theklabristol.co.uk/live").read() | |
soup = BeautifulSoup(web_page) | |
gigs = soup.find_all("div", attrs={"class": "details_wrapper_inner"}) | |
band = [gig.select("h4.gig_artist")[0].contents[0] for gig in gigs] | |
date = [gig.select("h3.gig_date")[0].contents[0] for gig in gigs] | |
def get_colstonhall(): | |
web_page = urllib2.urlopen("http://www.colstonhall.org/whats-on/").read() | |
soup = BeautifulSoup(web_page) | |
a = soup.find_all("h2", attrs={"itemprop": "summary"}) | |
def get_ents(url): | |
web_page = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(web_page) | |
gigs = soup.find_all("div", attrs={"class": "event-list-item-container"}) | |
band = [gig.find("h3").contents[-1] for gig in gigs] | |
date = [gig.find("abbr", attrs={"class": "dtstart"})['title'] for gig in gigs] | |
res = [band, date] | |
return res | |
thekla = get_ents("https://www.ents24.com/bristol-events/thekla") | |
colston = get_ents("https://www.ents24.com/bristol-events/colston-hall") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment