Last active
May 21, 2020 00:16
-
-
Save jaredculp/8241687 to your computer and use it in GitHub Desktop.
Web scraper to grap all setlists from http://dmbalmanac.com and output to CSV file. Invoked via `python dmb-scraper.py START_YEAR END_YEAR OUTPUT_FILENAME`. To get song counts invoke `python dmb-analysis.py DATA_FILENAME`.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
from collections import Counter | |
with open (sys.argv[1], 'rb') as csvfile: | |
reader = csv.reader(csvfile, delimiter='\t') | |
reader.next() | |
songs = [row[4] for row in reader] | |
print "DMB Song Counts\n===" | |
for (k,v) in Counter(songs).iteritems(): | |
print "-\t**%s** has been played **%d** times" % (k,v) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unicodecsv as csv | |
import re | |
import sys | |
from bs4 import BeautifulSoup | |
from multiprocessing.dummy import Pool as ThreadPool | |
from urllib2 import urlopen | |
from urlparse import urlparse, parse_qs | |
DMB_ALMANAC_URL = "http://dmbalmanac.com" | |
OPENER_COLOR = "#006666" | |
CLOSER_COLOR = "#336699" | |
SECOND_OPENER_COLOR = "#004040" | |
SECOND_CLOSER_COLOR = "#214263" | |
ENCORE_COLOR = "#660000" | |
SECOND_ENCORE_COLOR = "#CC0000" | |
class DMBShow(): | |
def __init__(self, **kwargs): | |
self.id = kwargs.get('id', None) | |
self.url = kwargs.get('url', None) | |
self.date = kwargs.get('date', None) | |
def __repr__(self): | |
return "<Show %s>" % self.date | |
class DMBSong(): | |
def __init__(self, **kwargs): | |
self.name = kwargs.get('name', None) | |
self.number = kwargs.get('number', None) | |
self.slot = kwargs.get('slot', None) | |
self.time = kwargs.get('time', None) | |
self.show_id = kwargs.get('show_id', None) | |
def __repr__(self): | |
return "<Song #%s %s %s>" % (self.number, self.name, self.slot) | |
def make_soup(url): | |
return BeautifulSoup(urlopen(url), "lxml") | |
def get_shows_for_tour(tour_url): | |
"Builds up DMBShow objects for all shows on a given tour year" | |
print "Getting shows for tour {0}".format(tour_url) | |
soup = make_soup(DMB_ALMANAC_URL + | |
"/TourShow.aspx?where=" + tour_url) | |
shows = soup.find_all(href=re.compile("TourShowSet")) | |
shows_for_tour = [get_show_info(show) for show in shows if get_show_info(show) is not None] | |
return shows_for_tour | |
def get_show_info(show): | |
show_url = show.get("href")[1:] | |
show_data = { | |
"id": parse_qs(urlparse(show_url).query).get("id")[0], | |
"url": show_url, | |
"date": show.text.encode('ascii', 'ignore'), | |
} | |
if show_data.get('date'): | |
return DMBShow(**show_data) | |
def get_setlist_for_show(show): | |
"Returns a mapping from a given show to a list of songs played" | |
print "\t-- Getting setlist for show {0}".format(show) | |
soup = make_soup(DMB_ALMANAC_URL + show.url) | |
songs = soup.find_all("a", "lightorange") | |
setlist = [get_song_info(song, show.id) for song in songs] | |
return {show: setlist} | |
def get_song_info(song, show_id): | |
siblings = list(song.parent.next_siblings) | |
song_data = { | |
"name": song.text.encode('ascii', 'ignore'), | |
"number": song.parent.previous_sibling.text.encode('ascii', 'ignore'), | |
"slot": determine_song_type(song), | |
"time": siblings[0].text.encode('ascii', 'ignore'), | |
"show_id": show_id | |
} | |
return DMBSong(**song_data) | |
def determine_song_type(song): | |
"Determines whether a song was played in an opener or encore slot" | |
bg_color = song.parent.previous_sibling.get("bgcolor") | |
if bg_color == OPENER_COLOR or bg_color == SECOND_OPENER_COLOR: | |
song_type = "(opener)" | |
elif bg_color == CLOSER_COLOR or bg_color == SECOND_CLOSER_COLOR: | |
song_type = "(closer)" | |
elif bg_color == ENCORE_COLOR: | |
song_type = "(encore)" | |
elif bg_color == SECOND_ENCORE_COLOR: | |
song_type = "(second encore)" | |
else: | |
song_type = None | |
return song_type | |
def write_data_to_tsv(data, filename): | |
with open("data/%s.tsv" % filename, "w+b") as f: | |
fieldnames = ("show_id", "show_date", "slot", "number", "song") | |
output = csv.writer(f, delimiter="\t") | |
output.writerow(fieldnames) | |
for tour in data: | |
for show_set in tour: | |
for show, setlist in show_set.iteritems(): | |
for song in setlist: | |
output.writerow([show.id, show.date, | |
song.slot, song.number, song.name]) | |
print "Done writing file" | |
if __name__ == '__main__': | |
try: | |
START_DATE = int(sys.argv[1]) | |
END_DATE = int(sys.argv[2]) | |
FILENAME = str(sys.argv[3]) | |
except IndexError: | |
print "Provide a start and end year YYYY and output name" | |
sys.exit() | |
results = [] | |
tour = START_DATE | |
while tour < END_DATE: | |
shows = get_shows_for_tour(str(tour)) | |
pool = ThreadPool(8) | |
result = pool.map(get_setlist_for_show, shows) | |
results.append(result) | |
pool.close() | |
pool.join() | |
tour = tour + 1 | |
write_data_to_tsv(results, FILENAME) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment