Skip to content

Instantly share code, notes, and snippets.

@jaredculp
Last active May 21, 2020 00:16
Show Gist options
  • Save jaredculp/8241687 to your computer and use it in GitHub Desktop.
Save jaredculp/8241687 to your computer and use it in GitHub Desktop.
Web scraper to grap all setlists from http://dmbalmanac.com and output to CSV file. Invoked via `python dmb-scraper.py START_YEAR END_YEAR OUTPUT_FILENAME`. To get song counts invoke `python dmb-analysis.py DATA_FILENAME`.
import csv
import sys
from collections import Counter
with open (sys.argv[1], 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
reader.next()
songs = [row[4] for row in reader]
print "DMB Song Counts\n==="
for (k,v) in Counter(songs).iteritems():
print "-\t**%s** has been played **%d** times" % (k,v)
import unicodecsv as csv
import re
import sys
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool
from urllib2 import urlopen
from urlparse import urlparse, parse_qs
DMB_ALMANAC_URL = "http://dmbalmanac.com"
OPENER_COLOR = "#006666"
CLOSER_COLOR = "#336699"
SECOND_OPENER_COLOR = "#004040"
SECOND_CLOSER_COLOR = "#214263"
ENCORE_COLOR = "#660000"
SECOND_ENCORE_COLOR = "#CC0000"
class DMBShow():
def __init__(self, **kwargs):
self.id = kwargs.get('id', None)
self.url = kwargs.get('url', None)
self.date = kwargs.get('date', None)
def __repr__(self):
return "<Show %s>" % self.date
class DMBSong():
def __init__(self, **kwargs):
self.name = kwargs.get('name', None)
self.number = kwargs.get('number', None)
self.slot = kwargs.get('slot', None)
self.time = kwargs.get('time', None)
self.show_id = kwargs.get('show_id', None)
def __repr__(self):
return "<Song #%s %s %s>" % (self.number, self.name, self.slot)
def make_soup(url):
return BeautifulSoup(urlopen(url), "lxml")
def get_shows_for_tour(tour_url):
"Builds up DMBShow objects for all shows on a given tour year"
print "Getting shows for tour {0}".format(tour_url)
soup = make_soup(DMB_ALMANAC_URL +
"/TourShow.aspx?where=" + tour_url)
shows = soup.find_all(href=re.compile("TourShowSet"))
shows_for_tour = [get_show_info(show) for show in shows if get_show_info(show) is not None]
return shows_for_tour
def get_show_info(show):
show_url = show.get("href")[1:]
show_data = {
"id": parse_qs(urlparse(show_url).query).get("id")[0],
"url": show_url,
"date": show.text.encode('ascii', 'ignore'),
}
if show_data.get('date'):
return DMBShow(**show_data)
def get_setlist_for_show(show):
"Returns a mapping from a given show to a list of songs played"
print "\t-- Getting setlist for show {0}".format(show)
soup = make_soup(DMB_ALMANAC_URL + show.url)
songs = soup.find_all("a", "lightorange")
setlist = [get_song_info(song, show.id) for song in songs]
return {show: setlist}
def get_song_info(song, show_id):
siblings = list(song.parent.next_siblings)
song_data = {
"name": song.text.encode('ascii', 'ignore'),
"number": song.parent.previous_sibling.text.encode('ascii', 'ignore'),
"slot": determine_song_type(song),
"time": siblings[0].text.encode('ascii', 'ignore'),
"show_id": show_id
}
return DMBSong(**song_data)
def determine_song_type(song):
"Determines whether a song was played in an opener or encore slot"
bg_color = song.parent.previous_sibling.get("bgcolor")
if bg_color == OPENER_COLOR or bg_color == SECOND_OPENER_COLOR:
song_type = "(opener)"
elif bg_color == CLOSER_COLOR or bg_color == SECOND_CLOSER_COLOR:
song_type = "(closer)"
elif bg_color == ENCORE_COLOR:
song_type = "(encore)"
elif bg_color == SECOND_ENCORE_COLOR:
song_type = "(second encore)"
else:
song_type = None
return song_type
def write_data_to_tsv(data, filename):
with open("data/%s.tsv" % filename, "w+b") as f:
fieldnames = ("show_id", "show_date", "slot", "number", "song")
output = csv.writer(f, delimiter="\t")
output.writerow(fieldnames)
for tour in data:
for show_set in tour:
for show, setlist in show_set.iteritems():
for song in setlist:
output.writerow([show.id, show.date,
song.slot, song.number, song.name])
print "Done writing file"
if __name__ == '__main__':
try:
START_DATE = int(sys.argv[1])
END_DATE = int(sys.argv[2])
FILENAME = str(sys.argv[3])
except IndexError:
print "Provide a start and end year YYYY and output name"
sys.exit()
results = []
tour = START_DATE
while tour < END_DATE:
shows = get_shows_for_tour(str(tour))
pool = ThreadPool(8)
result = pool.map(get_setlist_for_show, shows)
results.append(result)
pool.close()
pool.join()
tour = tour + 1
write_data_to_tsv(results, FILENAME)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment