Skip to content

Instantly share code, notes, and snippets.

@sampsyo
Created August 10, 2011 23:57
Show Gist options
  • Save sampsyo/1138600 to your computer and use it in GitHub Desktop.
Save sampsyo/1138600 to your computer and use it in GitHub Desktop.
Simple dependency-free multi-source lyrics scraper
"""Simple scraper for LyricsWiki and Lyrics.com. The code is hacky and
ugly because I wanted to implement this without a full-blown HTML parser
like BeautifulSoup or lxml.
"""
import urllib
import sys
import re
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
DIV_RE = re.compile(r'<(/?)div>?')
TAG_RE = re.compile(r'<[^>]*>')
BREAK_RE = re.compile(r'<br\s*/?>')
def unescape(text):
"""Resolves &#xxx; HTML entities."""
def replchar(m):
num = m.group(1)
return unichr(int(num))
return re.sub("&#(\d+);", replchar, text)
def extract_text(html, starttag):
"""Extract the text from a <DIV> tag in the HTML starting with
``starttag``. Returns None if parsing fails.
"""
# Strip off the leading text before opening tag.
try:
_, html = html.split(starttag, 1)
except ValueError:
return
# Walk through balanced DIV tags.
level = 0
parts = []
pos = 0
for match in DIV_RE.finditer(html):
if match.group(1): # Closing tag.
level -= 1
if level == 0:
pos = match.end()
else: # Opening tag.
if level == 0:
parts.append(html[pos:match.start()])
level += 1
if level == -1:
parts.append(html[pos:match.start()])
break
else:
print 'no closing tag found!'
return
lyrics = ''.join(parts)
# Strip cruft.
lyrics = COMMENT_RE.sub('', lyrics)
lyrics = unescape(lyrics)
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
lyrics = re.sub(r'\n +', '\n', lyrics)
lyrics = re.sub(r' +\n', '\n', lyrics)
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
lyrics = lyrics.strip()
return lyrics
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
def _lw_encode(s):
s = re.sub(r'\s+', '_', s)
s = s.replace("<", "Less_Than")
s = s.replace(">", "Greater_Than")
s = s.replace("#", "Number_")
s = re.sub(r'[\[\{]', '(', s)
s = re.sub(r'[\]\}]', ')', s)
return urllib.quote(s)
def fetch_lyricswiki(artist, title):
"""Fetch lyrics from LyricsWiki."""
url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
html = urllib.urlopen(url).read()
lyrics = extract_text(html, "<div class='lyricbox'>")
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
return lyrics
LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
def _lc_encode(s):
s = re.sub(r'\s+', '-', s)
return urllib.quote(s)
def fetch_lyricscom(artist, title):
"""Fetch lyrics from Lyrics.com."""
url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
html = urllib.urlopen(url).read()
lyrics = extract_text(html, '<div id="lyric_space">')
if lyrics:
lyrics, _ = lyrics.split('\n---\nLyrics powered by', 1)
return lyrics
BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
def get_lyrics(artist, title):
"""Fetch lyrics, trying each source in turn."""
for backend in BACKENDS:
lyrics = backend(artist, title)
if lyrics:
return lyrics
if __name__ == '__main__':
artist, title = sys.argv[1:]
print get_lyrics(artist, title)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment