Created
August 10, 2011 23:57
-
-
Save sampsyo/1138600 to your computer and use it in GitHub Desktop.
Simple dependency-free multi-source lyrics scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Simple scraper for LyricsWiki and Lyrics.com. The code is hacky and | |
ugly because I wanted to implement this without a full-blown HTML parser | |
like BeautifulSoup or lxml. | |
""" | |
import urllib | |
import sys | |
import re | |
COMMENT_RE = re.compile(r'<!--.*-->', re.S) | |
DIV_RE = re.compile(r'<(/?)div>?') | |
TAG_RE = re.compile(r'<[^>]*>') | |
BREAK_RE = re.compile(r'<br\s*/?>') | |
def unescape(text): | |
"""Resolves &#xxx; HTML entities.""" | |
def replchar(m): | |
num = m.group(1) | |
return unichr(int(num)) | |
return re.sub("&#(\d+);", replchar, text) | |
def extract_text(html, starttag): | |
"""Extract the text from a <DIV> tag in the HTML starting with | |
``starttag``. Returns None if parsing fails. | |
""" | |
# Strip off the leading text before opening tag. | |
try: | |
_, html = html.split(starttag, 1) | |
except ValueError: | |
return | |
# Walk through balanced DIV tags. | |
level = 0 | |
parts = [] | |
pos = 0 | |
for match in DIV_RE.finditer(html): | |
if match.group(1): # Closing tag. | |
level -= 1 | |
if level == 0: | |
pos = match.end() | |
else: # Opening tag. | |
if level == 0: | |
parts.append(html[pos:match.start()]) | |
level += 1 | |
if level == -1: | |
parts.append(html[pos:match.start()]) | |
break | |
else: | |
print 'no closing tag found!' | |
return | |
lyrics = ''.join(parts) | |
# Strip cruft. | |
lyrics = COMMENT_RE.sub('', lyrics) | |
lyrics = unescape(lyrics) | |
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse. | |
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines. | |
lyrics = re.sub(r'\n +', '\n', lyrics) | |
lyrics = re.sub(r' +\n', '\n', lyrics) | |
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags. | |
lyrics = lyrics.strip() | |
return lyrics | |
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s' | |
def _lw_encode(s): | |
s = re.sub(r'\s+', '_', s) | |
s = s.replace("<", "Less_Than") | |
s = s.replace(">", "Greater_Than") | |
s = s.replace("#", "Number_") | |
s = re.sub(r'[\[\{]', '(', s) | |
s = re.sub(r'[\]\}]', ')', s) | |
return urllib.quote(s) | |
def fetch_lyricswiki(artist, title): | |
"""Fetch lyrics from LyricsWiki.""" | |
url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title)) | |
html = urllib.urlopen(url).read() | |
lyrics = extract_text(html, "<div class='lyricbox'>") | |
if lyrics and 'Unfortunately, we are not licensed' not in lyrics: | |
return lyrics | |
LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html' | |
def _lc_encode(s): | |
s = re.sub(r'\s+', '-', s) | |
return urllib.quote(s) | |
def fetch_lyricscom(artist, title): | |
"""Fetch lyrics from Lyrics.com.""" | |
url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist)) | |
html = urllib.urlopen(url).read() | |
lyrics = extract_text(html, '<div id="lyric_space">') | |
if lyrics: | |
lyrics, _ = lyrics.split('\n---\nLyrics powered by', 1) | |
return lyrics | |
BACKENDS = [fetch_lyricswiki, fetch_lyricscom] | |
def get_lyrics(artist, title): | |
"""Fetch lyrics, trying each source in turn.""" | |
for backend in BACKENDS: | |
lyrics = backend(artist, title) | |
if lyrics: | |
return lyrics | |
if __name__ == '__main__': | |
artist, title = sys.argv[1:] | |
print get_lyrics(artist, title) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment