sampsyo · August 10, 2011 23:57
diff --git a/lyrics.py b/lyrics.py
 """Simple scraper for LyricsWiki and Lyrics.com. The code is hacky and
 ugly because I wanted to implement this without a full-blown HTML parser
 like BeautifulSoup or lxml.
 """
 import urllib
 import sys
 import re

 COMMENT_RE = re.compile(r'<!--.*-->', re.S)
 DIV_RE = re.compile(r'<(/?)div>?')
 TAG_RE = re.compile(r'<[^>]*>')
 BREAK_RE = re.compile(r'<br\s*/?>')

 def unescape(text):
    """Resolves &#xxx; HTML entities."""
    def replchar(m):
        num = m.group(1)
        return unichr(int(num))
    return re.sub("&#(\d+);", replchar, text)

 def extract_text(html, starttag):
    """Extract the text from a <DIV> tag in the HTML starting with
    ``starttag``. Returns None if parsing fails.
    """
    # Strip off the leading text before opening tag.
    try:
        _, html = html.split(starttag, 1)
    except ValueError:
        return

    # Walk through balanced DIV tags.
    level = 0
    parts = []
    pos = 0
    for match in DIV_RE.finditer(html):
        if match.group(1): # Closing tag.
            level -= 1
            if level == 0:
                pos = match.end()
        else: # Opening tag.
            if level == 0:
                parts.append(html[pos:match.start()])

            level += 1

        if level == -1:
            parts.append(html[pos:match.start()])
            break
    else:
        print 'no closing tag found!'
        return
    lyrics = ''.join(parts)

    # Strip cruft.
    lyrics = COMMENT_RE.sub('', lyrics)
    lyrics = unescape(lyrics)
    lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
    lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
    lyrics = re.sub(r'\n +', '\n', lyrics)
    lyrics = re.sub(r' +\n', '\n', lyrics)
    lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
    lyrics = lyrics.strip()
    return lyrics

 LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
 def _lw_encode(s):
    s = re.sub(r'\s+', '_', s)
    s = s.replace("<", "Less_Than")
    s = s.replace(">", "Greater_Than")
    s = s.replace("#", "Number_")
    s = re.sub(r'[\[\{]', '(', s)
    s = re.sub(r'[\]\}]', ')', s)
    return urllib.quote(s)
 def fetch_lyricswiki(artist, title):
    """Fetch lyrics from LyricsWiki."""
    url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
    html = urllib.urlopen(url).read()

    lyrics = extract_text(html, "<div class='lyricbox'>")
    if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
        return lyrics

 LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
 def _lc_encode(s):
    s = re.sub(r'\s+', '-', s)
    return urllib.quote(s)
 def fetch_lyricscom(artist, title):
    """Fetch lyrics from Lyrics.com."""
    url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
    html = urllib.urlopen(url).read()

    lyrics = extract_text(html, '<div id="lyric_space">')
    if lyrics:
        lyrics, _ = lyrics.split('\n---\nLyrics powered by', 1)
        return lyrics

 BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
 def get_lyrics(artist, title):
    """Fetch lyrics, trying each source in turn."""
    for backend in BACKENDS:
        lyrics = backend(artist, title)
        if lyrics:
            return lyrics

 if __name__ == '__main__':
    artist, title = sys.argv[1:]
    print get_lyrics(artist, title)
	"""Simple scraper for LyricsWiki and Lyrics.com. The code is hacky and
	ugly because I wanted to implement this without a full-blown HTML parser
	like BeautifulSoup or lxml.
	"""
	import urllib
	import sys
	import re

	COMMENT_RE = re.compile(r'<!--.*-->', re.S)
	DIV_RE = re.compile(r'<(/?)div>?')
	TAG_RE = re.compile(r'<[^>]*>')
	BREAK_RE = re.compile(r'<br\s*/?>')

	def unescape(text):
	"""Resolves &#xxx; HTML entities."""
	def replchar(m):
	num = m.group(1)
	return unichr(int(num))
	return re.sub("&#(\d+);", replchar, text)

	def extract_text(html, starttag):
	"""Extract the text from a <DIV> tag in the HTML starting with
	``starttag``. Returns None if parsing fails.
	"""
	# Strip off the leading text before opening tag.
	try:
	_, html = html.split(starttag, 1)
	except ValueError:
	return

	# Walk through balanced DIV tags.
	level = 0
	parts = []
	pos = 0
	for match in DIV_RE.finditer(html):
	if match.group(1): # Closing tag.
	level -= 1
	if level == 0:
	pos = match.end()
	else: # Opening tag.
	if level == 0:
	parts.append(html[pos:match.start()])

	level += 1

	if level == -1:
	parts.append(html[pos:match.start()])
	break
	else:
	print 'no closing tag found!'
	return
	lyrics = ''.join(parts)

	# Strip cruft.
	lyrics = COMMENT_RE.sub('', lyrics)
	lyrics = unescape(lyrics)
	lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
	lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
	lyrics = re.sub(r'\n +', '\n', lyrics)
	lyrics = re.sub(r' +\n', '\n', lyrics)
	lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
	lyrics = lyrics.strip()
	return lyrics

	LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
	def _lw_encode(s):
	s = re.sub(r'\s+', '_', s)
	s = s.replace("<", "Less_Than")
	s = s.replace(">", "Greater_Than")
	s = s.replace("#", "Number_")
	s = re.sub(r'[\[\{]', '(', s)
	s = re.sub(r'[\]\}]', ')', s)
	return urllib.quote(s)
	def fetch_lyricswiki(artist, title):
	"""Fetch lyrics from LyricsWiki."""
	url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
	html = urllib.urlopen(url).read()

	lyrics = extract_text(html, "<div class='lyricbox'>")
	if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
	return lyrics

	LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
	def _lc_encode(s):
	s = re.sub(r'\s+', '-', s)
	return urllib.quote(s)
	def fetch_lyricscom(artist, title):
	"""Fetch lyrics from Lyrics.com."""
	url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
	html = urllib.urlopen(url).read()

	lyrics = extract_text(html, '<div id="lyric_space">')
	if lyrics:
	lyrics, _ = lyrics.split('\n---\nLyrics powered by', 1)
	return lyrics

	BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
	def get_lyrics(artist, title):
	"""Fetch lyrics, trying each source in turn."""
	for backend in BACKENDS:
	lyrics = backend(artist, title)
	if lyrics:
	return lyrics

	if __name__ == '__main__':
	artist, title = sys.argv[1:]
	print get_lyrics(artist, title)