andlima · April 7, 2012 03:59
diff --git a/scrape_letras_terra.py b/scrape_letras_terra.py
 """

 A short script that, given an artist, scrapes all his/her/their songs
 from 'http://letras.terra.com.br'

 Dependence: BeautifulSoup, to parse the lyrics from each song page.

 Author: Andre Lima - http://github.com/andlima

 Licensed under MIT License

 Disclaimer: Sorry for the ugly code and lack of documentation. I'll try
 to give it some love later.

 """

 import sys
 import os
 import re
 import urllib

 from BeautifulSoup import BeautifulSoup


 URL_BASE = ['http://letras.terra.com.br']
 ELEMSUG = '<a class="elemsug" href="(?P<song>[^"]+)">'
 DIV_LETRA = '<div id="div_letra">(?P<lyrics>.+)</div>'
 CACHE = ['.', 'cache']


 def make_sure_dir_exists(d):
    try:
        os.mkdir(d)
    except OSError:
        pass

 def get_url(path):
    return '/'.join(URL_BASE + path)

 def get_file_name(path):
    return '/'.join(CACHE + path)

 def content_for_path(path):
    """
    Receives a `path` represented as a list, like:
     - ['chico-buarque']
     - ['legiao-urbana', '1234']
    Returns its content as a list of strings (lines)
    """

    if len(path) == 1:
        make_sure_dir_exists(get_file_name(path))
        path2 = path + ['root']
    else:
        make_sure_dir_exists(get_file_name(path[:-1]))
        path2 = path

    try:
        with open(get_file_name(path2), 'r') as f:
            lines = [line.strip() for line in f]
    except IOError:
        x = urllib.urlopen(get_url(path))
        lines = x.readlines()
        with open(get_file_name(path2), 'w') as f:
            f.writelines(lines)
            lines = [line.strip() for line in lines]

    return lines

 def fix_lyrics(lyrics):
    substitutions = [
        ('<p>', '<strophe> <verse> '),
        ('</p>', ' </verse> </strophe> '),
        ('<br />', ' </verse> <verse> '),
        ('  ', ' '),
    ]

    lyrics = '<song> ' + lyrics + ' </song>'

    for a, b in substitutions:
        lyrics = lyrics.replace(a, b)

    return lyrics

 if __name__ == '__main__':
    make_sure_dir_exists('/'.join(CACHE))

    try:
        path = [sys.argv[1]]
    except IndexError:
        # Examples:
        # path = ['engenheiros-do-hawaii']
        # path = ['legiao-urbana']
        # path = ['chico-buarque']
        path = ['the-beatles']

    song_paths = []
    for line in content_for_path(path):
        song_paths.extend([m.group('song').strip('/').split('/')
                           for m in re.finditer(ELEMSUG, line)])

    with open(get_file_name(path + ['processed']), 'w') as output:
        for song_path in song_paths:
            print 'processing', song_path
            content = ' '.join(content_for_path(song_path))
            soup = BeautifulSoup(content)
            lhtml = soup.find('div', id='div_letra')
            lyrics = ' '.join([str(p) for (i, p) in enumerate(lhtml.contents)
                               if i % 2 == 1])
            output.write(fix_lyrics(lyrics) + '\n')
            output.flush()
	"""

	A short script that, given an artist, scrapes all his/her/their songs
	from 'http://letras.terra.com.br'

	Dependence: BeautifulSoup, to parse the lyrics from each song page.

	Author: Andre Lima - http://github.com/andlima

	Licensed under MIT License

	Disclaimer: Sorry for the ugly code and lack of documentation. I'll try
	to give it some love later.

	"""

	import sys
	import os
	import re
	import urllib

	from BeautifulSoup import BeautifulSoup


	URL_BASE = ['http://letras.terra.com.br']
	ELEMSUG = '<a class="elemsug" href="(?P<song>[^"]+)">'
	DIV_LETRA = '<div id="div_letra">(?P<lyrics>.+)</div>'
	CACHE = ['.', 'cache']


	def make_sure_dir_exists(d):
	try:
	os.mkdir(d)
	except OSError:
	pass

	def get_url(path):
	return '/'.join(URL_BASE + path)

	def get_file_name(path):
	return '/'.join(CACHE + path)

	def content_for_path(path):
	"""
	Receives a `path` represented as a list, like:
	- ['chico-buarque']
	- ['legiao-urbana', '1234']
	Returns its content as a list of strings (lines)
	"""

	if len(path) == 1:
	make_sure_dir_exists(get_file_name(path))
	path2 = path + ['root']
	else:
	make_sure_dir_exists(get_file_name(path[:-1]))
	path2 = path

	try:
	with open(get_file_name(path2), 'r') as f:
	lines = [line.strip() for line in f]
	except IOError:
	x = urllib.urlopen(get_url(path))
	lines = x.readlines()
	with open(get_file_name(path2), 'w') as f:
	f.writelines(lines)
	lines = [line.strip() for line in lines]

	return lines

	def fix_lyrics(lyrics):
	substitutions = [
	('<p>', '<strophe> <verse> '),
	('</p>', ' </verse> </strophe> '),
	('<br />', ' </verse> <verse> '),
	(' ', ' '),
	]

	lyrics = '<song> ' + lyrics + ' </song>'

	for a, b in substitutions:
	lyrics = lyrics.replace(a, b)

	return lyrics

	if __name__ == '__main__':
	make_sure_dir_exists('/'.join(CACHE))

	try:
	path = [sys.argv[1]]
	except IndexError:
	# Examples:
	# path = ['engenheiros-do-hawaii']
	# path = ['legiao-urbana']
	# path = ['chico-buarque']
	path = ['the-beatles']

	song_paths = []
	for line in content_for_path(path):
	song_paths.extend([m.group('song').strip('/').split('/')
	for m in re.finditer(ELEMSUG, line)])

	with open(get_file_name(path + ['processed']), 'w') as output:
	for song_path in song_paths:
	print 'processing', song_path
	content = ' '.join(content_for_path(song_path))
	soup = BeautifulSoup(content)
	lhtml = soup.find('div', id='div_letra')
	lyrics = ' '.join([str(p) for (i, p) in enumerate(lhtml.contents)
	if i % 2 == 1])
	output.write(fix_lyrics(lyrics) + '\n')
	output.flush()