Created
April 7, 2012 03:59
-
-
Save andlima/2324940 to your computer and use it in GitHub Desktop.
Scrape letras.terra.com.br
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A short script that, given an artist, scrapes all his/her/their songs | |
from 'http://letras.terra.com.br' | |
Dependence: BeautifulSoup, to parse the lyrics from each song page. | |
Author: Andre Lima - http://github.com/andlima | |
Licensed under MIT License | |
Disclaimer: Sorry for the ugly code and lack of documentation. I'll try | |
to give it some love later. | |
""" | |
import sys | |
import os | |
import re | |
import urllib | |
from BeautifulSoup import BeautifulSoup | |
URL_BASE = ['http://letras.terra.com.br'] | |
ELEMSUG = '<a class="elemsug" href="(?P<song>[^"]+)">' | |
DIV_LETRA = '<div id="div_letra">(?P<lyrics>.+)</div>' | |
CACHE = ['.', 'cache'] | |
def make_sure_dir_exists(d): | |
try: | |
os.mkdir(d) | |
except OSError: | |
pass | |
def get_url(path): | |
return '/'.join(URL_BASE + path) | |
def get_file_name(path): | |
return '/'.join(CACHE + path) | |
def content_for_path(path): | |
""" | |
Receives a `path` represented as a list, like: | |
- ['chico-buarque'] | |
- ['legiao-urbana', '1234'] | |
Returns its content as a list of strings (lines) | |
""" | |
if len(path) == 1: | |
make_sure_dir_exists(get_file_name(path)) | |
path2 = path + ['root'] | |
else: | |
make_sure_dir_exists(get_file_name(path[:-1])) | |
path2 = path | |
try: | |
with open(get_file_name(path2), 'r') as f: | |
lines = [line.strip() for line in f] | |
except IOError: | |
x = urllib.urlopen(get_url(path)) | |
lines = x.readlines() | |
with open(get_file_name(path2), 'w') as f: | |
f.writelines(lines) | |
lines = [line.strip() for line in lines] | |
return lines | |
def fix_lyrics(lyrics): | |
substitutions = [ | |
('<p>', '<strophe> <verse> '), | |
('</p>', ' </verse> </strophe> '), | |
('<br />', ' </verse> <verse> '), | |
(' ', ' '), | |
] | |
lyrics = '<song> ' + lyrics + ' </song>' | |
for a, b in substitutions: | |
lyrics = lyrics.replace(a, b) | |
return lyrics | |
if __name__ == '__main__': | |
make_sure_dir_exists('/'.join(CACHE)) | |
try: | |
path = [sys.argv[1]] | |
except IndexError: | |
# Examples: | |
# path = ['engenheiros-do-hawaii'] | |
# path = ['legiao-urbana'] | |
# path = ['chico-buarque'] | |
path = ['the-beatles'] | |
song_paths = [] | |
for line in content_for_path(path): | |
song_paths.extend([m.group('song').strip('/').split('/') | |
for m in re.finditer(ELEMSUG, line)]) | |
with open(get_file_name(path + ['processed']), 'w') as output: | |
for song_path in song_paths: | |
print 'processing', song_path | |
content = ' '.join(content_for_path(song_path)) | |
soup = BeautifulSoup(content) | |
lhtml = soup.find('div', id='div_letra') | |
lyrics = ' '.join([str(p) for (i, p) in enumerate(lhtml.contents) | |
if i % 2 == 1]) | |
output.write(fix_lyrics(lyrics) + '\n') | |
output.flush() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment