Created
May 23, 2017 17:33
-
-
Save ErikGartner/318beb4da7597bb3aace9b5d9350ba89 to your computer and use it in GitHub Desktop.
Preprocess lyrics for RNN
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import requests | |
import sys | |
import json | |
LASTM_FM_KEY = '' | |
genre_cache = {} | |
def get_genres(artist_name): | |
artist_name = artist_name.strip().lower() | |
if artist_name not in genre_cache: | |
try: | |
print('Downloading genres for: %s' % artist_name) | |
data = requests.get('http://ws.audioscrobbler.com/2.0/?method=artist.getinfo&artist=%s&format=json&api_key=%s' % (artist_name, LASTM_FM_KEY)) | |
genres = [n['name'] for n in data.json()['artist']['tags']['tag']] | |
except Exception as e: | |
print('Error while downloading genres for %s' % artist_name) | |
genres = [] | |
finally: | |
genre_cache[artist_name] = genres | |
return genre_cache[artist_name] | |
def filter_row(row, target_genre): | |
# artist,song,link,text | |
genres = get_genres(row[0]) | |
return target_genre in genres | |
input_file = sys.argv[1] | |
output_file = sys.argv[2] | |
target_genre = sys.argv[3] | |
genres_file = sys.argv[4] | |
with open(genres_file) as f: | |
genre_cache = json.load(f) | |
with open(input_file) as f: | |
reader = csv.reader(f, delimiter=',', quotechar='"') | |
rows = list(reader) | |
filtered_rows = [r for r in rows if filter_row(r, target_genre)] | |
with open(output_file, 'w') as f: | |
for filtered_row in filtered_rows: | |
f.write(filtered_row[3]) | |
f.write('\n') | |
with open(genres_file, 'w') as f: | |
json.dump(genre_cache, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment