Skip to content

Instantly share code, notes, and snippets.

@pezon
Created May 15, 2019 16:45
Show Gist options
  • Save pezon/95dd0bb3fc09fe54a77227bcfbeca757 to your computer and use it in GitHub Desktop.
Save pezon/95dd0bb3fc09fe54a77227bcfbeca757 to your computer and use it in GitHub Desktop.
Download song lyrics (en-masse)
"""
Fetch song lyrics
Fetches song lyrics from Genius API based on a CSV with a list of songs.
Outputs song lyrics in a separate CSV.
CSVs are set through command line parameters.
Uses asynchronous routines to fetch song lyrics to fetch many lyrics at a time.
Requires Python 3.6+
Install:
Download pre-reqs:
```
pip install lyricsgenius
```
Usage:
export GENIUS_CLIENT_ACCESS_TOKEN="<access_token>";
python main.py\
-i <SONG_LIST_CSV>\
-l <SONG_LYRICS_CSV>\
-s <STARTING_OFFSET>\
-l <LIMIT>
Song info (input) CSV schema must include:
`artist_name`,`song_name`
"""
import asyncio
import concurrent.futures
import csv
import re
from itertools import islice
import lyricsgenius
class Song(object):
ATTR_KEYS = [
'artist_key',
'artist_name',
'song_key',
'song_name',
'lyrics',
]
def __init__(self, artist_name, song_name):
self.artist_name = artist_name.strip()
self.song_name = song_name.strip()
self.artist_key = re.sub('[^\w]+', '-', artist_name.lower())
self.song_key = re.sub('[^\w]', '-', song_name.lower())
self.key = f'{self.artist_key}--{self.song_key}'
self.lyrics = ''
self.errors = ''
@property
def has_lyrics(self):
return len(self.lyrics) > 0
@property
def has_errors(self):
return len(self.errors) > 0
def as_dict(self):
return {k: getattr(self, k) for k in self.ATTR_KEYS}
def get_songs(songs_csv):
unique_songs = []
with open(songs_csv, newline='') as csv_file:
csv_reader = csv.DictReader(csv_file, quotechar='"')
for song_item in csv_reader:
song = Song(song_item['artist_name'], song_item['song_name'])
if song.key in unique_songs:
continue
unique_songs.append(song.key)
yield song
def fetch_lyrics(genius_client, song):
print(f'Fetching lyrics for {song.artist_name} - {song.song_name}')
try:
artist_genius = genius_client.search_artist(song.artist_name, max_songs=1)
song_genius = genius_client.search_song(song.song_name, artist_genius.name)
except Exception as err:
song.errors = 'artist/song not found'
try:
song.lyrics = song_genius.lyrics
print(f'Fetched lyrics for {song.artist_name} - {song.song_name}')
except Exception as err:
song.errors = 'lyrics not found'
return song
async def main(genius_client_access_token, song_info_csv, song_lyrics_csv,
start=0, limit=1000):
genius_client = lyricsgenius.Genius(genius_client_access_token,
remove_section_headers=True)
songs = islice(get_songs(song_info_csv), start, start + limit)
total_count = 0
found_count = 0
error_count = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
loop = asyncio.get_event_loop()
with open(song_lyrics_csv, 'w', newline='') as csv_fp:
csv_writer = csv.DictWriter(csv_fp, delimiter=',', quotechar='"',
fieldnames=Song.ATTR_KEYS)
csv_writer.writeheader()
# flush to CSV into 100-item chunks to reduce memory overhead
while True:
futures = [
loop.run_in_executor(executor, fetch_lyrics, genius_client, song)
for song in islice(songs, 0, 100)
]
if len(futures) == 0:
break
for song in await asyncio.gather(*futures):
total_count += 1
found_count += int(song.has_lyrics)
error_count += int(song.has_errors)
csv_writer.writerow(song.as_dict())
csv_fp.flush()
print(f'Written {total_count} rows ({found_count} found)')
print(f'{total_count} songs (distinct)')
print(f'{found_count} found')
print(f'{error_count} errors')
if __name__ == '__main__':
import argparse
import os
# can be set through env variable
GENIUS_CLIENT_ACCESS_TOKEN = os.getenv('GENIUS_CLIENT_ACCESS_TOKEN') or ''
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--song-info', help='Location of input song info CSV')
parser.add_argument('-o', '--song-lyrics', help='Location of output song lyrics CSV')
parser.add_argument('-s', '--start', default=0, type=int, help='Start position of song info CSV')
parser.add_argument('-l', '--limit', default=200000, type=int, help='Number of items in song info CSV to process')
args = parser.parse_args()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(GENIUS_CLIENT_ACCESS_TOKEN, args.song_info_csv,
args.song_lyrics_csv, start=args.start, limit=args.limit))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment