pezon · May 15, 2019 16:45
diff --git a/__main__.py b/__main__.py
 """
 Fetch song lyrics
 Fetches song lyrics from Genius API based on a CSV with a list of songs.
 Outputs song lyrics in a separate CSV.
 CSVs are set through command line parameters.
 Uses asynchronous routines to fetch song lyrics to fetch many lyrics at a time.

 Requires Python 3.6+

 Install:

    Download pre-reqs:
    ```
    pip install lyricsgenius
    ```

 Usage:
    export GENIUS_CLIENT_ACCESS_TOKEN="<access_token>";
    python main.py\
        -i <SONG_LIST_CSV>\
        -l <SONG_LYRICS_CSV>\
        -s <STARTING_OFFSET>\
        -l <LIMIT>
        
 Song info (input) CSV schema must include:
  `artist_name`,`song_name`

 """
 import asyncio
 import concurrent.futures
 import csv
 import re
 from itertools import islice

 import lyricsgenius


 class Song(object):
    ATTR_KEYS = [
        'artist_key',
        'artist_name',
        'song_key',
        'song_name',
        'lyrics',
    ]

    def __init__(self, artist_name, song_name):
        self.artist_name = artist_name.strip()
        self.song_name = song_name.strip()
        self.artist_key = re.sub('[^\w]+', '-', artist_name.lower())
        self.song_key = re.sub('[^\w]', '-', song_name.lower())
        self.key = f'{self.artist_key}--{self.song_key}'
        self.lyrics = ''
        self.errors = ''

    @property
    def has_lyrics(self):
        return len(self.lyrics) > 0

    @property
    def has_errors(self):
        return len(self.errors) > 0

    def as_dict(self):
        return {k: getattr(self, k) for k in self.ATTR_KEYS}


 def get_songs(songs_csv):
    unique_songs = []
    with open(songs_csv, newline='') as csv_file:
        csv_reader = csv.DictReader(csv_file, quotechar='"')
        for song_item in csv_reader:
            song = Song(song_item['artist_name'], song_item['song_name'])
            if song.key in unique_songs:
                continue
            unique_songs.append(song.key)
            yield song


 def fetch_lyrics(genius_client, song):
    print(f'Fetching lyrics for {song.artist_name} - {song.song_name}')
    try:
        artist_genius = genius_client.search_artist(song.artist_name, max_songs=1)
        song_genius = genius_client.search_song(song.song_name, artist_genius.name)
    except Exception as err:
        song.errors = 'artist/song not found'
    try:
        song.lyrics = song_genius.lyrics
        print(f'Fetched lyrics for {song.artist_name} - {song.song_name}')
    except Exception as err:
        song.errors = 'lyrics not found'
    return song


 async def main(genius_client_access_token, song_info_csv, song_lyrics_csv,
               start=0, limit=1000):
    genius_client = lyricsgenius.Genius(genius_client_access_token,
                                        remove_section_headers=True)
    songs = islice(get_songs(song_info_csv), start, start + limit)

    total_count = 0
    found_count = 0
    error_count = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        loop = asyncio.get_event_loop()

        with open(song_lyrics_csv, 'w', newline='') as csv_fp:
            csv_writer = csv.DictWriter(csv_fp, delimiter=',', quotechar='"',
                                        fieldnames=Song.ATTR_KEYS)
            csv_writer.writeheader()

            # flush to CSV into 100-item chunks to reduce memory overhead
            while True:
                futures = [
                    loop.run_in_executor(executor, fetch_lyrics, genius_client, song)
                    for song in islice(songs, 0, 100)
                ]
                if len(futures) == 0:
                    break
                for song in await asyncio.gather(*futures):
                    total_count += 1
                    found_count += int(song.has_lyrics)
                    error_count += int(song.has_errors)
                    csv_writer.writerow(song.as_dict())

                csv_fp.flush()
                print(f'Written {total_count} rows ({found_count} found)')

    print(f'{total_count} songs (distinct)')
    print(f'{found_count} found')
    print(f'{error_count} errors')


 if __name__ == '__main__':
    import argparse
    import os

    # can be set through env variable
    GENIUS_CLIENT_ACCESS_TOKEN = os.getenv('GENIUS_CLIENT_ACCESS_TOKEN') or ''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--song-info', help='Location of input song info CSV')
    parser.add_argument('-o', '--song-lyrics', help='Location of output song lyrics CSV')
    parser.add_argument('-s', '--start', default=0, type=int, help='Start position of song info CSV')
    parser.add_argument('-l', '--limit', default=200000, type=int, help='Number of items in song info CSV to process')
    args = parser.parse_args()

    loop = asyncio.get_event_loop()
    loop.run_until_complete(main(GENIUS_CLIENT_ACCESS_TOKEN, args.song_info_csv,
                                 args.song_lyrics_csv, start=args.start, limit=args.limit))
	"""
	Fetch song lyrics
	Fetches song lyrics from Genius API based on a CSV with a list of songs.
	Outputs song lyrics in a separate CSV.
	CSVs are set through command line parameters.
	Uses asynchronous routines to fetch song lyrics to fetch many lyrics at a time.

	Requires Python 3.6+

	Install:

	Download pre-reqs:
	```
	pip install lyricsgenius
	```

	Usage:
	export GENIUS_CLIENT_ACCESS_TOKEN="<access_token>";
	python main.py\
	-i <SONG_LIST_CSV>\
	-l <SONG_LYRICS_CSV>\
	-s <STARTING_OFFSET>\
	-l <LIMIT>

	Song info (input) CSV schema must include:
	`artist_name`,`song_name`

	"""
	import asyncio
	import concurrent.futures
	import csv
	import re
	from itertools import islice

	import lyricsgenius


	class Song(object):
	ATTR_KEYS = [
	'artist_key',
	'artist_name',
	'song_key',
	'song_name',
	'lyrics',
	]

	def __init__(self, artist_name, song_name):
	self.artist_name = artist_name.strip()
	self.song_name = song_name.strip()
	self.artist_key = re.sub('[^\w]+', '-', artist_name.lower())
	self.song_key = re.sub('[^\w]', '-', song_name.lower())
	self.key = f'{self.artist_key}--{self.song_key}'
	self.lyrics = ''
	self.errors = ''

	@property
	def has_lyrics(self):
	return len(self.lyrics) > 0

	@property
	def has_errors(self):
	return len(self.errors) > 0

	def as_dict(self):
	return {k: getattr(self, k) for k in self.ATTR_KEYS}


	def get_songs(songs_csv):
	unique_songs = []
	with open(songs_csv, newline='') as csv_file:
	csv_reader = csv.DictReader(csv_file, quotechar='"')
	for song_item in csv_reader:
	song = Song(song_item['artist_name'], song_item['song_name'])
	if song.key in unique_songs:
	continue
	unique_songs.append(song.key)
	yield song


	def fetch_lyrics(genius_client, song):
	print(f'Fetching lyrics for {song.artist_name} - {song.song_name}')
	try:
	artist_genius = genius_client.search_artist(song.artist_name, max_songs=1)
	song_genius = genius_client.search_song(song.song_name, artist_genius.name)
	except Exception as err:
	song.errors = 'artist/song not found'
	try:
	song.lyrics = song_genius.lyrics
	print(f'Fetched lyrics for {song.artist_name} - {song.song_name}')
	except Exception as err:
	song.errors = 'lyrics not found'
	return song


	async def main(genius_client_access_token, song_info_csv, song_lyrics_csv,
	start=0, limit=1000):
	genius_client = lyricsgenius.Genius(genius_client_access_token,
	remove_section_headers=True)
	songs = islice(get_songs(song_info_csv), start, start + limit)

	total_count = 0
	found_count = 0
	error_count = 0

	with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
	loop = asyncio.get_event_loop()

	with open(song_lyrics_csv, 'w', newline='') as csv_fp:
	csv_writer = csv.DictWriter(csv_fp, delimiter=',', quotechar='"',
	fieldnames=Song.ATTR_KEYS)
	csv_writer.writeheader()

	# flush to CSV into 100-item chunks to reduce memory overhead
	while True:
	futures = [
	loop.run_in_executor(executor, fetch_lyrics, genius_client, song)
	for song in islice(songs, 0, 100)
	]
	if len(futures) == 0:
	break
	for song in await asyncio.gather(*futures):
	total_count += 1
	found_count += int(song.has_lyrics)
	error_count += int(song.has_errors)
	csv_writer.writerow(song.as_dict())

	csv_fp.flush()
	print(f'Written {total_count} rows ({found_count} found)')

	print(f'{total_count} songs (distinct)')
	print(f'{found_count} found')
	print(f'{error_count} errors')


	if __name__ == '__main__':
	import argparse
	import os

	# can be set through env variable
	GENIUS_CLIENT_ACCESS_TOKEN = os.getenv('GENIUS_CLIENT_ACCESS_TOKEN') or ''

	parser = argparse.ArgumentParser()
	parser.add_argument('-i', '--song-info', help='Location of input song info CSV')
	parser.add_argument('-o', '--song-lyrics', help='Location of output song lyrics CSV')
	parser.add_argument('-s', '--start', default=0, type=int, help='Start position of song info CSV')
	parser.add_argument('-l', '--limit', default=200000, type=int, help='Number of items in song info CSV to process')
	args = parser.parse_args()

	loop = asyncio.get_event_loop()
	loop.run_until_complete(main(GENIUS_CLIENT_ACCESS_TOKEN, args.song_info_csv,
	args.song_lyrics_csv, start=args.start, limit=args.limit))