Last active
May 15, 2020 07:00
-
-
Save 0187773933/ce87fcd05b07e8f3f04d8db04dcef1a5 to your computer and use it in GitHub Desktop.
Searches Genius.com Lyrics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
from pprint import pprint | |
from tqdm import tqdm | |
from concurrent.futures import ThreadPoolExecutor | |
from dotenv import load_dotenv, find_dotenv | |
# http://genius.com/api-clients | |
# https://docs.genius.com/#/getting-started-h1 | |
# https://github.com/johnwmillr/LyricsGenius | |
class GeniusLyricSearch: | |
def __init__( self , options={} ): | |
if "client_access_token" not in options: | |
print( "You Need To Get a Client Access Token from:" ) | |
print( "http://genius.com/api-clients" ) | |
sys.exit( 1 ) | |
self.client_access_token = options["client_access_token"] | |
self.batch_search_finished = True | |
# options = { max_workers: 10 , batch_list , function_reference: func_ref } | |
def batch_process( self , options ): | |
batch_size = len( options[ "batch_list" ] ) | |
with ThreadPoolExecutor() as executor: | |
result_pool = list( tqdm( executor.map( options[ "function_reference" ] , iter( options[ "batch_list" ] ) ) , total=batch_size ) ) | |
return result_pool | |
def search( self , search_term ): | |
headers = { | |
'accept': 'application/json, text/plain, */*', | |
} | |
params = ( | |
( 'access_token' , self.client_access_token ) , | |
( 'q' , search_term ) , | |
) | |
response = requests.get( 'https://api.genius.com/search' , headers=headers , params=params ) | |
return response.json() | |
def get_artist_info_from_id( self , artist_id ): | |
headers = { | |
'accept': 'application/json, text/plain, */*', | |
} | |
params = ( | |
( 'access_token' , self.client_access_token ) , | |
) | |
url = f"https://api.genius.com/artists/{ str( artist_id ) }" | |
print( url ) | |
response = requests.get( url , headers=headers , params=params ) | |
return response.json() | |
def get_songs_from_artist_id( self , options={} ): | |
if self.batch_search_finished == True: | |
return [] | |
if "artist_id" not in options: | |
print( "no artist id given" ) | |
return | |
if "per_page" not in options: | |
options["per_page"] = 20 | |
if "page" not in options: | |
options["page"] = 1 | |
headers = { | |
'accept': 'application/json, text/plain, */*', | |
} | |
params = ( | |
( 'sort' , 'popularity' ) , | |
( 'per_page' , options[ "per_page" ] ) , | |
( 'page' , options[ "page" ] ) , | |
( 'access_token' , self.client_access_token ) , | |
) | |
url = f"https://api.genius.com/artists/{ str( options['artist_id'] ) }/songs" | |
response = requests.get( url , headers=headers , params=params ) | |
result = response.json() | |
if result["response"]["next_page"] == None: | |
self.batch_search_finished = True | |
return result["response"]["songs"] | |
def enumerate_artist_songs( self , artist_id ): | |
self.batch_search_finished = False | |
batch_options_list = [ { "artist_id": artist_id , "per_page": 20 , "page": x } for x in range( 1 , 100 ) ] | |
songs = self.batch_process({ | |
"max_workers": 5 , | |
"function_reference": self.get_songs_from_artist_id , | |
"batch_list": batch_options_list | |
}) | |
self.batch_search_finished = True | |
return songs | |
# https://github.com/johnwmillr/LyricsGenius/blob/a65c0fb7b2a2c7f35fe004d390c2ea2253265c0f/lyricsgenius/api.py#L159 | |
def scrape_song_lyrics_from_url( self , song_instance ): | |
page = requests.get( song_instance["url"] ) | |
if page.status_code == 404: | |
return None | |
html = BeautifulSoup( page.text , "html.parser" ) | |
div = html.find( "div" , class_="lyrics" ) | |
if not div: | |
return None # Sometimes the lyrics section isn't found | |
# Scrape lyrics if proper section was found on page | |
lyrics = div.get_text() | |
remove_section_headers = False | |
if remove_section_headers: # Remove [Verse], [Bridge], etc. | |
lyrics = re.sub( '(\[.*?\])*' , '' , lyrics ) | |
lyrics = re.sub( '\n{2}' , '\n' , lyrics ) # Gaps between verses | |
song_instance["lyrics"] = lyrics.strip( "\n" ) | |
return song_instance | |
def get_all_songs_from_artist( self , artist_name ): | |
search_results = self.search( artist_name ) | |
possible_artist_ids = list( map( lambda x: x[ "result" ][ "primary_artist" ][ "id" ] , search_results[ "response" ][ "hits" ] ) ) | |
print( f"Gathering Songs from Artist ID: {str( possible_artist_ids[ 0 ] )}" ) | |
artist_songs = self.enumerate_artist_songs( possible_artist_ids[ 0 ] ) | |
# Flatten List of Lists | |
artist_songs = [ item for sublist in artist_songs for item in sublist ] | |
# Filter Songs that For Whatever Reason Don't Match Artist Id | |
artist_songs = [ i for i in artist_songs if i[ "primary_artist" ][ "id" ] == possible_artist_ids[ 0 ] ] | |
# Now Batch Process the Lyric Scrapping of Each Song | |
print( f"Scrapping Lyrics from {str(len(artist_songs))} Songs" ) | |
artist_songs = self.batch_process({ | |
"max_workers": 5 , | |
"function_reference": self.scrape_song_lyrics_from_url , | |
"batch_list": artist_songs | |
}) | |
return artist_songs | |
if __name__ == "__main__": | |
load_dotenv( find_dotenv() ) | |
lyric_searcher = GeniusLyricSearch({"client_access_token": os.environ[ "client_access_token" ]}) | |
artist_songs = lyric_searcher.get_all_songs_from_artist( "Led Zeppelin" ) | |
pprint( artist_songs ) | |
print( str( len( artist_songs ) ) ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment