0187773933 · May 15, 2020 07:00
diff --git a/LyricsSearch.py b/LyricsSearch.py
 import os
 import sys
 import re
 import requests
 from bs4 import BeautifulSoup
 from pprint import pprint
 from tqdm import tqdm
 from concurrent.futures import ThreadPoolExecutor

 from dotenv import load_dotenv, find_dotenv

 # http://genius.com/api-clients
 # https://docs.genius.com/#/getting-started-h1
 # https://github.com/johnwmillr/LyricsGenius

 class GeniusLyricSearch:

 	def __init__( self , options={} ):
 		if "client_access_token" not in options:
 			print( "You Need To Get a Client Access Token from:" )
 			print( "http://genius.com/api-clients" )
 			sys.exit( 1 )
 		self.client_access_token = options["client_access_token"]
 		self.batch_search_finished = True

 	# options = { max_workers: 10 , batch_list , function_reference: func_ref }
 	def batch_process( self , options ):
 		batch_size = len( options[ "batch_list" ] )
 		with ThreadPoolExecutor() as executor:
 			result_pool = list( tqdm( executor.map( options[ "function_reference" ] , iter( options[ "batch_list" ] ) ) , total=batch_size ) )
 			return result_pool

 	def search( self , search_term ):
 		headers = {
 			'accept': 'application/json, text/plain, */*',
 		}
 		params = (
 			( 'access_token' , self.client_access_token ) ,
 			( 'q' , search_term ) ,
 		)
 		response = requests.get( 'https://api.genius.com/search' , headers=headers , params=params )
 		return response.json()

 	def get_artist_info_from_id( self , artist_id ):
 		headers = {
 			'accept': 'application/json, text/plain, */*',
 		}
 		params = (
 			( 'access_token' , self.client_access_token ) ,
 		)
 		url = f"https://api.genius.com/artists/{ str( artist_id ) }"
 		print( url )
 		response = requests.get( url , headers=headers , params=params )
 		return response.json()

 	def get_songs_from_artist_id( self , options={} ):
 		if self.batch_search_finished == True:
 			return []
 		if "artist_id" not in options:
 			print( "no artist id given" )
 			return
 		if "per_page" not in options:
 			options["per_page"] = 20
 		if "page" not in options:
 			options["page"] = 1

 		headers = {
 			'accept': 'application/json, text/plain, */*',
 		}
 		params = (
 			( 'sort' , 'popularity' ) ,
 			( 'per_page' , options[ "per_page" ] ) ,
 			( 'page' , options[ "page" ] ) ,
 			( 'access_token' , self.client_access_token ) ,
 		)
 		url = f"https://api.genius.com/artists/{ str( options['artist_id'] ) }/songs"
 		response = requests.get( url , headers=headers , params=params )
 		result = response.json()
 		if result["response"]["next_page"] == None:
 			self.batch_search_finished = True
 		return result["response"]["songs"]

 	def enumerate_artist_songs( self , artist_id ):
 		self.batch_search_finished = False
 		batch_options_list = [ { "artist_id": artist_id , "per_page": 20 , "page": x } for x in range( 1 , 100 ) ]
 		songs = self.batch_process({
 				"max_workers": 5 ,
 				"function_reference": self.get_songs_from_artist_id ,
 				"batch_list": batch_options_list
 			})
 		self.batch_search_finished = True
 		return songs

 	# https://github.com/johnwmillr/LyricsGenius/blob/a65c0fb7b2a2c7f35fe004d390c2ea2253265c0f/lyricsgenius/api.py#L159
 	def scrape_song_lyrics_from_url( self , song_instance ):
 		page = requests.get( song_instance["url"] )
 		if page.status_code == 404:
 			return None

 		html = BeautifulSoup( page.text , "html.parser" )
 		div = html.find( "div" , class_="lyrics" )
 		if not div:
 			return None # Sometimes the lyrics section isn't found

 		# Scrape lyrics if proper section was found on page
 		lyrics = div.get_text()
 		remove_section_headers = False
 		if remove_section_headers:  # Remove [Verse], [Bridge], etc.
 			lyrics = re.sub( '(\[.*?\])*' , '' , lyrics )
 			lyrics = re.sub( '\n{2}' , '\n' , lyrics )  # Gaps between verses
 		song_instance["lyrics"] = lyrics.strip( "\n" )
 		return song_instance

 	def get_all_songs_from_artist( self , artist_name ):
 		search_results = self.search( artist_name )
 		possible_artist_ids = list( map( lambda x: x[ "result" ][ "primary_artist" ][ "id" ] , search_results[ "response" ][ "hits" ] ) )
 		print( f"Gathering Songs from Artist ID: {str( possible_artist_ids[ 0 ] )}" )
 		artist_songs = self.enumerate_artist_songs( possible_artist_ids[ 0 ] )

 		# Flatten List of Lists
 		artist_songs = [ item for sublist in artist_songs for item in sublist ]

 		# Filter Songs that For Whatever Reason Don't Match Artist Id
 		artist_songs = [ i for i in artist_songs if i[ "primary_artist" ][ "id" ] == possible_artist_ids[ 0 ] ]

 		# Now Batch Process the Lyric Scrapping of Each Song
 		print( f"Scrapping Lyrics from {str(len(artist_songs))} Songs" )
 		artist_songs = self.batch_process({
 				"max_workers": 5 ,
 				"function_reference": self.scrape_song_lyrics_from_url ,
 				"batch_list": artist_songs
 			})
 		return artist_songs


 if __name__ == "__main__":

 	load_dotenv( find_dotenv() )
 	lyric_searcher = GeniusLyricSearch({"client_access_token": os.environ[ "client_access_token" ]})
 	artist_songs = lyric_searcher.get_all_songs_from_artist( "Led Zeppelin" )
 	pprint( artist_songs )
 	print( str( len( artist_songs ) ) )
	import os
	import sys
	import re
	import requests
	from bs4 import BeautifulSoup
	from pprint import pprint
	from tqdm import tqdm
	from concurrent.futures import ThreadPoolExecutor

	from dotenv import load_dotenv, find_dotenv

	# http://genius.com/api-clients
	# https://docs.genius.com/#/getting-started-h1
	# https://github.com/johnwmillr/LyricsGenius

	class GeniusLyricSearch:

	def __init__( self , options={} ):
	if "client_access_token" not in options:
	print( "You Need To Get a Client Access Token from:" )
	print( "http://genius.com/api-clients" )
	sys.exit( 1 )
	self.client_access_token = options["client_access_token"]
	self.batch_search_finished = True

	# options = { max_workers: 10 , batch_list , function_reference: func_ref }
	def batch_process( self , options ):
	batch_size = len( options[ "batch_list" ] )
	with ThreadPoolExecutor() as executor:
	result_pool = list( tqdm( executor.map( options[ "function_reference" ] , iter( options[ "batch_list" ] ) ) , total=batch_size ) )
	return result_pool

	def search( self , search_term ):
	headers = {
	'accept': 'application/json, text/plain, /',
	}
	params = (
	( 'access_token' , self.client_access_token ) ,
	( 'q' , search_term ) ,
	)
	response = requests.get( 'https://api.genius.com/search' , headers=headers , params=params )
	return response.json()

	def get_artist_info_from_id( self , artist_id ):
	headers = {
	'accept': 'application/json, text/plain, /',
	}
	params = (
	( 'access_token' , self.client_access_token ) ,
	)
	url = f"https://api.genius.com/artists/{ str( artist_id ) }"
	print( url )
	response = requests.get( url , headers=headers , params=params )
	return response.json()

	def get_songs_from_artist_id( self , options={} ):
	if self.batch_search_finished == True:
	return []
	if "artist_id" not in options:
	print( "no artist id given" )
	return
	if "per_page" not in options:
	options["per_page"] = 20
	if "page" not in options:
	options["page"] = 1

	headers = {
	'accept': 'application/json, text/plain, /',
	}
	params = (
	( 'sort' , 'popularity' ) ,
	( 'per_page' , options[ "per_page" ] ) ,
	( 'page' , options[ "page" ] ) ,
	( 'access_token' , self.client_access_token ) ,
	)
	url = f"https://api.genius.com/artists/{ str( options['artist_id'] ) }/songs"
	response = requests.get( url , headers=headers , params=params )
	result = response.json()
	if result["response"]["next_page"] == None:
	self.batch_search_finished = True
	return result["response"]["songs"]

	def enumerate_artist_songs( self , artist_id ):
	self.batch_search_finished = False
	batch_options_list = [ { "artist_id": artist_id , "per_page": 20 , "page": x } for x in range( 1 , 100 ) ]
	songs = self.batch_process({
	"max_workers": 5 ,
	"function_reference": self.get_songs_from_artist_id ,
	"batch_list": batch_options_list
	})
	self.batch_search_finished = True
	return songs

	# https://github.com/johnwmillr/LyricsGenius/blob/a65c0fb7b2a2c7f35fe004d390c2ea2253265c0f/lyricsgenius/api.py#L159
	def scrape_song_lyrics_from_url( self , song_instance ):
	page = requests.get( song_instance["url"] )
	if page.status_code == 404:
	return None

	html = BeautifulSoup( page.text , "html.parser" )
	div = html.find( "div" , class_="lyrics" )
	if not div:
	return None # Sometimes the lyrics section isn't found

	# Scrape lyrics if proper section was found on page
	lyrics = div.get_text()
	remove_section_headers = False
	if remove_section_headers: # Remove [Verse], [Bridge], etc.
	lyrics = re.sub( '(\[.?\])' , '' , lyrics )
	lyrics = re.sub( '\n{2}' , '\n' , lyrics ) # Gaps between verses
	song_instance["lyrics"] = lyrics.strip( "\n" )
	return song_instance

	def get_all_songs_from_artist( self , artist_name ):
	search_results = self.search( artist_name )
	possible_artist_ids = list( map( lambda x: x[ "result" ][ "primary_artist" ][ "id" ] , search_results[ "response" ][ "hits" ] ) )
	print( f"Gathering Songs from Artist ID: {str( possible_artist_ids[ 0 ] )}" )
	artist_songs = self.enumerate_artist_songs( possible_artist_ids[ 0 ] )

	# Flatten List of Lists
	artist_songs = [ item for sublist in artist_songs for item in sublist ]

	# Filter Songs that For Whatever Reason Don't Match Artist Id
	artist_songs = [ i for i in artist_songs if i[ "primary_artist" ][ "id" ] == possible_artist_ids[ 0 ] ]

	# Now Batch Process the Lyric Scrapping of Each Song
	print( f"Scrapping Lyrics from {str(len(artist_songs))} Songs" )
	artist_songs = self.batch_process({
	"max_workers": 5 ,
	"function_reference": self.scrape_song_lyrics_from_url ,
	"batch_list": artist_songs
	})
	return artist_songs


	if __name__ == "__main__":

	load_dotenv( find_dotenv() )
	lyric_searcher = GeniusLyricSearch({"client_access_token": os.environ[ "client_access_token" ]})
	artist_songs = lyric_searcher.get_all_songs_from_artist( "Led Zeppelin" )
	pprint( artist_songs )
	print( str( len( artist_songs ) ) )