Skip to content

Instantly share code, notes, and snippets.

@ashioyajotham
Forked from iwouldnot/scrapper.py
Created December 5, 2022 09:50
Show Gist options
  • Save ashioyajotham/71b52aa10fc3b175a20568d90c5b965e to your computer and use it in GitHub Desktop.
Save ashioyajotham/71b52aa10fc3b175a20568d90c5b965e to your computer and use it in GitHub Desktop.
scrap lyrics from genius.com
import asyncio
import json
from time import time, sleep
import aiohttp
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
# Ващет СИКРЕТНА!!!!! Но код хранится в private репозитории, так что пох
CLIENT_TOKEN = 'nH_1wY8HnEaSxV0FSJ5oL23XtDNxZ8-mZgkBISwps_B4rFrI8WrQ8TWHrViDGTGs'
# max amount of connection at once for session
MAX_CONNECTIONS = 50
loop = asyncio.get_event_loop()
df = pd.read_csv('../data/no_lyrics.csv', index_col=0)
async def get_url(search_query: str) -> str:
"""
Gets search query, containing artist+title information, returns url of this song on genius.com,
obtained from Genius API (api.genius.com). Returns empty string ('') if song is not found.
@param search_query: query text for searching in genius
@type search_query: str
@return: url of genius.com page with lyrics of this song
@rtype: str
"""
params = {'q': search_query}
url = 'https://api.genius.com/search'
async with session.get(url, params=params) as response:
try:
# gets response part, most possible (index 0) result, its url
url = json.loads(await response.text())['response']['hits'][0]['result']['url']
except IndexError:
# if nothing is found
return ''
return url
async def parse_url(url: str) -> str:
"""
Gets url as string, returns parsed lyrics of song from this page. Returns empty string ('')
if url is empty ('').
@param url: actual link to the song
@type url: str
@return: full lyrics of song
@rtype: str
"""
if url != '':
page = await session.get(url)
soup = BeautifulSoup(await page.text(), 'html.parser')
[scr.extract() for scr in soup('script')]
lyrics = soup.find('div', class_='lyrics').get_text()
return lyrics
else:
return ''
async def get_lyrics(search_query):
"""
Returns lyrics for combined artist+title request
@param search_query: query text for searching in genius
@type search_query: str
@return: full lyrics of song
@rtype: str
"""
url = await get_url(search_query)
lyrics = await parse_url(url)
return lyrics
async def put_lyrics(row: int) -> None:
"""
Insert lyrics for song into globally defined dataset, based on artist name and song title
from this dataset, accessed by row index
@param row: index of row in dataset
@type row: int
@return: None
"""
global df
query = '{} {}'.format(df.loc[row, 'artist'], df.loc[row, 'title'])
lyrics = await get_lyrics(query)
df.loc[row, 'lyrics'] = lyrics
async def wait_with_progress(coroutines: list) -> None:
"""
tqdm wrapper for async waiter. Prints progress bar for coroutines completing.
@param coroutines: list of coroutines to complete
@type coroutines: list of coroutines
@return: None
"""
bar_width = 90
for f in tqdm(asyncio.as_completed(coroutines), total=len(coroutines), ncols=bar_width):
await f
def main():
start = time()
df['lyrics'] = pd.Series(data='', index=df.index)
futures = [put_lyrics(row_id) for row_id in df.index]
loop.run_until_complete(wait_with_progress(futures))
finish = time()
sleep(0.5) # to separate tqdm bar from further print
print('Done in {:6.2f} sec'.format(finish - start))
df.to_csv('../data/with_lyrics.csv')
print('Saved dataset with lyrics to ../data/with_lyrics.csv')
if __name__ == '__main__':
headers = {'Authorization': 'Bearer {}'.format(CLIENT_TOKEN)} # include auth token
with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=MAX_CONNECTIONS),
headers=headers,
conn_timeout=None,
read_timeout=None) as session:
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment