-
-
Save ashioyajotham/71b52aa10fc3b175a20568d90c5b965e to your computer and use it in GitHub Desktop.
scrap lyrics from genius.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import json | |
from time import time, sleep | |
import aiohttp | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from tqdm import tqdm | |
# Ващет СИКРЕТНА!!!!! Но код хранится в private репозитории, так что пох | |
CLIENT_TOKEN = 'nH_1wY8HnEaSxV0FSJ5oL23XtDNxZ8-mZgkBISwps_B4rFrI8WrQ8TWHrViDGTGs' | |
# max amount of connection at once for session | |
MAX_CONNECTIONS = 50 | |
loop = asyncio.get_event_loop() | |
df = pd.read_csv('../data/no_lyrics.csv', index_col=0) | |
async def get_url(search_query: str) -> str: | |
""" | |
Gets search query, containing artist+title information, returns url of this song on genius.com, | |
obtained from Genius API (api.genius.com). Returns empty string ('') if song is not found. | |
@param search_query: query text for searching in genius | |
@type search_query: str | |
@return: url of genius.com page with lyrics of this song | |
@rtype: str | |
""" | |
params = {'q': search_query} | |
url = 'https://api.genius.com/search' | |
async with session.get(url, params=params) as response: | |
try: | |
# gets response part, most possible (index 0) result, its url | |
url = json.loads(await response.text())['response']['hits'][0]['result']['url'] | |
except IndexError: | |
# if nothing is found | |
return '' | |
return url | |
async def parse_url(url: str) -> str: | |
""" | |
Gets url as string, returns parsed lyrics of song from this page. Returns empty string ('') | |
if url is empty (''). | |
@param url: actual link to the song | |
@type url: str | |
@return: full lyrics of song | |
@rtype: str | |
""" | |
if url != '': | |
page = await session.get(url) | |
soup = BeautifulSoup(await page.text(), 'html.parser') | |
[scr.extract() for scr in soup('script')] | |
lyrics = soup.find('div', class_='lyrics').get_text() | |
return lyrics | |
else: | |
return '' | |
async def get_lyrics(search_query): | |
""" | |
Returns lyrics for combined artist+title request | |
@param search_query: query text for searching in genius | |
@type search_query: str | |
@return: full lyrics of song | |
@rtype: str | |
""" | |
url = await get_url(search_query) | |
lyrics = await parse_url(url) | |
return lyrics | |
async def put_lyrics(row: int) -> None: | |
""" | |
Insert lyrics for song into globally defined dataset, based on artist name and song title | |
from this dataset, accessed by row index | |
@param row: index of row in dataset | |
@type row: int | |
@return: None | |
""" | |
global df | |
query = '{} {}'.format(df.loc[row, 'artist'], df.loc[row, 'title']) | |
lyrics = await get_lyrics(query) | |
df.loc[row, 'lyrics'] = lyrics | |
async def wait_with_progress(coroutines: list) -> None: | |
""" | |
tqdm wrapper for async waiter. Prints progress bar for coroutines completing. | |
@param coroutines: list of coroutines to complete | |
@type coroutines: list of coroutines | |
@return: None | |
""" | |
bar_width = 90 | |
for f in tqdm(asyncio.as_completed(coroutines), total=len(coroutines), ncols=bar_width): | |
await f | |
def main(): | |
start = time() | |
df['lyrics'] = pd.Series(data='', index=df.index) | |
futures = [put_lyrics(row_id) for row_id in df.index] | |
loop.run_until_complete(wait_with_progress(futures)) | |
finish = time() | |
sleep(0.5) # to separate tqdm bar from further print | |
print('Done in {:6.2f} sec'.format(finish - start)) | |
df.to_csv('../data/with_lyrics.csv') | |
print('Saved dataset with lyrics to ../data/with_lyrics.csv') | |
if __name__ == '__main__': | |
headers = {'Authorization': 'Bearer {}'.format(CLIENT_TOKEN)} # include auth token | |
with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=MAX_CONNECTIONS), | |
headers=headers, | |
conn_timeout=None, | |
read_timeout=None) as session: | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment