ashioyajotham · December 5, 2022 09:50
diff --git a/scrapper.py b/scrapper.py
 import asyncio
 import json
 from time import time, sleep

 import aiohttp
 import pandas as pd
 from bs4 import BeautifulSoup
 from tqdm import tqdm

 # Ващет СИКРЕТНА!!!!! Но код хранится в private репозитории, так что пох
 CLIENT_TOKEN = 'nH_1wY8HnEaSxV0FSJ5oL23XtDNxZ8-mZgkBISwps_B4rFrI8WrQ8TWHrViDGTGs'

 # max amount of connection at once for session
 MAX_CONNECTIONS = 50

 loop = asyncio.get_event_loop()
 df = pd.read_csv('../data/no_lyrics.csv', index_col=0)


 async def get_url(search_query: str) -> str:
    """
    Gets search query, containing artist+title information, returns url of this song on genius.com,
    obtained from Genius API (api.genius.com). Returns empty string ('') if song is not found.
    @param search_query: query text for searching in genius
    @type search_query: str
    @return: url of genius.com page with lyrics of this song
    @rtype: str
    """
    params = {'q': search_query}
    url = 'https://api.genius.com/search'
    async with session.get(url, params=params) as response:
        try:
            # gets response part, most possible (index 0) result, its url
            url = json.loads(await response.text())['response']['hits'][0]['result']['url']
        except IndexError:
            # if nothing is found
            return ''
        return url


 async def parse_url(url: str) -> str:
    """
    Gets url as string, returns parsed lyrics of song from this page. Returns empty string ('')
    if url is empty ('').
    @param url: actual link to the song
    @type url: str
    @return: full lyrics of song
    @rtype: str
    """
    if url != '':
        page = await session.get(url)
        soup = BeautifulSoup(await page.text(), 'html.parser')
        [scr.extract() for scr in soup('script')]
        lyrics = soup.find('div', class_='lyrics').get_text()
        return lyrics
    else:
        return ''


 async def get_lyrics(search_query):
    """
    Returns lyrics for combined artist+title request
    @param search_query: query text for searching in genius
    @type search_query: str
    @return: full lyrics of song
    @rtype: str
    """
    url = await get_url(search_query)
    lyrics = await parse_url(url)
    return lyrics


 async def put_lyrics(row: int) -> None:
    """
    Insert lyrics for song into globally defined dataset, based on artist name and song title
    from this dataset, accessed by row index
    @param row: index of row in dataset
    @type row: int
    @return: None
    """
    global df
    query = '{} {}'.format(df.loc[row, 'artist'], df.loc[row, 'title'])
    lyrics = await get_lyrics(query)
    df.loc[row, 'lyrics'] = lyrics


 async def wait_with_progress(coroutines: list) -> None:
    """
    tqdm wrapper for async waiter. Prints progress bar for coroutines completing.
    @param coroutines: list of coroutines to complete
    @type coroutines: list of coroutines
    @return: None
    """
    bar_width = 90
    for f in tqdm(asyncio.as_completed(coroutines), total=len(coroutines), ncols=bar_width):
        await f


 def main():
    start = time()
    df['lyrics'] = pd.Series(data='', index=df.index)
    futures = [put_lyrics(row_id) for row_id in df.index]
    loop.run_until_complete(wait_with_progress(futures))
    finish = time()
    sleep(0.5)  # to separate tqdm bar from further print

    print('Done in {:6.2f} sec'.format(finish - start))
    df.to_csv('../data/with_lyrics.csv')
    print('Saved dataset with lyrics to ../data/with_lyrics.csv')


 if __name__ == '__main__':
    headers = {'Authorization': 'Bearer {}'.format(CLIENT_TOKEN)}  # include auth token
    with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=MAX_CONNECTIONS),
                               headers=headers,
                               conn_timeout=None,
                               read_timeout=None) as session:
        main()
	import asyncio
	import json
	from time import time, sleep

	import aiohttp
	import pandas as pd
	from bs4 import BeautifulSoup
	from tqdm import tqdm

	# Ващет СИКРЕТНА!!!!! Но код хранится в private репозитории, так что пох
	CLIENT_TOKEN = 'nH_1wY8HnEaSxV0FSJ5oL23XtDNxZ8-mZgkBISwps_B4rFrI8WrQ8TWHrViDGTGs'

	# max amount of connection at once for session
	MAX_CONNECTIONS = 50

	loop = asyncio.get_event_loop()
	df = pd.read_csv('../data/no_lyrics.csv', index_col=0)


	async def get_url(search_query: str) -> str:
	"""
	Gets search query, containing artist+title information, returns url of this song on genius.com,
	obtained from Genius API (api.genius.com). Returns empty string ('') if song is not found.
	@param search_query: query text for searching in genius
	@type search_query: str
	@return: url of genius.com page with lyrics of this song
	@rtype: str
	"""
	params = {'q': search_query}
	url = 'https://api.genius.com/search'
	async with session.get(url, params=params) as response:
	try:
	# gets response part, most possible (index 0) result, its url
	url = json.loads(await response.text())['response']['hits'][0]['result']['url']
	except IndexError:
	# if nothing is found
	return ''
	return url


	async def parse_url(url: str) -> str:
	"""
	Gets url as string, returns parsed lyrics of song from this page. Returns empty string ('')
	if url is empty ('').
	@param url: actual link to the song
	@type url: str
	@return: full lyrics of song
	@rtype: str
	"""
	if url != '':
	page = await session.get(url)
	soup = BeautifulSoup(await page.text(), 'html.parser')
	[scr.extract() for scr in soup('script')]
	lyrics = soup.find('div', class_='lyrics').get_text()
	return lyrics
	else:
	return ''


	async def get_lyrics(search_query):
	"""
	Returns lyrics for combined artist+title request
	@param search_query: query text for searching in genius
	@type search_query: str
	@return: full lyrics of song
	@rtype: str
	"""
	url = await get_url(search_query)
	lyrics = await parse_url(url)
	return lyrics


	async def put_lyrics(row: int) -> None:
	"""
	Insert lyrics for song into globally defined dataset, based on artist name and song title
	from this dataset, accessed by row index
	@param row: index of row in dataset
	@type row: int
	@return: None
	"""
	global df
	query = '{} {}'.format(df.loc[row, 'artist'], df.loc[row, 'title'])
	lyrics = await get_lyrics(query)
	df.loc[row, 'lyrics'] = lyrics


	async def wait_with_progress(coroutines: list) -> None:
	"""
	tqdm wrapper for async waiter. Prints progress bar for coroutines completing.
	@param coroutines: list of coroutines to complete
	@type coroutines: list of coroutines
	@return: None
	"""
	bar_width = 90
	for f in tqdm(asyncio.as_completed(coroutines), total=len(coroutines), ncols=bar_width):
	await f


	def main():
	start = time()
	df['lyrics'] = pd.Series(data='', index=df.index)
	futures = [put_lyrics(row_id) for row_id in df.index]
	loop.run_until_complete(wait_with_progress(futures))
	finish = time()
	sleep(0.5) # to separate tqdm bar from further print

	print('Done in {:6.2f} sec'.format(finish - start))
	df.to_csv('../data/with_lyrics.csv')
	print('Saved dataset with lyrics to ../data/with_lyrics.csv')


	if __name__ == '__main__':
	headers = {'Authorization': 'Bearer {}'.format(CLIENT_TOKEN)} # include auth token
	with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=MAX_CONNECTIONS),
	headers=headers,
	conn_timeout=None,
	read_timeout=None) as session:
	main()