Skip to content

Instantly share code, notes, and snippets.

@msukmanowsky
Last active October 23, 2017 13:40
Show Gist options
  • Select an option

  • Save msukmanowsky/b7fb7d1182f719416f82cbe53c11b26a to your computer and use it in GitHub Desktop.

Select an option

Save msukmanowsky/b7fb7d1182f719416f82cbe53c11b26a to your computer and use it in GitHub Desktop.
import time
import asyncio
import aiohttp
from bs4 import BeautifulSoup
async def main():
async with aiohttp.ClientSession() as session:
print('Fetching schools.')
start = time.time()
school_urls = await get_school_urls(session)
lap = time.time() - start
print(f'Fetched {len(school_urls):,} schools in {lap:,.2f}s.')
# Only get the first 20
school_urls = school_urls[:20]
# If I initiate a new session here, requests below complete asynchronously, otherwise synchronous
print('Fetching first 20 ratings.')
start = time.time()
ratings = [get_school_rating(session, url) for url in school_urls]
ratings = await asyncio.gather(*ratings)
lap = time.time() - start
print(f'Fetched {len(ratings):,} ratings in {lap:,.2f}s.')
print(ratings)
async def get_school_urls(session):
response = await session.get('http://ontario.compareschoolrankings.org/elementary/SchoolsByRankLocationName.aspx',
params={'schooltype': 'elementary'})
soup = BeautifulSoup(await response.text(), 'lxml')
table = soup.find('table', 'rating').find_all('table')[0]
rows = table.find_all('tr')
school_urls = []
for row in rows[1:]:
td = row.find_all('td')[3]
relative_school_url = td.find('a').attrs['href']
school_url = '{}://{}{}'.format(response.url.scheme, response.url.host, relative_school_url)
school_urls.append(school_url)
return school_urls
async def get_school_rating(session, url):
print(f'Starting fetch of {url}.')
async with session.get(url) as response:
print(f'Fetched {url}, starting response text parsing.')
soup = BeautifulSoup(await response.text(), 'lxml')
print(f'Finished repsonse text parsing for {url}.')
rating_img = soup.find(id='ctl00_ContentPlaceHolder1_detailedReportCard_imgRating')
if not rating_img:
rating = None
else:
rating = float(rating_img.attrs['src'].split('=')[1])
return {
'url': url,
'rating': rating
}
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment