Last active
October 23, 2017 13:40
-
-
Save msukmanowsky/b7fb7d1182f719416f82cbe53c11b26a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time | |
| import asyncio | |
| import aiohttp | |
| from bs4 import BeautifulSoup | |
| async def main(): | |
| async with aiohttp.ClientSession() as session: | |
| print('Fetching schools.') | |
| start = time.time() | |
| school_urls = await get_school_urls(session) | |
| lap = time.time() - start | |
| print(f'Fetched {len(school_urls):,} schools in {lap:,.2f}s.') | |
| # Only get the first 20 | |
| school_urls = school_urls[:20] | |
| # If I initiate a new session here, requests below complete asynchronously, otherwise synchronous | |
| print('Fetching first 20 ratings.') | |
| start = time.time() | |
| ratings = [get_school_rating(session, url) for url in school_urls] | |
| ratings = await asyncio.gather(*ratings) | |
| lap = time.time() - start | |
| print(f'Fetched {len(ratings):,} ratings in {lap:,.2f}s.') | |
| print(ratings) | |
| async def get_school_urls(session): | |
| response = await session.get('http://ontario.compareschoolrankings.org/elementary/SchoolsByRankLocationName.aspx', | |
| params={'schooltype': 'elementary'}) | |
| soup = BeautifulSoup(await response.text(), 'lxml') | |
| table = soup.find('table', 'rating').find_all('table')[0] | |
| rows = table.find_all('tr') | |
| school_urls = [] | |
| for row in rows[1:]: | |
| td = row.find_all('td')[3] | |
| relative_school_url = td.find('a').attrs['href'] | |
| school_url = '{}://{}{}'.format(response.url.scheme, response.url.host, relative_school_url) | |
| school_urls.append(school_url) | |
| return school_urls | |
| async def get_school_rating(session, url): | |
| print(f'Starting fetch of {url}.') | |
| async with session.get(url) as response: | |
| print(f'Fetched {url}, starting response text parsing.') | |
| soup = BeautifulSoup(await response.text(), 'lxml') | |
| print(f'Finished repsonse text parsing for {url}.') | |
| rating_img = soup.find(id='ctl00_ContentPlaceHolder1_detailedReportCard_imgRating') | |
| if not rating_img: | |
| rating = None | |
| else: | |
| rating = float(rating_img.attrs['src'].split('=')[1]) | |
| return { | |
| 'url': url, | |
| 'rating': rating | |
| } | |
| if __name__ == '__main__': | |
| loop = asyncio.get_event_loop() | |
| loop.run_until_complete(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment