msukmanowsky · October 23, 2017 13:40
diff --git a/fi_crawler.py b/fi_crawler.py
 import time
 import asyncio

 import aiohttp

 from bs4 import BeautifulSoup


 async def main():
    async with aiohttp.ClientSession() as session:
        print('Fetching schools.')
        start = time.time()
        school_urls = await get_school_urls(session)
        lap = time.time() - start
        print(f'Fetched {len(school_urls):,} schools in {lap:,.2f}s.')
        
        # Only get the first 20
        school_urls = school_urls[:20]
        
        # If I initiate a new session here, requests below complete asynchronously, otherwise synchronous
        print('Fetching first 20 ratings.')
        start = time.time()
        ratings = [get_school_rating(session, url) for url in school_urls]
        ratings = await asyncio.gather(*ratings)
        lap = time.time() - start
        print(f'Fetched {len(ratings):,} ratings in {lap:,.2f}s.')

    print(ratings)


 async def get_school_urls(session):
    response = await session.get('http://ontario.compareschoolrankings.org/elementary/SchoolsByRankLocationName.aspx',
                                 params={'schooltype': 'elementary'})
    soup = BeautifulSoup(await response.text(), 'lxml')
    table = soup.find('table', 'rating').find_all('table')[0]
    rows = table.find_all('tr')
    school_urls = []
    for row in rows[1:]:
        td = row.find_all('td')[3]
        relative_school_url = td.find('a').attrs['href']
        school_url = '{}://{}{}'.format(response.url.scheme, response.url.host, relative_school_url)
        school_urls.append(school_url)
    return school_urls


 async def get_school_rating(session, url):
    print(f'Starting fetch of {url}.')
    async with session.get(url) as response:
        print(f'Fetched {url}, starting response text parsing.')
        soup = BeautifulSoup(await response.text(), 'lxml')
        print(f'Finished repsonse text parsing for {url}.')
        rating_img = soup.find(id='ctl00_ContentPlaceHolder1_detailedReportCard_imgRating')
        if not rating_img:
            rating = None
        else:
            rating = float(rating_img.attrs['src'].split('=')[1])
        return {
            'url': url,
            'rating': rating
        }


 if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
	import time
	import asyncio

	import aiohttp

	from bs4 import BeautifulSoup


	async def main():
	async with aiohttp.ClientSession() as session:
	print('Fetching schools.')
	start = time.time()
	school_urls = await get_school_urls(session)
	lap = time.time() - start
	print(f'Fetched {len(school_urls):,} schools in {lap:,.2f}s.')

	# Only get the first 20
	school_urls = school_urls[:20]

	# If I initiate a new session here, requests below complete asynchronously, otherwise synchronous
	print('Fetching first 20 ratings.')
	start = time.time()
	ratings = [get_school_rating(session, url) for url in school_urls]
	ratings = await asyncio.gather(*ratings)
	lap = time.time() - start
	print(f'Fetched {len(ratings):,} ratings in {lap:,.2f}s.')

	print(ratings)


	async def get_school_urls(session):
	response = await session.get('http://ontario.compareschoolrankings.org/elementary/SchoolsByRankLocationName.aspx',
	params={'schooltype': 'elementary'})
	soup = BeautifulSoup(await response.text(), 'lxml')
	table = soup.find('table', 'rating').find_all('table')[0]
	rows = table.find_all('tr')
	school_urls = []
	for row in rows[1:]:
	td = row.find_all('td')[3]
	relative_school_url = td.find('a').attrs['href']
	school_url = '{}://{}{}'.format(response.url.scheme, response.url.host, relative_school_url)
	school_urls.append(school_url)
	return school_urls


	async def get_school_rating(session, url):
	print(f'Starting fetch of {url}.')
	async with session.get(url) as response:
	print(f'Fetched {url}, starting response text parsing.')
	soup = BeautifulSoup(await response.text(), 'lxml')
	print(f'Finished repsonse text parsing for {url}.')
	rating_img = soup.find(id='ctl00_ContentPlaceHolder1_detailedReportCard_imgRating')
	if not rating_img:
	rating = None
	else:
	rating = float(rating_img.attrs['src'].split('=')[1])
	return {
	'url': url,
	'rating': rating
	}


	if __name__ == '__main__':
	loop = asyncio.get_event_loop()
	loop.run_until_complete(main())
No results found