Skip to content

Instantly share code, notes, and snippets.

@niazangels
Created April 12, 2022 20:57
Show Gist options
  • Save niazangels/952e36c12e272facb7b520e048c14280 to your computer and use it in GitHub Desktop.
Save niazangels/952e36c12e272facb7b520e048c14280 to your computer and use it in GitHub Desktop.
Async python crawler with semaphore limit
# References:
# https://github.com/PrettyPrinted/youtube_video_code/blob/master/2020/12/31/How%20to%20Speed%20Up%20API%20Requests%20With%20Async%20Python/apiasync/script.py
# https://stackoverflow.com/questions/47934212/how-to-use-python-aiohttp-library-to-download-multiple-webpages
# https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
import aiohttp
import aiofiles
import asyncio
URLS = [] # populate this
async def main():
tasks = []
sem = asyncio.Semaphore(20)
for id, url in enumerate(URLS):
task = asyncio.ensure_future(bound_save_webpage(id, url, sem))
tasks.append(task)
await asyncio.gather(*tasks)
async def bound_save_webpage(id, url, sem):
async with sem:
await save_webpage(id, url)
async def save_webpage(id, url):
try:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
content = await response.read()
print(f"Fetched {id}")
except:
print(f"Failed {id}")
else:
async with aiofiles.open(f'webpages/{id}.html', mode='wb') as f:
await f.write(content)
print(f"Saved {id}")
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment