Created
January 9, 2017 22:36
-
-
Save hadware/1342e76ee910212450b7f9a06801357a to your computer and use it in GitHub Desktop.
A small scraper that uses ayncio to run lots of concurrent request on an API. However, it does use a sempaphore to limit the number of outgoing connection at one time.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import json | |
from os import path | |
import aiohttp | |
import async_timeout | |
imdb_ids = ['0114319', '0112302', '0114576', '0113189', '0112346', '0112896', '0112453', | |
'0113987', '0112760', '0112641', '0114388', '0113101', '0112281', '0113845'] | |
api_url = "http://www.omdbapi.com/?i=tt%s&plot=full&r=json" | |
all_plots = dict() | |
async def fetch(session, url): | |
with async_timeout.timeout(10): | |
async with session.get(url) as response: | |
return await response.text() | |
async def retrieve_plot(imdb_id, count): | |
# from what i've understood, every usage of this context adds one lock to the semaphore | |
async with connection_limit: | |
async with aiohttp.ClientSession() as session: # async context for the client | |
print("Fetching %i (%s)" % (count, imdb_id)) | |
response_text = await fetch(session, api_url % imdb_id) | |
print("Got %i (%s)" % (count, imdb_id)) | |
try: | |
plot = json.loads(response_text)["Plot"] | |
all_plots[imdb_id] = plot | |
except KeyError: | |
pass | |
loop = asyncio.get_event_loop() | |
connection_limit = asyncio.Semaphore(10) # rate limit to 10 connections at a time | |
tasks = [] | |
for i, imdb_id in enumerate(imdb_ids): | |
task = asyncio.ensure_future(retrieve_plot(imdb_id, i)) | |
tasks.append(task) | |
loop.run_until_complete(asyncio.wait(tasks)) | |
with open(path.join("plots.json", "wb") as jsonfile: | |
json.dump(all_plots, jsonfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment