Last active
April 17, 2019 07:38
-
-
Save wtneal/377ed73f42f5da1acdbebc5eee976bd1 to your computer and use it in GitHub Desktop.
asyncio scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import aiofiles | |
import aiohttp | |
import logging | |
import re | |
import sys | |
import os | |
import lxml.html | |
logger = logging.getLogger(__name__) | |
logger.addHandler(logging.StreamHandler()) | |
logger.setLevel(logging.DEBUG) | |
HEADERS = { | |
'USER-AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36' | |
} | |
MAX_PICS = 10 | |
SUBREDDIT = 'earthporn' | |
class WorkerPool: | |
def __init__(self, loop, coro, worker_count, options): | |
self.loop = loop or asyncio.get_event_loop() | |
self.result = None | |
self.q = asyncio.Queue(loop=self.loop) | |
self.coro = coro | |
self.worker_count = worker_count | |
self.options = options | |
async def run(self): | |
workers = [asyncio.Task(self.coro(self.loop, self.q, self.options)) | |
for _ in range(self.worker_count)] | |
await self.q.join() | |
for w in workers: | |
w.cancel() | |
async def _fetch(loop, q, options): | |
# the loop makes sure the coroutine doesn't return and keeps going | |
while True: | |
try: | |
''' Pull a url from the queue and request it ''' | |
url, utype = await q.get() | |
logger.debug('url: %s, url type: %s', url, utype) | |
async with aiohttp.ClientSession(loop=loop, headers=HEADERS) as session: | |
async with session.get(url) as resp: | |
if utype == 'seed': | |
text = await resp.text() | |
for link in _parse_links(text): | |
q.put_nowait((link, 'post')) | |
elif utype == 'post': | |
text = await resp.text() | |
q.put_nowait((_get_image_link(text), 'img')) | |
else: | |
outdir = os.path.join('/tmp', SUBREDDIT, options['mods'].replace('/', '_')) | |
await _get_image(resp, outdir) | |
logger.debug('about to finish task, queue size: %s', q.qsize()) | |
q.task_done() | |
except asyncio.CancelledError: | |
break | |
except: | |
logger.exception('error') | |
raise | |
def _parse_links(text): | |
logger.debug('getting post links') | |
html = lxml.html.fromstring(text) | |
links = html.xpath("//a[contains(@class, title)]/@href") | |
logger.debug(links) | |
links = ['http://imgur.com' + link for link in links | |
if re.search('^/r/{}/[a-zA-Z0-9]{{7}}'.format(SUBREDDIT), link, re.I)] | |
logger.debug(links) | |
return links[:MAX_PICS] | |
def _get_image_link(text): | |
logger.debug('getting actual image link') | |
html = lxml.html.fromstring(text) | |
video_link = html.xpath('//div[@class="post-image"]//video/following-sibling::meta[@itemprop="contentURL"]/@content') | |
if video_link: | |
logger.debug('returning video link') | |
return video_link[0] | |
link = html.xpath('//div[@class="post-image"]//img/@src')[0] | |
return link | |
async def _get_image(resp, outdir): | |
logger.debug('saving image') | |
fn = resp.url.split('/')[-1] | |
async with aiofiles.open('{}/{}'.format(outdir, fn), 'wb') as f: | |
data = await resp.read() | |
await f.write(data) | |
def main(mods=''): | |
try: | |
outdir = os.path.join('/tmp', SUBREDDIT, mods.replace('/', '_')) | |
os.makedirs(outdir) | |
except: | |
logger.warn("oops couldn't create folder") | |
try: | |
loop = asyncio.get_event_loop() | |
wp = WorkerPool(loop, _fetch, 5, options={'sr': SUBREDDIT, 'mods': mods}) | |
seed_url = 'http://imgur.com/r/{}{}'.format(SUBREDDIT, mods) | |
wp.q.put_nowait((seed_url, 'seed')) | |
loop.run_until_complete(wp.run()) | |
except KeyboardInterrupt: | |
sys.stderr.flush() | |
except: | |
logger.exception('error with loop') | |
finally: | |
loop.close() | |
if __name__ == '__main__': | |
main(mods='/top/all') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hello wtneal, rum this code get error, please help to look this:
about to finish task, queue size: 6
url: //i.imgur.com/sATBfbPr.jpg, url type: img
error
Traceback (most recent call last):
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\site-packages\aiohttp\connector.py", line 608, in _create_direct_connection
local_addr=self._local_addr)
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\base_events.py", line 695, in create_connection
raise exceptions[0]
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\base_events.py", line 682, in create_connection
yield from self.sock_connect(sock, address)
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\futures.py", line 361, in iter
yield self # This tells Task to wait for completion.
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\tasks.py", line 296, in _wakeup
future.result()
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\futures.py", line 274, in result
raise self._exception
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\asyncio\selector_events.py", line 414, in _sock_connect
sock.connect(address)
OSError: [WinError 10049] 在其上下文中,该请求的地址无效。
Exception:
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:/Users/admin/PycharmProjects/untitled/new-aysncio/example-github/crawl-asyncio/web-asyncio.py", line 47, in _fetch
async with session.get(url) as resp:
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\site-packages\aiohttp\client.py", line 529, in aenter
self._resp = yield from self._coro
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\site-packages\aiohttp\client.py", line 165, in _request
conn = yield from self._connector.connect(req)
File "C:\Users\admin\AppData\Local\Programs\Python\Python35\lib\site-packages\aiohttp\connector.py", line 316, in connect
.format(key, exc.strerror)) from exc
aiohttp.errors.ClientOSError: [Errno 10049] Cannot connect to host i.imgur.com:None ssl:False [Can not connect to i.imgur.com:None [在其上下文中,该请求的地址无效。]]