Last active
June 12, 2018 08:21
-
-
Save limboinf/263552c4258b3378b73ca3bb78fd8324 to your computer and use it in GitHub Desktop.
tornado异步爬虫示例
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
""" | |
tornado异步爬虫示例 | |
""" | |
import time | |
from datetime import timedelta | |
from bs4 import BeautifulSoup | |
from tornado.httpclient import AsyncHTTPClient | |
from tornado import ioloop, gen, queues | |
_q = queues.Queue() | |
@gen.coroutine | |
def fetch(url): | |
print('fetcing', url) | |
response = yield AsyncHTTPClient().fetch(url, raise_error=False) | |
raise gen.Return(response) | |
@gen.coroutine | |
def run(): | |
try: | |
url = yield _q.get() | |
res = yield fetch(url) | |
html = res.body | |
soup = BeautifulSoup(html, "html.parser") | |
print(str(soup.find('title'))) | |
finally: | |
_q.task_done() | |
@gen.coroutine | |
def worker(): | |
while not _q.empty(): | |
yield run() | |
@gen.coroutine | |
def main(): | |
for i in range(73000, 73100): # 放100个链接进去 | |
url = "http://www.jb51.net/article/%d.htm" % i | |
yield _q.put(url) | |
for _ in range(100): # 模拟100个线程 | |
worker() | |
yield _q.join(timeout=timedelta(seconds=30)) | |
if __name__ == '__main__': | |
s = time.time() | |
ioloop.IOLoop.current().run_sync(main) | |
print 'use time:', time.time() - s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment