Created
December 8, 2017 11:53
-
-
Save WyattJia/39f6ce8b811c253fd288f1ad0d7cbdad to your computer and use it in GitHub Desktop.
run in executor example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from asyncio import get_event_loop, ensure_future, sleep | |
from collections import deque | |
from concurrent.futures import ThreadPoolExecutor | |
import requests | |
from bs4 import BeautifulSoup | |
HEADERS = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/55.0.2883.95 Safari/537.36'} | |
ascii_re = re.compile('[\u0000-\u007F]+') | |
space_re = re.compile('\s+') | |
counter = 0 | |
done = set() | |
q = deque(maxlen=10000) | |
q.append('http://www.qq.com/') | |
done.add('http://www.qq.com/') | |
loop = get_event_loop() | |
thread_pool = ThreadPoolExecutor(30) | |
async def get_page(url): | |
def req(): | |
try: | |
return requests.get(url, headers=HEADERS, timeout=5).text | |
except requests.RequestException: | |
return "" | |
except Exception as e: | |
print(e) | |
return "" | |
return await loop.run_in_executor(thread_pool, req) | |
async def job(): | |
global counter | |
try: | |
url = q.popleft() | |
text = await get_page(url) | |
if text == '': | |
return | |
res = ''.join(ascii_re.findall(text)) | |
res = space_re.sub('', res) | |
with open(f'dat/{counter}.txt', 'w', encoding='ascii') as f: | |
f.write(url) | |
f.write('::') | |
f.write(res[:60000]) | |
counter += 1 | |
soup = BeautifulSoup(text, 'html.parser') | |
i = 0 | |
for link in soup.find_all('a'): | |
if i >= 40: | |
break | |
href = link.get('href') | |
if href is not None and (href.startswith('http://') or href.startswith('https://')) \ | |
and href[:30] not in done: | |
i += 1 | |
done.add(href[:30]) | |
q.append(href) | |
print(f'OK {counter}th') | |
except Exception as e: | |
print(e) | |
async def main(): | |
jobs = [] | |
await job() | |
while q: | |
if counter >= 200_0000: | |
break | |
if len(jobs) < 30: | |
jobs.append(ensure_future(job())) | |
else: | |
jobs[:] = [i for i in jobs if not i.done()] | |
if len(jobs) < 30: | |
continue | |
else: | |
await sleep(1) | |
loop.run_until_complete(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment