Created
October 29, 2018 23:52
-
-
Save cllu/57ed9d8067ae04b89391ff6b50f41a13 to your computer and use it in GitHub Desktop.
Python async HTML crawling with a list of proxies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from os import listdir | |
from os.path import isfile | |
from time import time, sleep | |
from threading import Thread, Lock | |
import json | |
import random | |
import asyncio | |
import aiohttp | |
class UrlManager: | |
def __init__(self, url_file): | |
self.urls = {l.strip() for l in open(url_file)} | |
self.writer = open('crawled.jsonl', 'w') | |
self.lock = Lock() | |
def get_url(self): | |
return self.urls.pop() | |
def add_url(self, url): | |
self.urls.add(url) | |
def save(self, url, html): | |
with self.lock: | |
self.writer.write(json.dumps({'url': url, 'timestamp': time(), 'html': html})) | |
self.writer.write('\n') | |
def empty(self): | |
return len(self.urls) == 0 | |
def close(self): | |
self.writer.close() | |
class ProxyManager: | |
def __init__(self, proxy_file): | |
self.available = {l.strip() for l in open(proxy_file)} | |
def get_proxy(self): | |
"""Get a proxy, mark it in use""" | |
proxy = None | |
while proxy is None: | |
proxy = self.available.pop() | |
if proxy is None: | |
print("waiting for valid proxies....") | |
sleep(1) | |
return proxy | |
def invalidate_proxy(self, proxy): | |
"""Drop an active proxy, since we find that it is not valid""" | |
def return_proxy(self, proxy): | |
"""Return valid proxies""" | |
self.available.add(proxy); | |
USER_AGENTS = [ | |
'Mozilla/5.0 (Linux; Android 4.4.4; One Build/KTU84L.H4) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/33.0.0.0 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/28.0.0.20.16;]', | |
'Mozilla/5.0 (Linux; Android 4.4.4; One Build/KTU84L.H4) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/33.0.0.0 Mobile Safari/537.36', | |
'Mozilla/5.0 (Linux; Android 4.4.4; One Build/KTU84L.H4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.135 Mobile Safari/537.36', | |
'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12D508 [FBAN/FBIOS;FBAV/27.0.0.10.12;FBBV/8291884;FBDV/iPhone7,1;FBMD/iPhone;FBSN/iPhone OS;FBSV/8.2;FBSS/3; FBCR/vodafoneIE;FBID/phone;FBLC/en_US;FBOP/5]', | |
'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12D508', | |
'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12D508', | |
'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12D508', | |
'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12D508 Safari/600.1.4', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', | |
] | |
HEADERS = { | |
'pragma': 'no-cache', | |
'dnt': '1', | |
'accept-encoding': 'gzip, deflate, sdch, br', | |
'accept-language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2', | |
'accept': '*/*', | |
'cache-control': 'no-cache', | |
} | |
url_manager = UrlManager('urls.txt') | |
proxy_manager = ProxyManager('proxies.txt') | |
async def crawl_page(url, client): | |
headers = { | |
'user-agent': random.choice(USER_AGENTS), | |
} | |
headers.update(HEADERS) | |
retries = 0 | |
while retries < 3: | |
retries += 1 | |
proxy = proxy_manager.get_proxy() | |
print('crawling url %s with proxy %s' % (url, proxy)) | |
try: | |
async with client.get(url, headers=headers, proxy="http://"+proxy, timeout=15) as resp: | |
if resp.status != 200: | |
print("Status error:", resp.status) | |
proxy_manager.invalidate_proxy(proxy) | |
return | |
proxy_manager.return_proxy(proxy) | |
text = await resp.text() | |
url_manager.save(url, text) | |
break | |
except Exception: | |
print("proxy %s does not work" % proxy) | |
proxy_manager.invalidate_proxy(proxy) | |
async def batch_crawl_pages(sem, url, client): | |
async with sem: | |
await crawl_page(url, client) | |
async def crawl_pages(loop): | |
tasks = [] | |
sem = asyncio.Semaphore(100) | |
print("Crawling %d pages" % len(url_manager.urls)) | |
async with aiohttp.ClientSession(loop=loop) as client: | |
for url in url_manager.urls: | |
task = asyncio.ensure_future(batch_crawl_pages(sem, url, client)) | |
tasks.append(task) | |
responses = asyncio.gather(*tasks) | |
await responses | |
if __name__ == '__main__': | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(crawl_pages(loop)) | |
url_manager.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment