Skip to content

Instantly share code, notes, and snippets.

@cllu
Created October 29, 2018 23:52
Show Gist options
  • Save cllu/57ed9d8067ae04b89391ff6b50f41a13 to your computer and use it in GitHub Desktop.
Save cllu/57ed9d8067ae04b89391ff6b50f41a13 to your computer and use it in GitHub Desktop.
Python async HTML crawling with a list of proxies
#!/usr/bin/env python3
from os import listdir
from os.path import isfile
from time import time, sleep
from threading import Thread, Lock
import json
import random
import asyncio
import aiohttp
class UrlManager:
def __init__(self, url_file):
self.urls = {l.strip() for l in open(url_file)}
self.writer = open('crawled.jsonl', 'w')
self.lock = Lock()
def get_url(self):
return self.urls.pop()
def add_url(self, url):
self.urls.add(url)
def save(self, url, html):
with self.lock:
self.writer.write(json.dumps({'url': url, 'timestamp': time(), 'html': html}))
self.writer.write('\n')
def empty(self):
return len(self.urls) == 0
def close(self):
self.writer.close()
class ProxyManager:
def __init__(self, proxy_file):
self.available = {l.strip() for l in open(proxy_file)}
def get_proxy(self):
"""Get a proxy, mark it in use"""
proxy = None
while proxy is None:
proxy = self.available.pop()
if proxy is None:
print("waiting for valid proxies....")
sleep(1)
return proxy
def invalidate_proxy(self, proxy):
"""Drop an active proxy, since we find that it is not valid"""
def return_proxy(self, proxy):
"""Return valid proxies"""
self.available.add(proxy);
USER_AGENTS = [
'Mozilla/5.0 (Linux; Android 4.4.4; One Build/KTU84L.H4) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/33.0.0.0 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/28.0.0.20.16;]',
'Mozilla/5.0 (Linux; Android 4.4.4; One Build/KTU84L.H4) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/33.0.0.0 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; Android 4.4.4; One Build/KTU84L.H4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.135 Mobile Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12D508 [FBAN/FBIOS;FBAV/27.0.0.10.12;FBBV/8291884;FBDV/iPhone7,1;FBMD/iPhone;FBSN/iPhone OS;FBSV/8.2;FBSS/3; FBCR/vodafoneIE;FBID/phone;FBLC/en_US;FBOP/5]',
'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12D508',
'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12D508',
'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12D508',
'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12D508 Safari/600.1.4',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
]
HEADERS = {
'pragma': 'no-cache',
'dnt': '1',
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2',
'accept': '*/*',
'cache-control': 'no-cache',
}
url_manager = UrlManager('urls.txt')
proxy_manager = ProxyManager('proxies.txt')
async def crawl_page(url, client):
headers = {
'user-agent': random.choice(USER_AGENTS),
}
headers.update(HEADERS)
retries = 0
while retries < 3:
retries += 1
proxy = proxy_manager.get_proxy()
print('crawling url %s with proxy %s' % (url, proxy))
try:
async with client.get(url, headers=headers, proxy="http://"+proxy, timeout=15) as resp:
if resp.status != 200:
print("Status error:", resp.status)
proxy_manager.invalidate_proxy(proxy)
return
proxy_manager.return_proxy(proxy)
text = await resp.text()
url_manager.save(url, text)
break
except Exception:
print("proxy %s does not work" % proxy)
proxy_manager.invalidate_proxy(proxy)
async def batch_crawl_pages(sem, url, client):
async with sem:
await crawl_page(url, client)
async def crawl_pages(loop):
tasks = []
sem = asyncio.Semaphore(100)
print("Crawling %d pages" % len(url_manager.urls))
async with aiohttp.ClientSession(loop=loop) as client:
for url in url_manager.urls:
task = asyncio.ensure_future(batch_crawl_pages(sem, url, client))
tasks.append(task)
responses = asyncio.gather(*tasks)
await responses
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(crawl_pages(loop))
url_manager.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment