Created
July 10, 2013 21:49
-
-
Save AstraLuma/5970631 to your computer and use it in GitHub Desktop.
A basic spider to aggregate statuses
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python -i | |
import argparse | |
import collections | |
import Queue | |
import logging | |
import requests | |
import threading | |
import urlparse | |
from bs4 import BeautifulSoup | |
parser = argparse.ArgumentParser( | |
description='Spider a site looking for errors' | |
) | |
parser.add_argument('url', nargs='+', | |
help='Root URLs') | |
parser.add_argument('--threads', default=4, type=int, | |
help='How many threads to use') | |
urls = {} | |
statuses = collections.Counter() | |
session = requests.Session() | |
def download(url, referer=None): | |
global urls, statuses, session | |
logging.info("-> %s", url) | |
r = session.get(url) | |
urls[url] = (r.status_code, referer) | |
logging.info("<- %s %i", url, r.status_code) | |
statuses.update([r.status_code]) | |
if r.status_code == 200: | |
return BeautifulSoup(r.content) | |
def worker(queue): | |
while True: | |
j = queue.get() | |
url, referer = j | |
try: | |
url, _ = urlparse.urldefrag(url) | |
if url in urls: | |
continue | |
urls[url] = None | |
doc = download(url, referer) | |
if doc is None: | |
continue | |
for link in doc.find_all('a'): | |
next = urlparse.urljoin(url, link.get('href')) | |
if next not in urls: | |
queue.put((next, url)) | |
finally: | |
queue.task_done() | |
args = parser.parse_args() | |
logging.basicConfig( | |
format='%(asctime)s %(name)s %(levelname)-8s %(message)s', | |
level=logging.INFO, | |
) | |
logging.getLogger('requests').level = logging.ERROR | |
queue = Queue.Queue() | |
for url in args.url: | |
queue.put((url, None)) | |
for i in xrange(args.threads): | |
t = threading.Thread(target=worker, args=(queue,), name='Worker %u' % i) | |
t.daemon = True | |
t.start() | |
queue.join() | |
print statuses |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment