Created
August 21, 2015 08:33
-
-
Save gurisko/70085f4bae7a2ed7fff8 to your computer and use it in GitHub Desktop.
Simple crawler written in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# default starting url | |
seed = "http://opera.com/" | |
# workers number | |
workers = 20 | |
# max number of discovered URL, None if no limit | |
limit = 10000 | |
# user-agent name and logging name | |
name = 'crawlerlog' | |
# allowed url schemes | |
allowed_schemes = ['http', 'https'] | |
# parsing tags with web address attribute | |
urltags = {'a': 'href', 'form': 'action', 'frame': 'src', 'iframe': 'src'} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import urllib2 | |
import urlparse | |
import logging | |
from robotparser import RobotFileParser | |
from HTMLParser import HTMLParser | |
from gevent import Timeout, Greenlet | |
import gevent.pool as GeventPool | |
import gevent.queue as GeventQueue | |
import gevent.event as GeventEvent | |
import gevent.monkey | |
import config | |
gevent.monkey.patch_all() | |
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', | |
filename=config.name+'.log', level=logging.INFO) | |
log = logging.getLogger() | |
discovered = set() | |
permissions = {} | |
class LogException(Exception): | |
def __init__(self, val, url): | |
self.value = val | |
self.url = url | |
def __str__(self): | |
return (self.value + ': ' + self.url) | |
class ProcessPage(HTMLParser): | |
def __init__(self, url, queue): | |
HTMLParser.__init__(self) | |
self.url = url | |
self.queue = queue | |
log.info('Parsing '+self.url) | |
def normalize_url(self, url): | |
url = urlparse.urljoin(self.url, url) | |
url = urlparse.urldefrag(url)[0].lower() | |
return url | |
def can_follow(self, attrs): | |
attrs = dict(attrs) | |
if 'name' in attrs and attrs['name'] == 'robots': | |
attrs['content'] = [x.strip() for x in attrs['content'].split(',')] | |
if 'nofollow' in attrs['content'] and \ | |
'follow' not in attrs['content']: | |
return False | |
return True | |
def has_allowed_scheme(self, url): | |
return (urlparse.urlparse(url).scheme in config.allowed_schemes) | |
def handle_starttag(self, tag, attrs): | |
if tag == 'meta' and self.can_follow(attrs) == False: | |
raise LogException('Nofollow', self.url) | |
if tag not in config.urltags: | |
return | |
attrs = dict(attrs) | |
if config.urltags[tag] not in attrs: | |
return | |
url = self.normalize_url(attrs[config.urltags[tag]]) | |
if self.has_allowed_scheme(url) and (url not in discovered): | |
log.info(self.url + ' -> ' + url) | |
discovered.add(url) | |
self.queue.put(url) | |
class Extractor(Greenlet): | |
def __init__(self, url, queue): | |
Greenlet.__init__(self) | |
self.url = url | |
self.queue = queue | |
def can_read(self): | |
domain = urlparse.urlparse(self.url).netloc | |
robot_url = urlparse.urljoin('http://' + domain, 'robots.txt') | |
try: | |
if domain not in permissions: | |
robot = RobotFileParser() | |
robot.set_url(robot_url) | |
robot.read() | |
permissions[domain] = robot | |
res = permissions[domain].can_fetch('*', self.url) | |
except: | |
raise LogException('RobotError', robot_url) | |
return res | |
def get_source_code(self): | |
if not self.can_read(): | |
raise LogException('ProtectedAddress', self.url) | |
try: | |
opener = urllib2.urlopen(self.url) | |
except urllib2.HTTPError as ex: | |
raise LogException('HTTPError', self.url) | |
except urllib2.URLError as ex: | |
raise LogException('URLError', self.url) | |
opener_type = opener.info().gettype() | |
if opener_type != "text/html": | |
raise LogException('InvalidFormatException', self.url) | |
content = opener.read().decode('ascii', 'ignore') | |
return content | |
def extract_links(self): | |
with Timeout(10): | |
data = self.get_source_code() | |
if not data: | |
return set() | |
parser = ProcessPage(self.url, self.queue) | |
parser.feed(data) | |
parser.close() | |
class Crawler(object): | |
def __init__(self): | |
self.pool = GeventPool.Pool(config.workers) | |
self.frontier = GeventQueue.Queue() | |
self.flag_finished = GeventEvent.Event() | |
self.start() | |
def start(self): | |
discovered.add(config.seed) | |
self.frontier.put(config.seed) | |
gevent.spawn(self.scheduler).join() | |
def scheduler(self): | |
while True: | |
try: | |
url = self.frontier.get_nowait() | |
except GeventQueue.Empty: | |
if self.pool.free_count() != self.pool.size: | |
self.flag_finished.wait() | |
self.flag_finished.clear() | |
else: | |
self.pool.join() | |
return | |
if url is not None and (config.limit is None or | |
len(discovered) <= config.limit): | |
self.pool.spawn(self.worker, url) | |
url = None | |
def worker(self, url): | |
try: | |
Extractor(url, self.frontier).extract_links() | |
except LogException as ex: | |
log.warn(ex) | |
except Timeout as t: | |
log.info('Timeout: ' + t) | |
self.flag_finished.set() | |
def main(): | |
log.info('[-- Hello, world! --]') | |
try: | |
print '[-- Crawling in progress --]' | |
Crawler() | |
except KeyboardInterrupt as ex: | |
log.info('KeyboardInterrupt') | |
except Exception as ex: | |
log.exception(ex) | |
finally: | |
log.info('[-- Goodbye, world! --]\n') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment