|
# -*- coding: utf-8 -*- |
|
# filename: crawler.py |
|
|
|
import sqlite3 |
|
import urllib2 |
|
from HTMLParser import HTMLParser |
|
from urlparse import urlparse |
|
|
|
|
|
class HREFParser(HTMLParser): |
|
""" |
|
Parser that extracts hrefs |
|
""" |
|
hrefs = set() |
|
def handle_starttag(self, tag, attrs): |
|
if tag == 'a': |
|
dict_attrs = dict(attrs) |
|
if dict_attrs.get('href'): |
|
self.hrefs.add(dict_attrs['href']) |
|
|
|
|
|
def get_local_links(html, domain): |
|
""" |
|
Read through HTML content and returns a tuple of links |
|
internal to the given domain |
|
""" |
|
hrefs = set() |
|
parser = HREFParser() |
|
parser.feed(html) |
|
for href in parser.hrefs: |
|
u_parse = urlparse(href) |
|
if href.startswith('/'): |
|
# purposefully using path, no query, no hash |
|
hrefs.add(u_parse.path) |
|
else: |
|
# only keep the local urls |
|
if u_parse.netloc == domain: |
|
hrefs.add(u_parse.path) |
|
return hrefs |
|
|
|
|
|
class CrawlerCache(object): |
|
""" |
|
Crawler data caching per relative URL and domain. |
|
""" |
|
def __init__(self, db_file): |
|
self.conn = sqlite3.connect(db_file) |
|
c = self.conn.cursor() |
|
c.execute('''CREATE TABLE IF NOT EXISTS sites |
|
(domain text, url text, content text)''') |
|
self.conn.commit() |
|
self.cursor = self.conn.cursor() |
|
|
|
def set(self, domain, url, data): |
|
""" |
|
store the content for a given domain and relative url |
|
""" |
|
self.cursor.execute("INSERT INTO sites VALUES (?,?,?)", |
|
(domain, url, data)) |
|
self.conn.commit() |
|
|
|
def get(self, domain, url): |
|
""" |
|
return the content for a given domain and relative url |
|
""" |
|
self.cursor.execute("SELECT content FROM sites WHERE domain=? and url=?", |
|
(domain, url)) |
|
row = self.cursor.fetchone() |
|
if row: |
|
return row[0] |
|
|
|
def get_urls(self, domain): |
|
""" |
|
return all the URLS within a domain |
|
""" |
|
self.cursor.execute("SELECT url FROM sites WHERE domain=?", (domain,)) |
|
# could use fetchone and yield but I want to release |
|
# my cursor after the call. I could have create a new cursor tho. |
|
# ...Oh well |
|
return [row[0] for row in self.cursor.fetchall()] |
|
|
|
|
|
class Crawler(object): |
|
def __init__(self, cache=None, depth=2): |
|
""" |
|
depth: how many time it will bounce from page one (optional) |
|
cache: a basic cache controller (optional) |
|
""" |
|
self.depth = depth |
|
self.content = {} |
|
self.cache = cache |
|
|
|
def crawl(self, url, no_cache=None): |
|
""" |
|
url: where we start crawling, should be a complete URL like |
|
'http://www.intel.com/news/' |
|
no_cache: function returning True if the url should be refreshed |
|
""" |
|
u_parse = urlparse(url) |
|
self.domain = u_parse.netloc |
|
self.content[self.domain] = {} |
|
self.scheme = u_parse.scheme |
|
self.no_cache = no_cache |
|
self._crawl([u_parse.path], self.depth) |
|
|
|
def set(self, url, html): |
|
self.content[self.domain][url] = html |
|
if self.is_cacheable(url): |
|
self.cache.set(self.domain, url, html) |
|
|
|
def get(self, url): |
|
page = None |
|
if self.is_cacheable(url): |
|
page = self.cache.get(self.domain, url) |
|
if page is None: |
|
page = self.curl(url) |
|
else: |
|
print "cached url... [%s] %s" % (self.domain, url) |
|
return page |
|
|
|
def is_cacheable(self, url): |
|
return self.cache and self.no_cache \ |
|
and not self.no_cache(url) |
|
|
|
def _crawl(self, urls, max_depth): |
|
n_urls = set() |
|
if max_depth: |
|
for url in urls: |
|
# do not crawl twice the same page |
|
if url not in self.content: |
|
html = self.get(url) |
|
self.set(url, html) |
|
n_urls = n_urls.union(get_local_links(html, self.domain)) |
|
self._crawl(n_urls, max_depth-1) |
|
|
|
def curl(self, url): |
|
""" |
|
return content at url. |
|
return empty string if response raise an HTTPError (not found, 500...) |
|
""" |
|
try: |
|
print "retrieving url... [%s] %s" % (self.domain, url) |
|
req = urllib2.Request('%s://%s%s' % (self.scheme, self.domain, url)) |
|
response = urllib2.urlopen(req) |
|
return response.read().decode('ascii', 'ignore') |
|
except urllib2.HTTPError, e: |
|
print "error [%s] %s: %s" % (self.domain, url, e) |
|
return '' |