Skip to content

Instantly share code, notes, and snippets.

Last active February 19, 2024 03:06
Show Gist options
  • Save typehorror/a34563fb078d9d2d15e8 to your computer and use it in GitHub Desktop.
Save typehorror/a34563fb078d9d2d15e8 to your computer and use it in GitHub Desktop.
Simple Website Crawler (in python)

Simple Website Crawler

The following gist is an extract of the article Building a simple crawler. It allows crawling from a URL and for a given number of bounce.

Basic Usage

from crawler import Crawler
crawler = Crawler()
# displays the urls
print crawler.content[''].keys()

Advanced Usage

The following is using a cache (in sqlalchemy, crawler.db) and crawl to a depth of 3 from the home page. The no_cache parameter prevent '/' to be cached, enforcing new pull of the homepage each time the crawler is launched.

import re
from crawler import Crawler, CrawlerCache
crawler = Crawler(CrawlerCache('crawler.db'), depth=3)
crawler.crawl('', no_cache=re.compile('^/$').match)
# displays the urls
print crawler.content[''].keys()
# -*- coding: utf-8 -*-
# filename:
import sqlite3
import urllib2
from HTMLParser import HTMLParser
from urlparse import urlparse
class HREFParser(HTMLParser):
Parser that extracts hrefs
hrefs = set()
def handle_starttag(self, tag, attrs):
if tag == 'a':
dict_attrs = dict(attrs)
if dict_attrs.get('href'):
def get_local_links(html, domain):
Read through HTML content and returns a tuple of links
internal to the given domain
hrefs = set()
parser = HREFParser()
for href in parser.hrefs:
u_parse = urlparse(href)
if href.startswith('/'):
# purposefully using path, no query, no hash
# only keep the local urls
if u_parse.netloc == domain:
return hrefs
class CrawlerCache(object):
Crawler data caching per relative URL and domain.
def __init__(self, db_file):
self.conn = sqlite3.connect(db_file)
c = self.conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS sites
(domain text, url text, content text)''')
self.cursor = self.conn.cursor()
def set(self, domain, url, data):
store the content for a given domain and relative url
self.cursor.execute("INSERT INTO sites VALUES (?,?,?)",
(domain, url, data))
def get(self, domain, url):
return the content for a given domain and relative url
self.cursor.execute("SELECT content FROM sites WHERE domain=? and url=?",
(domain, url))
row = self.cursor.fetchone()
if row:
return row[0]
def get_urls(self, domain):
return all the URLS within a domain
self.cursor.execute("SELECT url FROM sites WHERE domain=?", (domain,))
# could use fetchone and yield but I want to release
# my cursor after the call. I could have create a new cursor tho.
# ...Oh well
return [row[0] for row in self.cursor.fetchall()]
class Crawler(object):
def __init__(self, cache=None, depth=2):
depth: how many time it will bounce from page one (optional)
cache: a basic cache controller (optional)
self.depth = depth
self.content = {}
self.cache = cache
def crawl(self, url, no_cache=None):
url: where we start crawling, should be a complete URL like
no_cache: function returning True if the url should be refreshed
u_parse = urlparse(url)
self.domain = u_parse.netloc
self.content[self.domain] = {}
self.scheme = u_parse.scheme
self.no_cache = no_cache
self._crawl([u_parse.path], self.depth)
def set(self, url, html):
self.content[self.domain][url] = html
if self.is_cacheable(url):
self.cache.set(self.domain, url, html)
def get(self, url):
page = None
if self.is_cacheable(url):
page = self.cache.get(self.domain, url)
if page is None:
page = self.curl(url)
print "cached url... [%s] %s" % (self.domain, url)
return page
def is_cacheable(self, url):
return self.cache and self.no_cache \
and not self.no_cache(url)
def _crawl(self, urls, max_depth):
n_urls = set()
if max_depth:
for url in urls:
# do not crawl twice the same page
if url not in self.content:
html = self.get(url)
self.set(url, html)
n_urls = n_urls.union(get_local_links(html, self.domain))
self._crawl(n_urls, max_depth-1)
def curl(self, url):
return content at url.
return empty string if response raise an HTTPError (not found, 500...)
print "retrieving url... [%s] %s" % (self.domain, url)
req = urllib2.Request('%s://%s%s' % (self.scheme, self.domain, url))
response = urllib2.urlopen(req)
return'ascii', 'ignore')
except urllib2.HTTPError, e:
print "error [%s] %s: %s" % (self.domain, url, e)
return ''
# filename:
import re
from crawler import Crawler, CrawlerCache
if __name__ == "__main__":
# Using SQLite as a cache to avoid pulling twice
crawler = Crawler(CrawlerCache('crawler.db'))
root_re = re.compile('^/$').match
crawler.crawl('', no_cache=root_re)
crawler.crawl('', no_cache=root_re)
crawler.crawl('', no_cache=root_re)
crawler.crawl('', no_cache=root_re)
crawler.crawl('', no_cache=root_re)
Copy link

elythi0n commented Mar 8, 2017


Copy link

How to run the program at the command prompt Can you help me
Can I show it in interface !?

Copy link

vasug30 commented Jul 27, 2017

i am getting the below as an error.Could you please help me through it.

[vgupta@localhost ~]$ python
retrieving url... [] /
Traceback (most recent call last):
File "", line 7, in
File "/home/vgupta/", line 60, in crawl
File "/home/vgupta/", line 83, in _crawl
n_urls = n_urls.union(get_local_links(html,self.domain))
NameError: global name 'get_local_links' is not defined

Copy link

I'm getting the following error plz help
and thanks in advance : )

Traceback (most recent call last):
File "C:\Python27\lib\", line 62, in
import os
File "C:\Python27\lib\", line 400, in
import UserDict
File "C:\Python27\lib\", line 116, in
import _abcoll
File "C:\Python27\", line 11, in
from abc import ABCMeta, abstractmethod
File "C:\Users\electronicpeople\PycharmProjects\Sample\", line 2, in
from crawler import Crawler, CrawlerCache
ImportError: No module named crawler

Copy link


Anyone plz help with the following error it is very much urgent....

Traceback (most recent call last):
File "C:\Python27\lib\", line 62, in
import os
File "C:\Python27\lib\", line 400, in
import UserDict
File "C:\Python27\lib\", line 116, in
import _abcoll
File "C:\Python27\", line 11, in
from abc import ABCMeta, abstractmethod
File "C:\Users\electronicpeople\PycharmProjects\Sample\", line 2, in
from crawler import Crawler, CrawlerCache
ImportError: No module named crawler

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment