Skip to content

Instantly share code, notes, and snippets.

@jwickett
Created December 22, 2009 06:32
Show Gist options
  • Save jwickett/261551 to your computer and use it in GitHub Desktop.
Save jwickett/261551 to your computer and use it in GitHub Desktop.
A multi-threaded Web crawler implemented in Python
import threading, urllib, urlparse
from HTMLParser import HTMLParser
import sys
class LinkHTMLParser(HTMLParser):
A_TAG = "a"
HREF_ATTRIBUTE = "href"
def __init__(self):
self.links = []
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
"""Add all 'href' links within 'a' tags to self.links"""
if cmp(tag, self.A_TAG) == 0:
for (key, value) in attrs:
if cmp(key, self.HREF_ATTRIBUTE) == 0:
self.links.append(value)
def handle_endtag(self, tag):
pass
class CrawlerThread(threading.Thread):
def __init__(self, binarySemaphore, url, crawlDepth):
self.binarySemaphore = binarySemaphore
self.url = url
self.crawlDepth = crawlDepth
self.threadId = hash(self)
threading.Thread.__init__(self)
def run(self):
"""Print out all of the links on the given url associated with this particular thread. Grab the passed in
binary semaphore when attempting to write to STDOUT so that there is no overlap between threads' output."""
socket = urllib.urlopen(self.url)
urlMarkUp = socket.read()
linkHTMLParser = LinkHTMLParser()
linkHTMLParser.feed(urlMarkUp)
self.binarySemaphore.acquire() # wait if another thread has acquired and not yet released binary semaphore
print "Thread #%d: Reading from %s" %(self.threadId, self.url)
print "Thread #%d: Crawl Depth = %d" %(self.threadId, self.crawlDepth)
print "Thread #%d: Retreived the following links..." %(self.threadId)
urls = []
for link in linkHTMLParser.links:
link = urlparse.urljoin(self.url, link)
urls.append(link)
print "\t"+link
print ""
self.binarySemaphore.release()
for url in urls:
# Keep crawling to different urls until the crawl depth is less than 1
if self.crawlDepth > 1:
CrawlerThread(binarySemaphore, url, self.crawlDepth-1).start()
if __name__ == "__main__":
binarySemaphore = threading.Semaphore(1)
urls = [("http://www.google.com", 1), ("http://www.twitter.com", 2), ("http://www.facebook.com", 1), ("http://www.cnn.com", 1),
("http://www.nyt.com", 1), ("http://www.schwab.com", 1), ("http://www.bankofamerica.com", 1)]
for (url, crawlDepth) in urls:
CrawlerThread(binarySemaphore, url, crawlDepth).start()
@HardTacos
Copy link

To avoid ascii errors, include

reload(sys)  
sys.setdefaultencoding('utf8')

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment