Created
December 22, 2009 06:32
-
-
Save jwickett/261551 to your computer and use it in GitHub Desktop.
A multi-threaded Web crawler implemented in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import threading, urllib, urlparse | |
from HTMLParser import HTMLParser | |
import sys | |
class LinkHTMLParser(HTMLParser): | |
A_TAG = "a" | |
HREF_ATTRIBUTE = "href" | |
def __init__(self): | |
self.links = [] | |
HTMLParser.__init__(self) | |
def handle_starttag(self, tag, attrs): | |
"""Add all 'href' links within 'a' tags to self.links""" | |
if cmp(tag, self.A_TAG) == 0: | |
for (key, value) in attrs: | |
if cmp(key, self.HREF_ATTRIBUTE) == 0: | |
self.links.append(value) | |
def handle_endtag(self, tag): | |
pass | |
class CrawlerThread(threading.Thread): | |
def __init__(self, binarySemaphore, url, crawlDepth): | |
self.binarySemaphore = binarySemaphore | |
self.url = url | |
self.crawlDepth = crawlDepth | |
self.threadId = hash(self) | |
threading.Thread.__init__(self) | |
def run(self): | |
"""Print out all of the links on the given url associated with this particular thread. Grab the passed in | |
binary semaphore when attempting to write to STDOUT so that there is no overlap between threads' output.""" | |
socket = urllib.urlopen(self.url) | |
urlMarkUp = socket.read() | |
linkHTMLParser = LinkHTMLParser() | |
linkHTMLParser.feed(urlMarkUp) | |
self.binarySemaphore.acquire() # wait if another thread has acquired and not yet released binary semaphore | |
print "Thread #%d: Reading from %s" %(self.threadId, self.url) | |
print "Thread #%d: Crawl Depth = %d" %(self.threadId, self.crawlDepth) | |
print "Thread #%d: Retreived the following links..." %(self.threadId) | |
urls = [] | |
for link in linkHTMLParser.links: | |
link = urlparse.urljoin(self.url, link) | |
urls.append(link) | |
print "\t"+link | |
print "" | |
self.binarySemaphore.release() | |
for url in urls: | |
# Keep crawling to different urls until the crawl depth is less than 1 | |
if self.crawlDepth > 1: | |
CrawlerThread(binarySemaphore, url, self.crawlDepth-1).start() | |
if __name__ == "__main__": | |
binarySemaphore = threading.Semaphore(1) | |
urls = [("http://www.google.com", 1), ("http://www.twitter.com", 2), ("http://www.facebook.com", 1), ("http://www.cnn.com", 1), | |
("http://www.nyt.com", 1), ("http://www.schwab.com", 1), ("http://www.bankofamerica.com", 1)] | |
for (url, crawlDepth) in urls: | |
CrawlerThread(binarySemaphore, url, crawlDepth).start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To avoid ascii errors, include