zach-klippenstein · December 9, 2009 19:51
diff --git a/linkparser.py b/linkparser.py
 from HTMLParser import HTMLParser

 class LinkParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []
    #end __init__()
    
    def reset(self):
        HTMLParser.reset(self)
        self.links = []
    #end reset()
    
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for attr in attrs:
                if attr[0] == 'href':
                    self.links.append(attr[1])
    # end handle_starttag()
diff --git a/test.py b/test.py
 import linkparser
 import urllib
 from wikicrawler import WikiCrawler

 lp = linkparser.LinkParser()
 f = urllib.urlopen('http://docs.python.org/library/urllib.html')

 lp.feed(f.read())
 print(lp.links)

 print('Testing wikicrawler...')
 print()

 search = 'disambiguation'
 filterRegex = 'wikimediafoundation.org|wikipedia.org'

 print("Searching for '" + search + "' filtering by '" + filterRegex + "'")

 c = WikiCrawler('http://en.wikipedia.org/wiki/United_Nations', filterRegex)
 c.crawlTo(search)
diff --git a/wikicrawler.py b/wikicrawler.py
 from linkparser import LinkParser
 import urllib
 import re

 class WikiCrawler:
    
    def __init__(self, url, filterRegex=''):
        self.startUrl = url
        self.filterRegex = re.compile(filterRegex)
        
    def crawlTo(self, regex):
        """Returns a list of URLs on the way to the first one that matches regex"""
        curUrl = self.startUrl
        queue = [curUrl]
        visitedPages = [curUrl]
        counter = 0
        matcher = re.compile(regex)
        
        while len(queue) > 0:
            counter += 1
            curUrl = queue.pop(0)
            print("Processing '" + self.getPlainUrl(curUrl) + "'...")
            
            if None != matcher.search(self.getPlainUrl(curUrl)):
                return counter
            
            # add the links from the current node onto the queue
            links = self.getLinks(curUrl)
            for link in links:
                if self.filterUrl(link) and not link in visitedPages:
                    queue.append(link)
                    visitedPages.append(link)
        #end while
    #end crawlTo()
    
    def filterUrl(self, url):
        """Returns True if the url should be processed."""
        allow = True
        plainUrl = self.getPlainUrl(url)
        
        if not url.startswith("http://"):
            allow = False
        
        if None == self.filterRegex.search(plainUrl):
            allow = False
        
        return allow
    #end filterUrl()
    
    def getLinks(self, url):
        parser = LinkParser()
        
        if url != None and len(url) > 0:
            try:
                f = urllib.urlopen(url)
                
                parser.feed(f.read())
            except Exception:
                pass
        #end if
            
        return parser.links
    #end getLinks()
    
    def getPlainUrl(self, url):
        """Returns url with any targets or ? GET arguments removed"""
        plainUrl = url.partition('#')[0]
        plainUrl = plainUrl.partition('?')[0]
        plainUrl = urllib.unquote(plainUrl)
        
        return plainUrl
    #end getPlainUrl()
	from HTMLParser import HTMLParser

	class LinkParser(HTMLParser):
	def __init__(self):
	HTMLParser.__init__(self)
	self.links = []
	#end __init__()

	def reset(self):
	HTMLParser.reset(self)
	self.links = []
	#end reset()

	def handle_starttag(self, tag, attrs):
	if tag == 'a':
	for attr in attrs:
	if attr[0] == 'href':
	self.links.append(attr[1])
	# end handle_starttag()
	import linkparser
	import urllib
	from wikicrawler import WikiCrawler

	lp = linkparser.LinkParser()
	f = urllib.urlopen('http://docs.python.org/library/urllib.html')

	lp.feed(f.read())
	print(lp.links)

	print('Testing wikicrawler...')
	print()

	search = 'disambiguation'
	filterRegex = 'wikimediafoundation.org\|wikipedia.org'

	print("Searching for '" + search + "' filtering by '" + filterRegex + "'")

	c = WikiCrawler('http://en.wikipedia.org/wiki/United_Nations', filterRegex)
	c.crawlTo(search)
	from linkparser import LinkParser
	import urllib
	import re

	class WikiCrawler:

	def __init__(self, url, filterRegex=''):
	self.startUrl = url
	self.filterRegex = re.compile(filterRegex)

	def crawlTo(self, regex):
	"""Returns a list of URLs on the way to the first one that matches regex"""
	curUrl = self.startUrl
	queue = [curUrl]
	visitedPages = [curUrl]
	counter = 0
	matcher = re.compile(regex)

	while len(queue) > 0:
	counter += 1
	curUrl = queue.pop(0)
	print("Processing '" + self.getPlainUrl(curUrl) + "'...")

	if None != matcher.search(self.getPlainUrl(curUrl)):
	return counter

	# add the links from the current node onto the queue
	links = self.getLinks(curUrl)
	for link in links:
	if self.filterUrl(link) and not link in visitedPages:
	queue.append(link)
	visitedPages.append(link)
	#end while
	#end crawlTo()

	def filterUrl(self, url):
	"""Returns True if the url should be processed."""
	allow = True
	plainUrl = self.getPlainUrl(url)

	if not url.startswith("http://"):
	allow = False

	if None == self.filterRegex.search(plainUrl):
	allow = False

	return allow
	#end filterUrl()

	def getLinks(self, url):
	parser = LinkParser()

	if url != None and len(url) > 0:
	try:
	f = urllib.urlopen(url)

	parser.feed(f.read())
	except Exception:
	pass
	#end if

	return parser.links
	#end getLinks()

	def getPlainUrl(self, url):
	"""Returns url with any targets or ? GET arguments removed"""
	plainUrl = url.partition('#')[0]
	plainUrl = plainUrl.partition('?')[0]
	plainUrl = urllib.unquote(plainUrl)

	return plainUrl
	#end getPlainUrl()