Created
December 9, 2009 19:51
-
-
Save zach-klippenstein/252733 to your computer and use it in GitHub Desktop.
Attempt at creating a very simple, breadth-first web crawler. As the test file shows, it was supposed to work on wikipedia. However, it doesn't (Wikipedia seems to send HTML that doesn't actually contain the article content).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from HTMLParser import HTMLParser | |
class LinkParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.links = [] | |
#end __init__() | |
def reset(self): | |
HTMLParser.reset(self) | |
self.links = [] | |
#end reset() | |
def handle_starttag(self, tag, attrs): | |
if tag == 'a': | |
for attr in attrs: | |
if attr[0] == 'href': | |
self.links.append(attr[1]) | |
# end handle_starttag() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import linkparser | |
import urllib | |
from wikicrawler import WikiCrawler | |
lp = linkparser.LinkParser() | |
f = urllib.urlopen('http://docs.python.org/library/urllib.html') | |
lp.feed(f.read()) | |
print(lp.links) | |
print('Testing wikicrawler...') | |
print() | |
search = 'disambiguation' | |
filterRegex = 'wikimediafoundation.org|wikipedia.org' | |
print("Searching for '" + search + "' filtering by '" + filterRegex + "'") | |
c = WikiCrawler('http://en.wikipedia.org/wiki/United_Nations', filterRegex) | |
c.crawlTo(search) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from linkparser import LinkParser | |
import urllib | |
import re | |
class WikiCrawler: | |
def __init__(self, url, filterRegex=''): | |
self.startUrl = url | |
self.filterRegex = re.compile(filterRegex) | |
def crawlTo(self, regex): | |
"""Returns a list of URLs on the way to the first one that matches regex""" | |
curUrl = self.startUrl | |
queue = [curUrl] | |
visitedPages = [curUrl] | |
counter = 0 | |
matcher = re.compile(regex) | |
while len(queue) > 0: | |
counter += 1 | |
curUrl = queue.pop(0) | |
print("Processing '" + self.getPlainUrl(curUrl) + "'...") | |
if None != matcher.search(self.getPlainUrl(curUrl)): | |
return counter | |
# add the links from the current node onto the queue | |
links = self.getLinks(curUrl) | |
for link in links: | |
if self.filterUrl(link) and not link in visitedPages: | |
queue.append(link) | |
visitedPages.append(link) | |
#end while | |
#end crawlTo() | |
def filterUrl(self, url): | |
"""Returns True if the url should be processed.""" | |
allow = True | |
plainUrl = self.getPlainUrl(url) | |
if not url.startswith("http://"): | |
allow = False | |
if None == self.filterRegex.search(plainUrl): | |
allow = False | |
return allow | |
#end filterUrl() | |
def getLinks(self, url): | |
parser = LinkParser() | |
if url != None and len(url) > 0: | |
try: | |
f = urllib.urlopen(url) | |
parser.feed(f.read()) | |
except Exception: | |
pass | |
#end if | |
return parser.links | |
#end getLinks() | |
def getPlainUrl(self, url): | |
"""Returns url with any targets or ? GET arguments removed""" | |
plainUrl = url.partition('#')[0] | |
plainUrl = plainUrl.partition('?')[0] | |
plainUrl = urllib.unquote(plainUrl) | |
return plainUrl | |
#end getPlainUrl() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment