Created
February 15, 2012 15:38
-
-
Save ChrisMoney/aa7b005ab37bce81e70a to your computer and use it in GitHub Desktop.
Python --Find Links
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # program: spider.py | |
| # author: aahz | |
| # version: 1.1 | |
| # date: June 2006 | |
| # description: start on command line with URL argument | |
| # Finds pages within a web site. | |
| # These modules do ost of the work | |
| import sys | |
| import urllib2 | |
| import urlparse | |
| import htmllib, formatter | |
| from cStringIO | |
| def log_stdout(msg): | |
| """Print msg to the screen. """ | |
| print msg | |
| def get_page(url, log): | |
| """Retrieve URL and return content, log errors.""" | |
| try: | |
| page = urllib2.urlopen(url) | |
| except urllib.URLError: | |
| log("Error retieving: " + url) | |
| return ' ' | |
| body = page.read() | |
| page.close() | |
| return body | |
| def find_links(html) | |
| """Return a list of links in html.""" | |
| # We're using the parser just to get the HREFs | |
| writer = formatter.DumbWriter(StringIO()) | |
| f = formatter.AbstractFormatter(writer) | |
| parser = htmllib.HTMLParser(f) | |
| parser.feed(html) | |
| parser.close() | |
| return parser.anchorlist | |
| class Spider: | |
| """ | |
| The heart of this program, finds all links within a web site. | |
| run() contains the main loop | |
| process_page() retrieves each page and finds the links """ | |
| def __init__(self, startURL, log=None): | |
| # This method sets initial values | |
| self.URLs = set() | |
| self.URLs.add(startURL) | |
| self.include = startURL | |
| self._links_to_process = [startURL] | |
| if log is None: | |
| # Use log_stdout function if no log provided | |
| self.log = log_stdout | |
| else | |
| self.log = log | |
| def run(self): | |
| # Processes list of URLs one at a time | |
| while sef._links_to_process.pop() | |
| self.log("Retrieving: " + url) | |
| self.process_page(url) | |
| def url_in_site(self, link): | |
| # Checks wheter the link starts with the base URL | |
| return link.startswith(self.include) | |
| def process_page(self, url): | |
| # Retrieves page and finds links in it | |
| html = get_page(url, self.log) | |
| for link in find_links(html) | |
| # Handle relative links | |
| link = urlparse.urljoin(url, link) | |
| self.log("checking: " + link) | |
| # Make sure this is a new URL within current site | |
| if link not in self.URLs and self.url_in_site(link): | |
| self.URLs.add(link) | |
| self._links_to_process.append(link) | |
| if _name_ = = '_main_': | |
| # This code runs when script is started from command line | |
| startURL = sys.argv[1] | |
| spider = Spider(startURL) | |
| spider.run() | |
| for URL in sorted(spider.URLs): | |
| print URL |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment