Last active
September 23, 2015 22:08
-
-
Save QuiteClose/188401e0ff92ec3d4826 to your computer and use it in GitHub Desktop.
A simple web crawler.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################### | |
# $Author: [email protected] | |
# $Expects: Python version 3.4.3 | |
############################################################################### | |
"""Crawls a given URL and presents a simple sitemap based upon which | |
webpages are found. Each page is given as an individual WebResource | |
with a URL, a title and a list of links within the page. | |
From the command line: | |
$ python crawl.py www.example.com | |
From within python: | |
>>> from sitemap import crawl | |
>>> for resource in crawl('www.example.com'): | |
... print(resource) | |
... | |
>>> | |
""" | |
############################################################################### | |
try: | |
from argparse import ArgumentParser | |
from urllib.parse import urlparse, urljoin | |
from urllib.request import urlopen | |
import re | |
except: | |
print("Import Error: Expects Python 3.4.3") | |
exit(1) | |
############################################################################### | |
class WebResource: | |
""" Stores the title and links found at a crawled URL.""" | |
def __init__(self, url, title): | |
""" A resource has a URL and a Title. Links are normally added | |
via the add method. | |
""" | |
self.url = url | |
self.parsed_url = urlparse(url) | |
self.title = title | |
self.links = [] | |
def add(self, link_list): | |
""" Takes a list of links, converts them to URLs based upon the | |
WebResource's URL and appends them to the WebResource if they | |
successfully pass the _filter method. | |
""" | |
url_list = [urljoin(self.url, link) for link in link_list] | |
self.links.extend(sorted(set(self._filter(url_list)))) | |
def _filter(self, url_list): | |
""" Returns a list of URLs, base on 'url_list', that have a | |
different hostname and path to the resource's url. The effect | |
is to filter out links to within the same page or to different | |
hosts. | |
Overload this method to modify what links are recorded against a | |
WebResource. | |
""" | |
non_page = lambda url: self.parsed_url.path != url.path | |
local = lambda url: self.parsed_url.hostname == url.hostname | |
suitable = lambda url: non_page(url) and local(url) | |
return [url for url in url_list if suitable(urlparse(url))] | |
def __str__(self): | |
if len(self.links) is 0: | |
link_str = '' | |
else: | |
link_str = '\t Links: {}'.format('\n\t\t'.join(self.links)) | |
return '{}\n\t Title: {}\n{}'.format(self.url, | |
self.title, | |
link_str) | |
############################################################################### | |
class CrawlQueue: | |
""" Queues uncrawled links. """ | |
def __init__(self, start_url): | |
""" Prepares a Manifest for a crawl starting from 'url'. """ | |
self._queue = [] # uncrawled URLs | |
self._all =[] # stored to prevent duplicates | |
self.insert([start_url]) | |
def __iter__(self): | |
return self | |
def __next__(self): | |
""" Iterate until the queue is empty. """ | |
try: | |
return self._queue.pop(0) | |
except: | |
raise StopIteration() | |
def insert(self, url_list): | |
""" Adds unique URLs in the given URL list to the queue. """ | |
unique = [url for url in set(url_list) if url not in self._all] | |
self._all.extend(unique) | |
self._queue.extend(unique) | |
############################################################################### | |
def _is_html(response): | |
""" Returns True for both HTML and XHTML HTTP reponses. """ | |
content = response.getheader('Content-Type', default='Unknown') | |
return 'text/html' in content or 'application/xhtml+xml' in content | |
############################## | |
def _get_title_from(html): | |
""" Returns the content of the first <title> tag in 'html'.""" | |
result = re.search(r'<title>(.*)</title>', html, re.IGNORECASE) | |
if result: | |
return result.group(1) | |
else: | |
return '' | |
############################## | |
def _get_links_from(html): | |
""" Returns a list of the values of the href attribute from any | |
anchor tags in 'html'. | |
""" | |
return re.findall(r'<a[^>]+href=[\'"]?([^\'" >]+)', html, re.IGNORECASE) | |
############################## | |
def _get_root_url(url): | |
""" Returns 'url' ensuring that it has a scheme and a path. """ | |
if '//' not in url: # ensure proper scheme or netloc parsing | |
return _get_root_url('//' + url) | |
full_url = urlparse(url, 'http') | |
if full_url.path == '': # prevent duplicate visits to / | |
return full_url.geturl() + '/' | |
else: | |
return full_url.geturl() | |
############################## | |
def crawl(root_url, resource_type=WebResource): | |
""" Crawls the URL at 'root_url'. Yields a 'resource_type' for | |
each resource crawled after adding any extracted links. | |
Successfully added links are followed. | |
""" | |
queue = CrawlQueue(_get_root_url(root_url)) | |
for url in queue: | |
try: | |
response = urlopen(url) | |
except: | |
continue | |
if response.status == 200 and _is_html(response): | |
html = response.read().decode() | |
resource = resource_type(url, _get_title_from(html)) | |
resource.add(_get_links_from(html)) | |
queue.insert(resource.links) | |
yield resource | |
############################################################################### | |
if __name__ == '__main__': | |
# only one argument, the URL | |
parser = ArgumentParser(description='Crawls a URL to obtain a sitemap.') | |
parser.add_argument('URL', help='The URL to crawl.') | |
args = parser.parse_args() | |
for resource in crawl(args.URL): | |
print(resource) | |
# eof |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment