Last active
June 9, 2016 14:35
-
-
Save scottstamp/e585f7245caadc5dc0c8 to your computer and use it in GitHub Desktop.
Utility for checking anchor links for the Docker documentation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" I honestly don't even know how the hell this works, just use it. """ | |
__author__ = "Scott Stamp <[email protected]>" | |
from HTMLParser import HTMLParser | |
from urlparse import urljoin | |
from sys import setrecursionlimit | |
import re | |
import requests | |
setrecursionlimit(10000) | |
root = 'http://localhost:8000' | |
class DataHolder: | |
def __init__(self, value=None, attr_name='value'): | |
self._attr_name = attr_name | |
self.set(value) | |
def __call__(self, value): | |
return self.set(value) | |
def set(self, value): | |
setattr(self, self._attr_name, value) | |
return value | |
def get(self): | |
return getattr(self, self._attr_name) | |
class Parser(HTMLParser): | |
global root | |
ids = set() | |
crawled = set() | |
anchors = {} | |
pages = set() | |
save_match = DataHolder(attr_name='match') | |
def __init__(self, origin): | |
self.origin = origin | |
HTMLParser.__init__(self) | |
def handle_starttag(self, tag, attrs): | |
attrs = dict(attrs) | |
if 'href' in attrs: | |
href = attrs['href'] | |
if re.match('^{0}|\/|\#[\S]{{1,}}'.format(root), href): | |
if self.save_match(re.search('.*\#(.*?)$', href)): | |
if self.origin not in self.anchors: | |
self.anchors[self.origin] = set() | |
self.anchors[self.origin].add( | |
self.save_match.match.groups(1)[0]) | |
url = urljoin(root, href) | |
if url not in self.crawled and not re.match('^\#', href): | |
self.crawled.add(url) | |
Parser(url).feed(requests.get(url).content) | |
if 'id' in attrs: | |
self.ids.add(attrs['id']) | |
r = requests.get(root) | |
parser = Parser(root) | |
parser.feed(r.content) | |
for anchor in parser.anchors: | |
for anchor_name in parser.anchors[anchor]: | |
if anchor_name not in parser.ids: | |
print 'Missing - ({0}): #{1}'.format( | |
anchor.replace(root, ''), anchor_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Interesting enhancement to
DataHolder
. AFAICT, the original is here: http://code.activestate.com/recipes/66061/I was going to ask about the changes, but since you "don't even know how the hell this works", I don't want to put you on the spot. 😉