Last active
October 29, 2017 20:58
-
-
Save encukou/6f0038a3e104dfbed8a08031de03ab0b to your computer and use it in GitHub Desktop.
Hacky spider for naucse.python.cz -- see https://github.com/pyvec/elsa/issues/33
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
from urllib.parse import urlparse, urljoin, urldefrag | |
from tidylib import tidy_document | |
import sys | |
import os | |
import re | |
import flask | |
import lxml.html | |
import requests | |
from flask_frozen import Freezer | |
import naucse as the_site | |
TIDY_IGNORED_ERRORS = { | |
r'^line \d+ column \d+ - Warning: <img> lacks "alt" attribute$', | |
r'^line 9 column 9 - Warning: trimming empty <style>$', | |
r'^line \d+ column \d+ - Warning: <script> proprietary attribute "(integrity|crossorigin)"$', | |
# Notebooks: | |
r'^line \d+ column 1 - Warning: <img> discarding newline in URI reference$', | |
r'^line \d+ column 1 - Warning: <svg> proprietary attribute ".*:.*"$', | |
r'^line \d+ column 1 - Warning: discarding unexpected XML declaration$', | |
r'^line \d+ column 1 - Warning: <svg> attribute "(width|height)" has invalid value "\d+.\d+"$', | |
r'^line \d+ column 1 - Warning: <svg> anchor "[a-z0-9]+" already defined$', | |
r'^line \d+ column \d+ - Warning: <th> proprietary attribute "halign"$', | |
} | |
def test_spider(client, app, check_external_links): | |
"""Check that all links work | |
Spiders the site, making sure all internal links point to existing pages. | |
Includes fragments: any #hash in a link must correspond to existing element | |
with id. | |
If check_external_links is true, checks external links as well. | |
""" | |
to_visit = {'http://localhost/'} | |
visited = set() | |
external = set() | |
wanted_fragments = collections.defaultdict(set) | |
page_ids = {} | |
def recording_url_for(*args, **kwargs): | |
url = flask.url_for(*args, **kwargs) | |
if 'apiref' in url: | |
print('?'*80, url, args, kwargs) | |
if url not in visited: | |
to_visit.add(urljoin('http://localhost/', url)) | |
return url | |
app.jinja_env.globals['url_for'] = recording_url_for | |
links_to = {} | |
links_to['http://localhost/'] = set() | |
while to_visit: | |
url = sorted(to_visit)[0] | |
to_visit.remove(url) | |
if url in visited: | |
continue | |
visited.add(url) | |
links = [] | |
parsed = urlparse(url) | |
if parsed.netloc == 'localhost': | |
print('visit', url) | |
page_ids[url] = [] | |
try: | |
check_url(client, url, links, page_ids[url]) | |
except: | |
print('! from:', links_to.get(url)) | |
raise | |
for link in links: | |
fullurl = urljoin('http://localhost/', url) | |
fullurl = urljoin(fullurl, link) | |
if 'apiref' in url: | |
print('!'*80, url, link, fullurl) | |
result = urldefrag(fullurl) | |
defrag = result.url | |
fragment = result.fragment | |
if fragment and urlparse(fullurl).netloc == 'localhost': | |
wanted_fragments[defrag].add(fragment) | |
if defrag not in visited: | |
to_visit.add(defrag) | |
links_to.setdefault(fullurl, set()).add(url) | |
else: | |
if parsed.scheme in ('http', 'https'): | |
external.add(url) | |
else: | |
print('ignore', url) | |
for url, fragments in wanted_fragments.items(): | |
missing = fragments - set(page_ids[url]) | |
if missing: | |
raise AssertionError('Missing fragments for URL {}: {}'.format( | |
url, missing)) | |
if check_external_links: | |
for url in sorted(external): | |
print('check', url) | |
check_external_link(url) | |
return visited | |
def check_url(client, url, links_out=None, ids_out=None): | |
if url == 'http://localhost/static/': | |
return | |
result = client.get(url) | |
if result.status_code != 200: | |
raise AssertionError("Got HTTP status {} when accessing {}".format( | |
result.status_code, url)) | |
tree = lxml.html.document_fromstring(result.data) | |
if links_out is not None: | |
for element, attribute, link, pos in tree.iterlinks(): | |
links_out.append(link) | |
if ids_out is not None: | |
for element in tree.cssselect('*[id]'): | |
ids_out.append(element.attrib['id']) | |
if result.content_type.startswith('text/html'): | |
check_tidy(url, result.data) | |
def check_external_link(url): | |
status_code = requests.head(url).status_code | |
if status_code not in (200, 301, 302): | |
raise AssertionError("Got HTTP status {} when accessing {}".format( | |
status_code, url)) | |
def check_tidy(url, content): | |
document, errors = tidy_document( | |
content, | |
options={ | |
'anchor-as-name': 1, | |
'numeric-entities': 0, | |
'drop-empty-paras': 0, | |
'enclose-block-text': 1, | |
'enclose-text': 0, | |
'fix-uri': 1, | |
'merge-divs': 1, | |
'break-before-br': 1, | |
'punctuation-wrap': 1, | |
'sort-attributes': 'alpha', | |
'vertical-space': 1, | |
'char-encoding': 'utf8', | |
'drop-empty-elements': 'no', | |
} | |
) | |
errors = [err for err in errors.splitlines() | |
if not any(re.match(e, err) for e in TIDY_IGNORED_ERRORS)] | |
if errors: | |
for error in errors: | |
print('{}: {}'.format(url, error)) | |
raise AssertionError('HTML check failed') | |
def main(): | |
app = the_site.app | |
app.config['TRAP_HTTP_EXCEPTIONS'] = True | |
freezer = getattr(the_site, 'freezer', Freezer(app)) | |
client = app.test_client() | |
visited = set(test_spider(client, app, False)) | |
for path in freezer.all_urls(): | |
url = urljoin('http://localhost/', path) | |
if url not in visited: | |
print('Unlinked frozen URL:', path) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment