Last active
December 30, 2017 21:40
-
-
Save yarko/f113f9f9948480fc66c5f566a301a27d to your computer and use it in GitHub Desktop.
quick check for valid links on a webpage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
check_link.py | |
Parses a url, and checks links in it for validity. | |
Normally, does not check links to the source URL's pages. | |
Normally, only reports problem links. | |
Usage: | |
check_link.py [-i] [-v] [-b BASEURL] <url>... | |
Options: | |
-b BASEURL --base= BASEURL checking multiple pages on a site? | |
set the base, and supages to check | |
('/' will check BASEURL) | |
-i --internal also check links internal to the site; | |
-v --verbose report all link outcomes, good and bad; | |
-h --help Show this message and exit; | |
Examples: | |
check_link.py -b https://mysite.io books blog # check only 2 subpages | |
check_link.py -b https://mysite.io / /blog # check home page too | |
''' | |
import sys | |
import requests | |
from urllib.parse import urljoin, urlsplit, urlunsplit | |
from docopt import docopt | |
from colorama import init as colorama_init | |
from colorama import Fore | |
from bs4 import BeautifulSoup, SoupStrainer | |
# this is a modification of a gist; | |
# it got me quickly started for now. | |
# Original: | |
# [email protected]:2872d7f994d192188970408980267e6e.git | |
def check(address, netloc): | |
global VERBOSE | |
global FULL_GET | |
global session | |
msg = None # the normal "ok" is no message | |
# optimize which retrieve we use: | |
retrieve = session.get if netloc in FULL_GET else session.head | |
try: | |
# NOTE: amazon denies requests from python scripts, so we use | |
# a session with an updated 'User-Agent' throughout ('session') | |
# amazon.com remembers if the session.get() was from a python agent, | |
# and then denies the session.get(), even if it updated | |
# its 'user-agent' with a 503 - Service Unavailable | |
# OPTIMIZATION: | |
# we try a light-weight session.head() call first; | |
# if it fails for a domain w/ 405 (Method Not Allowed), then | |
# we retry with a full session.get(), and log the location so | |
# we always try the long way; | |
resp = retrieve(address) | |
if resp.status_code == 405: | |
resp = session.get(address) | |
FULL_GET.add(netloc) | |
except Exception as e: | |
return f'{Fore.YELLOW}{e} - {address}{Fore.RESET}' | |
if resp.status_code in \ | |
[301, 308, | |
400, 401, 402, 403, 404, 405, 408, 409, 410, | |
501, 502, 503]: | |
msg = f'{resp.status_code} - {resp.reason} => {address}' | |
# TODO: scrub other permanent redirection codes to include in this: | |
if resp.status_code == 301: | |
newaddress = urljoin(address, | |
resp.headers["Location"].split(";")[0]) | |
msg += f'\n{" "*19}NEW: => {newaddress}' | |
elif VERBOSE: | |
msg = f'{Fore.GREEN}{resp.status_code} - ' \ | |
f'{resp.reason} => {address}{Fore.RESET}' | |
return msg | |
def pattern_adjust(link_address, rbase=None): | |
''' | |
returns "adjusted" address and netloc; | |
don't follow local addresses, unless | |
option set to follow internal addresses | |
''' | |
global website | |
global INTERNAL | |
# if we're checking local, might as well | |
# check on-page, too - for typos | |
if link_address[0] == '#' and not INTERNAL: # local | |
return (None, None) | |
# create a local static var: | |
# - depends on global "website" | |
# if 'rbase' not in pattern_adjust.__dict__ \ | |
# pattern_adjust.rbase = urlsplit(website) | |
r = urlsplit(link_address) | |
# don't follow local: | |
if not INTERNAL and \ | |
r.netloc == rbase.netloc: | |
return (None, None) | |
# NOTE: I don't really understand | |
# what this is doing, so annotating: | |
# if relative URL (local) | |
# TODO: I am getting convinced what this wants to do | |
# should be done w/ a simple urljoin() | |
# I'm also thinking this code branch isn't traversed; | |
if r.scheme == '' and (r.netloc != '' or r.path != ''): | |
# reconstitute - it won't be a full path | |
d = urlunsplit(r) | |
# This if seems exceedingly wonky | |
if d.startswith('//'): | |
# if it starts with '//', throw that away... | |
# m = re.search('(?<=//)\S+', d) | |
# d = m.group(0) | |
# TODO: if r.netloc is empty, then this | |
# could result in an incorrect URL: | |
# => if address = foo.com/something - then ok | |
# => if address relaive: ./static/something - then trouble | |
return ("https://" + d[2:], r.netloc) | |
elif r.scheme == '' and r.netloc == '': | |
# is this what I want to do? | |
# would I rather do urljoin(urlunsplit(rbase), link_address)? | |
return (urljoin(website, link_address), r.netloc) \ | |
if INTERNAL else (None, None) | |
else: | |
return (link_address, r.netloc) | |
def string_trunc(s, field_width=73, fill='.'): | |
''' | |
usage: | |
s, f, w = string_trunc(longurl) | |
print(f']{s:{f}<{w}}[') | |
returns: | |
a truncated (if needed) string, | |
a fill char, and | |
a matching field_width | |
''' | |
str_width = len(s) | |
if str_width > field_width: | |
# room for 3 fill chars | |
return s[:field_width-3], fill, field_width | |
else: | |
# this doesn't work: need real values | |
# return s, None, None | |
# width of zero seems to not do width, | |
# but I need returned str_width to clear progress line; | |
# fill could be anything (another "0", but...) | |
return s, " ", str_width | |
def progress(msg): | |
''' | |
hack to print progress | |
''' | |
if 'w' not in progress.__dict__: | |
progress.w = 0 | |
# clear previous progress line | |
print(f'\r{" "*progress.w}', end='', file=sys.stderr) | |
s, f, w = string_trunc(msg) | |
print(f'\r{s:{f}<{w}}', end='', file=sys.stderr) | |
progress.w = w | |
def extract_link(address): | |
global link_status | |
global session | |
tags = {'a': 'href', 'img': 'src', 'script': 'src', 'link': 'href'} | |
# the partitioned pieces of URL we're checking | |
rbase = urlsplit(address) | |
response = session.get(address) | |
for key, value in tags.items(): | |
for link in BeautifulSoup(response.content, "html.parser", | |
parse_only=SoupStrainer(key)): | |
if link.has_attr(value): | |
# I'm jonesin' for some progress indicators | |
progress(link[value]) | |
p, netloc = pattern_adjust(link[value], rbase) | |
if p and p not in link_status: | |
link_status[p] = check(p, netloc) | |
if link_status[p]: | |
# the '\r' is a hack to show stdout ok w/ progress msgs | |
print('\r', end='', file=sys.stderr) | |
print(link_status[p]) | |
if __name__ == "__main__": | |
arguments = docopt(__doc__) | |
BASEURL = arguments['--base'] | |
INTERNAL = arguments['--internal'] | |
VERBOSE = arguments['--verbose'] | |
websites = arguments['<url>'] # "https://davericho.com/books/" | |
colorama_init() | |
# to facilitate checking each link only once | |
link_status = {} | |
# sites which don't accept 'head' requests (dynamic) | |
# populate with results of urlsplit().netloc | |
FULL_GET = ('www.amazon.com',) | |
# for places like amazon.com, which will deny python scripts: | |
# Now - use session throughout this script! | |
session = requests.Session() | |
session.headers.update({'User-Agent': 'test'}) | |
for website in websites: | |
if BASEURL: | |
website = urljoin(BASEURL, website) | |
print(f'{Fore.CYAN}--- checking links on {website} ---{Fore.RESET}') | |
result = extract_link(website) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a bit of a work-in-progress - to check, non-recursively, the links on a URL for validity.
It's just starting to come along.