yarko · December 30, 2017 21:40 · yarko · Dec 27, 2017
diff --git a/check_links.py b/check_links.py
 #!/usr/bin/env python
 '''
 check_link.py

 Parses a url, and checks links in it for validity.
 Normally, does not check links to the source URL's pages.
 Normally, only reports problem links.

 Usage:
    check_link.py [-i] [-v] [-b BASEURL] <url>...

 Options:
    -b BASEURL --base= BASEURL  checking multiple pages on a site?
                                set the base, and supages to check
                                ('/' will check BASEURL)
    -i --internal               also check links internal to the site;
    -v --verbose                report all link outcomes, good and bad;
    -h --help                   Show this message and exit;

 Examples:
    check_link.py -b https://mysite.io books blog   # check only 2 subpages
    check_link.py -b https://mysite.io / /blog      # check home page too
 '''
 import sys
 import requests
 from urllib.parse import urljoin, urlsplit, urlunsplit
 from docopt import docopt
 from colorama import init as colorama_init
 from colorama import Fore
 from bs4 import BeautifulSoup, SoupStrainer

 # this is a modification of a gist;
 #   it got me quickly started for now.
 #  Original:
 #  [email protected]:2872d7f994d192188970408980267e6e.git


 def check(address, netloc):
    global VERBOSE
    global FULL_GET
    global session
    msg = None  # the normal "ok" is no message

    # optimize which retrieve we use:
    retrieve = session.get if netloc in FULL_GET else session.head
    try:
        # NOTE: amazon denies requests from python scripts, so we use
        #    a session with an updated 'User-Agent' throughout ('session')
        #    amazon.com remembers if the session.get() was from a python agent,
        #    and then denies the session.get(), even if it updated
        #    its 'user-agent' with a 503 - Service Unavailable
        # OPTIMIZATION:
        #    we try a light-weight session.head() call first;
        #    if it fails for a domain w/ 405 (Method Not Allowed), then
        #    we retry with a full session.get(), and log the location so
        #    we always try the long way;
        resp = retrieve(address)
        if resp.status_code == 405:
            resp = session.get(address)
            FULL_GET.add(netloc)
    except Exception as e:
        return f'{Fore.YELLOW}{e} - {address}{Fore.RESET}'

    if resp.status_code in \
            [301, 308,
             400, 401, 402, 403, 404, 405, 408, 409, 410,
             501, 502, 503]:
        msg = f'{resp.status_code} - {resp.reason} => {address}'
        # TODO: scrub other permanent redirection codes to include in this:
        if resp.status_code == 301:
            newaddress = urljoin(address,
                                 resp.headers["Location"].split(";")[0])
            msg += f'\n{" "*19}NEW: => {newaddress}'
    elif VERBOSE:
        msg = f'{Fore.GREEN}{resp.status_code} - ' \
              f'{resp.reason} => {address}{Fore.RESET}'
    return msg


 def pattern_adjust(link_address, rbase=None):
    '''
    returns  "adjusted" address and netloc;

    don't follow local addresses, unless
    option set to follow internal addresses
    '''
    global website
    global INTERNAL

    # if we're checking local, might as well
    #   check on-page, too - for typos
    if link_address[0] == '#' and not INTERNAL:  # local
        return (None, None)

    # create a local static var:
    #  - depends on global "website"
    # if 'rbase' not in pattern_adjust.__dict__ \
    #    pattern_adjust.rbase = urlsplit(website)

    r = urlsplit(link_address)
    # don't follow local:
    if not INTERNAL and \
       r.netloc == rbase.netloc:
        return (None, None)
    # NOTE:  I don't really understand
    #    what this is doing, so annotating:
    #  if relative URL (local)
    # TODO: I am getting convinced what this wants to do
    #   should be done w/ a simple urljoin()
    #   I'm also thinking this code branch isn't traversed;
    if r.scheme == '' and (r.netloc != '' or r.path != ''):
        # reconstitute - it won't be a full path
        d = urlunsplit(r)
        # This if seems exceedingly wonky
        if d.startswith('//'):
            # if it starts with '//', throw that away...
            # m = re.search('(?<=//)\S+', d)
            # d = m.group(0)
            # TODO:  if r.netloc is empty, then this
            #  could result in an incorrect URL:
            # => if address = foo.com/something - then ok
            # => if address relaive: ./static/something - then trouble
            return ("https://" + d[2:], r.netloc)
    elif r.scheme == '' and r.netloc == '':
        # is this what I want to do?
        #  would I rather do urljoin(urlunsplit(rbase), link_address)?
        return (urljoin(website, link_address), r.netloc) \
                if INTERNAL else (None, None)
    else:
        return (link_address, r.netloc)


 def string_trunc(s, field_width=73, fill='.'):
    '''
    usage:
        s, f, w = string_trunc(longurl)
        print(f']{s:{f}<{w}}[')
    returns:
        a truncated (if needed) string,
        a fill char, and
        a matching field_width
    '''
    str_width = len(s)

    if str_width > field_width:
        # room for 3 fill chars
        return s[:field_width-3], fill, field_width
    else:
        # this doesn't work: need real values
        # return s, None, None
        # width of zero seems to not do width,
        #   but I need returned str_width to clear progress line;
        # fill could be anything (another "0", but...)
        return s, " ", str_width


 def progress(msg):
    '''
    hack to print progress
    '''
    if 'w' not in progress.__dict__:
        progress.w = 0

    # clear previous progress line
    print(f'\r{" "*progress.w}', end='', file=sys.stderr)
    s, f, w = string_trunc(msg)
    print(f'\r{s:{f}<{w}}', end='', file=sys.stderr)
    progress.w = w


 def extract_link(address):
    global link_status
    global session

    tags = {'a': 'href', 'img': 'src', 'script': 'src', 'link': 'href'}

    # the partitioned pieces of URL we're checking
    rbase = urlsplit(address)
    response = session.get(address)
    for key, value in tags.items():
        for link in BeautifulSoup(response.content, "html.parser",
                                  parse_only=SoupStrainer(key)):
            if link.has_attr(value):
                # I'm jonesin' for some progress indicators
                progress(link[value])
                p, netloc = pattern_adjust(link[value], rbase)
                if p and p not in link_status:
                    link_status[p] = check(p, netloc)
                    if link_status[p]:
                        # the '\r' is a hack to show stdout ok w/ progress msgs
                        print('\r', end='', file=sys.stderr)
                        print(link_status[p])


 if __name__ == "__main__":
    arguments = docopt(__doc__)
    BASEURL = arguments['--base']
    INTERNAL = arguments['--internal']
    VERBOSE = arguments['--verbose']
    websites = arguments['<url>']  # "https://davericho.com/books/"

    colorama_init()
    # to facilitate checking each link only once
    link_status = {}
    # sites which don't accept 'head' requests (dynamic)
    #  populate with results of urlsplit().netloc
    FULL_GET = ('www.amazon.com',)
    # for places like amazon.com, which will deny python scripts:
    #  Now - use session throughout this script!
    session = requests.Session()
    session.headers.update({'User-Agent': 'test'})
    for website in websites:
        if BASEURL:
            website = urljoin(BASEURL, website)
        print(f'{Fore.CYAN}--- checking links on {website} ---{Fore.RESET}')
        result = extract_link(website)
	#!/usr/bin/env python
	'''
	check_link.py

	Parses a url, and checks links in it for validity.
	Normally, does not check links to the source URL's pages.
	Normally, only reports problem links.

	Usage:
	check_link.py [-i] [-v] [-b BASEURL] <url>...

	Options:
	-b BASEURL --base= BASEURL checking multiple pages on a site?
	set the base, and supages to check
	('/' will check BASEURL)
	-i --internal also check links internal to the site;
	-v --verbose report all link outcomes, good and bad;
	-h --help Show this message and exit;

	Examples:
	check_link.py -b https://mysite.io books blog # check only 2 subpages
	check_link.py -b https://mysite.io / /blog # check home page too
	'''
	import sys
	import requests
	from urllib.parse import urljoin, urlsplit, urlunsplit
	from docopt import docopt
	from colorama import init as colorama_init
	from colorama import Fore
	from bs4 import BeautifulSoup, SoupStrainer

	# this is a modification of a gist;
	# it got me quickly started for now.
	# Original:
	# [email protected]:2872d7f994d192188970408980267e6e.git


	def check(address, netloc):
	global VERBOSE
	global FULL_GET
	global session
	msg = None # the normal "ok" is no message

	# optimize which retrieve we use:
	retrieve = session.get if netloc in FULL_GET else session.head
	try:
	# NOTE: amazon denies requests from python scripts, so we use
	# a session with an updated 'User-Agent' throughout ('session')
	# amazon.com remembers if the session.get() was from a python agent,
	# and then denies the session.get(), even if it updated
	# its 'user-agent' with a 503 - Service Unavailable
	# OPTIMIZATION:
	# we try a light-weight session.head() call first;
	# if it fails for a domain w/ 405 (Method Not Allowed), then
	# we retry with a full session.get(), and log the location so
	# we always try the long way;
	resp = retrieve(address)
	if resp.status_code == 405:
	resp = session.get(address)
	FULL_GET.add(netloc)
	except Exception as e:
	return f'{Fore.YELLOW}{e} - {address}{Fore.RESET}'

	if resp.status_code in \
	[301, 308,
	400, 401, 402, 403, 404, 405, 408, 409, 410,
	501, 502, 503]:
	msg = f'{resp.status_code} - {resp.reason} => {address}'
	# TODO: scrub other permanent redirection codes to include in this:
	if resp.status_code == 301:
	newaddress = urljoin(address,
	resp.headers["Location"].split(";")[0])
	msg += f'\n{" "*19}NEW: => {newaddress}'
	elif VERBOSE:
	msg = f'{Fore.GREEN}{resp.status_code} - ' \
	f'{resp.reason} => {address}{Fore.RESET}'
	return msg


	def pattern_adjust(link_address, rbase=None):
	'''
	returns "adjusted" address and netloc;

	don't follow local addresses, unless
	option set to follow internal addresses
	'''
	global website
	global INTERNAL

	# if we're checking local, might as well
	# check on-page, too - for typos
	if link_address[0] == '#' and not INTERNAL: # local
	return (None, None)

	# create a local static var:
	# - depends on global "website"
	# if 'rbase' not in pattern_adjust.__dict__ \
	# pattern_adjust.rbase = urlsplit(website)

	r = urlsplit(link_address)
	# don't follow local:
	if not INTERNAL and \
	r.netloc == rbase.netloc:
	return (None, None)
	# NOTE: I don't really understand
	# what this is doing, so annotating:
	# if relative URL (local)
	# TODO: I am getting convinced what this wants to do
	# should be done w/ a simple urljoin()
	# I'm also thinking this code branch isn't traversed;
	if r.scheme == '' and (r.netloc != '' or r.path != ''):
	# reconstitute - it won't be a full path
	d = urlunsplit(r)
	# This if seems exceedingly wonky
	if d.startswith('//'):
	# if it starts with '//', throw that away...
	# m = re.search('(?<=//)\S+', d)
	# d = m.group(0)
	# TODO: if r.netloc is empty, then this
	# could result in an incorrect URL:
	# => if address = foo.com/something - then ok
	# => if address relaive: ./static/something - then trouble
	return ("https://" + d[2:], r.netloc)
	elif r.scheme == '' and r.netloc == '':
	# is this what I want to do?
	# would I rather do urljoin(urlunsplit(rbase), link_address)?
	return (urljoin(website, link_address), r.netloc) \
	if INTERNAL else (None, None)
	else:
	return (link_address, r.netloc)


	def string_trunc(s, field_width=73, fill='.'):
	'''
	usage:
	s, f, w = string_trunc(longurl)
	print(f']{s:{f}<{w}}[')
	returns:
	a truncated (if needed) string,
	a fill char, and
	a matching field_width
	'''
	str_width = len(s)

	if str_width > field_width:
	# room for 3 fill chars
	return s[:field_width-3], fill, field_width
	else:
	# this doesn't work: need real values
	# return s, None, None
	# width of zero seems to not do width,
	# but I need returned str_width to clear progress line;
	# fill could be anything (another "0", but...)
	return s, " ", str_width


	def progress(msg):
	'''
	hack to print progress
	'''
	if 'w' not in progress.__dict__:
	progress.w = 0

	# clear previous progress line
	print(f'\r{" "*progress.w}', end='', file=sys.stderr)
	s, f, w = string_trunc(msg)
	print(f'\r{s:{f}<{w}}', end='', file=sys.stderr)
	progress.w = w


	def extract_link(address):
	global link_status
	global session

	tags = {'a': 'href', 'img': 'src', 'script': 'src', 'link': 'href'}

	# the partitioned pieces of URL we're checking
	rbase = urlsplit(address)
	response = session.get(address)
	for key, value in tags.items():
	for link in BeautifulSoup(response.content, "html.parser",
	parse_only=SoupStrainer(key)):
	if link.has_attr(value):
	# I'm jonesin' for some progress indicators
	progress(link[value])
	p, netloc = pattern_adjust(link[value], rbase)
	if p and p not in link_status:
	link_status[p] = check(p, netloc)
	if link_status[p]:
	# the '\r' is a hack to show stdout ok w/ progress msgs
	print('\r', end='', file=sys.stderr)
	print(link_status[p])


	if __name__ == "__main__":
	arguments = docopt(__doc__)
	BASEURL = arguments['--base']
	INTERNAL = arguments['--internal']
	VERBOSE = arguments['--verbose']
	websites = arguments['<url>'] # "https://davericho.com/books/"

	colorama_init()
	# to facilitate checking each link only once
	link_status = {}
	# sites which don't accept 'head' requests (dynamic)
	# populate with results of urlsplit().netloc
	FULL_GET = ('www.amazon.com',)
	# for places like amazon.com, which will deny python scripts:
	# Now - use session throughout this script!
	session = requests.Session()
	session.headers.update({'User-Agent': 'test'})
	for website in websites:
	if BASEURL:
	website = urljoin(BASEURL, website)
	print(f'{Fore.CYAN}--- checking links on {website} ---{Fore.RESET}')
	result = extract_link(website)