Last active
April 28, 2022 14:52
-
-
Save u8sand/33e6f81b7009af63f0a5a6bcb255251c to your computer and use it in GitHub Desktop.
Link-CK finds dead links recursively in text documents and tests them.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import csv | |
import click | |
import pathlib | |
import requests | |
link_expr = re.compile(r'https?://[^\s\]\)\|\>\"\'\<\}`#]+') | |
@click.command(help=''' | |
Link-CK finds dead links recursively in text documents and tests them. | |
''') | |
@click.option('-i', '--input', type=str, default='-', show_default=True, help='Input file, directory or glob to search for links in') | |
@click.option('-o', '--output', type=click.File(mode='w'), default='-', show_default=True, help='Output file to write results to') | |
@click.option('-H', '--head', type=bool, is_flag=True, default=False, show_default=True, help='Use HEAD instead of GET') | |
@click.option('-t', '--timeout', type=int, default=1, show_default=True, help='Timeout for requests') | |
@click.option('-d', '--dead', type=bool, is_flag=True, default=False, show_default=True, help='Only report dead-links') | |
def link_ck(input, output, head, timeout, dead): | |
if input == '-': | |
input = [pathlib.Path('/dev/stdin')] | |
elif '*' in input: | |
input = pathlib.Path('.').glob(input) | |
else: | |
input = pathlib.Path(input) | |
if input.is_dir(): | |
input = input.rglob('*') | |
else: | |
input = [input] | |
# | |
ck = requests.head if head else requests.get | |
# | |
w = csv.DictWriter(output, ['location', 'link', 'status'], delimiter='\t') | |
done = {} | |
w.writeheader() | |
for path in input: | |
if path.is_dir(): continue | |
try: | |
with path.open('r', encoding='utf-8') as fr: | |
for line_number, line in enumerate(fr): | |
for m in link_expr.finditer(line): | |
link = m.group(0) | |
if link not in done: | |
try: | |
req = ck(link, allow_redirects=True, timeout=timeout) | |
status = req.status_code | |
except Exception as e: | |
status = e.__class__.__name__ | |
done[link] = status | |
else: | |
status = done[link] | |
# | |
if dead and type(status) == int and status >= 200 and status <= 299: | |
continue | |
w.writerow(dict( | |
location=str(path) + ':' + str(line_number+1) + ':' + str(m.start(0)+1), | |
link=link, | |
status=status, | |
)) | |
except UnicodeDecodeError: | |
# ignore non-utf-8 & presumably binary data files | |
continue | |
if __name__ == '__main__': | |
link_ck() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment