Last active
March 13, 2024 14:58
-
-
Save twerp/21da5ec99dc0614270403da36d522714 to your computer and use it in GitHub Desktop.
Link-checker / link-reporter for dokuwiki text files (WIP; Python3.6)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
from collections import namedtuple | |
from pprint import pprint | |
from urllib.parse import urlparse | |
user_agent = '' | |
Link = namedtuple("Link", "url title") | |
def remove_newline(input_str): | |
if input_str.endswith('\n'): | |
output_str = input_str[:-1] | |
else: | |
output_str = input_str | |
return output_str | |
def print_file(f, lines_to_print): | |
line_count = 0 | |
for raw_line in f: | |
line_count += 1 | |
if line_count <= lines_to_print: | |
line = remove_newline(raw_line) | |
print(f"{line_count} {line}") | |
print(f"\nNumber of lines: {line_count}") | |
def extract_link_from(f, progress=None, start_at=1): | |
# a single line may have 0 or more links | |
for line in f: | |
if progress: | |
progress['linecounter'] += 1 | |
if progress['linecounter'] < start_at: | |
continue | |
try: | |
start_idx = -1 | |
while True: | |
start_idx = line.index('[[', start_idx+1) | |
end_idx = line.index(']]', start_idx) | |
# print(f"{start_idx} - {end_idx}") | |
link_str = line[start_idx+2:end_idx] | |
sep_idx = link_str.find('|') | |
if sep_idx > -1: | |
url = link_str[:sep_idx] | |
title = link_str[sep_idx+1:] | |
else: | |
url = link_str | |
title = None | |
# print(url) | |
if url.startswith('http'): | |
if url.startswith('https://web.archive.org'): | |
yield # ignore Wayback Machine links | |
elif 'news.ycombinator.com' in url: | |
yield # ignore Hacker News links | |
else: | |
yield Link(url, title) | |
else: | |
yield | |
except ValueError: | |
# line has 0 urls | |
yield | |
def request(url): | |
global user_agent | |
return requests.get(url, headers={'User-Agent': user_agent}) | |
def request_head(url): | |
global user_agent | |
return requests.head(url, headers={'User-Agent': user_agent}) | |
def extract_owner_repo(url): | |
parsed_url = urlparse(url) | |
path = parsed_url.path.strip("/") | |
owner, repo = path.split("/")[:2] | |
return owner, repo | |
def process_file(f, user_agent, query_gh_api, progress=None, start_at=1): | |
def set_user_agent(ua): | |
global user_agent | |
user_agent = ua | |
set_user_agent(user_agent) | |
urls = [] | |
for link in extract_link_from(f, progress, start_at): | |
if not link: | |
continue | |
urls.append(link.url) | |
try: | |
r = request_head(link.url) | |
except requests.exceptions.ConnectionError: | |
print(f"ERR {link.url}") | |
continue | |
except requests.exceptions.SSLError: | |
print(f"SSL {link.url}") | |
continue | |
headers = dict(r.headers.items()) | |
if r.status_code == 200: | |
if query_gh_api and ('github.com' in link.url) and ('gist.' not in link.url): | |
try: | |
owner, repo = extract_owner_repo(link.url) | |
except ValueError: | |
# most likely not a repo url | |
continue | |
''' | |
except requests.exceptions.JSONDecodeError: | |
# comes up erratically, rerunning script should help | |
except KeyError: | |
# GitHub API limit reached(?); unfortunately comes up too easily | |
''' | |
r2 = requests.get(f"https://api.github.com/repos/{owner}/{repo}/commits", | |
params={'per_page': '1'}, | |
headers={'User-Agent': user_agent}) | |
if r2.status_code == 403: | |
print(f"403 FORBIDDEN github.com/{owner}/{repo}") | |
continue | |
if r2.status_code != 404: | |
print(f"github.com/{owner}/{repo} latest commit at: {r2.json()[0]['commit']['author']['date']}") | |
continue | |
if r.status_code == 405: | |
r2 = request(link.url) | |
if r2.status_code != 200: | |
print(f"{r2.status_code} {link.url}") | |
continue | |
print(f"{r.status_code} {link.url}") | |
if r.status_code == 301 or r.status_code == 302: | |
try: | |
new_url = headers['Location'] | |
except KeyError: | |
print(f"-> ???") | |
continue | |
if new_url.startswith('http'): | |
try: | |
r2 = request_head(new_url) | |
print(f"-> {r2.status_code} {new_url}") | |
except requests.exceptions.ConnectionError: | |
print(f"-> ERR {new_url}") | |
except requests.exceptions.SSLError: | |
print(f"-> SSL {new_url}") | |
else: | |
print(f"-> TRY {new_url}") | |
if r.status_code == 403: | |
pprint(headers, indent=4) | |
# print(f"Found {len(urls)} urls") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note: on WSL Windows path separators will be "eaten" so you must use forward slashes! | |
MAX_LINES_TO_PRINT = 6 # Only used for debugging | |
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0' | |
progress = { 'linecounter': 0 } # Using a dict enables us to modify the value in the functions module | |
import defopt | |
import logging | |
from functions import process_file, print_file | |
def main(filepath, *, user_agent=USER_AGENT, github=False, n=1): | |
""" | |
Wiki-link-reporter v0.2 | |
:param str filepath: Path to file, e.g. data/pages/foo.txt | |
:param str user_agent: User-Agent string (from your browser) | |
:param bool github: Query GitHub API for info on the latest commits | |
:param int n: Start working from line N | |
""" | |
try: | |
with open(filepath, encoding='utf-8') as f: | |
# print() | |
# print_file(f, MAX_LINES_TO_PRINT) | |
process_file(f, user_agent, query_gh_api=github, progress=progress, start_at=n) | |
except Exception as ex: | |
logging.exception(f"Was on line {progress['linecounter']}") # Gives us a traceback as well | |
if __name__ == '__main__': | |
defopt.run(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment