Last active
May 24, 2025 18:01
-
-
Save twerp/21da5ec99dc0614270403da36d522714 to your computer and use it in GitHub Desktop.
Link-checker / link-reporter for dokuwiki text files (WIP; Python3.6)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import random | |
import requests | |
import time | |
from collections import namedtuple | |
from datetime import datetime | |
from pprint import pprint | |
from urllib.parse import urlparse | |
user_agent = '' | |
Link = namedtuple("Link", "url title") | |
def remove_newline(input_str): | |
if input_str.endswith('\n'): | |
output_str = input_str[:-1] | |
else: | |
output_str = input_str | |
return output_str | |
def print_file(f, lines_to_print): | |
line_count = 0 | |
for raw_line in f: | |
line_count += 1 | |
if line_count <= lines_to_print: | |
line = remove_newline(raw_line) | |
print(f"{line_count} {line}") | |
print(f"\nNumber of lines: {line_count}") | |
def extract_link_from(f, progress=None, start_at=1): | |
# a single line may have 0 or more links | |
for line in f: | |
if progress: | |
progress['linecounter'] += 1 | |
if progress['linecounter'] < start_at: | |
continue | |
try: | |
start_idx = -1 | |
while True: | |
start_idx = line.index('[[', start_idx+1) | |
end_idx = line.index(']]', start_idx) | |
# print(f"{start_idx} - {end_idx}") | |
link_str = line[start_idx+2:end_idx] | |
sep_idx = link_str.find('|') | |
if sep_idx > -1: | |
url = link_str[:sep_idx] | |
title = link_str[sep_idx+1:] | |
else: | |
url = link_str | |
title = None | |
# print(url) | |
if url.startswith('http'): | |
if url.startswith('https://web.archive.org'): | |
yield # ignore Wayback Machine links | |
elif 'news.ycombinator.com' in url: | |
yield # ignore Hacker News links | |
else: | |
yield Link(url, title) | |
else: | |
yield | |
except ValueError: | |
# line has 0 urls | |
yield | |
def request(url): | |
global user_agent | |
return requests.get(url, headers={'User-Agent': user_agent}) | |
def request_head(url): | |
global user_agent | |
return requests.head(url, headers={'User-Agent': user_agent}) | |
def extract_owner_repo(url): | |
parsed_url = urlparse(url) | |
path = parsed_url.path.strip("/") | |
owner, repo = path.split("/")[:2] | |
return owner, repo | |
def process_file(f, user_agent, query_gh_api, progress=None, start_at=1): | |
def set_user_agent(ua): | |
global user_agent | |
user_agent = ua | |
set_user_agent(user_agent) | |
start_time = datetime.now() | |
gh_api_request_count = 0 | |
urls = [] | |
for link in extract_link_from(f, progress, start_at): | |
if not link: | |
continue | |
urls.append(link.url) | |
try: | |
r = request_head(link.url) | |
except requests.exceptions.ConnectionError: | |
print(f"ERR {link.url}") | |
continue | |
except requests.exceptions.SSLError: | |
print(f"SSL {link.url}") | |
continue | |
headers = dict(r.headers.items()) | |
if r.status_code == 200: | |
if query_gh_api and ('github.com' in link.url) and ('gist.' not in link.url): | |
delta_time = datetime.now() - start_time | |
if delta_time.seconds < 3600 and gh_api_request_count > 59: | |
raise Exception(f"GitHub API limits reached ({datetime.now()})") | |
if delta_time.seconds >= 3600: | |
start_time = datetime.now() | |
gh_api_request_count = 0 | |
try: | |
owner, repo = extract_owner_repo(link.url) | |
except ValueError: | |
# most likely not a repo url | |
continue | |
''' | |
except requests.exceptions.JSONDecodeError: | |
# comes up erratically, rerunning script should help | |
except KeyError: | |
# GitHub API limit reached(?); unfortunately comes up too easily | |
''' | |
req_start = datetime.now() | |
r2 = requests.get(f"https://api.github.com/repos/{owner}/{repo}/commits", | |
params={'per_page': '1'}, | |
headers={'User-Agent': user_agent}) | |
req_end = datetime.now() | |
gh_api_request_count += 1 | |
if r2.status_code == 403: | |
print(f"403 FORBIDDEN github.com/{owner}/{repo}") | |
continue | |
if r2.status_code != 404: | |
print(f"github.com/{owner}/{repo} latest commit at: {r2.json()[0]['commit']['author']['date']}") | |
sleep_seconds = (req_end - req_start).seconds | |
sleep_seconds += random.randint(1, 11) | |
print(f"Sleeping for {sleep_seconds} seconds..") | |
time.sleep(sleep_seconds) | |
continue | |
if r.status_code == 405: | |
r2 = request(link.url) | |
if r2.status_code != 200: | |
print(f"{r2.status_code} {link.url}") | |
continue | |
print(f"{r.status_code} {link.url}") | |
if r.status_code == 301 or r.status_code == 302: | |
try: | |
new_url = headers['Location'] | |
except KeyError: | |
print(f"-> ???") | |
continue | |
if new_url.startswith('http'): | |
try: | |
r2 = request_head(new_url) | |
print(f"-> {r2.status_code} {new_url}") | |
except requests.exceptions.ConnectionError: | |
print(f"-> ERR {new_url}") | |
except requests.exceptions.SSLError: | |
print(f"-> SSL {new_url}") | |
else: | |
print(f"-> TRY {new_url}") | |
if r.status_code == 403: | |
pprint(headers, indent=4) | |
# print(f"Found {len(urls)} urls") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note: on WSL Windows path separators will be "eaten" so you must use forward slashes! | |
MAX_LINES_TO_PRINT = 6 # Only used for debugging | |
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0' | |
progress = { 'linecounter': 0 } # Using a dict enables us to modify the value in the functions module | |
import defopt | |
import logging | |
from functions import process_file, print_file | |
def main(filepath, *, user_agent=USER_AGENT, github=False, n=1): | |
""" | |
Wiki-link-reporter v0.2 | |
:param str filepath: Path to file, e.g. data/pages/foo.txt | |
:param str user_agent: User-Agent string (from your browser) | |
:param bool github: Query GitHub API for info on the latest commits | |
:param int n: Start working from line N | |
""" | |
try: | |
with open(filepath, encoding='utf-8') as f: | |
# print() | |
# print_file(f, MAX_LINES_TO_PRINT) | |
process_file(f, user_agent, query_gh_api=github, progress=progress, start_at=n) | |
except Exception as ex: | |
logging.exception(f"Was on line {progress['linecounter']}") # Gives us a traceback as well | |
if __name__ == '__main__': | |
defopt.run(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment