Last active
March 27, 2023 01:11
-
-
Save glenn-jocher/85b1fe5325d73971783e16b7721b81f7 to your computer and use it in GitHub Desktop.
Find Broken Links in Repo Issues/PRs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ultralytics YOLO 🚀, GPL-3.0 license | |
# https://gist.github.com/glenn-jocher/85b1fe5325d73971783e16b7721b81f7 | |
import contextlib | |
import re | |
import warnings | |
from collections import Counter | |
import requests | |
from tqdm import tqdm | |
def find_broken_links(repo_owner: str, repo_name: str, token: str): | |
""" | |
Finds and counts the most common broken links in all issue and PR messages in a repository. | |
""" | |
warnings.filterwarnings("ignore") | |
# Send a GET request to the GitHub API to retrieve all issues for the repository | |
url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/issues?state=all&per_page=100&page=1" | |
headers = {"Accept": "application/vnd.github.v3+json", "Authorization": f"token {token}"} | |
response = requests.get(url, headers=headers) | |
# Check if the request was successful | |
if response.status_code != 200: | |
print(f"Error retrieving issues: {response.text}") | |
return | |
# Process each issue | |
messages = [] | |
links = [] | |
page = 1 | |
while response.json(): | |
for issue in tqdm(response.json(), desc=f"Processing issues (page {page})"): | |
# Retrieve the comments associated with the issue | |
url = issue["comments_url"] | |
response = requests.get(url, headers=headers) | |
# Check if the request was successful | |
if response.status_code != 200: | |
print(f"Error retrieving comments for issue {issue['number']}: {response.text}") | |
continue | |
# Process each comment in the issue | |
for comment in response.json(): | |
# Ignore comments from bots and GitHub actions | |
if "bot" in comment["user"]["type"].lower() or "github" in comment["user"]["login"].lower(): | |
continue | |
# Extract all URLs from the comment text | |
urls = re.findall(r'https?://\S+(?=\s|[",:`)\'><*]+)', comment['body']) | |
# Check each URL for errors | |
for url in urls: | |
with contextlib.suppress(Exception): | |
for s in ['.', ',', '>', ')', '`', '"', "'", ',']: | |
url = url.rstrip(s) | |
for s in ['">', ']', ')', '#']: | |
url = url.split(s)[0] | |
response = requests.head(url, timeout=3) # return head only (lighter and faster) | |
if response.status_code >= 400: # may include 405 redirects etc. | |
response = requests.get(url, timeout=10) | |
if response.status_code >= 400: | |
messages.append(f"Broken link {response.status_code}: {url} (issue #{issue['number']})") | |
links.append(f"{response.status_code} {url}") | |
# Move on to the next page of issues | |
page += 1 | |
url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/issues?state=all&per_page=100&page={page}" | |
headers = {"Accept": "application/vnd.github.v3+json", "Authorization": f"token {token}"} | |
response = requests.get(url, headers=headers) | |
# Count the occurrences of each broken link | |
link_counts = Counter(links) | |
print('\n'.join(messages)) | |
print("\nThe most common broken links are:") | |
for link, count in link_counts.most_common(): | |
print(f"{count} occurrences: {link}") | |
find_broken_links(repo_owner='ultralytics', repo_name='ultralytics', token='') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment