glenn-jocher · March 27, 2023 01:11
diff --git a/find_broken_links.py b/find_broken_links.py
 # Ultralytics YOLO 🚀, GPL-3.0 license
 # https://gist.github.com/glenn-jocher/85b1fe5325d73971783e16b7721b81f7

 import contextlib
 import re
 import warnings
 from collections import Counter

 import requests
 from tqdm import tqdm


 def find_broken_links(repo_owner: str, repo_name: str, token: str):
    """
    Finds and counts the most common broken links in all issue and PR messages in a repository.
    """
    warnings.filterwarnings("ignore")

    # Send a GET request to the GitHub API to retrieve all issues for the repository
    url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/issues?state=all&per_page=100&page=1"
    headers = {"Accept": "application/vnd.github.v3+json", "Authorization": f"token {token}"}
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Error retrieving issues: {response.text}")
        return

    # Process each issue
    messages = []
    links = []
    page = 1
    while response.json():
        for issue in tqdm(response.json(), desc=f"Processing issues (page {page})"):
            # Retrieve the comments associated with the issue
            url = issue["comments_url"]
            response = requests.get(url, headers=headers)

            # Check if the request was successful
            if response.status_code != 200:
                print(f"Error retrieving comments for issue {issue['number']}: {response.text}")
                continue

            # Process each comment in the issue
            for comment in response.json():
                # Ignore comments from bots and GitHub actions
                if "bot" in comment["user"]["type"].lower() or "github" in comment["user"]["login"].lower():
                    continue

                # Extract all URLs from the comment text
                urls = re.findall(r'https?://\S+(?=\s|[",:`)\'><*]+)', comment['body'])

                # Check each URL for errors
                for url in urls:
                    with contextlib.suppress(Exception):
                        for s in ['.', ',', '>', ')', '`', '"', "'", '，']:
                            url = url.rstrip(s)
                        for s in ['">', ']', ')', '#']:
                            url = url.split(s)[0]
                        response = requests.head(url, timeout=3)  # return head only (lighter and faster)
                        if response.status_code >= 400:  # may include 405 redirects etc.
                            response = requests.get(url, timeout=10)
                        if response.status_code >= 400:
                            messages.append(f"Broken link {response.status_code}: {url} (issue #{issue['number']})")
                            links.append(f"{response.status_code} {url}")

        # Move on to the next page of issues
        page += 1
        url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/issues?state=all&per_page=100&page={page}"
        headers = {"Accept": "application/vnd.github.v3+json", "Authorization": f"token {token}"}
        response = requests.get(url, headers=headers)

    # Count the occurrences of each broken link
    link_counts = Counter(links)
    print('\n'.join(messages))
    print("\nThe most common broken links are:")
    for link, count in link_counts.most_common():
        print(f"{count} occurrences: {link}")


 find_broken_links(repo_owner='ultralytics', repo_name='ultralytics', token='')
	# Ultralytics YOLO 🚀, GPL-3.0 license
	# https://gist.github.com/glenn-jocher/85b1fe5325d73971783e16b7721b81f7

	import contextlib
	import re
	import warnings
	from collections import Counter

	import requests
	from tqdm import tqdm


	def find_broken_links(repo_owner: str, repo_name: str, token: str):
	"""
	Finds and counts the most common broken links in all issue and PR messages in a repository.
	"""
	warnings.filterwarnings("ignore")

	# Send a GET request to the GitHub API to retrieve all issues for the repository
	url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/issues?state=all&per_page=100&page=1"
	headers = {"Accept": "application/vnd.github.v3+json", "Authorization": f"token {token}"}
	response = requests.get(url, headers=headers)

	# Check if the request was successful
	if response.status_code != 200:
	print(f"Error retrieving issues: {response.text}")
	return

	# Process each issue
	messages = []
	links = []
	page = 1
	while response.json():
	for issue in tqdm(response.json(), desc=f"Processing issues (page {page})"):
	# Retrieve the comments associated with the issue
	url = issue["comments_url"]
	response = requests.get(url, headers=headers)

	# Check if the request was successful
	if response.status_code != 200:
	print(f"Error retrieving comments for issue {issue['number']}: {response.text}")
	continue

	# Process each comment in the issue
	for comment in response.json():
	# Ignore comments from bots and GitHub actions
	if "bot" in comment["user"]["type"].lower() or "github" in comment["user"]["login"].lower():
	continue

	# Extract all URLs from the comment text
	urls = re.findall(r'https?://\S+(?=\s\|[",:`)\'><*]+)', comment['body'])

	# Check each URL for errors
	for url in urls:
	with contextlib.suppress(Exception):
	for s in ['.', ',', '>', ')', '`', '"', "'", '，']:
	url = url.rstrip(s)
	for s in ['">', ']', ')', '#']:
	url = url.split(s)[0]
	response = requests.head(url, timeout=3) # return head only (lighter and faster)
	if response.status_code >= 400: # may include 405 redirects etc.
	response = requests.get(url, timeout=10)
	if response.status_code >= 400:
	messages.append(f"Broken link {response.status_code}: {url} (issue #{issue['number']})")
	links.append(f"{response.status_code} {url}")

	# Move on to the next page of issues
	page += 1
	url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/issues?state=all&per_page=100&page={page}"
	headers = {"Accept": "application/vnd.github.v3+json", "Authorization": f"token {token}"}
	response = requests.get(url, headers=headers)

	# Count the occurrences of each broken link
	link_counts = Counter(links)
	print('\n'.join(messages))
	print("\nThe most common broken links are:")
	for link, count in link_counts.most_common():
	print(f"{count} occurrences: {link}")


	find_broken_links(repo_owner='ultralytics', repo_name='ultralytics', token='')