Skip to content

Instantly share code, notes, and snippets.

@labuladong
Last active October 19, 2022 08:23
Show Gist options
  • Save labuladong/7fc744fb1214fe406550aacd03bfa20e to your computer and use it in GitHub Desktop.
Save labuladong/7fc744fb1214fe406550aacd03bfa20e to your computer and use it in GitHub Desktop.
Help detect broken markdown link for Apache Pulsar documentation.
# %%
import re
from pathlib import Path
from typing import Union
# todo: read from input
BASE_DIR = Path('/Users/labuladong/IdeaProjects/pulsar/site2/docs/')
class LinkChecker:
def __init__(self, s: str, path: Union[str, Path]):
m = re.match(r'!?\[(.*?)\]\((.*?)\)', s)
assert m, f"{str(path)} -> {s} is not a universal pattern"
self.raw = s
self.title = m.group(1)
self.link = m.group(2)
self.file_path = Path(path)
def __eq__(self, other):
return self.raw.__eq__(other.raw)
def __str__(self):
return self.raw
def is_valid(self) -> bool:
return not self.invalid_reason()
def invalid_reason(self) -> Union[None, str]:
if not self.link:
return f"{self.raw} missing link"
if not self.raw.startswith('!') and not self.title:
return f"{self.raw} missing title"
if not self.file_path.exists():
return f"{self.raw} path not exists"
return None
# like '[this page](schema-manage.md#set-schema-compatibility-check-strategy)'
class MarkdownLinkChecker(LinkChecker):
def __init__(self, s: str, path: Union[str, Path]):
super().__init__(s, path)
m = re.match(r'\[(.*?)\]\((.*?\.md)?(#.*?)?\)', s)
assert m, f"{str(path)} -> {s} is not a markdown pattern"
self.md_link_path = m.group(2)
if self.md_link_path:
self.md_link_path = self.md_link_path.lstrip('/')
else:
# only have markdown anchor
self.md_link_path = path.name
self.md_anchor = m.group(3)
def invalid_reason(self) -> Union[None, str]:
# universal link error
reason = super(MarkdownLinkChecker, self).invalid_reason()
if reason:
return reason
# wrong md format
if not self.md_link_path:
return f"{self.md_link_path} in not a valid markdown path"
link_abs_path: Path = self.file_path.parents[0] / self.md_link_path
# file not exist
if not link_abs_path.exists():
return f"{self.md_link_path} not exist"
# check anchor
title_set = MarkdownLinkChecker.get_title_set(link_abs_path)
if self.md_anchor and self.md_anchor.lstrip('#') not in title_set:
return f"{self.md_link_path} doesn't have anchor: {self.md_anchor}"
return None
@staticmethod
def get_title_set(md_file: Path) -> set[str]:
content = md_file.read_text()
titles = re.findall(r'(#)+\s+(.*)\n', content)
titles = [group[1].strip() for group in titles]
# 'This is a title -> this-is-a-title
titles = [re.sub(r'[()`\'\"/?:.,*]', '', title) for title in titles]
titles = list(map(lambda e: e.lower().replace(' ', '-'), titles))
return set(titles)
# todo: handle links like '[this page](https://www.google.com)'
class HttpLinkChecker(LinkChecker):
pass
# todo: handle links like '[this page](/tool/admin)'
class OtherLinkChecker(LinkChecker):
pass
# %%
md_files = list(BASE_DIR.glob('**/*.md'))
file_name_to_checker_list = dict()
for file in md_files:
text = file.read_text()
# filter all link in md file
links = re.findall(r'!?\[.*?\]\(.*?\)', text)
if not links:
continue
checkers = []
for link in links:
if 'http' in link:
checkers.append(HttpLinkChecker(link, file))
elif '.md' in link or '(#' in link:
checkers.append(MarkdownLinkChecker(link, file))
else:
checkers.append(OtherLinkChecker(link, file))
wrong_checkers = list(filter(lambda e: not e.is_valid(), checkers))
if not wrong_checkers:
continue
# print(list(map(lambda e: str(e), md_links)))
file_name_to_checker_list[file.name] = wrong_checkers
for file_name, checker_list in file_name_to_checker_list.items():
template = f'`{file_name}` has {len(checker_list)} errors:\n'
for checker in checker_list:
template = template + f"\t{checker.raw} -> {checker.invalid_reason()}\n"
print(template)
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment