Last active
October 19, 2022 08:23
-
-
Save labuladong/7fc744fb1214fe406550aacd03bfa20e to your computer and use it in GitHub Desktop.
Help detect broken markdown link for Apache Pulsar documentation.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
import re | |
from pathlib import Path | |
from typing import Union | |
# todo: read from input | |
BASE_DIR = Path('/Users/labuladong/IdeaProjects/pulsar/site2/docs/') | |
class LinkChecker: | |
def __init__(self, s: str, path: Union[str, Path]): | |
m = re.match(r'!?\[(.*?)\]\((.*?)\)', s) | |
assert m, f"{str(path)} -> {s} is not a universal pattern" | |
self.raw = s | |
self.title = m.group(1) | |
self.link = m.group(2) | |
self.file_path = Path(path) | |
def __eq__(self, other): | |
return self.raw.__eq__(other.raw) | |
def __str__(self): | |
return self.raw | |
def is_valid(self) -> bool: | |
return not self.invalid_reason() | |
def invalid_reason(self) -> Union[None, str]: | |
if not self.link: | |
return f"{self.raw} missing link" | |
if not self.raw.startswith('!') and not self.title: | |
return f"{self.raw} missing title" | |
if not self.file_path.exists(): | |
return f"{self.raw} path not exists" | |
return None | |
# like '[this page](schema-manage.md#set-schema-compatibility-check-strategy)' | |
class MarkdownLinkChecker(LinkChecker): | |
def __init__(self, s: str, path: Union[str, Path]): | |
super().__init__(s, path) | |
m = re.match(r'\[(.*?)\]\((.*?\.md)?(#.*?)?\)', s) | |
assert m, f"{str(path)} -> {s} is not a markdown pattern" | |
self.md_link_path = m.group(2) | |
if self.md_link_path: | |
self.md_link_path = self.md_link_path.lstrip('/') | |
else: | |
# only have markdown anchor | |
self.md_link_path = path.name | |
self.md_anchor = m.group(3) | |
def invalid_reason(self) -> Union[None, str]: | |
# universal link error | |
reason = super(MarkdownLinkChecker, self).invalid_reason() | |
if reason: | |
return reason | |
# wrong md format | |
if not self.md_link_path: | |
return f"{self.md_link_path} in not a valid markdown path" | |
link_abs_path: Path = self.file_path.parents[0] / self.md_link_path | |
# file not exist | |
if not link_abs_path.exists(): | |
return f"{self.md_link_path} not exist" | |
# check anchor | |
title_set = MarkdownLinkChecker.get_title_set(link_abs_path) | |
if self.md_anchor and self.md_anchor.lstrip('#') not in title_set: | |
return f"{self.md_link_path} doesn't have anchor: {self.md_anchor}" | |
return None | |
@staticmethod | |
def get_title_set(md_file: Path) -> set[str]: | |
content = md_file.read_text() | |
titles = re.findall(r'(#)+\s+(.*)\n', content) | |
titles = [group[1].strip() for group in titles] | |
# 'This is a title -> this-is-a-title | |
titles = [re.sub(r'[()`\'\"/?:.,*]', '', title) for title in titles] | |
titles = list(map(lambda e: e.lower().replace(' ', '-'), titles)) | |
return set(titles) | |
# todo: handle links like '[this page](https://www.google.com)' | |
class HttpLinkChecker(LinkChecker): | |
pass | |
# todo: handle links like '[this page](/tool/admin)' | |
class OtherLinkChecker(LinkChecker): | |
pass | |
# %% | |
md_files = list(BASE_DIR.glob('**/*.md')) | |
file_name_to_checker_list = dict() | |
for file in md_files: | |
text = file.read_text() | |
# filter all link in md file | |
links = re.findall(r'!?\[.*?\]\(.*?\)', text) | |
if not links: | |
continue | |
checkers = [] | |
for link in links: | |
if 'http' in link: | |
checkers.append(HttpLinkChecker(link, file)) | |
elif '.md' in link or '(#' in link: | |
checkers.append(MarkdownLinkChecker(link, file)) | |
else: | |
checkers.append(OtherLinkChecker(link, file)) | |
wrong_checkers = list(filter(lambda e: not e.is_valid(), checkers)) | |
if not wrong_checkers: | |
continue | |
# print(list(map(lambda e: str(e), md_links))) | |
file_name_to_checker_list[file.name] = wrong_checkers | |
for file_name, checker_list in file_name_to_checker_list.items(): | |
template = f'`{file_name}` has {len(checker_list)} errors:\n' | |
for checker in checker_list: | |
template = template + f"\t{checker.raw} -> {checker.invalid_reason()}\n" | |
print(template) | |
# %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment