labuladong · October 19, 2022 08:23
diff --git a/detector.py b/detector.py
 # %%
 import re
 from pathlib import Path
 from typing import Union

 # todo: read from input
 BASE_DIR = Path('/Users/labuladong/IdeaProjects/pulsar/site2/docs/')

 class LinkChecker:
    def __init__(self, s: str, path: Union[str, Path]):
        m = re.match(r'!?\[(.*?)\]\((.*?)\)', s)
        assert m, f"{str(path)} -> {s} is not a universal pattern"
        self.raw = s
        self.title = m.group(1)
        self.link = m.group(2)
        self.file_path = Path(path)

    def __eq__(self, other):
        return self.raw.__eq__(other.raw)

    def __str__(self):
        return self.raw

    def is_valid(self) -> bool:
        return not self.invalid_reason()

    def invalid_reason(self) -> Union[None, str]:
        if not self.link:
            return f"{self.raw} missing link"
        if not self.raw.startswith('!') and not self.title:
            return f"{self.raw} missing title"
        if not self.file_path.exists():
            return f"{self.raw} path not exists"
        return None


 # like '[this page](schema-manage.md#set-schema-compatibility-check-strategy)'
 class MarkdownLinkChecker(LinkChecker):
    def __init__(self, s: str, path: Union[str, Path]):
        super().__init__(s, path)

        m = re.match(r'\[(.*?)\]\((.*?\.md)?(#.*?)?\)', s)
        assert m, f"{str(path)} -> {s} is not a markdown pattern"

        self.md_link_path = m.group(2)
        if self.md_link_path:
            self.md_link_path = self.md_link_path.lstrip('/')
        else:
            # only have markdown anchor
            self.md_link_path = path.name

        self.md_anchor = m.group(3)

    def invalid_reason(self) -> Union[None, str]:
        # universal link error
        reason = super(MarkdownLinkChecker, self).invalid_reason()
        if reason:
            return reason

        # wrong md format
        if not self.md_link_path:
            return f"{self.md_link_path} in not a valid markdown path"

        link_abs_path: Path = self.file_path.parents[0] / self.md_link_path
        # file not exist
        if not link_abs_path.exists():
            return f"{self.md_link_path} not exist"

        # check anchor
        title_set = MarkdownLinkChecker.get_title_set(link_abs_path)
        if self.md_anchor and self.md_anchor.lstrip('#') not in title_set:
            return f"{self.md_link_path} doesn't have anchor: {self.md_anchor}"
        return None

    @staticmethod
    def get_title_set(md_file: Path) -> set[str]:
        content = md_file.read_text()
        titles = re.findall(r'(#)+\s+(.*)\n', content)
        titles = [group[1].strip() for group in titles]
        # 'This is a title -> this-is-a-title
        titles = [re.sub(r'[()`\'\"/?:.,*]', '', title) for title in titles]
        titles = list(map(lambda e: e.lower().replace(' ', '-'), titles))
        return set(titles)


 # todo: handle links like '[this page](https://www.google.com)'
 class HttpLinkChecker(LinkChecker):
    pass


 # todo: handle links like '[this page](/tool/admin)'
 class OtherLinkChecker(LinkChecker):
    pass


 # %%
 md_files = list(BASE_DIR.glob('**/*.md'))

 file_name_to_checker_list = dict()
 for file in md_files:
    text = file.read_text()
    # filter all link in md file
    links = re.findall(r'!?\[.*?\]\(.*?\)', text)
    if not links:
        continue

    checkers = []
    for link in links:
        if 'http' in link:
            checkers.append(HttpLinkChecker(link, file))
        elif '.md' in link or '(#' in link:
            checkers.append(MarkdownLinkChecker(link, file))
        else:
            checkers.append(OtherLinkChecker(link, file))

    wrong_checkers = list(filter(lambda e: not e.is_valid(), checkers))
    if not wrong_checkers:
        continue
    # print(list(map(lambda e: str(e), md_links)))
    file_name_to_checker_list[file.name] = wrong_checkers

 for file_name, checker_list in file_name_to_checker_list.items():
    template = f'`{file_name}` has {len(checker_list)} errors:\n'
    for checker in checker_list:
        template = template + f"\t{checker.raw} -> {checker.invalid_reason()}\n"

    print(template)


 # %%
	# %%
	import re
	from pathlib import Path
	from typing import Union

	# todo: read from input
	BASE_DIR = Path('/Users/labuladong/IdeaProjects/pulsar/site2/docs/')

	class LinkChecker:
	def __init__(self, s: str, path: Union[str, Path]):
	m = re.match(r'!?\[(.?)\]\((.?)\)', s)
	assert m, f"{str(path)} -> {s} is not a universal pattern"
	self.raw = s
	self.title = m.group(1)
	self.link = m.group(2)
	self.file_path = Path(path)

	def __eq__(self, other):
	return self.raw.__eq__(other.raw)

	def __str__(self):
	return self.raw

	def is_valid(self) -> bool:
	return not self.invalid_reason()

	def invalid_reason(self) -> Union[None, str]:
	if not self.link:
	return f"{self.raw} missing link"
	if not self.raw.startswith('!') and not self.title:
	return f"{self.raw} missing title"
	if not self.file_path.exists():
	return f"{self.raw} path not exists"
	return None


	# like '[this page](schema-manage.md#set-schema-compatibility-check-strategy)'
	class MarkdownLinkChecker(LinkChecker):
	def __init__(self, s: str, path: Union[str, Path]):
	super().__init__(s, path)

	m = re.match(r'\[(.?)\]\((.?\.md)?(#.*?)?\)', s)
	assert m, f"{str(path)} -> {s} is not a markdown pattern"

	self.md_link_path = m.group(2)
	if self.md_link_path:
	self.md_link_path = self.md_link_path.lstrip('/')
	else:
	# only have markdown anchor
	self.md_link_path = path.name

	self.md_anchor = m.group(3)

	def invalid_reason(self) -> Union[None, str]:
	# universal link error
	reason = super(MarkdownLinkChecker, self).invalid_reason()
	if reason:
	return reason

	# wrong md format
	if not self.md_link_path:
	return f"{self.md_link_path} in not a valid markdown path"

	link_abs_path: Path = self.file_path.parents[0] / self.md_link_path
	# file not exist
	if not link_abs_path.exists():
	return f"{self.md_link_path} not exist"

	# check anchor
	title_set = MarkdownLinkChecker.get_title_set(link_abs_path)
	if self.md_anchor and self.md_anchor.lstrip('#') not in title_set:
	return f"{self.md_link_path} doesn't have anchor: {self.md_anchor}"
	return None

	@staticmethod
	def get_title_set(md_file: Path) -> set[str]:
	content = md_file.read_text()
	titles = re.findall(r'(#)+\s+(.*)\n', content)
	titles = [group[1].strip() for group in titles]
	# 'This is a title -> this-is-a-title
	titles = [re.sub(r'[()`\'\"/?:.,*]', '', title) for title in titles]
	titles = list(map(lambda e: e.lower().replace(' ', '-'), titles))
	return set(titles)


	# todo: handle links like '[this page](https://www.google.com)'
	class HttpLinkChecker(LinkChecker):
	pass


	# todo: handle links like '[this page](/tool/admin)'
	class OtherLinkChecker(LinkChecker):
	pass


	# %%
	md_files = list(BASE_DIR.glob('*/.md'))

	file_name_to_checker_list = dict()
	for file in md_files:
	text = file.read_text()
	# filter all link in md file
	links = re.findall(r'!?\[.?\]\(.?\)', text)
	if not links:
	continue

	checkers = []
	for link in links:
	if 'http' in link:
	checkers.append(HttpLinkChecker(link, file))
	elif '.md' in link or '(#' in link:
	checkers.append(MarkdownLinkChecker(link, file))
	else:
	checkers.append(OtherLinkChecker(link, file))

	wrong_checkers = list(filter(lambda e: not e.is_valid(), checkers))
	if not wrong_checkers:
	continue
	# print(list(map(lambda e: str(e), md_links)))
	file_name_to_checker_list[file.name] = wrong_checkers

	for file_name, checker_list in file_name_to_checker_list.items():
	template = f'`{file_name}` has {len(checker_list)} errors:\n'
	for checker in checker_list:
	template = template + f"\t{checker.raw} -> {checker.invalid_reason()}\n"

	print(template)


	# %%