davidgilbertson · July 15, 2024 23:26 · MLCole · Jul 12, 2024
diff --git a/markdown_to_unicode.py b/markdown_to_unicode.py
 # Licensed under the MIT-0 License. See https://opensource.org/licenses/MIT-0 for details.
 import re


 def markdown_to_unicode(md_text):
    """
    Converts Markdown text to Unicode by transforming Markdown syntax for
     bold and italic text into their corresponding Unicode characters,
     handling lists and headings, and preserving the formatting within code blocks.

    Parameters
    ----------
    md_text : str
        The input Markdown text to be converted.

    Returns
    -------
    str
        The converted text with Unicode characters.
    """
    # Yes you could be clever and generate the below by checking the codepoint offsets, like ('𝑎' - 'a')
    # But you would soon learn that 'ℎ' is special and there's 5 different offsets and its simpler
    # just to list them out in a map.

    # fmt: off
    bold_map = {
        "0": "𝟬", "1": "𝟭", "2": "𝟮", "3": "𝟯", "4": "𝟰", "5": "𝟱", "6": "𝟲", "7": "𝟳", "8": "𝟴", "9": "𝟵",
        "a": "𝗮", "b": "𝗯", "c": "𝗰", "d": "𝗱", "e": "𝗲", "f": "𝗳", "g": "𝗴", "h": "𝗵", "i": "𝗶", "j": "𝗷", "k": "𝗸", "l": "𝗹", "m": "𝗺", "n": "𝗻", "o": "𝗼", "p": "𝗽", "q": "𝗾", "r": "𝗿", "s": "𝘀", "t": "𝘁", "u": "𝘂", "v": "𝘃", "w": "𝘄", "x": "𝘅", "y": "𝘆", "z": "𝘇",
        "A": "𝗔", "B": "𝗕", "C": "𝗖", "D": "𝗗", "E": "𝗘", "F": "𝗙", "G": "𝗚", "H": "𝗛", "I": "𝗜", "J": "𝗝", "K": "𝗞", "L": "𝗟", "M": "𝗠", "N": "𝗡", "O": "𝗢", "P": "𝗣", "Q": "𝗤", "R": "𝗥", "S": "𝗦", "T": "𝗧", "U": "𝗨", "V": "𝗩", "W": "𝗪", "X": "𝗫", "Y": "𝗬", "Z": "𝗭",
    }

    italic_map = {
        "a": "𝑎", "b": "𝑏", "c": "𝑐", "d": "𝑑", "e": "𝑒", "f": "𝑓", "g": "𝑔", "h": "ℎ", "i": "𝑖", "j": "𝑗", "k": "𝑘", "l": "𝑙", "m": "𝑚", "n": "𝑛", "o": "𝑜", "p": "𝑝", "q": "𝑞", "r": "𝑟", "s": "𝑠", "t": "𝑡", "u": "𝑢", "v": "𝑣", "w": "𝑤", "x": "𝑥", "y": "𝑦", "z": "𝑧",
        "A": "𝐴", "B": "𝐵", "C": "𝐶", "D": "𝐷", "E": "𝐸", "F": "𝐹", "G": "𝐺", "H": "𝐻", "I": "𝐼", "J": "𝐽", "K": "𝐾", "L": "𝐿", "M": "𝑀", "N": "𝑁", "O": "𝑂", "P": "𝑃", "Q": "𝑄", "R": "𝑅", "S": "𝑆", "T": "𝑇", "U": "𝑈", "V": "𝑉", "W": "𝑊", "X": "𝑋", "Y": "𝑌", "Z": "𝑍",
    }
    # fmt: on

    def replace(text, char_map):
        return "".join(char_map.get(c, c) for c in text)

    def handle_styles(text):
        text = re.sub(
            r"\*\*(.*?)\*\*",
            lambda m: replace(m.group(1), bold_map),
            text,
        )
        text = re.sub(
            r"\*(.*?)\*",
            lambda m: replace(m.group(1), italic_map),
            text,
        )
        return text

    lines = md_text.split("\n")
    in_code = False
    for i, line in enumerate(lines):
        if line.startswith("```"):
            in_code = not in_code

        if in_code:
            continue

        if line.startswith(("- ", "* ")):
            line = f"• {line[2:]}"

        if re.match("#{1,6} ", line):
            # Treat all headings the same, but keep the '#'
            line = replace(line, bold_map)
        else:
            line = handle_styles(line)

        lines[i] = line

    return "\n".join(lines)


 if __name__ == "__main__":
    markdown_text = """
 Normal, **bold** and *italic* text.

 # Heading 1
 - Bullet point 1
 - Bullet point 2 with **bold** and *italics* and **more bold**
 * Star-based bullet *with italics* and **bold text** 

 ```py
 # Comment in a code block
 x = "A string with *italics* markers that should be ignored"
 ```

 ## Now a level 2 heading
 And some more text
    """

    print(markdown_to_unicode(markdown_text))
	# Licensed under the MIT-0 License. See https://opensource.org/licenses/MIT-0 for details.
	import re


	def markdown_to_unicode(md_text):
	"""
	Converts Markdown text to Unicode by transforming Markdown syntax for
	bold and italic text into their corresponding Unicode characters,
	handling lists and headings, and preserving the formatting within code blocks.

	Parameters
	----------
	md_text : str
	The input Markdown text to be converted.

	Returns
	-------
	str
	The converted text with Unicode characters.
	"""
	# Yes you could be clever and generate the below by checking the codepoint offsets, like ('𝑎' - 'a')
	# But you would soon learn that 'ℎ' is special and there's 5 different offsets and its simpler
	# just to list them out in a map.

	# fmt: off
	bold_map = {
	"0": "𝟬", "1": "𝟭", "2": "𝟮", "3": "𝟯", "4": "𝟰", "5": "𝟱", "6": "𝟲", "7": "𝟳", "8": "𝟴", "9": "𝟵",
	"a": "𝗮", "b": "𝗯", "c": "𝗰", "d": "𝗱", "e": "𝗲", "f": "𝗳", "g": "𝗴", "h": "𝗵", "i": "𝗶", "j": "𝗷", "k": "𝗸", "l": "𝗹", "m": "𝗺", "n": "𝗻", "o": "𝗼", "p": "𝗽", "q": "𝗾", "r": "𝗿", "s": "𝘀", "t": "𝘁", "u": "𝘂", "v": "𝘃", "w": "𝘄", "x": "𝘅", "y": "𝘆", "z": "𝘇",
	"A": "𝗔", "B": "𝗕", "C": "𝗖", "D": "𝗗", "E": "𝗘", "F": "𝗙", "G": "𝗚", "H": "𝗛", "I": "𝗜", "J": "𝗝", "K": "𝗞", "L": "𝗟", "M": "𝗠", "N": "𝗡", "O": "𝗢", "P": "𝗣", "Q": "𝗤", "R": "𝗥", "S": "𝗦", "T": "𝗧", "U": "𝗨", "V": "𝗩", "W": "𝗪", "X": "𝗫", "Y": "𝗬", "Z": "𝗭",
	}

	italic_map = {
	"a": "𝑎", "b": "𝑏", "c": "𝑐", "d": "𝑑", "e": "𝑒", "f": "𝑓", "g": "𝑔", "h": "ℎ", "i": "𝑖", "j": "𝑗", "k": "𝑘", "l": "𝑙", "m": "𝑚", "n": "𝑛", "o": "𝑜", "p": "𝑝", "q": "𝑞", "r": "𝑟", "s": "𝑠", "t": "𝑡", "u": "𝑢", "v": "𝑣", "w": "𝑤", "x": "𝑥", "y": "𝑦", "z": "𝑧",
	"A": "𝐴", "B": "𝐵", "C": "𝐶", "D": "𝐷", "E": "𝐸", "F": "𝐹", "G": "𝐺", "H": "𝐻", "I": "𝐼", "J": "𝐽", "K": "𝐾", "L": "𝐿", "M": "𝑀", "N": "𝑁", "O": "𝑂", "P": "𝑃", "Q": "𝑄", "R": "𝑅", "S": "𝑆", "T": "𝑇", "U": "𝑈", "V": "𝑉", "W": "𝑊", "X": "𝑋", "Y": "𝑌", "Z": "𝑍",
	}
	# fmt: on

	def replace(text, char_map):
	return "".join(char_map.get(c, c) for c in text)

	def handle_styles(text):
	text = re.sub(
	r"\\(.?)\\*",
	lambda m: replace(m.group(1), bold_map),
	text,
	)
	text = re.sub(
	r"\(.?)\*",
	lambda m: replace(m.group(1), italic_map),
	text,
	)
	return text

	lines = md_text.split("\n")
	in_code = False
	for i, line in enumerate(lines):
	if line.startswith("```"):
	in_code = not in_code

	if in_code:
	continue

	if line.startswith(("- ", "* ")):
	line = f"• {line[2:]}"

	if re.match("#{1,6} ", line):
	# Treat all headings the same, but keep the '#'
	line = replace(line, bold_map)
	else:
	line = handle_styles(line)

	lines[i] = line

	return "\n".join(lines)


	if __name__ == "__main__":
	markdown_text = """
	Normal, bold and italic text.

	# Heading 1
	- Bullet point 1
	- Bullet point 2 with bold and italics and more bold
	* Star-based bullet with italics and bold text

	```py
	# Comment in a code block
	x = "A string with italics markers that should be ignored"
	```

	## Now a level 2 heading
	And some more text
	"""

	print(markdown_to_unicode(markdown_text))