Skip to content

Instantly share code, notes, and snippets.

@davidgilbertson
Last active July 15, 2024 23:26
Show Gist options
  • Save davidgilbertson/9a4ec4caf3a35712819d15d116592aa5 to your computer and use it in GitHub Desktop.
Save davidgilbertson/9a4ec4caf3a35712819d15d116592aa5 to your computer and use it in GitHub Desktop.
Convert markdown text to unicode characters where possible
# Licensed under the MIT-0 License. See https://opensource.org/licenses/MIT-0 for details.
import re
def markdown_to_unicode(md_text):
"""
Converts Markdown text to Unicode by transforming Markdown syntax for
bold and italic text into their corresponding Unicode characters,
handling lists and headings, and preserving the formatting within code blocks.
Parameters
----------
md_text : str
The input Markdown text to be converted.
Returns
-------
str
The converted text with Unicode characters.
"""
# Yes you could be clever and generate the below by checking the codepoint offsets, like ('๐‘Ž' - 'a')
# But you would soon learn that 'โ„Ž' is special and there's 5 different offsets and its simpler
# just to list them out in a map.
# fmt: off
bold_map = {
"0": "๐Ÿฌ", "1": "๐Ÿญ", "2": "๐Ÿฎ", "3": "๐Ÿฏ", "4": "๐Ÿฐ", "5": "๐Ÿฑ", "6": "๐Ÿฒ", "7": "๐Ÿณ", "8": "๐Ÿด", "9": "๐Ÿต",
"a": "๐—ฎ", "b": "๐—ฏ", "c": "๐—ฐ", "d": "๐—ฑ", "e": "๐—ฒ", "f": "๐—ณ", "g": "๐—ด", "h": "๐—ต", "i": "๐—ถ", "j": "๐—ท", "k": "๐—ธ", "l": "๐—น", "m": "๐—บ", "n": "๐—ป", "o": "๐—ผ", "p": "๐—ฝ", "q": "๐—พ", "r": "๐—ฟ", "s": "๐˜€", "t": "๐˜", "u": "๐˜‚", "v": "๐˜ƒ", "w": "๐˜„", "x": "๐˜…", "y": "๐˜†", "z": "๐˜‡",
"A": "๐—”", "B": "๐—•", "C": "๐—–", "D": "๐——", "E": "๐—˜", "F": "๐—™", "G": "๐—š", "H": "๐—›", "I": "๐—œ", "J": "๐—", "K": "๐—ž", "L": "๐—Ÿ", "M": "๐— ", "N": "๐—ก", "O": "๐—ข", "P": "๐—ฃ", "Q": "๐—ค", "R": "๐—ฅ", "S": "๐—ฆ", "T": "๐—ง", "U": "๐—จ", "V": "๐—ฉ", "W": "๐—ช", "X": "๐—ซ", "Y": "๐—ฌ", "Z": "๐—ญ",
}
italic_map = {
"a": "๐‘Ž", "b": "๐‘", "c": "๐‘", "d": "๐‘‘", "e": "๐‘’", "f": "๐‘“", "g": "๐‘”", "h": "โ„Ž", "i": "๐‘–", "j": "๐‘—", "k": "๐‘˜", "l": "๐‘™", "m": "๐‘š", "n": "๐‘›", "o": "๐‘œ", "p": "๐‘", "q": "๐‘ž", "r": "๐‘Ÿ", "s": "๐‘ ", "t": "๐‘ก", "u": "๐‘ข", "v": "๐‘ฃ", "w": "๐‘ค", "x": "๐‘ฅ", "y": "๐‘ฆ", "z": "๐‘ง",
"A": "๐ด", "B": "๐ต", "C": "๐ถ", "D": "๐ท", "E": "๐ธ", "F": "๐น", "G": "๐บ", "H": "๐ป", "I": "๐ผ", "J": "๐ฝ", "K": "๐พ", "L": "๐ฟ", "M": "๐‘€", "N": "๐‘", "O": "๐‘‚", "P": "๐‘ƒ", "Q": "๐‘„", "R": "๐‘…", "S": "๐‘†", "T": "๐‘‡", "U": "๐‘ˆ", "V": "๐‘‰", "W": "๐‘Š", "X": "๐‘‹", "Y": "๐‘Œ", "Z": "๐‘",
}
# fmt: on
def replace(text, char_map):
return "".join(char_map.get(c, c) for c in text)
def handle_styles(text):
text = re.sub(
r"\*\*(.*?)\*\*",
lambda m: replace(m.group(1), bold_map),
text,
)
text = re.sub(
r"\*(.*?)\*",
lambda m: replace(m.group(1), italic_map),
text,
)
return text
lines = md_text.split("\n")
in_code = False
for i, line in enumerate(lines):
if line.startswith("```"):
in_code = not in_code
if in_code:
continue
if line.startswith(("- ", "* ")):
line = f"โ€ข {line[2:]}"
if re.match("#{1,6} ", line):
# Treat all headings the same, but keep the '#'
line = replace(line, bold_map)
else:
line = handle_styles(line)
lines[i] = line
return "\n".join(lines)
if __name__ == "__main__":
markdown_text = """
Normal, **bold** and *italic* text.
# Heading 1
- Bullet point 1
- Bullet point 2 with **bold** and *italics* and **more bold**
* Star-based bullet *with italics* and **bold text**
```py
# Comment in a code block
x = "A string with *italics* markers that should be ignored"
```
## Now a level 2 heading
And some more text
"""
print(markdown_to_unicode(markdown_text))
@davidgilbertson
Copy link
Author

It's a long long way from complete coverage of Markdown, but it makes LLM output text more readable for the major use cases.

image

@MLCole
Copy link

MLCole commented Jul 12, 2024

Neat!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment