Last active
July 15, 2024 23:26
-
-
Save davidgilbertson/9a4ec4caf3a35712819d15d116592aa5 to your computer and use it in GitHub Desktop.
Convert markdown text to unicode characters where possible
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Licensed under the MIT-0 License. See https://opensource.org/licenses/MIT-0 for details. | |
import re | |
def markdown_to_unicode(md_text): | |
""" | |
Converts Markdown text to Unicode by transforming Markdown syntax for | |
bold and italic text into their corresponding Unicode characters, | |
handling lists and headings, and preserving the formatting within code blocks. | |
Parameters | |
---------- | |
md_text : str | |
The input Markdown text to be converted. | |
Returns | |
------- | |
str | |
The converted text with Unicode characters. | |
""" | |
# Yes you could be clever and generate the below by checking the codepoint offsets, like ('๐' - 'a') | |
# But you would soon learn that 'โ' is special and there's 5 different offsets and its simpler | |
# just to list them out in a map. | |
# fmt: off | |
bold_map = { | |
"0": "๐ฌ", "1": "๐ญ", "2": "๐ฎ", "3": "๐ฏ", "4": "๐ฐ", "5": "๐ฑ", "6": "๐ฒ", "7": "๐ณ", "8": "๐ด", "9": "๐ต", | |
"a": "๐ฎ", "b": "๐ฏ", "c": "๐ฐ", "d": "๐ฑ", "e": "๐ฒ", "f": "๐ณ", "g": "๐ด", "h": "๐ต", "i": "๐ถ", "j": "๐ท", "k": "๐ธ", "l": "๐น", "m": "๐บ", "n": "๐ป", "o": "๐ผ", "p": "๐ฝ", "q": "๐พ", "r": "๐ฟ", "s": "๐", "t": "๐", "u": "๐", "v": "๐", "w": "๐", "x": "๐ ", "y": "๐", "z": "๐", | |
"A": "๐", "B": "๐", "C": "๐", "D": "๐", "E": "๐", "F": "๐", "G": "๐", "H": "๐", "I": "๐", "J": "๐", "K": "๐", "L": "๐", "M": "๐ ", "N": "๐ก", "O": "๐ข", "P": "๐ฃ", "Q": "๐ค", "R": "๐ฅ", "S": "๐ฆ", "T": "๐ง", "U": "๐จ", "V": "๐ฉ", "W": "๐ช", "X": "๐ซ", "Y": "๐ฌ", "Z": "๐ญ", | |
} | |
italic_map = { | |
"a": "๐", "b": "๐", "c": "๐", "d": "๐", "e": "๐", "f": "๐", "g": "๐", "h": "โ", "i": "๐", "j": "๐", "k": "๐", "l": "๐", "m": "๐", "n": "๐", "o": "๐", "p": "๐", "q": "๐", "r": "๐", "s": "๐ ", "t": "๐ก", "u": "๐ข", "v": "๐ฃ", "w": "๐ค", "x": "๐ฅ", "y": "๐ฆ", "z": "๐ง", | |
"A": "๐ด", "B": "๐ต", "C": "๐ถ", "D": "๐ท", "E": "๐ธ", "F": "๐น", "G": "๐บ", "H": "๐ป", "I": "๐ผ", "J": "๐ฝ", "K": "๐พ", "L": "๐ฟ", "M": "๐", "N": "๐", "O": "๐", "P": "๐", "Q": "๐", "R": "๐ ", "S": "๐", "T": "๐", "U": "๐", "V": "๐", "W": "๐", "X": "๐", "Y": "๐", "Z": "๐", | |
} | |
# fmt: on | |
def replace(text, char_map): | |
return "".join(char_map.get(c, c) for c in text) | |
def handle_styles(text): | |
text = re.sub( | |
r"\*\*(.*?)\*\*", | |
lambda m: replace(m.group(1), bold_map), | |
text, | |
) | |
text = re.sub( | |
r"\*(.*?)\*", | |
lambda m: replace(m.group(1), italic_map), | |
text, | |
) | |
return text | |
lines = md_text.split("\n") | |
in_code = False | |
for i, line in enumerate(lines): | |
if line.startswith("```"): | |
in_code = not in_code | |
if in_code: | |
continue | |
if line.startswith(("- ", "* ")): | |
line = f"โข {line[2:]}" | |
if re.match("#{1,6} ", line): | |
# Treat all headings the same, but keep the '#' | |
line = replace(line, bold_map) | |
else: | |
line = handle_styles(line) | |
lines[i] = line | |
return "\n".join(lines) | |
if __name__ == "__main__": | |
markdown_text = """ | |
Normal, **bold** and *italic* text. | |
# Heading 1 | |
- Bullet point 1 | |
- Bullet point 2 with **bold** and *italics* and **more bold** | |
* Star-based bullet *with italics* and **bold text** | |
```py | |
# Comment in a code block | |
x = "A string with *italics* markers that should be ignored" | |
``` | |
## Now a level 2 heading | |
And some more text | |
""" | |
print(markdown_to_unicode(markdown_text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It's a long long way from complete coverage of Markdown, but it makes LLM output text more readable for the major use cases.