Skip to content

Instantly share code, notes, and snippets.

@davidgilbertson
Last active July 15, 2024 23:26
Show Gist options
  • Select an option

  • Save davidgilbertson/9a4ec4caf3a35712819d15d116592aa5 to your computer and use it in GitHub Desktop.

Select an option

Save davidgilbertson/9a4ec4caf3a35712819d15d116592aa5 to your computer and use it in GitHub Desktop.
Convert markdown text to unicode characters where possible
import re
def markdown_to_unicode(md_text):
# Yes you could be clever and generate these by checking the codepoint offsets, like ('๐‘Ž' - 'a')
# But you would soon learn that 'โ„Ž' is special and there's 5 different offsets and its simpler
# just to list them out in a map.
# fmt: off
bold_map = {
"0": "๐Ÿฌ", "1": "๐Ÿญ", "2": "๐Ÿฎ", "3": "๐Ÿฏ", "4": "๐Ÿฐ", "5": "๐Ÿฑ", "6": "๐Ÿฒ", "7": "๐Ÿณ", "8": "๐Ÿด", "9": "๐Ÿต",
"a": "๐—ฎ", "b": "๐—ฏ", "c": "๐—ฐ", "d": "๐—ฑ", "e": "๐—ฒ", "f": "๐—ณ", "g": "๐—ด", "h": "๐—ต", "i": "๐—ถ", "j": "๐—ท", "k": "๐—ธ", "l": "๐—น", "m": "๐—บ", "n": "๐—ป", "o": "๐—ผ", "p": "๐—ฝ", "q": "๐—พ", "r": "๐—ฟ", "s": "๐˜€", "t": "๐˜", "u": "๐˜‚", "v": "๐˜ƒ", "w": "๐˜„", "x": "๐˜…", "y": "๐˜†", "z": "๐˜‡",
"A": "๐—”", "B": "๐—•", "C": "๐—–", "D": "๐——", "E": "๐—˜", "F": "๐—™", "G": "๐—š", "H": "๐—›", "I": "๐—œ", "J": "๐—", "K": "๐—ž", "L": "๐—Ÿ", "M": "๐— ", "N": "๐—ก", "O": "๐—ข", "P": "๐—ฃ", "Q": "๐—ค", "R": "๐—ฅ", "S": "๐—ฆ", "T": "๐—ง", "U": "๐—จ", "V": "๐—ฉ", "W": "๐—ช", "X": "๐—ซ", "Y": "๐—ฌ", "Z": "๐—ญ",
}
italic_map = {
"a": "๐‘Ž", "b": "๐‘", "c": "๐‘", "d": "๐‘‘", "e": "๐‘’", "f": "๐‘“", "g": "๐‘”", "h": "โ„Ž", "i": "๐‘–", "j": "๐‘—", "k": "๐‘˜", "l": "๐‘™", "m": "๐‘š", "n": "๐‘›", "o": "๐‘œ", "p": "๐‘", "q": "๐‘ž", "r": "๐‘Ÿ", "s": "๐‘ ", "t": "๐‘ก", "u": "๐‘ข", "v": "๐‘ฃ", "w": "๐‘ค", "x": "๐‘ฅ", "y": "๐‘ฆ", "z": "๐‘ง",
"A": "๐ด", "B": "๐ต", "C": "๐ถ", "D": "๐ท", "E": "๐ธ", "F": "๐น", "G": "๐บ", "H": "๐ป", "I": "๐ผ", "J": "๐ฝ", "K": "๐พ", "L": "๐ฟ", "M": "๐‘€", "N": "๐‘", "O": "๐‘‚", "P": "๐‘ƒ", "Q": "๐‘„", "R": "๐‘…", "S": "๐‘†", "T": "๐‘‡", "U": "๐‘ˆ", "V": "๐‘‰", "W": "๐‘Š", "X": "๐‘‹", "Y": "๐‘Œ", "Z": "๐‘",
}
# fmt: on
def replace(text, char_map):
return "".join(char_map.get(c, c) for c in text)
def handle_styles(text):
text = re.sub(
r"\*\*(.*?)\*\*",
lambda m: replace(m.group(1), bold_map),
text,
)
text = re.sub(
r"\*(.*?)\*",
lambda m: replace(m.group(1), italic_map),
text,
)
return text
lines = md_text.split("\n")
in_code = False
for i, line in enumerate(lines):
if line.startswith("```"):
in_code = not in_code
if in_code:
continue
if line.startswith(("- ", "* ")):
line = f"โ€ข {line[2:]}"
if re.match("#{1,6} ", line):
# Treat all headings the same, but keep the '#'
line = replace(line, bold_map)
else:
line = handle_styles(line)
lines[i] = line
return "\n".join(lines)
if __name__ == "__main__":
# Example usage
markdown_text = """
Normal, **bold** and *italic* text.
# Heading 1
- Bullet point 1
- Bullet point 2 with **bold** and *italic* and **more bold**
* Star-based bullet *with italics* and **bold text**
```py
# Comment in a code block
x = "A string with *italics* markers that should be ignored"
```
## Now a level 2 heading
And some more text
"""
print(markdown_to_unicode(markdown_text))
@davidgilbertson
Copy link
Copy Markdown
Author

It's a long long way from complete coverage of Markdown, but it makes LLM output text more readable for the major use cases.

image

@MLCole
Copy link
Copy Markdown

MLCole commented Jul 12, 2024

Neat!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment