Last active
July 15, 2024 23:26
-
-
Save davidgilbertson/9a4ec4caf3a35712819d15d116592aa5 to your computer and use it in GitHub Desktop.
Convert markdown text to unicode characters where possible
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| def markdown_to_unicode(md_text): | |
| # Yes you could be clever and generate these by checking the codepoint offsets, like ('๐' - 'a') | |
| # But you would soon learn that 'โ' is special and there's 5 different offsets and its simpler | |
| # just to list them out in a map. | |
| # fmt: off | |
| bold_map = { | |
| "0": "๐ฌ", "1": "๐ญ", "2": "๐ฎ", "3": "๐ฏ", "4": "๐ฐ", "5": "๐ฑ", "6": "๐ฒ", "7": "๐ณ", "8": "๐ด", "9": "๐ต", | |
| "a": "๐ฎ", "b": "๐ฏ", "c": "๐ฐ", "d": "๐ฑ", "e": "๐ฒ", "f": "๐ณ", "g": "๐ด", "h": "๐ต", "i": "๐ถ", "j": "๐ท", "k": "๐ธ", "l": "๐น", "m": "๐บ", "n": "๐ป", "o": "๐ผ", "p": "๐ฝ", "q": "๐พ", "r": "๐ฟ", "s": "๐", "t": "๐", "u": "๐", "v": "๐", "w": "๐", "x": "๐ ", "y": "๐", "z": "๐", | |
| "A": "๐", "B": "๐", "C": "๐", "D": "๐", "E": "๐", "F": "๐", "G": "๐", "H": "๐", "I": "๐", "J": "๐", "K": "๐", "L": "๐", "M": "๐ ", "N": "๐ก", "O": "๐ข", "P": "๐ฃ", "Q": "๐ค", "R": "๐ฅ", "S": "๐ฆ", "T": "๐ง", "U": "๐จ", "V": "๐ฉ", "W": "๐ช", "X": "๐ซ", "Y": "๐ฌ", "Z": "๐ญ", | |
| } | |
| italic_map = { | |
| "a": "๐", "b": "๐", "c": "๐", "d": "๐", "e": "๐", "f": "๐", "g": "๐", "h": "โ", "i": "๐", "j": "๐", "k": "๐", "l": "๐", "m": "๐", "n": "๐", "o": "๐", "p": "๐", "q": "๐", "r": "๐", "s": "๐ ", "t": "๐ก", "u": "๐ข", "v": "๐ฃ", "w": "๐ค", "x": "๐ฅ", "y": "๐ฆ", "z": "๐ง", | |
| "A": "๐ด", "B": "๐ต", "C": "๐ถ", "D": "๐ท", "E": "๐ธ", "F": "๐น", "G": "๐บ", "H": "๐ป", "I": "๐ผ", "J": "๐ฝ", "K": "๐พ", "L": "๐ฟ", "M": "๐", "N": "๐", "O": "๐", "P": "๐", "Q": "๐", "R": "๐ ", "S": "๐", "T": "๐", "U": "๐", "V": "๐", "W": "๐", "X": "๐", "Y": "๐", "Z": "๐", | |
| } | |
| # fmt: on | |
| def replace(text, char_map): | |
| return "".join(char_map.get(c, c) for c in text) | |
| def handle_styles(text): | |
| text = re.sub( | |
| r"\*\*(.*?)\*\*", | |
| lambda m: replace(m.group(1), bold_map), | |
| text, | |
| ) | |
| text = re.sub( | |
| r"\*(.*?)\*", | |
| lambda m: replace(m.group(1), italic_map), | |
| text, | |
| ) | |
| return text | |
| lines = md_text.split("\n") | |
| in_code = False | |
| for i, line in enumerate(lines): | |
| if line.startswith("```"): | |
| in_code = not in_code | |
| if in_code: | |
| continue | |
| if line.startswith(("- ", "* ")): | |
| line = f"โข {line[2:]}" | |
| if re.match("#{1,6} ", line): | |
| # Treat all headings the same, but keep the '#' | |
| line = replace(line, bold_map) | |
| else: | |
| line = handle_styles(line) | |
| lines[i] = line | |
| return "\n".join(lines) | |
| if __name__ == "__main__": | |
| # Example usage | |
| markdown_text = """ | |
| Normal, **bold** and *italic* text. | |
| # Heading 1 | |
| - Bullet point 1 | |
| - Bullet point 2 with **bold** and *italic* and **more bold** | |
| * Star-based bullet *with italics* and **bold text** | |
| ```py | |
| # Comment in a code block | |
| x = "A string with *italics* markers that should be ignored" | |
| ``` | |
| ## Now a level 2 heading | |
| And some more text | |
| """ | |
| print(markdown_to_unicode(markdown_text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It's a long long way from complete coverage of Markdown, but it makes LLM output text more readable for the major use cases.