Last active
July 15, 2024 23:26
-
-
Save davidgilbertson/9a4ec4caf3a35712819d15d116592aa5 to your computer and use it in GitHub Desktop.
Revisions
-
davidgilbertson revised this gist
Jul 15, 2024 . 1 changed file with 20 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,8 +1,24 @@ # Licensed under the MIT-0 License. See https://opensource.org/licenses/MIT-0 for details. import re def markdown_to_unicode(md_text): """ Converts Markdown text to Unicode by transforming Markdown syntax for bold and italic text into their corresponding Unicode characters, handling lists and headings, and preserving the formatting within code blocks. Parameters ---------- md_text : str The input Markdown text to be converted. Returns ------- str The converted text with Unicode characters. """ # Yes you could be clever and generate the below by checking the codepoint offsets, like ('๐' - 'a') # But you would soon learn that 'โ' is special and there's 5 different offsets and its simpler # just to list them out in a map. @@ -17,7 +33,6 @@ def markdown_to_unicode(md_text): "a": "๐", "b": "๐", "c": "๐", "d": "๐", "e": "๐", "f": "๐", "g": "๐", "h": "โ", "i": "๐", "j": "๐", "k": "๐", "l": "๐", "m": "๐", "n": "๐", "o": "๐", "p": "๐", "q": "๐", "r": "๐", "s": "๐ ", "t": "๐ก", "u": "๐ข", "v": "๐ฃ", "w": "๐ค", "x": "๐ฅ", "y": "๐ฆ", "z": "๐ง", "A": "๐ด", "B": "๐ต", "C": "๐ถ", "D": "๐ท", "E": "๐ธ", "F": "๐น", "G": "๐บ", "H": "๐ป", "I": "๐ผ", "J": "๐ฝ", "K": "๐พ", "L": "๐ฟ", "M": "๐", "N": "๐", "O": "๐", "P": "๐", "Q": "๐", "R": "๐ ", "S": "๐", "T": "๐", "U": "๐", "V": "๐", "W": "๐", "X": "๐", "Y": "๐", "Z": "๐", } # fmt: on def replace(text, char_map): @@ -60,18 +75,19 @@ def handle_styles(text): if __name__ == "__main__": markdown_text = """ Normal, **bold** and *italic* text. # Heading 1 - Bullet point 1 - Bullet point 2 with **bold** and *italics* and **more bold** * Star-based bullet *with italics* and **bold text** ```py # Comment in a code block x = "A string with *italics* markers that should be ignored" ``` ## Now a level 2 heading And some more text """ -
davidgilbertson created this gist
Jul 11, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,79 @@ import re def markdown_to_unicode(md_text): # Yes you could be clever and generate these by checking the codepoint offsets, like ('๐' - 'a') # But you would soon learn that 'โ' is special and there's 5 different offsets and its simpler # just to list them out in a map. # fmt: off bold_map = { "0": "๐ฌ", "1": "๐ญ", "2": "๐ฎ", "3": "๐ฏ", "4": "๐ฐ", "5": "๐ฑ", "6": "๐ฒ", "7": "๐ณ", "8": "๐ด", "9": "๐ต", "a": "๐ฎ", "b": "๐ฏ", "c": "๐ฐ", "d": "๐ฑ", "e": "๐ฒ", "f": "๐ณ", "g": "๐ด", "h": "๐ต", "i": "๐ถ", "j": "๐ท", "k": "๐ธ", "l": "๐น", "m": "๐บ", "n": "๐ป", "o": "๐ผ", "p": "๐ฝ", "q": "๐พ", "r": "๐ฟ", "s": "๐", "t": "๐", "u": "๐", "v": "๐", "w": "๐", "x": "๐ ", "y": "๐", "z": "๐", "A": "๐", "B": "๐", "C": "๐", "D": "๐", "E": "๐", "F": "๐", "G": "๐", "H": "๐", "I": "๐", "J": "๐", "K": "๐", "L": "๐", "M": "๐ ", "N": "๐ก", "O": "๐ข", "P": "๐ฃ", "Q": "๐ค", "R": "๐ฅ", "S": "๐ฆ", "T": "๐ง", "U": "๐จ", "V": "๐ฉ", "W": "๐ช", "X": "๐ซ", "Y": "๐ฌ", "Z": "๐ญ", } italic_map = { "a": "๐", "b": "๐", "c": "๐", "d": "๐", "e": "๐", "f": "๐", "g": "๐", "h": "โ", "i": "๐", "j": "๐", "k": "๐", "l": "๐", "m": "๐", "n": "๐", "o": "๐", "p": "๐", "q": "๐", "r": "๐", "s": "๐ ", "t": "๐ก", "u": "๐ข", "v": "๐ฃ", "w": "๐ค", "x": "๐ฅ", "y": "๐ฆ", "z": "๐ง", "A": "๐ด", "B": "๐ต", "C": "๐ถ", "D": "๐ท", "E": "๐ธ", "F": "๐น", "G": "๐บ", "H": "๐ป", "I": "๐ผ", "J": "๐ฝ", "K": "๐พ", "L": "๐ฟ", "M": "๐", "N": "๐", "O": "๐", "P": "๐", "Q": "๐", "R": "๐ ", "S": "๐", "T": "๐", "U": "๐", "V": "๐", "W": "๐", "X": "๐", "Y": "๐", "Z": "๐", } # fmt: on def replace(text, char_map): return "".join(char_map.get(c, c) for c in text) def handle_styles(text): text = re.sub( r"\*\*(.*?)\*\*", lambda m: replace(m.group(1), bold_map), text, ) text = re.sub( r"\*(.*?)\*", lambda m: replace(m.group(1), italic_map), text, ) return text lines = md_text.split("\n") in_code = False for i, line in enumerate(lines): if line.startswith("```"): in_code = not in_code if in_code: continue if line.startswith(("- ", "* ")): line = f"โข {line[2:]}" if re.match("#{1,6} ", line): # Treat all headings the same, but keep the '#' line = replace(line, bold_map) else: line = handle_styles(line) lines[i] = line return "\n".join(lines) if __name__ == "__main__": # Example usage markdown_text = """ Normal, **bold** and *italic* text. # Heading 1 - Bullet point 1 - Bullet point 2 with **bold** and *italic* and **more bold** * Star-based bullet *with italics* and **bold text** ```py # Comment in a code block x = "A string with *italics* markers that should be ignored" ``` ## Now a level 2 heading And some more text """ print(markdown_to_unicode(markdown_text))