davidgilbertson · July 15, 2024 23:26 · Jul 15, 2024 · Jul 11, 2024
diff --git a/markdown_to_unicode.py b/markdown_to_unicode.py
@@ -1,8 +1,24 @@
+# Licensed under the MIT-0 License. See https://opensource.org/licenses/MIT-0 for details.
 import re
 
 
 def markdown_to_unicode(md_text):
-    # Yes you could be clever and generate these by checking the codepoint offsets, like ('𝑎' - 'a')
+    """
+    Converts Markdown text to Unicode by transforming Markdown syntax for
+     bold and italic text into their corresponding Unicode characters,
+     handling lists and headings, and preserving the formatting within code blocks.
+
+    Parameters
+    ----------
+    md_text : str
+        The input Markdown text to be converted.
+
+    Returns
+    -------
+    str
+        The converted text with Unicode characters.
+    """
+    # Yes you could be clever and generate the below by checking the codepoint offsets, like ('𝑎' - 'a')
     # But you would soon learn that 'ℎ' is special and there's 5 different offsets and its simpler
     # just to list them out in a map.
 
@@ -17,7 +33,6 @@ def markdown_to_unicode(md_text):
         "a": "𝑎", "b": "𝑏", "c": "𝑐", "d": "𝑑", "e": "𝑒", "f": "𝑓", "g": "𝑔", "h": "ℎ", "i": "𝑖", "j": "𝑗", "k": "𝑘", "l": "𝑙", "m": "𝑚", "n": "𝑛", "o": "𝑜", "p": "𝑝", "q": "𝑞", "r": "𝑟", "s": "𝑠", "t": "𝑡", "u": "𝑢", "v": "𝑣", "w": "𝑤", "x": "𝑥", "y": "𝑦", "z": "𝑧",
         "A": "𝐴", "B": "𝐵", "C": "𝐶", "D": "𝐷", "E": "𝐸", "F": "𝐹", "G": "𝐺", "H": "𝐻", "I": "𝐼", "J": "𝐽", "K": "𝐾", "L": "𝐿", "M": "𝑀", "N": "𝑁", "O": "𝑂", "P": "𝑃", "Q": "𝑄", "R": "𝑅", "S": "𝑆", "T": "𝑇", "U": "𝑈", "V": "𝑉", "W": "𝑊", "X": "𝑋", "Y": "𝑌", "Z": "𝑍",
     }
-
     # fmt: on
 
     def replace(text, char_map):
@@ -60,18 +75,19 @@ def handle_styles(text):
 
 
 if __name__ == "__main__":
-    # Example usage
     markdown_text = """
 Normal, **bold** and *italic* text.
 
 # Heading 1
 - Bullet point 1
-- Bullet point 2 with **bold** and *italic* and **more bold**
+- Bullet point 2 with **bold** and *italics* and **more bold**
 * Star-based bullet *with italics* and **bold text** 
+
 ```py
 # Comment in a code block
 x = "A string with *italics* markers that should be ignored"
 ```
+
 ## Now a level 2 heading
 And some more text
     """

diff --git a/markdown_to_unicode.py b/markdown_to_unicode.py
@@ -0,0 +1,79 @@
+import re
+
+
+def markdown_to_unicode(md_text):
+    # Yes you could be clever and generate these by checking the codepoint offsets, like ('𝑎' - 'a')
+    # But you would soon learn that 'ℎ' is special and there's 5 different offsets and its simpler
+    # just to list them out in a map.
+
+    # fmt: off
+    bold_map = {
+        "0": "𝟬", "1": "𝟭", "2": "𝟮", "3": "𝟯", "4": "𝟰", "5": "𝟱", "6": "𝟲", "7": "𝟳", "8": "𝟴", "9": "𝟵",
+        "a": "𝗮", "b": "𝗯", "c": "𝗰", "d": "𝗱", "e": "𝗲", "f": "𝗳", "g": "𝗴", "h": "𝗵", "i": "𝗶", "j": "𝗷", "k": "𝗸", "l": "𝗹", "m": "𝗺", "n": "𝗻", "o": "𝗼", "p": "𝗽", "q": "𝗾", "r": "𝗿", "s": "𝘀", "t": "𝘁", "u": "𝘂", "v": "𝘃", "w": "𝘄", "x": "𝘅", "y": "𝘆", "z": "𝘇",
+        "A": "𝗔", "B": "𝗕", "C": "𝗖", "D": "𝗗", "E": "𝗘", "F": "𝗙", "G": "𝗚", "H": "𝗛", "I": "𝗜", "J": "𝗝", "K": "𝗞", "L": "𝗟", "M": "𝗠", "N": "𝗡", "O": "𝗢", "P": "𝗣", "Q": "𝗤", "R": "𝗥", "S": "𝗦", "T": "𝗧", "U": "𝗨", "V": "𝗩", "W": "𝗪", "X": "𝗫", "Y": "𝗬", "Z": "𝗭",
+    }
+
+    italic_map = {
+        "a": "𝑎", "b": "𝑏", "c": "𝑐", "d": "𝑑", "e": "𝑒", "f": "𝑓", "g": "𝑔", "h": "ℎ", "i": "𝑖", "j": "𝑗", "k": "𝑘", "l": "𝑙", "m": "𝑚", "n": "𝑛", "o": "𝑜", "p": "𝑝", "q": "𝑞", "r": "𝑟", "s": "𝑠", "t": "𝑡", "u": "𝑢", "v": "𝑣", "w": "𝑤", "x": "𝑥", "y": "𝑦", "z": "𝑧",
+        "A": "𝐴", "B": "𝐵", "C": "𝐶", "D": "𝐷", "E": "𝐸", "F": "𝐹", "G": "𝐺", "H": "𝐻", "I": "𝐼", "J": "𝐽", "K": "𝐾", "L": "𝐿", "M": "𝑀", "N": "𝑁", "O": "𝑂", "P": "𝑃", "Q": "𝑄", "R": "𝑅", "S": "𝑆", "T": "𝑇", "U": "𝑈", "V": "𝑉", "W": "𝑊", "X": "𝑋", "Y": "𝑌", "Z": "𝑍",
+    }
+
+    # fmt: on
+
+    def replace(text, char_map):
+        return "".join(char_map.get(c, c) for c in text)
+
+    def handle_styles(text):
+        text = re.sub(
+            r"\*\*(.*?)\*\*",
+            lambda m: replace(m.group(1), bold_map),
+            text,
+        )
+        text = re.sub(
+            r"\*(.*?)\*",
+            lambda m: replace(m.group(1), italic_map),
+            text,
+        )
+        return text
+
+    lines = md_text.split("\n")
+    in_code = False
+    for i, line in enumerate(lines):
+        if line.startswith("```"):
+            in_code = not in_code
+
+        if in_code:
+            continue
+
+        if line.startswith(("- ", "* ")):
+            line = f"• {line[2:]}"
+
+        if re.match("#{1,6} ", line):
+            # Treat all headings the same, but keep the '#'
+            line = replace(line, bold_map)
+        else:
+            line = handle_styles(line)
+
+        lines[i] = line
+
+    return "\n".join(lines)
+
+
+if __name__ == "__main__":
+    # Example usage
+    markdown_text = """
+Normal, **bold** and *italic* text.
+
+# Heading 1
+- Bullet point 1
+- Bullet point 2 with **bold** and *italic* and **more bold**
+* Star-based bullet *with italics* and **bold text** 
+```py
+# Comment in a code block
+x = "A string with *italics* markers that should be ignored"
+```
+## Now a level 2 heading
+And some more text
+    """
+
+    print(markdown_to_unicode(markdown_text))
No results found