Skip to content

Instantly share code, notes, and snippets.

@davidgilbertson
Last active July 15, 2024 23:26
Show Gist options
  • Select an option

  • Save davidgilbertson/9a4ec4caf3a35712819d15d116592aa5 to your computer and use it in GitHub Desktop.

Select an option

Save davidgilbertson/9a4ec4caf3a35712819d15d116592aa5 to your computer and use it in GitHub Desktop.

Revisions

  1. davidgilbertson revised this gist Jul 15, 2024. 1 changed file with 20 additions and 4 deletions.
    24 changes: 20 additions & 4 deletions markdown_to_unicode.py
    Original file line number Diff line number Diff line change
    @@ -1,8 +1,24 @@
    # Licensed under the MIT-0 License. See https://opensource.org/licenses/MIT-0 for details.
    import re


    def markdown_to_unicode(md_text):
    # Yes you could be clever and generate these by checking the codepoint offsets, like ('๐‘Ž' - 'a')
    """
    Converts Markdown text to Unicode by transforming Markdown syntax for
    bold and italic text into their corresponding Unicode characters,
    handling lists and headings, and preserving the formatting within code blocks.
    Parameters
    ----------
    md_text : str
    The input Markdown text to be converted.
    Returns
    -------
    str
    The converted text with Unicode characters.
    """
    # Yes you could be clever and generate the below by checking the codepoint offsets, like ('๐‘Ž' - 'a')
    # But you would soon learn that 'โ„Ž' is special and there's 5 different offsets and its simpler
    # just to list them out in a map.

    @@ -17,7 +33,6 @@ def markdown_to_unicode(md_text):
    "a": "๐‘Ž", "b": "๐‘", "c": "๐‘", "d": "๐‘‘", "e": "๐‘’", "f": "๐‘“", "g": "๐‘”", "h": "โ„Ž", "i": "๐‘–", "j": "๐‘—", "k": "๐‘˜", "l": "๐‘™", "m": "๐‘š", "n": "๐‘›", "o": "๐‘œ", "p": "๐‘", "q": "๐‘ž", "r": "๐‘Ÿ", "s": "๐‘ ", "t": "๐‘ก", "u": "๐‘ข", "v": "๐‘ฃ", "w": "๐‘ค", "x": "๐‘ฅ", "y": "๐‘ฆ", "z": "๐‘ง",
    "A": "๐ด", "B": "๐ต", "C": "๐ถ", "D": "๐ท", "E": "๐ธ", "F": "๐น", "G": "๐บ", "H": "๐ป", "I": "๐ผ", "J": "๐ฝ", "K": "๐พ", "L": "๐ฟ", "M": "๐‘€", "N": "๐‘", "O": "๐‘‚", "P": "๐‘ƒ", "Q": "๐‘„", "R": "๐‘…", "S": "๐‘†", "T": "๐‘‡", "U": "๐‘ˆ", "V": "๐‘‰", "W": "๐‘Š", "X": "๐‘‹", "Y": "๐‘Œ", "Z": "๐‘",
    }

    # fmt: on

    def replace(text, char_map):
    @@ -60,18 +75,19 @@ def handle_styles(text):


    if __name__ == "__main__":
    # Example usage
    markdown_text = """
    Normal, **bold** and *italic* text.
    # Heading 1
    - Bullet point 1
    - Bullet point 2 with **bold** and *italic* and **more bold**
    - Bullet point 2 with **bold** and *italics* and **more bold**
    * Star-based bullet *with italics* and **bold text**
    ```py
    # Comment in a code block
    x = "A string with *italics* markers that should be ignored"
    ```
    ## Now a level 2 heading
    And some more text
    """
  2. davidgilbertson created this gist Jul 11, 2024.
    79 changes: 79 additions & 0 deletions markdown_to_unicode.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,79 @@
    import re


    def markdown_to_unicode(md_text):
    # Yes you could be clever and generate these by checking the codepoint offsets, like ('๐‘Ž' - 'a')
    # But you would soon learn that 'โ„Ž' is special and there's 5 different offsets and its simpler
    # just to list them out in a map.

    # fmt: off
    bold_map = {
    "0": "๐Ÿฌ", "1": "๐Ÿญ", "2": "๐Ÿฎ", "3": "๐Ÿฏ", "4": "๐Ÿฐ", "5": "๐Ÿฑ", "6": "๐Ÿฒ", "7": "๐Ÿณ", "8": "๐Ÿด", "9": "๐Ÿต",
    "a": "๐—ฎ", "b": "๐—ฏ", "c": "๐—ฐ", "d": "๐—ฑ", "e": "๐—ฒ", "f": "๐—ณ", "g": "๐—ด", "h": "๐—ต", "i": "๐—ถ", "j": "๐—ท", "k": "๐—ธ", "l": "๐—น", "m": "๐—บ", "n": "๐—ป", "o": "๐—ผ", "p": "๐—ฝ", "q": "๐—พ", "r": "๐—ฟ", "s": "๐˜€", "t": "๐˜", "u": "๐˜‚", "v": "๐˜ƒ", "w": "๐˜„", "x": "๐˜…", "y": "๐˜†", "z": "๐˜‡",
    "A": "๐—”", "B": "๐—•", "C": "๐—–", "D": "๐——", "E": "๐—˜", "F": "๐—™", "G": "๐—š", "H": "๐—›", "I": "๐—œ", "J": "๐—", "K": "๐—ž", "L": "๐—Ÿ", "M": "๐— ", "N": "๐—ก", "O": "๐—ข", "P": "๐—ฃ", "Q": "๐—ค", "R": "๐—ฅ", "S": "๐—ฆ", "T": "๐—ง", "U": "๐—จ", "V": "๐—ฉ", "W": "๐—ช", "X": "๐—ซ", "Y": "๐—ฌ", "Z": "๐—ญ",
    }

    italic_map = {
    "a": "๐‘Ž", "b": "๐‘", "c": "๐‘", "d": "๐‘‘", "e": "๐‘’", "f": "๐‘“", "g": "๐‘”", "h": "โ„Ž", "i": "๐‘–", "j": "๐‘—", "k": "๐‘˜", "l": "๐‘™", "m": "๐‘š", "n": "๐‘›", "o": "๐‘œ", "p": "๐‘", "q": "๐‘ž", "r": "๐‘Ÿ", "s": "๐‘ ", "t": "๐‘ก", "u": "๐‘ข", "v": "๐‘ฃ", "w": "๐‘ค", "x": "๐‘ฅ", "y": "๐‘ฆ", "z": "๐‘ง",
    "A": "๐ด", "B": "๐ต", "C": "๐ถ", "D": "๐ท", "E": "๐ธ", "F": "๐น", "G": "๐บ", "H": "๐ป", "I": "๐ผ", "J": "๐ฝ", "K": "๐พ", "L": "๐ฟ", "M": "๐‘€", "N": "๐‘", "O": "๐‘‚", "P": "๐‘ƒ", "Q": "๐‘„", "R": "๐‘…", "S": "๐‘†", "T": "๐‘‡", "U": "๐‘ˆ", "V": "๐‘‰", "W": "๐‘Š", "X": "๐‘‹", "Y": "๐‘Œ", "Z": "๐‘",
    }

    # fmt: on

    def replace(text, char_map):
    return "".join(char_map.get(c, c) for c in text)

    def handle_styles(text):
    text = re.sub(
    r"\*\*(.*?)\*\*",
    lambda m: replace(m.group(1), bold_map),
    text,
    )
    text = re.sub(
    r"\*(.*?)\*",
    lambda m: replace(m.group(1), italic_map),
    text,
    )
    return text

    lines = md_text.split("\n")
    in_code = False
    for i, line in enumerate(lines):
    if line.startswith("```"):
    in_code = not in_code

    if in_code:
    continue

    if line.startswith(("- ", "* ")):
    line = f"โ€ข {line[2:]}"

    if re.match("#{1,6} ", line):
    # Treat all headings the same, but keep the '#'
    line = replace(line, bold_map)
    else:
    line = handle_styles(line)

    lines[i] = line

    return "\n".join(lines)


    if __name__ == "__main__":
    # Example usage
    markdown_text = """
    Normal, **bold** and *italic* text.
    # Heading 1
    - Bullet point 1
    - Bullet point 2 with **bold** and *italic* and **more bold**
    * Star-based bullet *with italics* and **bold text**
    ```py
    # Comment in a code block
    x = "A string with *italics* markers that should be ignored"
    ```
    ## Now a level 2 heading
    And some more text
    """

    print(markdown_to_unicode(markdown_text))