davidgilbertson · July 15, 2024 23:26 · davidgilbertson · Jul 11, 2024 · MLCole · Jul 12, 2024
diff --git a/markdown_to_unicode.py b/markdown_to_unicode.py
 import re


 def markdown_to_unicode(md_text):
    # Yes you could be clever and generate these by checking the codepoint offsets, like ('𝑎' - 'a')
    # But you would soon learn that 'ℎ' is special and there's 5 different offsets and its simpler
    # just to list them out in a map.

    # fmt: off
    bold_map = {
        "0": "𝟬", "1": "𝟭", "2": "𝟮", "3": "𝟯", "4": "𝟰", "5": "𝟱", "6": "𝟲", "7": "𝟳", "8": "𝟴", "9": "𝟵",
        "a": "𝗮", "b": "𝗯", "c": "𝗰", "d": "𝗱", "e": "𝗲", "f": "𝗳", "g": "𝗴", "h": "𝗵", "i": "𝗶", "j": "𝗷", "k": "𝗸", "l": "𝗹", "m": "𝗺", "n": "𝗻", "o": "𝗼", "p": "𝗽", "q": "𝗾", "r": "𝗿", "s": "𝘀", "t": "𝘁", "u": "𝘂", "v": "𝘃", "w": "𝘄", "x": "𝘅", "y": "𝘆", "z": "𝘇",
        "A": "𝗔", "B": "𝗕", "C": "𝗖", "D": "𝗗", "E": "𝗘", "F": "𝗙", "G": "𝗚", "H": "𝗛", "I": "𝗜", "J": "𝗝", "K": "𝗞", "L": "𝗟", "M": "𝗠", "N": "𝗡", "O": "𝗢", "P": "𝗣", "Q": "𝗤", "R": "𝗥", "S": "𝗦", "T": "𝗧", "U": "𝗨", "V": "𝗩", "W": "𝗪", "X": "𝗫", "Y": "𝗬", "Z": "𝗭",
    }

    italic_map = {
        "a": "𝑎", "b": "𝑏", "c": "𝑐", "d": "𝑑", "e": "𝑒", "f": "𝑓", "g": "𝑔", "h": "ℎ", "i": "𝑖", "j": "𝑗", "k": "𝑘", "l": "𝑙", "m": "𝑚", "n": "𝑛", "o": "𝑜", "p": "𝑝", "q": "𝑞", "r": "𝑟", "s": "𝑠", "t": "𝑡", "u": "𝑢", "v": "𝑣", "w": "𝑤", "x": "𝑥", "y": "𝑦", "z": "𝑧",
        "A": "𝐴", "B": "𝐵", "C": "𝐶", "D": "𝐷", "E": "𝐸", "F": "𝐹", "G": "𝐺", "H": "𝐻", "I": "𝐼", "J": "𝐽", "K": "𝐾", "L": "𝐿", "M": "𝑀", "N": "𝑁", "O": "𝑂", "P": "𝑃", "Q": "𝑄", "R": "𝑅", "S": "𝑆", "T": "𝑇", "U": "𝑈", "V": "𝑉", "W": "𝑊", "X": "𝑋", "Y": "𝑌", "Z": "𝑍",
    }

    # fmt: on

    def replace(text, char_map):
        return "".join(char_map.get(c, c) for c in text)

    def handle_styles(text):
        text = re.sub(
            r"\*\*(.*?)\*\*",
            lambda m: replace(m.group(1), bold_map),
            text,
        )
        text = re.sub(
            r"\*(.*?)\*",
            lambda m: replace(m.group(1), italic_map),
            text,
        )
        return text

    lines = md_text.split("\n")
    in_code = False
    for i, line in enumerate(lines):
        if line.startswith("```"):
            in_code = not in_code

        if in_code:
            continue

        if line.startswith(("- ", "* ")):
            line = f"• {line[2:]}"

        if re.match("#{1,6} ", line):
            # Treat all headings the same, but keep the '#'
            line = replace(line, bold_map)
        else:
            line = handle_styles(line)

        lines[i] = line

    return "\n".join(lines)


 if __name__ == "__main__":
    # Example usage
    markdown_text = """
 Normal, **bold** and *italic* text.

 # Heading 1
 - Bullet point 1
 - Bullet point 2 with **bold** and *italic* and **more bold**
 * Star-based bullet *with italics* and **bold text** 
 ```py
 # Comment in a code block
 x = "A string with *italics* markers that should be ignored"
 ```
 ## Now a level 2 heading
 And some more text
    """

    print(markdown_to_unicode(markdown_text))
	import re


	def markdown_to_unicode(md_text):
	# Yes you could be clever and generate these by checking the codepoint offsets, like ('𝑎' - 'a')
	# But you would soon learn that 'ℎ' is special and there's 5 different offsets and its simpler
	# just to list them out in a map.

	# fmt: off
	bold_map = {
	"0": "𝟬", "1": "𝟭", "2": "𝟮", "3": "𝟯", "4": "𝟰", "5": "𝟱", "6": "𝟲", "7": "𝟳", "8": "𝟴", "9": "𝟵",
	"a": "𝗮", "b": "𝗯", "c": "𝗰", "d": "𝗱", "e": "𝗲", "f": "𝗳", "g": "𝗴", "h": "𝗵", "i": "𝗶", "j": "𝗷", "k": "𝗸", "l": "𝗹", "m": "𝗺", "n": "𝗻", "o": "𝗼", "p": "𝗽", "q": "𝗾", "r": "𝗿", "s": "𝘀", "t": "𝘁", "u": "𝘂", "v": "𝘃", "w": "𝘄", "x": "𝘅", "y": "𝘆", "z": "𝘇",
	"A": "𝗔", "B": "𝗕", "C": "𝗖", "D": "𝗗", "E": "𝗘", "F": "𝗙", "G": "𝗚", "H": "𝗛", "I": "𝗜", "J": "𝗝", "K": "𝗞", "L": "𝗟", "M": "𝗠", "N": "𝗡", "O": "𝗢", "P": "𝗣", "Q": "𝗤", "R": "𝗥", "S": "𝗦", "T": "𝗧", "U": "𝗨", "V": "𝗩", "W": "𝗪", "X": "𝗫", "Y": "𝗬", "Z": "𝗭",
	}

	italic_map = {
	"a": "𝑎", "b": "𝑏", "c": "𝑐", "d": "𝑑", "e": "𝑒", "f": "𝑓", "g": "𝑔", "h": "ℎ", "i": "𝑖", "j": "𝑗", "k": "𝑘", "l": "𝑙", "m": "𝑚", "n": "𝑛", "o": "𝑜", "p": "𝑝", "q": "𝑞", "r": "𝑟", "s": "𝑠", "t": "𝑡", "u": "𝑢", "v": "𝑣", "w": "𝑤", "x": "𝑥", "y": "𝑦", "z": "𝑧",
	"A": "𝐴", "B": "𝐵", "C": "𝐶", "D": "𝐷", "E": "𝐸", "F": "𝐹", "G": "𝐺", "H": "𝐻", "I": "𝐼", "J": "𝐽", "K": "𝐾", "L": "𝐿", "M": "𝑀", "N": "𝑁", "O": "𝑂", "P": "𝑃", "Q": "𝑄", "R": "𝑅", "S": "𝑆", "T": "𝑇", "U": "𝑈", "V": "𝑉", "W": "𝑊", "X": "𝑋", "Y": "𝑌", "Z": "𝑍",
	}

	# fmt: on

	def replace(text, char_map):
	return "".join(char_map.get(c, c) for c in text)

	def handle_styles(text):
	text = re.sub(
	r"\\(.?)\\*",
	lambda m: replace(m.group(1), bold_map),
	text,
	)
	text = re.sub(
	r"\(.?)\*",
	lambda m: replace(m.group(1), italic_map),
	text,
	)
	return text

	lines = md_text.split("\n")
	in_code = False
	for i, line in enumerate(lines):
	if line.startswith("```"):
	in_code = not in_code

	if in_code:
	continue

	if line.startswith(("- ", "* ")):
	line = f"• {line[2:]}"

	if re.match("#{1,6} ", line):
	# Treat all headings the same, but keep the '#'
	line = replace(line, bold_map)
	else:
	line = handle_styles(line)

	lines[i] = line

	return "\n".join(lines)


	if __name__ == "__main__":
	# Example usage
	markdown_text = """
	Normal, bold and italic text.

	# Heading 1
	- Bullet point 1
	- Bullet point 2 with bold and italic and more bold
	* Star-based bullet with italics and bold text
	```py
	# Comment in a code block
	x = "A string with italics markers that should be ignored"
	```
	## Now a level 2 heading
	And some more text
	"""

	print(markdown_to_unicode(markdown_text))
No results found