Skip to content

Instantly share code, notes, and snippets.

@jroweboy
Last active April 5, 2026 20:15
Show Gist options
  • Select an option

  • Save jroweboy/b688d673aac602d4b17a3cccffe134f2 to your computer and use it in GitHub Desktop.

Select an option

Save jroweboy/b688d673aac602d4b17a3cccffe134f2 to your computer and use it in GitHub Desktop.
CA65 Macro for UTF8 Character Mapping
;/*****************************************************************************/
;/* */
;/* utfmap.inc */
;/* */
;/* Macros emulating charmap for UTF-8 characters */
;/* */
;/* */
;/* */
;/* (C) 2026 jroweboy */
;/* EMail: jroweboy@gmail.com */
;/* */
;/* */
;/* This software is provided 'as-is', without any expressed or implied */
;/* warranty. In no event will the authors be held liable for any damages */
;/* arising from the use of this software. */
;/* */
;/* Permission is granted to anyone to use this software for any purpose, */
;/* including commercial applications, and to alter it and redistribute it */
;/* freely, subject to the following restrictions: */
;/* */
;/* 1. The origin of this software must not be misrepresented; you must not */
;/* claim that you wrote the original software. If you use this software */
;/* in a product, an acknowledgment in the product documentation would be */
;/* appreciated but is not required. */
;/* 2. Altered source versions must be plainly marked as such, and must not */
;/* be misrepresented as being the original software. */
;/* 3. This notice may not be removed or altered from any source */
;/* distribution. */
;/* */
;/*****************************************************************************/
.ifndef __UTFMAP_H
__UTFMAP_H = 1
;; --
; OPTIONS
; Uncomment or set one of the following before including this header
;
; __UTFMAP_UNMAPPED_DATA - How to handle unmapped characters
; 0 = (Default) Output the bytes for any characters that are not mapped
; 1 = Skip the character and log a warning in the console output for the skipped
; 2 = Generate an assembly error when an unknown character is encountered
; __UTFMAP_UNMAPPED_DATA = 0
; __UTFMAP_UNMAPPED_DATA = 1
; __UTFMAP_UNMAPPED_DATA = 2
.ifndef __UTFMAP_UNMAPPED_DATA
__UTFMAP_UNMAPPED_DATA = 0
.endif
; --
; UTF8 version of .charmap
; Maps a single valid utf8 letter Str to a single 8bit number Chr
.macro utfmap Str, Chr
.local Val, StrLen
__next_utf8_character Str
.if __VALIDATED_LENGTH <> .strlen(Str)
; Erroring out here matches the behavior of ca65 .charmap
.error "Illegal utf8 character constant"
.endif
.ident(.sprintf("__UTF8_MAPPING_%08X", __VALIDATED_CHARACTER)) .set Chr
.endmacro
; --
; Generates a byte string using the previously defined mapping from a utf8 string.
; The mapping must be included in the same compilation unit BEFORE calling this function.
.macro utfstr Str
.local Next, Chr
Next .set 0
; For each ascii character in the input string
.repeat .strlen(Str), I
; Skip to the start of the next utf8 character (which starts at Next)
.if I >= Next
; Get the next utf8 character (assuming its valid utf8)
__next_utf8_character Str, I
.ifdef .ident(.sprintf("__UTF8_MAPPING_%08X", __VALIDATED_CHARACTER))
; Found a match! Output the mapping byte
__output_bytes_be {.ident(.sprintf("__UTF8_MAPPING_%08X", __VALIDATED_CHARACTER))}
.else
; No mapping, so either output the original utf8 character or skip/error according to the user option
.if __UTFMAP_UNMAPPED_DATA = 0
__output_bytes_be {__VALIDATED_CHARACTER}
.elseif __UTFMAP_UNMAPPED_DATA = 1
.out .sprintf("Skipped unmapped UTF8 character: 0x%X in string %s at position %d", __VALIDATED_CHARACTER, Str, I)
.else
.error .sprintf("Unmapped UTF8 character: 0x%X in string %s at position %d", __VALIDATED_CHARACTER, Str, I)
.endif
.endif
Next .set Next + __VALIDATED_LENGTH
.endif
.endrepeat
.endmacro
; --
; Similar to .asciiz, adds a null terminator to the end of the string
.macro utfstrz Str
utfstr Str
.byte $00
.endmacro
; --
; Helper macro for outputting a 32 bit int in big endian order while skipping
; empty values
.macro __output_bytes_be Byt
.if Byt >= (1 << 24)
.byte ((Byt >> 24) & $ff)
.endif
.if Byt >= (1 << 16)
.byte ((Byt >> 16) & $ff)
.endif
.if Byt >= (1 << 8)
.byte ((Byt >> 8) & $ff)
.endif
.byte ((Byt >> 0) & $ff)
.endmacro
; --
; Helper macro for reading a single unicode codepoint into a 32 bit int
; Stores a result in the globals __VALIDATED_LENGTH and __VALIDATED_CHARACTER
.macro __next_utf8_character Str, I
.local Offset
.ifnblank I
Offset = I
.else
Offset = 0
.endif
__VALIDATED_CHARACTER .set 0
__VALIDATED_LENGTH .set 0
.if (.strat(Str, Offset+0) & %10000000) = 0
; 1 byte character
__VALIDATED_LENGTH .set 1
__VALIDATED_CHARACTER .set .strat(Str, Offset+0)
.elseif (.strat(Str, Offset+0) & %11100000) = %11000000
.if (.strat(Str, Offset+1) & %11000000) = %10000000
; 2 byte character
__VALIDATED_LENGTH .set 2
__VALIDATED_CHARACTER .set (.strat(Str, Offset+0) << 8) | (.strat(Str, Offset+1))
.endif
.elseif (.strat(Str, Offset+0) & %11110000) = %11100000
.if (.strat(Str, Offset+1) & %11000000) = %10000000 .and (.strat(Str, Offset+2) & %11000000) = %10000000
; 3 byte character
__VALIDATED_LENGTH .set 3
__VALIDATED_CHARACTER .set (.strat(Str, Offset+0) << 16) | (.strat(Str, Offset+1) << 8) | (.strat(Str, Offset+2))
.endif
.elseif (.strat(Str, Offset+0) & %11111000) = %11110000
.if (.strat(Str, Offset+1) & %11000000) = %10000000 .and (.strat(Str, Offset+2) & %11000000) = %10000000 .and (.strat(Str, Offset+3) & %11000000) = %10000000
; 4 byte character
__VALIDATED_LENGTH .set 4
__VALIDATED_CHARACTER .set (.strat(Str, Offset+0) << 24) | (.strat(Str, Offset+1) << 16) | (.strat(Str, Offset+2) << 8) | (.strat(Str, Offset+3))
.endif
.endif
.if __VALIDATED_LENGTH = 0
.error .sprintf("Unable to parse UTF-8 character in string %s at position %d", Str, Offset)
.endif
.endmacro
.endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment