Last active
April 5, 2026 20:15
-
-
Save jroweboy/b688d673aac602d4b17a3cccffe134f2 to your computer and use it in GitHub Desktop.
CA65 Macro for UTF8 Character Mapping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ;/*****************************************************************************/ | |
| ;/* */ | |
| ;/* utfmap.inc */ | |
| ;/* */ | |
| ;/* Macros emulating charmap for UTF-8 characters */ | |
| ;/* */ | |
| ;/* */ | |
| ;/* */ | |
| ;/* (C) 2026 jroweboy */ | |
| ;/* EMail: jroweboy@gmail.com */ | |
| ;/* */ | |
| ;/* */ | |
| ;/* This software is provided 'as-is', without any expressed or implied */ | |
| ;/* warranty. In no event will the authors be held liable for any damages */ | |
| ;/* arising from the use of this software. */ | |
| ;/* */ | |
| ;/* Permission is granted to anyone to use this software for any purpose, */ | |
| ;/* including commercial applications, and to alter it and redistribute it */ | |
| ;/* freely, subject to the following restrictions: */ | |
| ;/* */ | |
| ;/* 1. The origin of this software must not be misrepresented; you must not */ | |
| ;/* claim that you wrote the original software. If you use this software */ | |
| ;/* in a product, an acknowledgment in the product documentation would be */ | |
| ;/* appreciated but is not required. */ | |
| ;/* 2. Altered source versions must be plainly marked as such, and must not */ | |
| ;/* be misrepresented as being the original software. */ | |
| ;/* 3. This notice may not be removed or altered from any source */ | |
| ;/* distribution. */ | |
| ;/* */ | |
| ;/*****************************************************************************/ | |
| .ifndef __UTFMAP_H | |
| __UTFMAP_H = 1 | |
| ;; -- | |
| ; OPTIONS | |
| ; Uncomment or set one of the following before including this header | |
| ; | |
| ; __UTFMAP_UNMAPPED_DATA - How to handle unmapped characters | |
| ; 0 = (Default) Output the bytes for any characters that are not mapped | |
| ; 1 = Skip the character and log a warning in the console output for the skipped | |
| ; 2 = Generate an assembly error when an unknown character is encountered | |
| ; __UTFMAP_UNMAPPED_DATA = 0 | |
| ; __UTFMAP_UNMAPPED_DATA = 1 | |
| ; __UTFMAP_UNMAPPED_DATA = 2 | |
| .ifndef __UTFMAP_UNMAPPED_DATA | |
| __UTFMAP_UNMAPPED_DATA = 0 | |
| .endif | |
| ; -- | |
| ; UTF8 version of .charmap | |
| ; Maps a single valid utf8 letter Str to a single 8bit number Chr | |
| .macro utfmap Str, Chr | |
| .local Val, StrLen | |
| __next_utf8_character Str | |
| .if __VALIDATED_LENGTH <> .strlen(Str) | |
| ; Erroring out here matches the behavior of ca65 .charmap | |
| .error "Illegal utf8 character constant" | |
| .endif | |
| .ident(.sprintf("__UTF8_MAPPING_%08X", __VALIDATED_CHARACTER)) .set Chr | |
| .endmacro | |
| ; -- | |
| ; Generates a byte string using the previously defined mapping from a utf8 string. | |
| ; The mapping must be included in the same compilation unit BEFORE calling this function. | |
| .macro utfstr Str | |
| .local Next, Chr | |
| Next .set 0 | |
| ; For each ascii character in the input string | |
| .repeat .strlen(Str), I | |
| ; Skip to the start of the next utf8 character (which starts at Next) | |
| .if I >= Next | |
| ; Get the next utf8 character (assuming its valid utf8) | |
| __next_utf8_character Str, I | |
| .ifdef .ident(.sprintf("__UTF8_MAPPING_%08X", __VALIDATED_CHARACTER)) | |
| ; Found a match! Output the mapping byte | |
| __output_bytes_be {.ident(.sprintf("__UTF8_MAPPING_%08X", __VALIDATED_CHARACTER))} | |
| .else | |
| ; No mapping, so either output the original utf8 character or skip/error according to the user option | |
| .if __UTFMAP_UNMAPPED_DATA = 0 | |
| __output_bytes_be {__VALIDATED_CHARACTER} | |
| .elseif __UTFMAP_UNMAPPED_DATA = 1 | |
| .out .sprintf("Skipped unmapped UTF8 character: 0x%X in string %s at position %d", __VALIDATED_CHARACTER, Str, I) | |
| .else | |
| .error .sprintf("Unmapped UTF8 character: 0x%X in string %s at position %d", __VALIDATED_CHARACTER, Str, I) | |
| .endif | |
| .endif | |
| Next .set Next + __VALIDATED_LENGTH | |
| .endif | |
| .endrepeat | |
| .endmacro | |
| ; -- | |
| ; Similar to .asciiz, adds a null terminator to the end of the string | |
| .macro utfstrz Str | |
| utfstr Str | |
| .byte $00 | |
| .endmacro | |
| ; -- | |
| ; Helper macro for outputting a 32 bit int in big endian order while skipping | |
| ; empty values | |
| .macro __output_bytes_be Byt | |
| .if Byt >= (1 << 24) | |
| .byte ((Byt >> 24) & $ff) | |
| .endif | |
| .if Byt >= (1 << 16) | |
| .byte ((Byt >> 16) & $ff) | |
| .endif | |
| .if Byt >= (1 << 8) | |
| .byte ((Byt >> 8) & $ff) | |
| .endif | |
| .byte ((Byt >> 0) & $ff) | |
| .endmacro | |
| ; -- | |
| ; Helper macro for reading a single unicode codepoint into a 32 bit int | |
| ; Stores a result in the globals __VALIDATED_LENGTH and __VALIDATED_CHARACTER | |
| .macro __next_utf8_character Str, I | |
| .local Offset | |
| .ifnblank I | |
| Offset = I | |
| .else | |
| Offset = 0 | |
| .endif | |
| __VALIDATED_CHARACTER .set 0 | |
| __VALIDATED_LENGTH .set 0 | |
| .if (.strat(Str, Offset+0) & %10000000) = 0 | |
| ; 1 byte character | |
| __VALIDATED_LENGTH .set 1 | |
| __VALIDATED_CHARACTER .set .strat(Str, Offset+0) | |
| .elseif (.strat(Str, Offset+0) & %11100000) = %11000000 | |
| .if (.strat(Str, Offset+1) & %11000000) = %10000000 | |
| ; 2 byte character | |
| __VALIDATED_LENGTH .set 2 | |
| __VALIDATED_CHARACTER .set (.strat(Str, Offset+0) << 8) | (.strat(Str, Offset+1)) | |
| .endif | |
| .elseif (.strat(Str, Offset+0) & %11110000) = %11100000 | |
| .if (.strat(Str, Offset+1) & %11000000) = %10000000 .and (.strat(Str, Offset+2) & %11000000) = %10000000 | |
| ; 3 byte character | |
| __VALIDATED_LENGTH .set 3 | |
| __VALIDATED_CHARACTER .set (.strat(Str, Offset+0) << 16) | (.strat(Str, Offset+1) << 8) | (.strat(Str, Offset+2)) | |
| .endif | |
| .elseif (.strat(Str, Offset+0) & %11111000) = %11110000 | |
| .if (.strat(Str, Offset+1) & %11000000) = %10000000 .and (.strat(Str, Offset+2) & %11000000) = %10000000 .and (.strat(Str, Offset+3) & %11000000) = %10000000 | |
| ; 4 byte character | |
| __VALIDATED_LENGTH .set 4 | |
| __VALIDATED_CHARACTER .set (.strat(Str, Offset+0) << 24) | (.strat(Str, Offset+1) << 16) | (.strat(Str, Offset+2) << 8) | (.strat(Str, Offset+3)) | |
| .endif | |
| .endif | |
| .if __VALIDATED_LENGTH = 0 | |
| .error .sprintf("Unable to parse UTF-8 character in string %s at position %d", Str, Offset) | |
| .endif | |
| .endmacro | |
| .endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment