Created
November 19, 2015 23:09
-
-
Save stanislaw/f62c36823242c4ffea1b to your computer and use it in GitHub Desktop.
Revisions
-
conradkleinespel revised this gist
May 28, 2013 . 1 changed file with 0 additions and 54 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,54 +0,0 @@ -
conradkleinespel revised this gist
May 28, 2013 . 2 changed files with 48 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -142,7 +142,7 @@ char * utf8_replace(char * needle, char * replace, char * haystack) { int32_t diff = (int32_t) (len_replace - len_needle); char * new_string = calloc((len + diff + 1), sizeof(char)); char * pos = strstr(haystack, needle); @@ -162,5 +162,48 @@ char * utf8_replace(char * needle, char * replace, char * haystack) { // Copy the remainder of the initial string memcpy(new_string + num_shifts + len_replace, pos + len_needle, len - num_shifts - len_needle); return new_string; } char * utf8_replace_all(char * needle, char * replace, char * haystack) { char * new_string = utf8_replace(needle, replace, haystack), * old_new_string = NULL; while (strstr(new_string, needle) != NULL) { old_new_string = new_string; new_string = utf8_replace(needle, replace, new_string); free(old_new_string); } return new_string; } // the length here is the wanted length of the string, not including the terminating null byte char * utf8_escape_null_bytes(const char * s, size_t num) { char * new_string = NULL; // double the amount of available space in case we have only null bytes size_t new_size = (num * 2 + 1) * sizeof(char); new_string = malloc(new_size); memset(new_string, '\0', new_size); // count number of null bytes size_t num_null_bytes = 0, num_from_s = 0; while (num_from_s < num) { if (s[num_from_s] == 0x0) { new_string[num_from_s + num_null_bytes] = '\\'; new_string[num_from_s + num_null_bytes + 1] = '0'; num_null_bytes++; } else { new_string[num_from_s + num_null_bytes] = s[num_from_s]; } num_from_s++; } return new_string; } This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -20,6 +20,10 @@ char * utf8_remove_trailing_newline(char * s); char * utf8_remove_char(char * s, size_t n); char * utf8_add_char(char * s, char * c, size_t n); char * utf8_replace(char * needle, char * replace, char * haystack); char * utf8_replace_all(char * needle, char * replace, char * haystack); size_t utf8_num_bytes(char * s); // Escape the null bytes in the given string that has the given length char * utf8_escape_null_bytes(const char * s, size_t num); #endif -
conradkleinespel revised this gist
May 28, 2013 . 3 changed files with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -48,8 +48,8 @@ int32_t utf8_is_continuation(char c) { return (c & 0xc0) == 0x80; } size_t utf8_strlen(char * s) { size_t i = 0, len = 0; while(s[i]) { if ( ! utf8_is_continuation(s[i])) ++len; ++i; This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -11,7 +11,7 @@ int32_t utf8_is_continuation(char c); int32_t utf8_validate(char * s); size_t utf8_strlen(char * s); int32_t utf8_is_single_byte(char * c); int32_t utf8_is_double_byte(char * c); int32_t utf8_is_triple_byte(char * c); File renamed without changes. -
conradkleinespel revised this gist
May 28, 2013 . 1 changed file with 54 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,54 @@ #include <stdio.h> #include <stdlib.h> #include <assert.h> #include "utf8.h" int main() { assert(utf8_is_continuation(0x80)); assert( ! utf8_is_continuation(0xf0)); assert(utf8_validate("Hello world, je parle français, héhé !")); { char s[2] = { 0xf0, 0x0 }; assert( ! utf8_validate(s)); } assert(utf8_strlen("ça é") == 4); { char s[2] = { 0xf0, 0x0 }; assert(utf8_num_bytes(s) == 0); } assert(utf8_num_bytes("hello") == 1); assert(utf8_num_bytes("éello") == 2); { char s[4] = { 0xe0, 0x80, 0x80, 0x0 }; assert(utf8_num_bytes(s) == 3); } { char s[5] = { 0xf0, 0x80, 0x80, 0x80, 0x0 }; assert(utf8_num_bytes(s) == 4); } { char * s = utf8_remove_char("Helro", 2); assert(strcmp(s, "Hero") == 0); free(s); } { char * s = utf8_add_char("Jrémy", "é", 1); assert(strcmp(s, "Jérémy") == 0); free(s); } { char * s = utf8_replace("déjeuner", "dîner", "Conrad veut déjeuner !"); assert(strcmp(s, "Conrad veut dîner !") == 0); free(s); } return 0; } -
conradkleinespel created this gist
May 28, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,166 @@ // // utf8.c // training // // Created by Conrad Kleinespel on 5/27/13. // Copyright (c) 2013 Conrad Kleinespel. All rights reserved. // #include <stdio.h> #include <stdlib.h> #include <string.h> #include "utf8.h" int32_t utf8_validate(char * s) { int32_t i = 0; size_t len = strlen(s); while (i < len) { size_t num_bytes = utf8_num_bytes(s + i); if (num_bytes) { i += num_bytes; } else { return 0; } } return 1; } int32_t utf8_is_single_byte(char * c) { return (c[0] & 0x80) == 0x0; } int32_t utf8_is_double_byte(char * c) { return (c[0] & 0xe0) == 0xc0 && utf8_is_continuation(c[1]); } int32_t utf8_is_triple_byte(char * c) { return (c[0] & 0xf0) == 0xe0 && utf8_is_continuation(c[1]) && utf8_is_continuation(c[2]); } int32_t utf8_is_quadruple_byte(char * c) { return (c[0] & 0xf8) == 0xf0 && utf8_is_continuation(c[1]) && utf8_is_continuation(c[2]) && utf8_is_continuation(c[3]); } int32_t utf8_is_continuation(char c) { return (c & 0xc0) == 0x80; } int32_t utf8_strlen(char * s) { int32_t i = 0, len = 0; while(s[i]) { if ( ! utf8_is_continuation(s[i])) ++len; ++i; } return len; } char * utf8_remove_trailing_newline(char * s) { size_t len = strlen(s); char * new_string = NULL; if (s[len - 1] == '\n') { new_string = malloc((len) * sizeof(char)); memcpy(new_string, s, len); new_string[len - 1] = 0x0; } else { new_string = malloc((len + 1) * sizeof(char)); strcpy(new_string, s); } return new_string; } size_t utf8_num_bytes(char * s) { size_t len = strlen(s), num_bytes = 0; // is valid single byte (ie 0xxx xxxx) if (len >= 1 && utf8_is_single_byte(s)) { num_bytes = 1; // or is valid double byte (ie 110x xxxx and continuation byte) } else if (len >= 2 && utf8_is_double_byte(s)) { num_bytes = 2; // or is valid tripple byte (ie 1110 xxxx and continuation byte) } else if (len >= 3 && utf8_is_triple_byte(s)) { num_bytes = 3; // or is valid tripple byte (ie 1111 0xxx and continuation byte) } else if (len >= 4 && utf8_is_quadruple_byte(s)) { num_bytes = 4; } return num_bytes; } char * utf8_remove_char(char * s, size_t n) { size_t len = strlen(s); if (len < n) { exit(EXIT_FAILURE); } size_t num_shifts = utf8_num_bytes(s + n); char * new_string = NULL; new_string = malloc(len * sizeof(char)); memcpy(new_string, s, n); memcpy(new_string + n, s + n + num_shifts, len - n - num_shifts + 1); return new_string; } char * utf8_add_char(char * s, char * c, size_t n) { size_t len = strlen(s); if (len < n) { exit(EXIT_FAILURE); } size_t num_shifts = utf8_num_bytes(c); char * new_string = NULL; new_string = malloc((len + num_shifts + 1) * sizeof(char)); // copy the begining of the string memcpy(new_string, s, n); // add the new char memcpy(new_string + n, c, num_shifts); // copy the remaining characters memcpy(new_string + n + num_shifts, s + n, len - n + 1); return new_string; } char * utf8_replace(char * needle, char * replace, char * haystack) { size_t len_replace = strlen(replace), len_needle = strlen(needle), len = strlen(haystack); int32_t diff = (int32_t) (len_replace - len_needle); char * new_string = malloc((len + diff) * sizeof(char)); char * pos = strstr(haystack, needle); if (pos == NULL) { strcpy(new_string, haystack); return new_string; } size_t num_shifts = pos - haystack; // Add begining of the string memcpy(new_string, haystack, num_shifts); // Copy the replacement in place of the needle memcpy(new_string + num_shifts, replace, len_replace); // Copy the remainder of the initial string memcpy(new_string + num_shifts + len_replace, pos + len_needle, len - num_shifts - len_needle); return new_string; } This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,25 @@ // // utf8.h // training // // Created by Conrad Kleinespel on 5/27/13. // Copyright (c) 2013 Conrad Kleinespel. All rights reserved. // #ifndef training_utf8_h #define training_utf8_h int32_t utf8_is_continuation(char c); int32_t utf8_validate(char * s); int32_t utf8_strlen(char * s); int32_t utf8_is_single_byte(char * c); int32_t utf8_is_double_byte(char * c); int32_t utf8_is_triple_byte(char * c); int32_t utf8_is_quadruple_byte(char * c); char * utf8_remove_trailing_newline(char * s); char * utf8_remove_char(char * s, size_t n); char * utf8_add_char(char * s, char * c, size_t n); char * utf8_replace(char * needle, char * replace, char * haystack); size_t utf8_num_bytes(char * s); #endif