Skip to content

Instantly share code, notes, and snippets.

@stanislaw
Created November 19, 2015 23:09
Show Gist options
  • Select an option

  • Save stanislaw/f62c36823242c4ffea1b to your computer and use it in GitHub Desktop.

Select an option

Save stanislaw/f62c36823242c4ffea1b to your computer and use it in GitHub Desktop.

Revisions

  1. @conradkleinespel conradkleinespel revised this gist May 28, 2013. 1 changed file with 0 additions and 54 deletions.
    54 changes: 0 additions & 54 deletions z_test.c
    Original file line number Diff line number Diff line change
    @@ -1,54 +0,0 @@
    #include <stdio.h>
    #include <stdlib.h>
    #include <assert.h>
    #include "utf8.h"

    int main() {
    assert(utf8_is_continuation(0x80));
    assert( ! utf8_is_continuation(0xf0));

    assert(utf8_validate("Hello world, je parle français, héhé !"));
    {
    char s[2] = { 0xf0, 0x0 };
    assert( ! utf8_validate(s));
    }

    assert(utf8_strlen("ça é") == 4);

    {
    char s[2] = { 0xf0, 0x0 };
    assert(utf8_num_bytes(s) == 0);
    }
    assert(utf8_num_bytes("hello") == 1);
    assert(utf8_num_bytes("éello") == 2);

    {
    char s[4] = { 0xe0, 0x80, 0x80, 0x0 };
    assert(utf8_num_bytes(s) == 3);
    }

    {
    char s[5] = { 0xf0, 0x80, 0x80, 0x80, 0x0 };
    assert(utf8_num_bytes(s) == 4);
    }

    {
    char * s = utf8_remove_char("Helro", 2);
    assert(strcmp(s, "Hero") == 0);
    free(s);
    }

    {
    char * s = utf8_add_char("Jrémy", "é", 1);
    assert(strcmp(s, "Jérémy") == 0);
    free(s);
    }

    {
    char * s = utf8_replace("déjeuner", "dîner", "Conrad veut déjeuner !");
    assert(strcmp(s, "Conrad veut dîner !") == 0);
    free(s);
    }

    return 0;
    }
  2. @conradkleinespel conradkleinespel revised this gist May 28, 2013. 2 changed files with 48 additions and 1 deletion.
    45 changes: 44 additions & 1 deletion utf8.c
    Original file line number Diff line number Diff line change
    @@ -142,7 +142,7 @@ char * utf8_replace(char * needle, char * replace, char * haystack) {

    int32_t diff = (int32_t) (len_replace - len_needle);

    char * new_string = malloc((len + diff) * sizeof(char));
    char * new_string = calloc((len + diff + 1), sizeof(char));

    char * pos = strstr(haystack, needle);

    @@ -162,5 +162,48 @@ char * utf8_replace(char * needle, char * replace, char * haystack) {
    // Copy the remainder of the initial string
    memcpy(new_string + num_shifts + len_replace, pos + len_needle, len - num_shifts - len_needle);

    return new_string;
    }

    char * utf8_replace_all(char * needle, char * replace, char * haystack) {
    char
    * new_string = utf8_replace(needle, replace, haystack),
    * old_new_string = NULL;

    while (strstr(new_string, needle) != NULL) {
    old_new_string = new_string;
    new_string = utf8_replace(needle, replace, new_string);
    free(old_new_string);
    }

    return new_string;
    }

    // the length here is the wanted length of the string, not including the terminating null byte
    char * utf8_escape_null_bytes(const char * s, size_t num) {
    char * new_string = NULL;
    // double the amount of available space in case we have only null bytes
    size_t new_size = (num * 2 + 1) * sizeof(char);
    new_string = malloc(new_size);
    memset(new_string, '\0', new_size);

    // count number of null bytes
    size_t
    num_null_bytes = 0,
    num_from_s = 0;

    while (num_from_s < num) {

    if (s[num_from_s] == 0x0) {
    new_string[num_from_s + num_null_bytes] = '\\';
    new_string[num_from_s + num_null_bytes + 1] = '0';
    num_null_bytes++;
    } else {
    new_string[num_from_s + num_null_bytes] = s[num_from_s];
    }

    num_from_s++;
    }

    return new_string;
    }
    4 changes: 4 additions & 0 deletions utf8.h
    Original file line number Diff line number Diff line change
    @@ -20,6 +20,10 @@ char * utf8_remove_trailing_newline(char * s);
    char * utf8_remove_char(char * s, size_t n);
    char * utf8_add_char(char * s, char * c, size_t n);
    char * utf8_replace(char * needle, char * replace, char * haystack);
    char * utf8_replace_all(char * needle, char * replace, char * haystack);
    size_t utf8_num_bytes(char * s);

    // Escape the null bytes in the given string that has the given length
    char * utf8_escape_null_bytes(const char * s, size_t num);

    #endif
  3. @conradkleinespel conradkleinespel revised this gist May 28, 2013. 3 changed files with 3 additions and 3 deletions.
    4 changes: 2 additions & 2 deletions utf8.c
    Original file line number Diff line number Diff line change
    @@ -48,8 +48,8 @@ int32_t utf8_is_continuation(char c) {
    return (c & 0xc0) == 0x80;
    }

    int32_t utf8_strlen(char * s) {
    int32_t i = 0, len = 0;
    size_t utf8_strlen(char * s) {
    size_t i = 0, len = 0;
    while(s[i]) {
    if ( ! utf8_is_continuation(s[i])) ++len;
    ++i;
    2 changes: 1 addition & 1 deletion utf8.h
    Original file line number Diff line number Diff line change
    @@ -11,7 +11,7 @@

    int32_t utf8_is_continuation(char c);
    int32_t utf8_validate(char * s);
    int32_t utf8_strlen(char * s);
    size_t utf8_strlen(char * s);
    int32_t utf8_is_single_byte(char * c);
    int32_t utf8_is_double_byte(char * c);
    int32_t utf8_is_triple_byte(char * c);
    File renamed without changes.
  4. @conradkleinespel conradkleinespel revised this gist May 28, 2013. 1 changed file with 54 additions and 0 deletions.
    54 changes: 54 additions & 0 deletions test.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,54 @@
    #include <stdio.h>
    #include <stdlib.h>
    #include <assert.h>
    #include "utf8.h"

    int main() {
    assert(utf8_is_continuation(0x80));
    assert( ! utf8_is_continuation(0xf0));

    assert(utf8_validate("Hello world, je parle français, héhé !"));
    {
    char s[2] = { 0xf0, 0x0 };
    assert( ! utf8_validate(s));
    }

    assert(utf8_strlen("ça é") == 4);

    {
    char s[2] = { 0xf0, 0x0 };
    assert(utf8_num_bytes(s) == 0);
    }
    assert(utf8_num_bytes("hello") == 1);
    assert(utf8_num_bytes("éello") == 2);

    {
    char s[4] = { 0xe0, 0x80, 0x80, 0x0 };
    assert(utf8_num_bytes(s) == 3);
    }

    {
    char s[5] = { 0xf0, 0x80, 0x80, 0x80, 0x0 };
    assert(utf8_num_bytes(s) == 4);
    }

    {
    char * s = utf8_remove_char("Helro", 2);
    assert(strcmp(s, "Hero") == 0);
    free(s);
    }

    {
    char * s = utf8_add_char("Jrémy", "é", 1);
    assert(strcmp(s, "Jérémy") == 0);
    free(s);
    }

    {
    char * s = utf8_replace("déjeuner", "dîner", "Conrad veut déjeuner !");
    assert(strcmp(s, "Conrad veut dîner !") == 0);
    free(s);
    }

    return 0;
    }
  5. @conradkleinespel conradkleinespel created this gist May 28, 2013.
    166 changes: 166 additions & 0 deletions utf8.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,166 @@
    //
    // utf8.c
    // training
    //
    // Created by Conrad Kleinespel on 5/27/13.
    // Copyright (c) 2013 Conrad Kleinespel. All rights reserved.
    //

    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include "utf8.h"

    int32_t utf8_validate(char * s) {
    int32_t i = 0;
    size_t len = strlen(s);

    while (i < len) {
    size_t num_bytes = utf8_num_bytes(s + i);

    if (num_bytes) {
    i += num_bytes;
    } else {
    return 0;
    }
    }

    return 1;
    }

    int32_t utf8_is_single_byte(char * c) {
    return (c[0] & 0x80) == 0x0;
    }

    int32_t utf8_is_double_byte(char * c) {
    return (c[0] & 0xe0) == 0xc0 && utf8_is_continuation(c[1]);
    }

    int32_t utf8_is_triple_byte(char * c) {
    return (c[0] & 0xf0) == 0xe0 && utf8_is_continuation(c[1]) && utf8_is_continuation(c[2]);
    }

    int32_t utf8_is_quadruple_byte(char * c) {
    return (c[0] & 0xf8) == 0xf0 && utf8_is_continuation(c[1]) && utf8_is_continuation(c[2]) && utf8_is_continuation(c[3]);
    }

    int32_t utf8_is_continuation(char c) {
    return (c & 0xc0) == 0x80;
    }

    int32_t utf8_strlen(char * s) {
    int32_t i = 0, len = 0;
    while(s[i]) {
    if ( ! utf8_is_continuation(s[i])) ++len;
    ++i;
    }
    return len;
    }

    char * utf8_remove_trailing_newline(char * s) {
    size_t len = strlen(s);
    char * new_string = NULL;

    if (s[len - 1] == '\n') {
    new_string = malloc((len) * sizeof(char));
    memcpy(new_string, s, len);
    new_string[len - 1] = 0x0;
    } else {
    new_string = malloc((len + 1) * sizeof(char));
    strcpy(new_string, s);
    }

    return new_string;
    }

    size_t utf8_num_bytes(char * s) {
    size_t len = strlen(s), num_bytes = 0;

    // is valid single byte (ie 0xxx xxxx)
    if (len >= 1 && utf8_is_single_byte(s)) {
    num_bytes = 1;

    // or is valid double byte (ie 110x xxxx and continuation byte)
    } else if (len >= 2 && utf8_is_double_byte(s)) {
    num_bytes = 2;

    // or is valid tripple byte (ie 1110 xxxx and continuation byte)
    } else if (len >= 3 && utf8_is_triple_byte(s)) {
    num_bytes = 3;

    // or is valid tripple byte (ie 1111 0xxx and continuation byte)
    } else if (len >= 4 && utf8_is_quadruple_byte(s)) {
    num_bytes = 4;
    }

    return num_bytes;
    }

    char * utf8_remove_char(char * s, size_t n) {
    size_t len = strlen(s);
    if (len < n) {
    exit(EXIT_FAILURE);
    }

    size_t num_shifts = utf8_num_bytes(s + n);
    char * new_string = NULL;
    new_string = malloc(len * sizeof(char));

    memcpy(new_string, s, n);
    memcpy(new_string + n, s + n + num_shifts, len - n - num_shifts + 1);

    return new_string;
    }

    char * utf8_add_char(char * s, char * c, size_t n) {
    size_t len = strlen(s);
    if (len < n) {
    exit(EXIT_FAILURE);
    }

    size_t num_shifts = utf8_num_bytes(c);
    char * new_string = NULL;
    new_string = malloc((len + num_shifts + 1) * sizeof(char));

    // copy the begining of the string
    memcpy(new_string, s, n);

    // add the new char
    memcpy(new_string + n, c, num_shifts);

    // copy the remaining characters
    memcpy(new_string + n + num_shifts, s + n, len - n + 1);

    return new_string;
    }

    char * utf8_replace(char * needle, char * replace, char * haystack) {
    size_t
    len_replace = strlen(replace),
    len_needle = strlen(needle),
    len = strlen(haystack);

    int32_t diff = (int32_t) (len_replace - len_needle);

    char * new_string = malloc((len + diff) * sizeof(char));

    char * pos = strstr(haystack, needle);

    if (pos == NULL) {
    strcpy(new_string, haystack);
    return new_string;
    }

    size_t num_shifts = pos - haystack;

    // Add begining of the string
    memcpy(new_string, haystack, num_shifts);

    // Copy the replacement in place of the needle
    memcpy(new_string + num_shifts, replace, len_replace);

    // Copy the remainder of the initial string
    memcpy(new_string + num_shifts + len_replace, pos + len_needle, len - num_shifts - len_needle);

    return new_string;
    }
    25 changes: 25 additions & 0 deletions utf8.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,25 @@
    //
    // utf8.h
    // training
    //
    // Created by Conrad Kleinespel on 5/27/13.
    // Copyright (c) 2013 Conrad Kleinespel. All rights reserved.
    //

    #ifndef training_utf8_h
    #define training_utf8_h

    int32_t utf8_is_continuation(char c);
    int32_t utf8_validate(char * s);
    int32_t utf8_strlen(char * s);
    int32_t utf8_is_single_byte(char * c);
    int32_t utf8_is_double_byte(char * c);
    int32_t utf8_is_triple_byte(char * c);
    int32_t utf8_is_quadruple_byte(char * c);
    char * utf8_remove_trailing_newline(char * s);
    char * utf8_remove_char(char * s, size_t n);
    char * utf8_add_char(char * s, char * c, size_t n);
    char * utf8_replace(char * needle, char * replace, char * haystack);
    size_t utf8_num_bytes(char * s);

    #endif