stanislaw · November 19, 2015 23:09 · May 28, 2013 · May 28, 2013 · May 28, 2013 · May 28, 2013
diff --git a/z_test.c b/z_test.c
@@ -1,54 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include "utf8.h"
-
-int main() {
-    assert(utf8_is_continuation(0x80));
-    assert( ! utf8_is_continuation(0xf0));
-
-    assert(utf8_validate("Hello world, je parle français, héhé !"));
-    {
-        char s[2] = { 0xf0, 0x0 };
-        assert( ! utf8_validate(s));
-    }
-
-    assert(utf8_strlen("ça é") == 4);
-
-    {
-        char s[2] = { 0xf0, 0x0 };
-        assert(utf8_num_bytes(s) == 0);
-    }
-    assert(utf8_num_bytes("hello") == 1);
-    assert(utf8_num_bytes("éello") == 2);
-
-    {
-        char s[4] = { 0xe0, 0x80, 0x80, 0x0 };
-        assert(utf8_num_bytes(s) == 3);
-    }
-
-    {
-        char s[5] = { 0xf0, 0x80, 0x80, 0x80, 0x0 };
-        assert(utf8_num_bytes(s) == 4);
-    }
-
-    {
-        char * s = utf8_remove_char("Helro", 2);
-        assert(strcmp(s, "Hero") == 0);
-        free(s);
-    }
-
-    {
-        char * s = utf8_add_char("Jrémy", "é", 1);
-        assert(strcmp(s, "Jérémy") == 0);
-        free(s);
-    }
-
-    {
-        char * s = utf8_replace("déjeuner", "dîner", "Conrad veut déjeuner !");
-        assert(strcmp(s, "Conrad veut dîner !") == 0);
-        free(s);
-    }
-
-    return 0;
-}

diff --git a/utf8.c b/utf8.c
@@ -142,7 +142,7 @@ char * utf8_replace(char * needle, char * replace, char * haystack) {
 
     int32_t diff = (int32_t) (len_replace - len_needle);
 
-    char * new_string = malloc((len + diff) * sizeof(char));
+    char * new_string = calloc((len + diff + 1), sizeof(char));
 
     char * pos = strstr(haystack, needle);
 
@@ -162,5 +162,48 @@ char * utf8_replace(char * needle, char * replace, char * haystack) {
     // Copy the remainder of the initial string
     memcpy(new_string + num_shifts + len_replace, pos + len_needle, len - num_shifts - len_needle);
 
+    return new_string;
+}
+
+char * utf8_replace_all(char * needle, char * replace, char * haystack) {
+    char
+        * new_string = utf8_replace(needle, replace, haystack),
+        * old_new_string = NULL;
+
+    while (strstr(new_string, needle) != NULL) {
+        old_new_string = new_string;
+        new_string = utf8_replace(needle, replace, new_string);
+        free(old_new_string);
+    }
+
+    return new_string;
+}
+
+// the length here is the wanted length of the string, not including the terminating null byte
+char * utf8_escape_null_bytes(const char * s, size_t num) {
+    char * new_string = NULL;
+    // double the amount of available space in case we have only null bytes
+    size_t new_size = (num * 2 + 1) * sizeof(char);
+    new_string = malloc(new_size);
+    memset(new_string, '\0', new_size);
+
+    // count number of null bytes
+    size_t
+        num_null_bytes = 0,
+        num_from_s = 0;
+
+    while (num_from_s < num) {
+
+        if (s[num_from_s] == 0x0) {
+            new_string[num_from_s + num_null_bytes] = '\\';
+            new_string[num_from_s + num_null_bytes + 1] = '0';
+            num_null_bytes++;
+        } else {
+            new_string[num_from_s + num_null_bytes] = s[num_from_s];
+        }
+
+        num_from_s++;
+    }
+
     return new_string;
 }
diff --git a/utf8.h b/utf8.h
@@ -20,6 +20,10 @@ char * utf8_remove_trailing_newline(char * s);
 char * utf8_remove_char(char * s, size_t n);
 char * utf8_add_char(char * s, char * c, size_t n);
 char * utf8_replace(char * needle, char * replace, char * haystack);
+char * utf8_replace_all(char * needle, char * replace, char * haystack);
 size_t utf8_num_bytes(char * s);
 
+// Escape the null bytes in the given string that has the given length
+char * utf8_escape_null_bytes(const char * s, size_t num);
+
 #endif
diff --git a/utf8.c b/utf8.c
@@ -48,8 +48,8 @@ int32_t utf8_is_continuation(char c) {
     return (c & 0xc0) == 0x80;
 }
 
-int32_t utf8_strlen(char * s) {
-    int32_t i = 0, len = 0;
+size_t utf8_strlen(char * s) {
+    size_t i = 0, len = 0;
     while(s[i]) {
         if ( ! utf8_is_continuation(s[i])) ++len;
         ++i;

diff --git a/utf8.h b/utf8.h
@@ -11,7 +11,7 @@
 
 int32_t utf8_is_continuation(char c);
 int32_t utf8_validate(char * s);
-int32_t utf8_strlen(char * s);
+size_t utf8_strlen(char * s);
 int32_t utf8_is_single_byte(char * c);
 int32_t utf8_is_double_byte(char * c);
 int32_t utf8_is_triple_byte(char * c);

diff --git a/test.c → z_test.c b/test.c → z_test.c
diff --git a/test.c b/test.c
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "utf8.h"
+
+int main() {
+    assert(utf8_is_continuation(0x80));
+    assert( ! utf8_is_continuation(0xf0));
+
+    assert(utf8_validate("Hello world, je parle français, héhé !"));
+    {
+        char s[2] = { 0xf0, 0x0 };
+        assert( ! utf8_validate(s));
+    }
+
+    assert(utf8_strlen("ça é") == 4);
+
+    {
+        char s[2] = { 0xf0, 0x0 };
+        assert(utf8_num_bytes(s) == 0);
+    }
+    assert(utf8_num_bytes("hello") == 1);
+    assert(utf8_num_bytes("éello") == 2);
+
+    {
+        char s[4] = { 0xe0, 0x80, 0x80, 0x0 };
+        assert(utf8_num_bytes(s) == 3);
+    }
+
+    {
+        char s[5] = { 0xf0, 0x80, 0x80, 0x80, 0x0 };
+        assert(utf8_num_bytes(s) == 4);
+    }
+
+    {
+        char * s = utf8_remove_char("Helro", 2);
+        assert(strcmp(s, "Hero") == 0);
+        free(s);
+    }
+
+    {
+        char * s = utf8_add_char("Jrémy", "é", 1);
+        assert(strcmp(s, "Jérémy") == 0);
+        free(s);
+    }
+
+    {
+        char * s = utf8_replace("déjeuner", "dîner", "Conrad veut déjeuner !");
+        assert(strcmp(s, "Conrad veut dîner !") == 0);
+        free(s);
+    }
+
+    return 0;
+}
diff --git a/utf8.c b/utf8.c
@@ -0,0 +1,166 @@
+//
+//  utf8.c
+//  training
+//
+//  Created by Conrad Kleinespel on 5/27/13.
+//  Copyright (c) 2013 Conrad Kleinespel. All rights reserved.
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "utf8.h"
+
+int32_t utf8_validate(char * s) {
+    int32_t i = 0;
+    size_t len = strlen(s);
+
+    while (i < len) {        
+        size_t num_bytes = utf8_num_bytes(s + i);
+
+        if (num_bytes) {
+            i += num_bytes;
+        } else {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+int32_t utf8_is_single_byte(char * c) {
+    return (c[0] & 0x80) == 0x0;
+}
+
+int32_t utf8_is_double_byte(char * c) {
+    return (c[0] & 0xe0) == 0xc0 && utf8_is_continuation(c[1]);
+}
+
+int32_t utf8_is_triple_byte(char * c) {
+    return (c[0] & 0xf0) == 0xe0 && utf8_is_continuation(c[1]) && utf8_is_continuation(c[2]);
+}
+
+int32_t utf8_is_quadruple_byte(char * c) {
+    return (c[0] & 0xf8) == 0xf0 && utf8_is_continuation(c[1]) && utf8_is_continuation(c[2]) && utf8_is_continuation(c[3]);
+}
+
+int32_t utf8_is_continuation(char c) {
+    return (c & 0xc0) == 0x80;
+}
+
+int32_t utf8_strlen(char * s) {
+    int32_t i = 0, len = 0;
+    while(s[i]) {
+        if ( ! utf8_is_continuation(s[i])) ++len;
+        ++i;
+    }
+    return len;
+}
+
+char * utf8_remove_trailing_newline(char * s) {
+    size_t len = strlen(s);
+    char * new_string = NULL;
+
+    if (s[len - 1] == '\n') {
+        new_string = malloc((len) * sizeof(char));
+        memcpy(new_string, s, len);
+        new_string[len - 1] = 0x0;
+    } else {
+        new_string = malloc((len + 1) * sizeof(char));
+        strcpy(new_string, s);
+    }
+
+    return new_string;
+}
+
+size_t utf8_num_bytes(char * s) {
+    size_t len = strlen(s), num_bytes = 0;
+
+    // is valid single byte (ie 0xxx xxxx)
+    if (len >= 1 && utf8_is_single_byte(s)) {
+        num_bytes = 1;
+
+    // or is valid double byte (ie 110x xxxx and continuation byte)
+    } else if (len >= 2 && utf8_is_double_byte(s)) {
+        num_bytes = 2;
+
+    // or is valid tripple byte (ie 1110 xxxx and continuation byte)
+    } else if (len >= 3 && utf8_is_triple_byte(s)) {
+        num_bytes = 3;
+
+    // or is valid tripple byte (ie 1111 0xxx and continuation byte)
+    } else if (len >= 4 && utf8_is_quadruple_byte(s)) {
+        num_bytes = 4;
+    }
+
+    return num_bytes;
+}
+
+char * utf8_remove_char(char * s, size_t n) {
+    size_t len = strlen(s);
+    if (len < n) {
+        exit(EXIT_FAILURE);
+    }
+
+    size_t num_shifts = utf8_num_bytes(s + n);
+    char * new_string = NULL;
+    new_string = malloc(len * sizeof(char));
+
+    memcpy(new_string, s, n);
+    memcpy(new_string + n, s + n + num_shifts, len - n - num_shifts + 1);
+
+    return new_string;
+}
+
+char * utf8_add_char(char * s, char * c, size_t n) {
+    size_t len = strlen(s);
+    if (len < n) {
+        exit(EXIT_FAILURE);
+    }
+
+    size_t num_shifts = utf8_num_bytes(c);
+    char * new_string = NULL;
+    new_string = malloc((len + num_shifts + 1) * sizeof(char));
+
+    // copy the begining of the string
+    memcpy(new_string, s, n);
+
+    // add the new char
+    memcpy(new_string + n, c, num_shifts);
+
+    // copy the remaining characters
+    memcpy(new_string + n + num_shifts, s + n, len - n + 1);
+
+    return new_string;
+}
+
+char * utf8_replace(char * needle, char * replace, char * haystack) {
+    size_t
+        len_replace = strlen(replace),
+        len_needle = strlen(needle),
+        len = strlen(haystack);
+
+    int32_t diff = (int32_t) (len_replace - len_needle);
+
+    char * new_string = malloc((len + diff) * sizeof(char));
+
+    char * pos = strstr(haystack, needle);
+
+    if (pos == NULL) {
+        strcpy(new_string, haystack);
+        return new_string;
+    }
+
+    size_t num_shifts = pos - haystack;
+
+    // Add begining of the string
+    memcpy(new_string, haystack, num_shifts);
+
+    // Copy the replacement in place of the needle
+    memcpy(new_string + num_shifts, replace, len_replace);
+
+    // Copy the remainder of the initial string
+    memcpy(new_string + num_shifts + len_replace, pos + len_needle, len - num_shifts - len_needle);
+
+    return new_string;
+}
diff --git a/utf8.h b/utf8.h
@@ -0,0 +1,25 @@
+//
+//  utf8.h
+//  training
+//
+//  Created by Conrad Kleinespel on 5/27/13.
+//  Copyright (c) 2013 Conrad Kleinespel. All rights reserved.
+//
+
+#ifndef training_utf8_h
+#define training_utf8_h
+
+int32_t utf8_is_continuation(char c);
+int32_t utf8_validate(char * s);
+int32_t utf8_strlen(char * s);
+int32_t utf8_is_single_byte(char * c);
+int32_t utf8_is_double_byte(char * c);
+int32_t utf8_is_triple_byte(char * c);
+int32_t utf8_is_quadruple_byte(char * c);
+char * utf8_remove_trailing_newline(char * s);
+char * utf8_remove_char(char * s, size_t n);
+char * utf8_add_char(char * s, char * c, size_t n);
+char * utf8_replace(char * needle, char * replace, char * haystack);
+size_t utf8_num_bytes(char * s);
+
+#endif
No results found