Last active
August 29, 2015 13:59
-
-
Save Inndy/10484418 to your computer and use it in GitHub Desktop.
Count UTF-8 character byte length and split.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#define BUFFER_SIZE 4096 | |
// Reference: http://en.wikipedia.org/wiki/UTF-8 | |
#define UTF8_MASK_C 0x080 // 0b10000000 | |
#define UTF8_MASK_2 0x0C0 // 0b11000000 | |
#define UTF8_MASK_3 0x0E0 // 0b11100000 | |
#define UTF8_MASK_4 0x0F0 // 0b11110000 | |
#define UTF8_MASK_5 0x0F8 // 0b11111000 | |
#define UTF8_MASK_6 0x0FC // 0b11111100 | |
#define UTF8_MKMASK(X) (X | X >> 1) | |
// Test leading bytes | |
#define UTF8_TEST_L(X, M) (((X) & UTF8_MKMASK(M)) == M) | |
// Test body bytes | |
#define UTF8_TEST_B(X) (((X) & UTF8_MASK_C) == UTF8_MASK_C) | |
// check body bytes | |
int utf8_check_bytes(unsigned char * data, int l) | |
{ | |
while (l--) | |
if (! UTF8_TEST_B(*data++)) | |
return 0; | |
return 1; | |
} | |
// get character size | |
int utf8_get_char_size(unsigned char data) { | |
if (data) { | |
if (UTF8_TEST_L(data, UTF8_MASK_6)) { | |
return 6; | |
} else if (UTF8_TEST_L(data, UTF8_MASK_5)) { | |
return 5; | |
} else if (UTF8_TEST_L(data, UTF8_MASK_4)) { | |
return 4; | |
} else if (UTF8_TEST_L(data, UTF8_MASK_3)) { | |
return 3; | |
} else if (UTF8_TEST_L(data, UTF8_MASK_2)) { | |
return 2; | |
} else if ((data & UTF8_MASK_C) == 0) { | |
return 1; | |
} | |
} else { | |
return 0; | |
} | |
return -1; | |
} | |
// At least 6 bytes for output buffer | |
// out_len == 0 // string end | |
// out_len == -1 // failed | |
// NULL for both buffer and out_len are allowed | |
// Return next position if success | |
// Return NULL if string is terminated or failed | |
unsigned char * utf8_get_char(unsigned char * data, unsigned char * buffer, | |
int * out_len) | |
{ | |
int len = utf8_get_char_size(*data); | |
// check every bytes | |
if (len > 1) | |
if (! utf8_check_bytes(data + 1, len - 1)) | |
len = -2; | |
if (out_len) { | |
*out_len = len; | |
} | |
if (buffer && len > 0) { | |
memcpy(buffer, data, len); | |
} | |
return len > 0 ? (data + len) : NULL; | |
} | |
// Count characters in UTF-8 | |
int utf8_char_count(unsigned char * data, int buffer_size) { | |
unsigned char *p = data, *end = data + buffer_size; | |
int n = 0; | |
while ((p = utf8_get_char(p, NULL, NULL)) && (end - p >= 0)) | |
n++; | |
return n; | |
} | |
int main () | |
{ | |
unsigned char buffer[BUFFER_SIZE], data[16] = { 0 }; | |
unsigned char *p = buffer; | |
char filename[1024]; | |
int flen, clen; | |
scanf("%1000s", filename); | |
FILE* fp = fopen(filename, "rb"); | |
if (!fp) { | |
fprintf(stderr, "Open file failed."); | |
return -1; | |
} | |
fseek(fp, 0, SEEK_END); | |
flen = ftell(fp); | |
fseek(fp, 0, SEEK_SET); | |
if (flen < BUFFER_SIZE) { | |
fread(buffer, flen, 1, fp); | |
int total_chars = utf8_char_count(buffer, sizeof(buffer)); | |
printf("total char counts = %d\n", total_chars); | |
while (1) { | |
p = utf8_get_char(p, data, &clen); | |
if (p) { | |
data[clen] = '\0'; | |
printf("Len = %d, Char: %s\n", clen, data); | |
} else { | |
break; | |
} | |
} | |
} else { | |
fprintf(stderr, "File is too large (%d)", flen); | |
} | |
fclose(fp); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment