Inndy · August 29, 2015 13:59
diff --git a/utf8_split_char.c b/utf8_split_char.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #define BUFFER_SIZE 4096

 // Reference: http://en.wikipedia.org/wiki/UTF-8
 #define UTF8_MASK_C 0x080 // 0b10000000

 #define UTF8_MASK_2 0x0C0 // 0b11000000
 #define UTF8_MASK_3 0x0E0 // 0b11100000
 #define UTF8_MASK_4 0x0F0 // 0b11110000
 #define UTF8_MASK_5 0x0F8 // 0b11111000
 #define UTF8_MASK_6 0x0FC // 0b11111100
 #define UTF8_MKMASK(X) (X | X >> 1)
 // Test leading bytes
 #define UTF8_TEST_L(X, M) (((X) & UTF8_MKMASK(M)) == M)
 // Test body bytes
 #define UTF8_TEST_B(X) (((X) & UTF8_MASK_C) == UTF8_MASK_C)

 // check body bytes
 int utf8_check_bytes(unsigned char * data, int l)
 {
 	while (l--)
 		if (! UTF8_TEST_B(*data++))
 			return 0;
 	return 1;
 }

 // get character size
 int utf8_get_char_size(unsigned char data) {
 	if (data) {
 		if (UTF8_TEST_L(data, UTF8_MASK_6)) {
 			return 6;
 		} else if (UTF8_TEST_L(data, UTF8_MASK_5)) {
 			return 5;
 		} else if (UTF8_TEST_L(data, UTF8_MASK_4)) {
 			return 4;
 		} else if (UTF8_TEST_L(data, UTF8_MASK_3)) {
 			return 3;
 		} else if (UTF8_TEST_L(data, UTF8_MASK_2)) {
 			return 2;
 		} else if ((data & UTF8_MASK_C) == 0) {
 			return 1;
 		}
 	} else {
 		return 0;
 	}
 	return -1;
 }

 // At least 6 bytes for output buffer
 // out_len ==  0 // string end
 // out_len == -1 // failed
 // NULL for both buffer and out_len are allowed
 // Return next position if success
 // Return NULL if string is terminated or failed
 unsigned char * utf8_get_char(unsigned char * data, unsigned char * buffer, 
 	int * out_len)
 {
 	int len = utf8_get_char_size(*data);

 	// check every bytes
 	if (len > 1)
 		if (! utf8_check_bytes(data + 1, len - 1))
 			len = -2;
 	if (out_len) {
 		*out_len = len;
 	}
 	if (buffer && len > 0) {
 		memcpy(buffer, data, len);
 	}
 	return len > 0 ? (data + len) : NULL;
 }

 // Count characters in UTF-8
 int utf8_char_count(unsigned char * data, int buffer_size) {
 	unsigned char *p = data, *end = data + buffer_size;
 	int n = 0;
 	while ((p = utf8_get_char(p, NULL, NULL)) && (end - p >= 0))
 		n++;
 	return n;
 }

 int main ()
 {
 	unsigned char buffer[BUFFER_SIZE], data[16] = { 0 };
 	unsigned char *p = buffer;
 	char filename[1024];
 	int flen, clen;
 	scanf("%1000s", filename);
 	FILE* fp = fopen(filename, "rb");
 	if (!fp) {
 		fprintf(stderr, "Open file failed.");
 		return -1;
 	}

 	fseek(fp, 0, SEEK_END);
 	flen = ftell(fp);
 	fseek(fp, 0, SEEK_SET);

 	if (flen < BUFFER_SIZE) {
 		fread(buffer, flen, 1, fp);
 		int total_chars = utf8_char_count(buffer, sizeof(buffer));
 		printf("total char counts = %d\n", total_chars);
 		while (1) {
 			p = utf8_get_char(p, data, &clen);
 			if (p) {
 				data[clen] = '\0';
 				printf("Len = %d, Char: %s\n", clen, data);
 			} else {
 				break;
 			}
 		}
 	} else {
 		fprintf(stderr, "File is too large (%d)", flen);
 	}

 	fclose(fp);
 	return 0;
 }
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	#define BUFFER_SIZE 4096

	// Reference: http://en.wikipedia.org/wiki/UTF-8
	#define UTF8_MASK_C 0x080 // 0b10000000

	#define UTF8_MASK_2 0x0C0 // 0b11000000
	#define UTF8_MASK_3 0x0E0 // 0b11100000
	#define UTF8_MASK_4 0x0F0 // 0b11110000
	#define UTF8_MASK_5 0x0F8 // 0b11111000
	#define UTF8_MASK_6 0x0FC // 0b11111100
	#define UTF8_MKMASK(X) (X \| X >> 1)
	// Test leading bytes
	#define UTF8_TEST_L(X, M) (((X) & UTF8_MKMASK(M)) == M)
	// Test body bytes
	#define UTF8_TEST_B(X) (((X) & UTF8_MASK_C) == UTF8_MASK_C)

	// check body bytes
	int utf8_check_bytes(unsigned char * data, int l)
	{
	while (l--)
	if (! UTF8_TEST_B(*data++))
	return 0;
	return 1;
	}

	// get character size
	int utf8_get_char_size(unsigned char data) {
	if (data) {
	if (UTF8_TEST_L(data, UTF8_MASK_6)) {
	return 6;
	} else if (UTF8_TEST_L(data, UTF8_MASK_5)) {
	return 5;
	} else if (UTF8_TEST_L(data, UTF8_MASK_4)) {
	return 4;
	} else if (UTF8_TEST_L(data, UTF8_MASK_3)) {
	return 3;
	} else if (UTF8_TEST_L(data, UTF8_MASK_2)) {
	return 2;
	} else if ((data & UTF8_MASK_C) == 0) {
	return 1;
	}
	} else {
	return 0;
	}
	return -1;
	}

	// At least 6 bytes for output buffer
	// out_len == 0 // string end
	// out_len == -1 // failed
	// NULL for both buffer and out_len are allowed
	// Return next position if success
	// Return NULL if string is terminated or failed
	unsigned char * utf8_get_char(unsigned char * data, unsigned char * buffer,
	int * out_len)
	{
	int len = utf8_get_char_size(*data);

	// check every bytes
	if (len > 1)
	if (! utf8_check_bytes(data + 1, len - 1))
	len = -2;
	if (out_len) {
	*out_len = len;
	}
	if (buffer && len > 0) {
	memcpy(buffer, data, len);
	}
	return len > 0 ? (data + len) : NULL;
	}

	// Count characters in UTF-8
	int utf8_char_count(unsigned char * data, int buffer_size) {
	unsigned char p = data, end = data + buffer_size;
	int n = 0;
	while ((p = utf8_get_char(p, NULL, NULL)) && (end - p >= 0))
	n++;
	return n;
	}

	int main ()
	{
	unsigned char buffer[BUFFER_SIZE], data[16] = { 0 };
	unsigned char *p = buffer;
	char filename[1024];
	int flen, clen;
	scanf("%1000s", filename);
	FILE* fp = fopen(filename, "rb");
	if (!fp) {
	fprintf(stderr, "Open file failed.");
	return -1;
	}

	fseek(fp, 0, SEEK_END);
	flen = ftell(fp);
	fseek(fp, 0, SEEK_SET);

	if (flen < BUFFER_SIZE) {
	fread(buffer, flen, 1, fp);
	int total_chars = utf8_char_count(buffer, sizeof(buffer));
	printf("total char counts = %d\n", total_chars);
	while (1) {
	p = utf8_get_char(p, data, &clen);
	if (p) {
	data[clen] = '\0';
	printf("Len = %d, Char: %s\n", clen, data);
	} else {
	break;
	}
	}
	} else {
	fprintf(stderr, "File is too large (%d)", flen);
	}

	fclose(fp);
	return 0;
	}
No results found