Last active
August 29, 2015 13:57
-
-
Save chao-he/9668215 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <string.h> | |
#include <stdint.h> | |
#include <ctype.h> | |
typedef uint32_t u32; | |
typedef uint16_t u16; | |
typedef uint8_t u8; | |
static inline int get_utf8_bytes(char c) | |
{ | |
register int bits = 0; | |
for(; 0 != (c & 0x80); c <<= 1) ++ bits; | |
return bits; | |
} | |
u16 get_utf8_char(const char **s) | |
{ | |
const char *p = *s; | |
if(!p || !*p) return 0; | |
int bytes = get_utf8_bytes(*p); | |
int code = 0; | |
register u8 *b = (u8 *)&code; | |
register u8 b1, b2, b3; | |
switch( bytes ) | |
{ | |
case 1: | |
*b = *p; | |
break; | |
case 2: | |
b1 = *p; | |
b2 = *(p + 1); | |
*b = (b1 << 6) + (b2 & 0x3f); | |
*(b + 1) = (b1 >> 2) & 0x07; | |
break; | |
case 3: | |
b1 = *p; | |
b2 = *(p + 1); | |
b3 = *(p + 2); | |
*b = (b2 << 6) + (b3 & 0x3f); | |
*(b + 1) = (b1 << 4) + ((b2 >> 2) & 0x07); | |
break; | |
} | |
*s += bytes; | |
return code; | |
} | |
/* | |
* check the given char is a CJK char or not. | |
* 2E80-2EFF CJK 部首补充 | |
* 2F00-2FDF 康熙字典部首 | |
* 3000-303F CJK 符号和标点 --ignore | |
* 31C0-31EF CJK 笔画 | |
* 3200-32FF 封闭式 CJK 文字和月份 --ignore. | |
* 3300-33FF CJK 兼容 | |
* 3400-4DBF CJK 统一表意符号扩展 A | |
* 4DC0-4DFF 易经六十四卦符号 | |
* 4E00-9FBF CJK 统一表意符号 | |
* F900-FAFF CJK 兼容象形文字 | |
* FE30-FE4F CJK 兼容形式 | |
* FF00-FFEF 全角ASCII、全角标点 --ignore (as basic latin) | |
* | |
* Japanese: | |
* 3040-309F 日本平假名 | |
* 30A0-30FF 日本片假名 | |
* 31F0-31FF 日本片假名拼音扩展 | |
* | |
* Korean: | |
* AC00-D7AF 韩文拼音 | |
* 1100-11FF 韩文字母 | |
* 3130-318F 韩文兼容字母 | |
* | |
*/ | |
#define CTYPE_UNKNOWN 0 | |
#define CTYPE_CJK_CH 1 | |
#define CTYPE_CJK_JP 2 | |
#define CTYPE_CJK_KR 3 | |
int utf8_char_type(u16 u) | |
{ | |
if( ( u >= 0x4E00 && u <= 0x9FBF ) | |
|| ( u >= 0x2E80 && u <= 0x2EFF ) | |
|| ( u >= 0x2F00 && u <= 0x2FDF ) | |
|| ( u >= 0x31C0 && u <= 0x31EF ) //|| ( u >= 0x3200 && u <= 0x32FF ) | |
|| ( u >= 0x3300 && u <= 0x33FF ) //|| ( u >= 0x3400 && u <= 0x4DBF ) | |
|| ( u >= 0x4DC0 && u <= 0x4DFF ) | |
|| ( u >= 0xF900 && u <= 0xFAFF ) | |
|| ( u >= 0xFE30 && u <= 0xFE4F ) ) | |
return CTYPE_CJK_CH; | |
if( ( u >= 0x3040 && u <= 0x309F ) | |
|| ( u >= 0x30A0 && u <= 0x30FF ) | |
|| ( u >= 0x31F0 && u <= 0x31FF ) ) | |
return CTYPE_CJK_JP; | |
if( ( u >= 0xAC00 && u <= 0xD7AF ) | |
|| ( u >= 0x1100 && u <= 0x11FF ) | |
|| ( u >= 0x3130 && u <= 0x318F ) ) | |
return CTYPE_CJK_KR; | |
return CTYPE_UNKNOWN; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment