Skip to content

Instantly share code, notes, and snippets.

@chao-he
Last active August 29, 2015 13:57
Show Gist options
  • Save chao-he/9668215 to your computer and use it in GitHub Desktop.
Save chao-he/9668215 to your computer and use it in GitHub Desktop.
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <ctype.h>
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
static inline int get_utf8_bytes(char c)
{
register int bits = 0;
for(; 0 != (c & 0x80); c <<= 1) ++ bits;
return bits;
}
u16 get_utf8_char(const char **s)
{
const char *p = *s;
if(!p || !*p) return 0;
int bytes = get_utf8_bytes(*p);
int code = 0;
register u8 *b = (u8 *)&code;
register u8 b1, b2, b3;
switch( bytes )
{
case 1:
*b = *p;
break;
case 2:
b1 = *p;
b2 = *(p + 1);
*b = (b1 << 6) + (b2 & 0x3f);
*(b + 1) = (b1 >> 2) & 0x07;
break;
case 3:
b1 = *p;
b2 = *(p + 1);
b3 = *(p + 2);
*b = (b2 << 6) + (b3 & 0x3f);
*(b + 1) = (b1 << 4) + ((b2 >> 2) & 0x07);
break;
}
*s += bytes;
return code;
}
/*
* check the given char is a CJK char or not.
* 2E80-2EFF CJK 部首补充
* 2F00-2FDF 康熙字典部首
* 3000-303F CJK 符号和标点 --ignore
* 31C0-31EF CJK 笔画
* 3200-32FF 封闭式 CJK 文字和月份 --ignore.
* 3300-33FF CJK 兼容
* 3400-4DBF CJK 统一表意符号扩展 A
* 4DC0-4DFF 易经六十四卦符号
* 4E00-9FBF CJK 统一表意符号
* F900-FAFF CJK 兼容象形文字
* FE30-FE4F CJK 兼容形式
* FF00-FFEF 全角ASCII、全角标点 --ignore (as basic latin)
*
* Japanese:
* 3040-309F 日本平假名
* 30A0-30FF 日本片假名
* 31F0-31FF 日本片假名拼音扩展
*
* Korean:
* AC00-D7AF 韩文拼音
* 1100-11FF 韩文字母
* 3130-318F 韩文兼容字母
*
*/
#define CTYPE_UNKNOWN 0
#define CTYPE_CJK_CH 1
#define CTYPE_CJK_JP 2
#define CTYPE_CJK_KR 3
int utf8_char_type(u16 u)
{
if( ( u >= 0x4E00 && u <= 0x9FBF )
|| ( u >= 0x2E80 && u <= 0x2EFF )
|| ( u >= 0x2F00 && u <= 0x2FDF )
|| ( u >= 0x31C0 && u <= 0x31EF ) //|| ( u >= 0x3200 && u <= 0x32FF )
|| ( u >= 0x3300 && u <= 0x33FF ) //|| ( u >= 0x3400 && u <= 0x4DBF )
|| ( u >= 0x4DC0 && u <= 0x4DFF )
|| ( u >= 0xF900 && u <= 0xFAFF )
|| ( u >= 0xFE30 && u <= 0xFE4F ) )
return CTYPE_CJK_CH;
if( ( u >= 0x3040 && u <= 0x309F )
|| ( u >= 0x30A0 && u <= 0x30FF )
|| ( u >= 0x31F0 && u <= 0x31FF ) )
return CTYPE_CJK_JP;
if( ( u >= 0xAC00 && u <= 0xD7AF )
|| ( u >= 0x1100 && u <= 0x11FF )
|| ( u >= 0x3130 && u <= 0x318F ) )
return CTYPE_CJK_KR;
return CTYPE_UNKNOWN;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment