Created
October 19, 2013 14:31
-
-
Save Liutos/7056664 to your computer and use it in GitHub Desktop.
在UTF-8和Code Point之间转换的代码及配套工具
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#define MASK 0x8000 | |
// 计算一个字节中最高位开始的连续为1的位的数量 | |
int count1(char byte) { | |
int count = 0; | |
while ((byte & MASK) == MASK) { | |
count++; | |
byte = byte << 1; | |
} | |
return count; | |
} | |
// 取出一个字节中的低n位组成的数字 | |
int get_low_bits(char byte, int n) { | |
int mask = 0; | |
int i = 0; | |
for (; i < n; i++) { | |
mask = (mask << 1) | 1; | |
} | |
return byte & mask; | |
} | |
// 把一个字节按照二进制编码打印出来,没有对齐数字长度的功能。 | |
void print_binary(char byte) { | |
char msg[256]; | |
int count = 0; | |
if (byte == 0) { | |
puts("0"); | |
return; | |
} | |
while (byte != 0) { | |
msg[count] = byte & 0x1; | |
count++; | |
byte = byte >> 1; | |
} | |
while (count > 0) { | |
printf("%d", msg[count - 1]); | |
count--; | |
} | |
putchar('\n'); | |
} | |
// 简单丑陋的从一个UTF-8编码的字节序列中提取出code point的功能 | |
// 这里的str指向的是一个UTF-8编码的字符的第一个字节 | |
// 简单粗暴的转换方法,按照维基百科UTF-8词条中的表格编写。 | |
int get_code_point(char *str) { | |
int count = count1(*str); | |
switch (count) { | |
case 0: return *str; | |
case 2: { | |
return (get_low_bits(str[0], 5) << 6) | get_low_bits(str[1], 6); | |
} | |
case 3: { | |
return (get_low_bits(str[0], 4) << 12) | (get_low_bits(str[1], 6) << 6) | get_low_bits(str[2], 6); | |
} | |
case 4: { | |
return (get_low_bits(str[0], 3) << 18) | (get_low_bits(str[1], 6) << 12) | (get_low_bits(str[2], 6) << 6) | get_low_bits(str[4], 6); | |
} | |
default : | |
return -1; | |
} | |
} | |
// 简单粗暴的将一个32位无符号整数表示的code point转换为对应的UTF-8编码的字符的功能。 | |
// 同样是按照维基百科上的UTF-8词条进行的简单粗暴的编码。 | |
char *code_point_to_utf8(unsigned int cp) { | |
if (cp < 0x80) { | |
char *str = calloc(1, sizeof(char)); | |
str[0] = cp; | |
return str; | |
} else if (cp < 0x0800) { | |
char *str = calloc(2, sizeof(char)); | |
str[1] = 0x80 | (cp & 0x3F); | |
cp = cp >> 6; | |
str[0] = 0xC0 | (cp & 0x1F); | |
return str; | |
} else if (cp < 0x10000) { | |
char *str = calloc(3, sizeof(char)); | |
str[2] = 0x80 | (cp & 0x3F); | |
cp = cp >> 6; | |
str[1] = 0x80 | (cp & 0x3F); | |
cp = cp >> 6; | |
str[0] = 0xE0 | (cp & 0x0F); | |
return str; | |
} else if (cp < 0x200000) { | |
char *str = calloc(4, sizeof(char)); | |
str[3] = 0x80 | (cp & 0x3F); | |
cp = cp >> 6; | |
str[2] = 0x80 | (cp & 0x3F); | |
cp = cp >> 6; | |
str[1] = 0x80 | (cp & 0x3F); | |
cp = cp >> 6; | |
str[0] = 0xF0 | (cp & 0x07); | |
return str; | |
} else { | |
printf("Error happens..."); | |
exit(1); | |
} | |
} | |
// 符号=>之后的内容表示输出的内容 | |
int main(int argc, char *argv[]) { | |
char *str = "汉"; | |
printf("count1('汉') == %d\n", count1(*str)); // => 3 | |
print_binary(get_low_bits(8, 8)); // => 1000 | |
print_binary(get_low_bits(13, 3)); // => 101 | |
printf("code point of %s is %x\n", str, get_code_point(str)); // => 6c49 | |
printf("code point of %s is %d\n", str, get_code_point(str)); // => 27721 | |
str = code_point_to_utf8(27721); | |
int len = count1(str[0]); | |
len = len == 0? 1: len; | |
for (int i = 0; i < len; i++) { | |
printf("%c", str[i]); | |
} // => 汉 | |
putchar('\n'); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment