Skip to content

Instantly share code, notes, and snippets.

@opsJson
Last active July 29, 2022 22:14
Show Gist options
  • Save opsJson/b785cb90f766f3cdfe519202418b6fa2 to your computer and use it in GitHub Desktop.
Save opsJson/b785cb90f766f3cdfe519202418b6fa2 to your computer and use it in GitHub Desktop.
UTF8 codec in C
void utf8_encode(int codepoint, unsigned char encoded[5]) {
if (codepoint < 0x80) {
encoded[1] = 0;
encoded[0] = codepoint;
encoded[0] &= ~(1 << 7);
return;
}
if (codepoint < 0x800) {
encoded[2] = 0;
encoded[1] = codepoint;
encoded[1] &= ~(1 << 6);
encoded[1] |= 1 << 7;
encoded[0] = codepoint >> 6;
encoded[0] &= ~(1 << 5);
encoded[0] |= 1 << 6;
encoded[0] |= 1 << 7;
return;
}
if (codepoint < 0x10000) {
encoded[3] = 0;
encoded[2] = codepoint;
encoded[2] &= ~(1 << 6);
encoded[2] |= 1 << 7;
encoded[1] = codepoint >> 6;
encoded[1] &= ~(1 << 6);
encoded[1] |= 1 << 7;
encoded[0] = codepoint >> 12;
encoded[0] &= ~(1 << 4);
encoded[0] |= 1 << 5;
encoded[0] |= 1 << 6;
encoded[0] |= 1 << 7;
return;
}
if (codepoint < 0x200000) {
encoded[4] = 0;
encoded[3] = codepoint;
encoded[3] &= ~(1 << 6);
encoded[3] |= 1 << 7;
encoded[2] = codepoint >> 6;
encoded[2] &= ~(1 << 6);
encoded[2] |= 1 << 7;
encoded[1] = codepoint >> 12;
encoded[1] &= ~(1 << 6);
encoded[1] |= 1 << 7;
encoded[0] = codepoint >> 18;
encoded[0] &= ~(1 << 3);
encoded[0] |= 1 << 4;
encoded[0] |= 1 << 5;
encoded[0] |= 1 << 6;
encoded[0] |= 1 << 7;
return;
}
}
unsigned int utf8_decode(unsigned char encoded[5]) {
unsigned int header = (unsigned int)encoded[0];
unsigned int decoded = 0;
if (header <= 0x7F)
decoded = encoded[0] & 127;
else if (header <= 0xDF)
decoded = ((header & 31) << 6) | ((encoded[1] & 63) << 0);
else if (header <= 0xEF)
decoded = ((header & 15) << 12) | ((encoded[1] & 63) << 6) | ((encoded[2] & 63) << 0);
else if (header <= 0xF7)
decoded = ((header & 7) << 18) | ((encoded[1] & 63) << 12) | ((encoded[2] & 63) << 6) | ((encoded[3] & 63) << 0);
return decoded;
}
/*///////////////////////////////////
Testing:
///////////////////////////////////*/
#include <stdio.h>
int main(void) {
int i;
char encoded[5] = {0};
FILE *fp;
if ((fp = fopen("unicode_test.txt", "w")) == NULL) return 1;
for (i=0x1F300; i<0x1F300+1000; i++) {
utf8_encode(i, encoded);
fprintf(fp, "%s", encoded);
printf("0x%x == 0x%x\n", utf8_decode(encoded), i);
if ((i+1) % 100 == 0) fprintf(fp, "\n");
}
fclose(fp);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment