Created
August 13, 2013 04:07
-
-
Save mtornwall/6217806 to your computer and use it in GitHub Desktop.
naïve utf8 encode/decode
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdint.h> | |
typedef uint32_t rune; | |
uint8_t * | |
encode(rune *runes, uint8_t *buf, size_t length) | |
{ | |
for (size_t i = 0; i < length; i++) { | |
rune r = runes[i]; | |
unsigned n, k; | |
if (r > 0x10FFFF) { | |
return NULL; // Out of range per RFC 3629. | |
} else if (r >= 0x10000) { | |
n = 4; | |
} else if (r >= 0x800) { | |
n = 3; | |
} else if (r >= 0x80) { | |
n = 2; | |
} else { | |
// We have an ASCII character. Echo as-is. | |
*buf++ = (uint8_t) r; | |
continue; | |
} | |
/* This is hairy. The high-order bits must come first, so we'll | |
* emit the bytes "backwards", ending with the first one. | |
*/ | |
for (k = n; k > 1; k--) { | |
*(buf + k - 1) = 0x80 | (r & 0x3f); | |
r >>= 6; | |
} | |
*buf = ~(0xff >> n) | ((uint8_t) r & (0x7f >> n)); | |
buf += n; | |
} | |
*buf = 0; | |
return buf; | |
} | |
rune * | |
decode(uint8_t *u, rune *buf, size_t *length) | |
{ | |
rune r = 0; | |
*length = 0; | |
while (*u) { | |
// Start of UTF-8 sequence? | |
if (*u >= 0xC0) { | |
unsigned n = 0; | |
// Figure out the total length of the sequence. | |
while (*u & (0x80 >> n)) | |
n++; | |
// The first few bits of data come from the start sequence. | |
r = *u++ & (0xff >> n); | |
// Consume all continuation bytes. | |
for (; n > 1; n--) { | |
if ((*u & 0xC0) != 0x80) | |
return NULL; | |
r = (r << 6) | (*u++ & 0x3f); | |
} | |
} else { | |
r = (rune) *u++; | |
} | |
buf[(*length)++] = r; | |
} | |
return buf; | |
} | |
int main() | |
{ | |
char utf8[1024]; | |
rune runes[1024]; | |
size_t length; | |
fgets(utf8,sizeof(utf8),stdin); | |
decode(utf8, runes, &length); | |
for (size_t i = 0; i < length; i++) { | |
printf("%x ", runes[i]); | |
} | |
memset(utf8,0,sizeof(utf8)); | |
encode(runes,utf8,length); | |
printf("\n%s\n",utf8); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment