Skip to content

Instantly share code, notes, and snippets.

@shwangdev
Created October 14, 2011 07:32
Show Gist options
  • Save shwangdev/1286475 to your computer and use it in GitHub Desktop.
Save shwangdev/1286475 to your computer and use it in GitHub Desktop.
#include "utf8.h"
inline static unsigned short xml_encode_iso_8859_1(unsigned char);
inline static char xml_decode_iso_8859_1(unsigned short);
inline static unsigned short xml_encode_us_ascii(unsigned char);
inline static char xml_decode_us_ascii(unsigned short);
static void *emalloc(size_t size)
{
void *p = malloc(size);
if (p == NULL) {
fprintf(stderr, "Out of memory!\n");
exit(1);
}
return p;
}
static void *erealloc(void *ptr, size_t size)
{
void *p = realloc(ptr, size);
if (p == NULL) {
fprintf(stderr, "Out of memory!\n");
exit(1);
}
return p;
}
/* All the encoding functions are set to NULL right now, since all
* the encoding is currently done internally by expat/xmltok.
*/
xml_encoding xml_encodings[] = {
{ "ISO-8859-1", xml_decode_iso_8859_1, xml_encode_iso_8859_1 },
{ "US-ASCII", xml_decode_us_ascii, xml_encode_us_ascii },
{ "UTF-8", NULL, NULL },
{ NULL, NULL, NULL }
};
inline static unsigned short xml_encode_iso_8859_1(unsigned char c)
{
return (unsigned short)c;
}
inline static char xml_decode_iso_8859_1(unsigned short c)
{
return (char)(c > 0xff ? '?' : c);
}
inline static unsigned short xml_encode_us_ascii(unsigned char c)
{
return (unsigned short)c;
}
inline static char xml_decode_us_ascii(unsigned short c)
{
return (char)(c > 0x7f ? '?' : c);
}
static xml_encoding *xml_get_encoding(const XML_Char *name)
{
xml_encoding *enc = &xml_encodings[0];
while (enc && enc->name) {
if (strcasecmp(name, enc->name) == 0)
return enc;
enc++;
}
return NULL;
}
static char *xml_utf8_encode(const char *s, int len, int *newlen,
const XML_Char *encoding)
{
int pos = len;
char *newbuf;
unsigned int c;
unsigned short (*encoder)(unsigned char) = NULL;
xml_encoding *enc = xml_get_encoding(encoding);
*newlen = 0;
if (enc)
encoder = enc->encoding_function;
else
/* If the target encoding was unknown, fail */
return NULL;
if (encoder == NULL) {
/* If no encoder function was specified, return the data as-is.
*/
newbuf = (char*)emalloc(len + 1);
memcpy(newbuf, s, len);
*newlen = len;
newbuf[*newlen] = '\0';
return newbuf;
}
/* This is the theoretical max (will never get beyond len * 2 as long
* as we are converting from single-byte characters, though) */
newbuf = emalloc(len);
while (pos > 0) {
c = encoder ? encoder((unsigned char)(*s)) : (unsigned short)(*s);
if (c < 0x80)
newbuf[(*newlen)++] = (char) c;
else if (c < 0x800) {
newbuf[(*newlen)++] = (0xc0 | (c >> 6));
newbuf[(*newlen)++] = (0x80 | (c & 0x3f));
}
else if (c < 0x10000) {
newbuf[(*newlen)++] = (0xe0 | (c >> 12));
newbuf[(*newlen)++] = (0xc0 | ((c >> 6) & 0x3f));
newbuf[(*newlen)++] = (0x80 | (c & 0x3f));
}
else if (c < 0x200000) {
newbuf[(*newlen)++] = (0xf0 | (c >> 18));
newbuf[(*newlen)++] = (0xe0 | ((c >> 12) & 0x3f));
newbuf[(*newlen)++] = (0xc0 | ((c >> 6) & 0x3f));
newbuf[(*newlen)++] = (0x80 | (c & 0x3f));
}
pos--;
s++;
}
newbuf[*newlen] = 0;
newbuf = erealloc(newbuf, (*newlen)+1);
return newbuf;
}
static char *xml_utf8_decode(const XML_Char *s, int len, int *newlen,
const XML_Char *encoding)
{
int pos = len;
char *newbuf = emalloc(len + 1);
unsigned short c;
char (*decoder)(unsigned short) = NULL;
xml_encoding *enc = xml_get_encoding(encoding);
*newlen = 0;
if (enc)
decoder = enc->decoding_function;
if (decoder == NULL) {
/* If the target encoding was unknown, or no decoder function
* was specified, return the UTF-8-encoded data as-is.
*/
memcpy(newbuf, s, len);
*newlen = len;
newbuf[*newlen] = '\0';
return newbuf;
}
while (pos > 0) {
c = (unsigned char)(*s);
if (c >= 0xf0) { /* four bytes encoded, 21 bits */
if(pos-4 >= 0)
c = ((s[0]&7)<<18) | ((s[1]&63)<<12) | ((s[2]&63)<<6) | (s[3]&63);
else
c = '?';
s += 4;
pos -= 4;
}
else if (c >= 0xe0) { /* three bytes encoded, 16 bits */
if(pos-3 >= 0)
c = ((s[0]&63)<<12) | ((s[1]&63)<<6) | (s[2]&63);
else
c = '?';
s += 3;
pos -= 3;
}
else if (c >= 0xc0) { /* two bytes encoded, 11 bits */
if(pos-2 >= 0)
c = ((s[0]&63)<<6) | (s[1]&63);
else
c = '?';
s += 2;
pos -= 2;
}
else {
s++;
pos--;
}
newbuf[*newlen] = decoder ? decoder(c) : c;
++*newlen;
}
if (*newlen < len)
newbuf = erealloc(newbuf, *newlen + 1);
newbuf[*newlen] = '\0';
return newbuf;
}
/* Public function */
char *utf8_encode(const char *str)
{
char *out;
if (strlen(str)) {
int alen, len;
alen = strlen(str);
out = xml_utf8_encode(str, alen, &len, "ISO-8859-1");
}
return out;
}
char *utf8_decode(const char *str)
{
char *out;
if (strlen(str)) {
int alen, len;
alen = strlen(str);
out = xml_utf8_decode(str, alen, &len, "ISO-8859-1");
}
return out;
}
#ifndef _UTF8_H
#define _UTF8_H
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef char XML_Char;
typedef struct {
XML_Char *name;
char (*decoding_function)(unsigned short);
unsigned short (*encoding_function)(unsigned char);
} xml_encoding;
char *utf8_encode (const char *in);
char *utf8_decode (const char *in);
static char *xml_utf8_decode (const XML_Char *, int, int *, const XML_Char *);
static char *xml_utf8_encode (const char *s, int len, int *newlen,
const XML_Char *encoding);
static void *emalloc (size_t size);
static void *erealloc (void* ptr, size_t size);
#endif /* _UTF8_H */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment