Created
October 14, 2011 07:32
-
-
Save shwangdev/1286475 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "utf8.h" | |
inline static unsigned short xml_encode_iso_8859_1(unsigned char); | |
inline static char xml_decode_iso_8859_1(unsigned short); | |
inline static unsigned short xml_encode_us_ascii(unsigned char); | |
inline static char xml_decode_us_ascii(unsigned short); | |
static void *emalloc(size_t size) | |
{ | |
void *p = malloc(size); | |
if (p == NULL) { | |
fprintf(stderr, "Out of memory!\n"); | |
exit(1); | |
} | |
return p; | |
} | |
static void *erealloc(void *ptr, size_t size) | |
{ | |
void *p = realloc(ptr, size); | |
if (p == NULL) { | |
fprintf(stderr, "Out of memory!\n"); | |
exit(1); | |
} | |
return p; | |
} | |
/* All the encoding functions are set to NULL right now, since all | |
* the encoding is currently done internally by expat/xmltok. | |
*/ | |
xml_encoding xml_encodings[] = { | |
{ "ISO-8859-1", xml_decode_iso_8859_1, xml_encode_iso_8859_1 }, | |
{ "US-ASCII", xml_decode_us_ascii, xml_encode_us_ascii }, | |
{ "UTF-8", NULL, NULL }, | |
{ NULL, NULL, NULL } | |
}; | |
inline static unsigned short xml_encode_iso_8859_1(unsigned char c) | |
{ | |
return (unsigned short)c; | |
} | |
inline static char xml_decode_iso_8859_1(unsigned short c) | |
{ | |
return (char)(c > 0xff ? '?' : c); | |
} | |
inline static unsigned short xml_encode_us_ascii(unsigned char c) | |
{ | |
return (unsigned short)c; | |
} | |
inline static char xml_decode_us_ascii(unsigned short c) | |
{ | |
return (char)(c > 0x7f ? '?' : c); | |
} | |
static xml_encoding *xml_get_encoding(const XML_Char *name) | |
{ | |
xml_encoding *enc = &xml_encodings[0]; | |
while (enc && enc->name) { | |
if (strcasecmp(name, enc->name) == 0) | |
return enc; | |
enc++; | |
} | |
return NULL; | |
} | |
static char *xml_utf8_encode(const char *s, int len, int *newlen, | |
const XML_Char *encoding) | |
{ | |
int pos = len; | |
char *newbuf; | |
unsigned int c; | |
unsigned short (*encoder)(unsigned char) = NULL; | |
xml_encoding *enc = xml_get_encoding(encoding); | |
*newlen = 0; | |
if (enc) | |
encoder = enc->encoding_function; | |
else | |
/* If the target encoding was unknown, fail */ | |
return NULL; | |
if (encoder == NULL) { | |
/* If no encoder function was specified, return the data as-is. | |
*/ | |
newbuf = (char*)emalloc(len + 1); | |
memcpy(newbuf, s, len); | |
*newlen = len; | |
newbuf[*newlen] = '\0'; | |
return newbuf; | |
} | |
/* This is the theoretical max (will never get beyond len * 2 as long | |
* as we are converting from single-byte characters, though) */ | |
newbuf = emalloc(len); | |
while (pos > 0) { | |
c = encoder ? encoder((unsigned char)(*s)) : (unsigned short)(*s); | |
if (c < 0x80) | |
newbuf[(*newlen)++] = (char) c; | |
else if (c < 0x800) { | |
newbuf[(*newlen)++] = (0xc0 | (c >> 6)); | |
newbuf[(*newlen)++] = (0x80 | (c & 0x3f)); | |
} | |
else if (c < 0x10000) { | |
newbuf[(*newlen)++] = (0xe0 | (c >> 12)); | |
newbuf[(*newlen)++] = (0xc0 | ((c >> 6) & 0x3f)); | |
newbuf[(*newlen)++] = (0x80 | (c & 0x3f)); | |
} | |
else if (c < 0x200000) { | |
newbuf[(*newlen)++] = (0xf0 | (c >> 18)); | |
newbuf[(*newlen)++] = (0xe0 | ((c >> 12) & 0x3f)); | |
newbuf[(*newlen)++] = (0xc0 | ((c >> 6) & 0x3f)); | |
newbuf[(*newlen)++] = (0x80 | (c & 0x3f)); | |
} | |
pos--; | |
s++; | |
} | |
newbuf[*newlen] = 0; | |
newbuf = erealloc(newbuf, (*newlen)+1); | |
return newbuf; | |
} | |
static char *xml_utf8_decode(const XML_Char *s, int len, int *newlen, | |
const XML_Char *encoding) | |
{ | |
int pos = len; | |
char *newbuf = emalloc(len + 1); | |
unsigned short c; | |
char (*decoder)(unsigned short) = NULL; | |
xml_encoding *enc = xml_get_encoding(encoding); | |
*newlen = 0; | |
if (enc) | |
decoder = enc->decoding_function; | |
if (decoder == NULL) { | |
/* If the target encoding was unknown, or no decoder function | |
* was specified, return the UTF-8-encoded data as-is. | |
*/ | |
memcpy(newbuf, s, len); | |
*newlen = len; | |
newbuf[*newlen] = '\0'; | |
return newbuf; | |
} | |
while (pos > 0) { | |
c = (unsigned char)(*s); | |
if (c >= 0xf0) { /* four bytes encoded, 21 bits */ | |
if(pos-4 >= 0) | |
c = ((s[0]&7)<<18) | ((s[1]&63)<<12) | ((s[2]&63)<<6) | (s[3]&63); | |
else | |
c = '?'; | |
s += 4; | |
pos -= 4; | |
} | |
else if (c >= 0xe0) { /* three bytes encoded, 16 bits */ | |
if(pos-3 >= 0) | |
c = ((s[0]&63)<<12) | ((s[1]&63)<<6) | (s[2]&63); | |
else | |
c = '?'; | |
s += 3; | |
pos -= 3; | |
} | |
else if (c >= 0xc0) { /* two bytes encoded, 11 bits */ | |
if(pos-2 >= 0) | |
c = ((s[0]&63)<<6) | (s[1]&63); | |
else | |
c = '?'; | |
s += 2; | |
pos -= 2; | |
} | |
else { | |
s++; | |
pos--; | |
} | |
newbuf[*newlen] = decoder ? decoder(c) : c; | |
++*newlen; | |
} | |
if (*newlen < len) | |
newbuf = erealloc(newbuf, *newlen + 1); | |
newbuf[*newlen] = '\0'; | |
return newbuf; | |
} | |
/* Public function */ | |
char *utf8_encode(const char *str) | |
{ | |
char *out; | |
if (strlen(str)) { | |
int alen, len; | |
alen = strlen(str); | |
out = xml_utf8_encode(str, alen, &len, "ISO-8859-1"); | |
} | |
return out; | |
} | |
char *utf8_decode(const char *str) | |
{ | |
char *out; | |
if (strlen(str)) { | |
int alen, len; | |
alen = strlen(str); | |
out = xml_utf8_decode(str, alen, &len, "ISO-8859-1"); | |
} | |
return out; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef _UTF8_H | |
#define _UTF8_H | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
typedef char XML_Char; | |
typedef struct { | |
XML_Char *name; | |
char (*decoding_function)(unsigned short); | |
unsigned short (*encoding_function)(unsigned char); | |
} xml_encoding; | |
char *utf8_encode (const char *in); | |
char *utf8_decode (const char *in); | |
static char *xml_utf8_decode (const XML_Char *, int, int *, const XML_Char *); | |
static char *xml_utf8_encode (const char *s, int len, int *newlen, | |
const XML_Char *encoding); | |
static void *emalloc (size_t size); | |
static void *erealloc (void* ptr, size_t size); | |
#endif /* _UTF8_H */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment