Last active
April 18, 2017 00:35
-
-
Save zephray/c4a7798dc2becf23c24ea08465135015 to your computer and use it in GitHub Desktop.
UTF8 to CP1252
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]> | |
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
// Copyright (c) 2017 ZephRay <[email protected]> | |
// | |
// utf8to1252 - almost equivalent to iconv -f utf-8 -t windows-1252, but better | |
// | |
// What this program can do? | |
// Sometimes you would encounter some "double utf-8 encoded file", most cases | |
// from a MySQL dump. If this happens, you may find some luck use this program | |
// to fix your file! | |
// What does this program do? | |
// In my case, I stored something in UTF-8 in my database, for example the | |
// Chinese character '检', it was encoded into UTF-8 sequence 0xe3 0xa3 0x80. | |
// Then when it was exported, it was encoded with UTF-8 again! Actually it | |
// consists two steps, the first step is to Unicode Code Point (convert charset) | |
// then to UTF-8 (encoding). I am not sure why MySQL picked CP1252 rather than | |
// ISO-8859 (which make more sense since it's compatiable with Unicode), but | |
// this is what happend. So, 0xe3 0xa3 was converted to 0x00e3 and 0x00a3, | |
// respectively. 0x80 is tricky, 0x80 in CP1252 is converted to 0x20ac in the | |
// unicode. And finally, 0x00e3 0x00a3 0x20ac was encoded into UTF-8 : 0xc3 | |
// 0xa6 0xc2 0xa3 0xe2 0x82 0xac. | |
// So, in short, the MySQL treated the UTF-8 encoded string as a CP1252 ANSI | |
// string and encoded it with UTF-8 again. To recover that, simply do a UTF-8 to | |
// CP1252 convertion and we should be fine. But I found iconv's Windows-1252 | |
// charset will not handle all 256 characters correctly(actually not characters, | |
// just UTF-8 encoded byte sequence), so I wrote this to fix that. | |
// So as you can see, this program would read a file, then decode it from UTF-8 | |
// to Unicode, then it tries to convert the decoded Unicode Code Point to CP1252 | |
// char, which could fill the whole 0x00-0xFF space. Then the double-encoded | |
// UTF-8 file is now single-encoded, problem solved. | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#define UTF8_ACCEPT 0 | |
#define UTF8_REJECT 1 | |
static const uint8_t utf8d[] = { | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f | |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf | |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df | |
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef | |
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff | |
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 | |
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 | |
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 | |
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 | |
}; | |
uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) { | |
uint32_t type = utf8d[byte]; | |
*codep = (*state != UTF8_ACCEPT) ? | |
(byte & 0x3fu) | (*codep << 6) : | |
(0xff >> type) & (byte); | |
*state = utf8d[256 + *state*16 + type]; | |
return *state; | |
} | |
long fileLoad(const char * filename, unsigned char * * buffer) { | |
FILE * pFile; | |
long lSize; | |
pFile = fopen(filename, "rb"); | |
if (pFile != NULL) { | |
fseek(pFile, 0, SEEK_END); | |
lSize = ftell(pFile); | |
rewind(pFile); | |
*buffer = (unsigned char *)malloc(lSize); | |
if (buffer != NULL) { | |
return fread(*buffer, 1, lSize, pFile); | |
} | |
fclose(pFile); | |
} | |
return 0; | |
} | |
uint8_t UNICODEtoCP1252(uint16_t chr) { | |
if (chr <= 0xff) | |
return (chr&0xff); | |
else { | |
switch(chr) { | |
case 0x20ac: return 0x80; break; | |
case 0x201a: return 0x82; break; | |
case 0x0192: return 0x83; break; | |
case 0x201e: return 0x84; break; | |
case 0x2026: return 0x85; break; | |
case 0x2020: return 0x86; break; | |
case 0x2021: return 0x87; break; | |
case 0x02c6: return 0x88; break; | |
case 0x2030: return 0x89; break; | |
case 0x0160: return 0x8a; break; | |
case 0x2039: return 0x8b; break; | |
case 0x0152: return 0x8c; break; | |
case 0x017d: return 0x8e; break; | |
case 0x2018: return 0x91; break; | |
case 0x2019: return 0x92; break; | |
case 0x201c: return 0x93; break; | |
case 0x201d: return 0x94; break; | |
case 0x2022: return 0x95; break; | |
case 0x2013: return 0x96; break; | |
case 0x2014: return 0x97; break; | |
case 0x02dc: return 0x98; break; | |
case 0x2122: return 0x99; break; | |
case 0x0161: return 0x9a; break; | |
case 0x203a: return 0x9b; break; | |
case 0x0153: return 0x9c; break; | |
case 0x017e: return 0x9e; break; | |
case 0x0178: return 0x9f; break; | |
default: | |
printf(stderr, "Your file is probably NOT a UTF-8 encoded CP1251 file.\n"); | |
return 0x00; break; | |
} | |
} | |
} | |
int main(int argc, char* argv[]) { | |
uint32_t codepoint; | |
uint32_t state = 0; | |
uint8_t *s; | |
uint8_t chr; | |
long size, pos; | |
FILE *fp; | |
if (argc != 2) { | |
printf("utf8to1251 input.file\n"); | |
return 0; | |
} | |
size = fileLoad(argv[1], &s); | |
for (pos=0; pos<size; pos++) { | |
if (!decode(&state, &codepoint, *s)) { | |
chr = UNICODEtoCP1252(codepoint); | |
printf("%c", chr); | |
} | |
s++; | |
} | |
if (state != UTF8_ACCEPT) | |
printf(stderr, "Your file is probably NOT encoded in UTF-8.\n"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment