zephray · April 18, 2017 00:35
diff --git a/utf8to1252.c b/utf8to1252.c
 // Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
 // Copyright (c) 2017 ZephRay <[email protected]>
 //
 // utf8to1252 - almost equivalent to iconv -f utf-8 -t windows-1252, but better
 //
 // What this program can do?
 // Sometimes you would encounter some "double utf-8 encoded file", most cases
 // from a MySQL dump. If this happens, you may find some luck use this program
 // to fix your file!

 // What does this program do?
 // In my case, I stored something in UTF-8 in my database, for example the
 // Chinese character '检', it was encoded into UTF-8 sequence 0xe3 0xa3 0x80.
 // Then when it was exported, it was encoded with UTF-8 again! Actually it 
 // consists two steps, the first step is to Unicode Code Point (convert charset)
 // then to UTF-8 (encoding). I am not sure why MySQL picked CP1252 rather than
 // ISO-8859 (which make more sense since it's compatiable with Unicode), but
 // this is what happend. So, 0xe3 0xa3 was converted to 0x00e3 and 0x00a3,
 // respectively. 0x80 is tricky, 0x80 in CP1252 is converted to 0x20ac in the
 // unicode. And finally, 0x00e3 0x00a3 0x20ac was encoded into UTF-8 : 0xc3
 // 0xa6 0xc2 0xa3 0xe2 0x82 0xac.
 // So, in short, the MySQL treated the UTF-8 encoded string as a CP1252 ANSI
 // string and encoded it with UTF-8 again. To recover that, simply do a UTF-8 to
 // CP1252 convertion and we should be fine. But I found iconv's Windows-1252
 // charset will not handle all 256 characters correctly(actually not characters,
 // just UTF-8 encoded byte sequence), so I wrote this to fix that.
 // So as you can see, this program would read a file, then decode it from UTF-8
 // to Unicode, then it tries to convert the decoded Unicode Code Point to CP1252
 // char, which could fill the whole 0x00-0xFF space. Then the double-encoded
 // UTF-8 file is now single-encoded, problem solved.

 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>

 #define UTF8_ACCEPT 0
 #define UTF8_REJECT 1

 static const uint8_t utf8d[] = {
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
 	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
 	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
 	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
 	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
 	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
 	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
 	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
 	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
 };

 uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
 	uint32_t type = utf8d[byte];

 	*codep = (*state != UTF8_ACCEPT) ?
 		(byte & 0x3fu) | (*codep << 6) :
 		(0xff >> type) & (byte);

 	*state = utf8d[256 + *state*16 + type];
 	return *state;
 }

 long fileLoad(const char * filename, unsigned char * * buffer) {
 		FILE * pFile;
 		long lSize;
 	
 		pFile = fopen(filename, "rb");
 		if (pFile != NULL) {
 				fseek(pFile, 0, SEEK_END);
 				lSize = ftell(pFile);
 				rewind(pFile);
 				*buffer = (unsigned char *)malloc(lSize);
 				if (buffer != NULL) {
 						return fread(*buffer, 1, lSize, pFile);
 				}
 				fclose(pFile);
 		}
 		return 0;
 }

 uint8_t UNICODEtoCP1252(uint16_t chr) {
 	if (chr <= 0xff)
 		return (chr&0xff);
 	else {
 		switch(chr) {
 			case 0x20ac: return 0x80; break;
 			case 0x201a: return 0x82; break;
 			case 0x0192: return 0x83; break;
 			case 0x201e: return 0x84; break;
 			case 0x2026: return 0x85; break;
 			case 0x2020: return 0x86; break;
 			case 0x2021: return 0x87; break;
 			case 0x02c6: return 0x88; break;
 			case 0x2030: return 0x89; break;
 			case 0x0160: return 0x8a; break;
 			case 0x2039: return 0x8b; break;
 			case 0x0152: return 0x8c; break;
 			case 0x017d: return 0x8e; break;
 			case 0x2018: return 0x91; break;
 			case 0x2019: return 0x92; break;
 			case 0x201c: return 0x93; break;
 			case 0x201d: return 0x94; break;
 			case 0x2022: return 0x95; break;
 			case 0x2013: return 0x96; break;
 			case 0x2014: return 0x97; break;
 			case 0x02dc: return 0x98; break;
 			case 0x2122: return 0x99; break;
 			case 0x0161: return 0x9a; break;
 			case 0x203a: return 0x9b; break;
 			case 0x0153: return 0x9c; break;
 			case 0x017e: return 0x9e; break;
 			case 0x0178: return 0x9f; break;
 			default: 
 				printf(stderr, "Your file is probably NOT a UTF-8 encoded CP1251 file.\n"); 
 				return 0x00; break;
 		}
 	}
 }

 int main(int argc, char* argv[]) {
 	uint32_t codepoint;
 	uint32_t state = 0;
 	uint8_t *s;
 	uint8_t chr;
 	long size, pos;
 	FILE *fp;

 	if (argc != 2) {
 		printf("utf8to1251 input.file\n");
 		return 0;
 	}

 	size = fileLoad(argv[1], &s);

 	for (pos=0; pos<size; pos++) {
 		if (!decode(&state, &codepoint, *s)) {			
 			chr = UNICODEtoCP1252(codepoint);
 			printf("%c", chr);
 		}
 		s++;
 	}

 	if (state != UTF8_ACCEPT)
 		printf(stderr, "Your file is probably NOT encoded in UTF-8.\n");

 	return 0;
 }
	// Copyright (c) 2008-2009 Bjoern Hoehrmann <[email protected]>
	// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
	// Copyright (c) 2017 ZephRay <[email protected]>
	//
	// utf8to1252 - almost equivalent to iconv -f utf-8 -t windows-1252, but better
	//
	// What this program can do?
	// Sometimes you would encounter some "double utf-8 encoded file", most cases
	// from a MySQL dump. If this happens, you may find some luck use this program
	// to fix your file!

	// What does this program do?
	// In my case, I stored something in UTF-8 in my database, for example the
	// Chinese character '检', it was encoded into UTF-8 sequence 0xe3 0xa3 0x80.
	// Then when it was exported, it was encoded with UTF-8 again! Actually it
	// consists two steps, the first step is to Unicode Code Point (convert charset)
	// then to UTF-8 (encoding). I am not sure why MySQL picked CP1252 rather than
	// ISO-8859 (which make more sense since it's compatiable with Unicode), but
	// this is what happend. So, 0xe3 0xa3 was converted to 0x00e3 and 0x00a3,
	// respectively. 0x80 is tricky, 0x80 in CP1252 is converted to 0x20ac in the
	// unicode. And finally, 0x00e3 0x00a3 0x20ac was encoded into UTF-8 : 0xc3
	// 0xa6 0xc2 0xa3 0xe2 0x82 0xac.
	// So, in short, the MySQL treated the UTF-8 encoded string as a CP1252 ANSI
	// string and encoded it with UTF-8 again. To recover that, simply do a UTF-8 to
	// CP1252 convertion and we should be fine. But I found iconv's Windows-1252
	// charset will not handle all 256 characters correctly(actually not characters,
	// just UTF-8 encoded byte sequence), so I wrote this to fix that.
	// So as you can see, this program would read a file, then decode it from UTF-8
	// to Unicode, then it tries to convert the decoded Unicode Code Point to CP1252
	// char, which could fill the whole 0x00-0xFF space. Then the double-encoded
	// UTF-8 file is now single-encoded, problem solved.

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>

	#define UTF8_ACCEPT 0
	#define UTF8_REJECT 1

	static const uint8_t utf8d[] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
	};

	uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
	uint32_t type = utf8d[byte];

	codep = (state != UTF8_ACCEPT) ?
	(byte & 0x3fu) \| (*codep << 6) :
	(0xff >> type) & (byte);

	state = utf8d[256 + state*16 + type];
	return *state;
	}

	long fileLoad(const char * filename, unsigned char * * buffer) {
	FILE * pFile;
	long lSize;

	pFile = fopen(filename, "rb");
	if (pFile != NULL) {
	fseek(pFile, 0, SEEK_END);
	lSize = ftell(pFile);
	rewind(pFile);
	buffer = (unsigned char )malloc(lSize);
	if (buffer != NULL) {
	return fread(*buffer, 1, lSize, pFile);
	}
	fclose(pFile);
	}
	return 0;
	}

	uint8_t UNICODEtoCP1252(uint16_t chr) {
	if (chr <= 0xff)
	return (chr&0xff);
	else {
	switch(chr) {
	case 0x20ac: return 0x80; break;
	case 0x201a: return 0x82; break;
	case 0x0192: return 0x83; break;
	case 0x201e: return 0x84; break;
	case 0x2026: return 0x85; break;
	case 0x2020: return 0x86; break;
	case 0x2021: return 0x87; break;
	case 0x02c6: return 0x88; break;
	case 0x2030: return 0x89; break;
	case 0x0160: return 0x8a; break;
	case 0x2039: return 0x8b; break;
	case 0x0152: return 0x8c; break;
	case 0x017d: return 0x8e; break;
	case 0x2018: return 0x91; break;
	case 0x2019: return 0x92; break;
	case 0x201c: return 0x93; break;
	case 0x201d: return 0x94; break;
	case 0x2022: return 0x95; break;
	case 0x2013: return 0x96; break;
	case 0x2014: return 0x97; break;
	case 0x02dc: return 0x98; break;
	case 0x2122: return 0x99; break;
	case 0x0161: return 0x9a; break;
	case 0x203a: return 0x9b; break;
	case 0x0153: return 0x9c; break;
	case 0x017e: return 0x9e; break;
	case 0x0178: return 0x9f; break;
	default:
	printf(stderr, "Your file is probably NOT a UTF-8 encoded CP1251 file.\n");
	return 0x00; break;
	}
	}
	}

	int main(int argc, char* argv[]) {
	uint32_t codepoint;
	uint32_t state = 0;
	uint8_t *s;
	uint8_t chr;
	long size, pos;
	FILE *fp;

	if (argc != 2) {
	printf("utf8to1251 input.file\n");
	return 0;
	}

	size = fileLoad(argv[1], &s);

	for (pos=0; pos<size; pos++) {
	if (!decode(&state, &codepoint, *s)) {
	chr = UNICODEtoCP1252(codepoint);
	printf("%c", chr);
	}
	s++;
	}

	if (state != UTF8_ACCEPT)
	printf(stderr, "Your file is probably NOT encoded in UTF-8.\n");

	return 0;
	}