Created
March 21, 2021 10:04
-
-
Save jtanx/06f07bc9bd0782c1107959072bf77478 to your computer and use it in GitHub Desktop.
fontforge encoding comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <utype.h> | |
#include <chardata.h> | |
#include <encoding.h> | |
static int umodenc(int enc, int modtype) | |
{ | |
if (modtype == -1) | |
return (-1); | |
if (modtype <= 1 /* Unicode */) | |
{ | |
/* No conversion needed, already unicode */; | |
} | |
else if (modtype == 2 /* SJIS */) | |
{ | |
if (enc <= 127) | |
{ | |
/* Latin */ | |
if (enc == '\\') | |
enc = 0xa5; /* Yen */ | |
} | |
else if (enc >= 161 && enc <= 223) | |
{ | |
/* Katakana */ | |
enc = unicode_from_jis201[enc]; | |
} | |
else if (enc < 255) | |
{ | |
/* This is erroneous as I understand SJIS */ | |
enc = 0; | |
} | |
else if (enc >= 0xeaa5) | |
{ | |
/* Encoded value is outside SJIS range */ | |
/* If this happens, it's likely that it's actually CP932 encoded */ | |
/* Todo: Detect CP932 encoding earlier and apply that instead of SJIS */ | |
enc = 0; | |
} | |
else | |
{ | |
int ch1 = enc >> 8, ch2 = enc & 0xff; | |
if (ch1 >= 129 && ch1 <= 159) | |
ch1 -= 112; | |
else | |
ch1 -= 176; | |
ch1 <<= 1; | |
if (ch2 >= 159) | |
ch2 -= 126; | |
else if (ch2 > 127) | |
{ | |
--ch1; | |
ch2 -= 32; | |
} | |
else | |
{ | |
--ch1; | |
ch2 -= 31; | |
} | |
if (ch1 < 0x21 || ch2 < 0x21 || ch1 > 0x7e || ch2 > 0x7e) | |
enc = 0; | |
else | |
enc = unicode_from_jis208[(ch1 - 0x21) * 94 + (ch2 - 0x21)]; | |
} | |
} | |
else if (modtype == 3 /* GB2312 offset by 0x8080, parse just like wansung */) | |
{ | |
if (enc > 0xa1a1) | |
{ | |
enc -= 0xa1a1; | |
enc = (enc >> 8) * 94 + (enc & 0xff); | |
enc = unicode_from_gb2312[enc]; | |
if (enc == 0) | |
enc = -1; | |
} | |
else if (enc > 0x100) | |
enc = 0; | |
} | |
else if (modtype == 4 /* BIG5 */) | |
{ /* old ms docs say big5 is modtype==3, but new ones say 4 */ | |
if (enc > 0x8100) | |
enc = unicode_from_big5hkscs[enc - 0x8100]; | |
else if (enc > 0x100) | |
enc = 0; | |
} | |
else if (modtype == 5 /* Wansung == KSC 5601-1987, I hope */) | |
{ | |
if (enc > 0xa1a1) | |
{ | |
enc -= 0xa1a1; | |
enc = (enc >> 8) * 94 + (enc & 0xff); | |
enc = unicode_from_ksc5601[enc]; | |
if (enc == 0) | |
enc = -1; | |
} | |
else if (enc > 0x100) | |
enc = 0; | |
} | |
else if (modtype == 6 /* Johab */) | |
{ | |
if (enc > 0x8400) | |
enc = unicode_from_johab[enc - 0x8400]; | |
else if (enc > 0x100) | |
enc = 0; | |
} | |
if (enc == 0) | |
enc = -1; | |
return (enc); | |
} | |
int main() | |
{ | |
// Encoding* enc = FindOrMakeEncoding("UCS4"); //1 | |
Encoding* enc = FindOrMakeEncoding("sjis"); //2 | |
// Encoding* enc = FindOrMakeEncoding("gb2312pk"); //3 | |
// Encoding* enc = FindOrMakeEncoding("big5"); //4 | |
// Encoding* enc = FindOrMakeEncoding("wansung"); //5 | |
// Encoding* enc = FindOrMakeEncoding("JOHAB"); //6 | |
if (enc == NULL) { | |
printf("WTF\n"); | |
return 1; | |
} | |
for (int i = 0; i < 65535; ++i) | |
{ | |
int a = umodenc(i, 6); | |
int b = UniFromEnc(i, enc); | |
b = b == 0 ? -1 : b; | |
if (a != b && a != -1 || b == 0x96e9) { | |
printf("0x%x: 0x%x vs 0x%x\n", i, a, b); | |
} | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <chardata.h> | |
#include <encoding.h> | |
#include <utype.h> | |
static int umodenc(int enc, int modtype) | |
{ | |
if (modtype == -1) | |
return (-1); | |
if (modtype <= 1 /* Unicode */) | |
{ | |
/* No conversion needed, already unicode */; | |
} | |
else if (modtype == 2 /* SJIS */) | |
{ | |
if (enc <= 127) | |
{ | |
/* Latin */ | |
if (enc == '\\') | |
enc = 0xa5; /* Yen */ | |
} | |
else if (enc >= 161 && enc <= 223) | |
{ | |
/* Katakana */ | |
enc = unicode_from_jis201[enc]; | |
} | |
else if (enc < 255) | |
{ | |
/* This is erroneous as I understand SJIS */ | |
enc = -1; | |
} | |
else if (enc >= 0xeaa5) | |
{ | |
/* Encoded value is outside SJIS range */ | |
/* If this happens, it's likely that it's actually CP932 encoded */ | |
/* Todo: Detect CP932 encoding earlier and apply that instead of SJIS */ | |
enc = -1; | |
} | |
else | |
{ | |
int ch1 = enc >> 8, ch2 = enc & 0xff; | |
if (ch1 >= 129 && ch1 <= 159) | |
ch1 -= 112; | |
else | |
ch1 -= 176; | |
ch1 <<= 1; | |
if (ch2 >= 159) | |
ch2 -= 126; | |
else if (ch2 > 127) | |
{ | |
--ch1; | |
ch2 -= 32; | |
} | |
else | |
{ | |
--ch1; | |
ch2 -= 31; | |
} | |
if (ch1 < 0x21 || ch2 < 0x21 || ch1 > 0x7e || ch2 > 0x7e) | |
enc = -1; | |
else | |
enc = unicode_from_jis208[(ch1 - 0x21) * 94 + (ch2 - 0x21)]; | |
} | |
} | |
else if (modtype == 3 /* GB2312 offset by 0x8080, parse just like wansung */) | |
{ | |
if (enc > 0xa1a1) | |
{ | |
enc -= 0xa1a1; | |
enc = (enc >> 8) * 94 + (enc & 0xff); | |
enc = unicode_from_gb2312[enc]; | |
if (enc == 0) | |
enc = -1; | |
} | |
else if (enc > 0x100) | |
enc = -1; | |
} | |
else if (modtype == 4 /* BIG5 */) | |
{ /* old ms docs say big5 is modtype==3, but new ones say 4 */ | |
if (enc > 0x8100) | |
enc = unicode_from_big5hkscs[enc - 0x8100]; | |
else if (enc > 0x100) | |
enc = -1; | |
} | |
else if (modtype == 5 /* Wansung == KSC 5601-1987, I hope */) | |
{ | |
if (enc > 0xa1a1) | |
{ | |
enc -= 0xa1a1; | |
enc = (enc >> 8) * 94 + (enc & 0xff); | |
enc = unicode_from_ksc5601[enc]; | |
if (enc == 0) | |
enc = -1; | |
} | |
else if (enc > 0x100) | |
enc = -1; | |
} | |
else if (modtype == 6 /* Johab */) | |
{ | |
if (enc > 0x8400) | |
enc = unicode_from_johab[enc - 0x8400]; | |
else if (enc > 0x100) | |
enc = -1; | |
} | |
if (enc == 0) | |
enc = -1; | |
return (enc); | |
} | |
static int umodenc2(int enc, int modtype) | |
{ | |
if (modtype == -1) | |
return -1; | |
if (modtype <= 1 /* Unicode */) | |
{ | |
/* No conversion needed, already unicode */; | |
} | |
else if (modtype == 2 /* SJIS */) | |
{ | |
// Apart from these, cp932 is a strict superset of sjis | |
if (enc == '\\') | |
enc = 0xa5; /* Yen */ | |
else if (enc == 0x7e) | |
enc = 0x7e; /* Tilde */ | |
else | |
{ | |
static Encoding* ed; | |
if (!ed) | |
{ | |
ed = FindOrMakeEncoding("cp932"); | |
} | |
enc = UniFromEnc(enc, ed); | |
} | |
} | |
else if (modtype == 3 /* GB2312 offset by 0x8080, parse just like wansung */) | |
{ | |
if (enc > 0xa1a1) | |
{ | |
static Encoding* ed; | |
if (!ed) | |
{ | |
ed = FindOrMakeEncoding("gb2312pk"); | |
} | |
enc = UniFromEnc(enc, ed); | |
} | |
else if (enc > 0x100) | |
enc = -1; | |
} | |
else if (modtype == 4 /* BIG5 */) | |
{ /* old ms docs say big5 is modtype==3, but new ones say 4 */ | |
if (enc > 0x8100) | |
{ | |
static Encoding* ed; | |
if (!ed) | |
{ | |
ed = FindOrMakeEncoding("cp950"); //does not include hkscs extensions... | |
} | |
enc = UniFromEnc(enc, ed); | |
} | |
else if (enc > 0x100) | |
enc = -1; | |
} | |
else if (modtype == 5 /* Wansung == KSC 5601-1987, I hope */) | |
{ | |
if (enc > 0xa1a1) | |
{ | |
static Encoding* ed; | |
if (!ed) | |
{ | |
ed = FindOrMakeEncoding("cp949"); | |
} | |
enc = UniFromEnc(enc, ed); | |
} | |
else if (enc > 0x100) | |
enc = -1; | |
} | |
else if (modtype == 6 /* Johab */) | |
{ | |
if (enc > 0x8400) | |
{ | |
static Encoding* ed; | |
if (!ed) | |
{ | |
ed = FindOrMakeEncoding("cp1361"); | |
} | |
enc = UniFromEnc(enc, ed); | |
} | |
else if (enc > 0x100) | |
enc = -1; | |
} | |
if (enc == 0) | |
enc = -1; | |
return (enc); | |
} | |
int main() | |
{ | |
// Encoding* enc = FindOrMakeEncoding("UCS4"); //1 | |
// Encoding* enc = FindOrMakeEncoding("sjis"); //2 | |
// Encoding* enc = FindOrMakeEncoding("gb2312pk"); //3 | |
// Encoding* enc = FindOrMakeEncoding("big5"); //4 | |
// Encoding* enc = FindOrMakeEncoding("wansung"); //5 | |
// Encoding* enc = FindOrMakeEncoding("JOHAB"); //6 | |
// if (enc == NULL) { | |
// printf("WTF\n"); | |
// return 1; | |
// } | |
// JIS208 is broke | |
for (int i = 0; i < 65535; ++i) | |
{ | |
int a = umodenc(i, 6); | |
int b = umodenc2(i, 6); | |
if (a != b) | |
{ | |
printf("0x%x: 0x%x vs 0x%x\n", i, a, b); | |
} | |
else | |
{ | |
printf("0x%x: ok 0x%x\n", i, a); | |
} | |
} | |
// Encoding* enc1 = FindOrMakeEncoding("gb2312pk"); | |
// Encoding* enc2 = FindOrMakeEncoding("cp936"); | |
// for (int i = 0; i < 65535; ++i) | |
// { | |
// int a = UniFromEnc(i, enc1); | |
// int b = UniFromEnc(i, enc2); | |
// if (a != b) | |
// { | |
// printf("0x%x: 0x%x vs 0x%x\n", i, a, b); | |
// } | |
// } | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment