Created
November 21, 2023 10:07
-
-
Save colrichie/ed63ec85ba088614594b54e70ea6fd18 to your computer and use it in GitHub Desktop.
Simple Textdata Converter from CESU-8 to UTF-8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*#################################################################### | |
# | |
# CESU8toUTF8.c - Simple Textdata Converter from CESU-8 to UTF-8 | |
# | |
# Usage : cat your_textfile.txt | ./CESU8toUTF8 > converted_text.txt | |
# | |
# How to compile me : cc -O3 -o CESU8toUTF8 CESU8toUTF8.c | |
# | |
# | |
# Written by Colonel Richie (@colrichie) on 2023-11-21 | |
# | |
####################################################################*/ | |
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <locale.h> | |
#define MAXBUF 1048576 | |
int find_cesu8(char* pszLine) { | |
int i; | |
unsigned char* puc; | |
puc = (unsigned char*)pszLine; | |
for (i=0; i<strlen(pszLine); ) { | |
if (puc[i ]!=0xED ){i++ ;continue;} | |
if (puc[i+1]<0xA0 || puc[i+1]>0xAF){i+=2;continue;} | |
if (puc[i+2]<0x80 || puc[i+2]>0xBF){i+=3;continue;} | |
if (puc[i+3]!=0xED ){i+=4;continue;} | |
if (puc[i+4]<0xB0 || puc[i+4]>0xBF){i+=5;continue;} | |
if (puc[i+5]<0x80 || puc[i+5]>0xBF){i+=6;continue;} | |
return i; | |
} | |
return -1; | |
} | |
int main(void) { | |
FILE *fp; | |
char szBuf[MAXBUF], *pszBuf; | |
int i, iOfs; | |
char szUTF8[4]; | |
setlocale(LC_CTYPE, ""); | |
fp = stdin; | |
while(fgets(szBuf, MAXBUF, fp) != NULL) { | |
pszBuf=szBuf; | |
while ((iOfs=find_cesu8(pszBuf))>=0) { | |
for (i=0; i<iOfs; i++) {putchar(*pszBuf);pszBuf++;} | |
pszBuf[1]++; | |
szUTF8[0] = 0xF0 | ((pszBuf[1] & 0x1C)>>2) ; | |
szUTF8[1] = 0x80 | ((pszBuf[1] & 0x03)<<4) | ((pszBuf[2] & 0x3C)>>2); | |
szUTF8[2] = 0x80 | ((pszBuf[2] & 0x03)<<4) | ( pszBuf[4] & 0x0F ); | |
szUTF8[3] = pszBuf[5]; ; | |
putchar(szUTF8[0]);putchar(szUTF8[1]);putchar(szUTF8[2]);putchar(szUTF8[3]); | |
pszBuf+=6; | |
} | |
while (*pszBuf!=0) {putchar(*pszBuf);pszBuf++;} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment