Skip to content

Instantly share code, notes, and snippets.

@yangacer
Created July 6, 2011 14:11
Show Gist options
  • Save yangacer/1067311 to your computer and use it in GitHub Desktop.
Save yangacer/1067311 to your computer and use it in GitHub Desktop.
ICU charset detection
// compile with g++ icu-detect.cpp -I /usr/local/include -L /usr/local/lib -licuuc -licudata -licui18n
#include <cstdio>
#include <string>
#include <cassert>
#include "unicode/ucsdet.h"
#include "unicode/uclean.h"
int main(int argc, char **argv)
{
using namespace std;
UErrorCode u_glob_status = U_ZERO_ERROR;
char buf[4096];
FILE* fp(0);
size_t read=0;
string data;
u_init(&u_glob_status);
fp = fopen(argv[1], "rb");
assert(argc > 1);
assert(U_SUCCESS(u_glob_status));
assert(fp != 0);
while(0 < (read = fread(buf, 1, 4096, fp)))
data.append(buf, read);
fclose(fp);
UErrorCode uerr = U_ZERO_ERROR;
UCharsetDetector *ucd = ucsdet_open ( &uerr );
ucsdet_setText(ucd, data.c_str(), data.size(), &uerr);
UCharsetMatch const * match = ucsdet_detect(ucd, &uerr);
printf("Name: %s\n", ucsdet_getName(match, &uerr));
printf("Lang: %s\n", ucsdet_getLanguage(match, &uerr));
printf("Confidence: %u\n", ucsdet_getConfidence(match, &uerr));
ucsdet_close(ucd);
u_cleanup(); // keep valgrind happy!!
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment