Skip to content

Instantly share code, notes, and snippets.

@qb20nh
Created October 20, 2021 11:03
Show Gist options
  • Save qb20nh/b315fea0625fa9e56b07d09f23a54a08 to your computer and use it in GitHub Desktop.
Save qb20nh/b315fea0625fa9e56b07d09f23a54a08 to your computer and use it in GitHub Desktop.
Automatically fix hangul encoding
import java.io.UnsupportedEncodingException;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ValueExample {
public static Map.Entry<String, String> guessAndCorrectEncoding(final String broken) {
return Arrays.asList("utf-8", "euc-kr", "ksc5601", "x-windows-949", "iso-8859-1").stream()
.map(charset -> {
String encoded = null;
try {
encoded = new String(broken.getBytes(charset), "utf-8");
} catch (UnsupportedEncodingException e) {
}
return new AbstractMap.SimpleEntry<String, String>(charset, encoded);
}).filter(encoded -> !containsNonKeyboardCharacters(encoded.getValue()))
.max((s1, s2) -> compareDouble(getEntropy(s2.getValue()), getEntropy(s1.getValue()))).get();
}
private static int compareDouble(double d1, double d2) {
final double epsilon = 1E-5;
double diff = d2 - d1;
if (diff < epsilon) {
return 0;
}
return diff > 0 ? 1 : -1;
}
private static boolean containsNonKeyboardCharacters(String s) {
final int length = s.length();
for (int codepoint, offset = 0; offset < length; offset += Character.charCount(codepoint)) {
codepoint = s.codePointAt(offset);
if (codepoint < 32)
return true;
if (codepoint > 126 && codepoint < 12593)
return true;
if (codepoint > 12687 && codepoint < 44032)
return true;
if (codepoint > 55204)
return true;
}
return false;
}
private static double getEntropy(String s) {
List<Integer> cps = getCodePoints(s);
Map<Integer, Integer> freq = new HashMap<>();
for (Integer cp : cps) {
freq.compute(cp, (k, v) -> (v == null ? 0 : v) + 1);
}
double entropy = 0.0;
for (Map.Entry<Integer, Integer> entry : freq.entrySet()) {
double p = 1.0 * entry.getValue() / cps.size();
entropy -= p * Math.log(p) / Math.log(2);
}
return entropy;
}
private static List<Integer> getCodePoints(String s) {
List<Integer> cp = new ArrayList<>();
final int length = s.length();
for (int codepoint, offset = 0; offset < length; offset += Character.charCount(codepoint)) {
codepoint = s.codePointAt(offset);
cp.add(codepoint);
}
return cp;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment