Created
October 20, 2021 11:03
-
-
Save qb20nh/b315fea0625fa9e56b07d09f23a54a08 to your computer and use it in GitHub Desktop.
Automatically fix hangul encoding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.UnsupportedEncodingException; | |
import java.util.AbstractMap; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
public class ValueExample { | |
public static Map.Entry<String, String> guessAndCorrectEncoding(final String broken) { | |
return Arrays.asList("utf-8", "euc-kr", "ksc5601", "x-windows-949", "iso-8859-1").stream() | |
.map(charset -> { | |
String encoded = null; | |
try { | |
encoded = new String(broken.getBytes(charset), "utf-8"); | |
} catch (UnsupportedEncodingException e) { | |
} | |
return new AbstractMap.SimpleEntry<String, String>(charset, encoded); | |
}).filter(encoded -> !containsNonKeyboardCharacters(encoded.getValue())) | |
.max((s1, s2) -> compareDouble(getEntropy(s2.getValue()), getEntropy(s1.getValue()))).get(); | |
} | |
private static int compareDouble(double d1, double d2) { | |
final double epsilon = 1E-5; | |
double diff = d2 - d1; | |
if (diff < epsilon) { | |
return 0; | |
} | |
return diff > 0 ? 1 : -1; | |
} | |
private static boolean containsNonKeyboardCharacters(String s) { | |
final int length = s.length(); | |
for (int codepoint, offset = 0; offset < length; offset += Character.charCount(codepoint)) { | |
codepoint = s.codePointAt(offset); | |
if (codepoint < 32) | |
return true; | |
if (codepoint > 126 && codepoint < 12593) | |
return true; | |
if (codepoint > 12687 && codepoint < 44032) | |
return true; | |
if (codepoint > 55204) | |
return true; | |
} | |
return false; | |
} | |
private static double getEntropy(String s) { | |
List<Integer> cps = getCodePoints(s); | |
Map<Integer, Integer> freq = new HashMap<>(); | |
for (Integer cp : cps) { | |
freq.compute(cp, (k, v) -> (v == null ? 0 : v) + 1); | |
} | |
double entropy = 0.0; | |
for (Map.Entry<Integer, Integer> entry : freq.entrySet()) { | |
double p = 1.0 * entry.getValue() / cps.size(); | |
entropy -= p * Math.log(p) / Math.log(2); | |
} | |
return entropy; | |
} | |
private static List<Integer> getCodePoints(String s) { | |
List<Integer> cp = new ArrayList<>(); | |
final int length = s.length(); | |
for (int codepoint, offset = 0; offset < length; offset += Character.charCount(codepoint)) { | |
codepoint = s.codePointAt(offset); | |
cp.add(codepoint); | |
} | |
return cp; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment