Created
August 10, 2020 10:21
-
-
Save tbvinh/fab60499cfa30dc86a032fcf18bf7063 to your computer and use it in GitHub Desktop.
convert VNI to Unicode vietnamese charaters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
javac convert.java | |
java convert file1.txt file2.txt | |
*/ | |
import java.io.*; | |
import java.nio.charset.StandardCharsets; | |
public class convert { | |
public static void main(String[] args) { | |
System.out.println("Hello World"); | |
if (args.length == 2) { | |
conFiles(new File(args[0]), new File(args[1])); | |
System.out.println("DONE"); | |
} else { | |
System.out.println("command <File1> <file2>"); | |
} | |
} | |
private static String convertVni2Unicode(String str) { | |
final String[] VNI_char = {"O\u00C2", "o\u00E2", "y\u00F5", "Y\u00D5", "y\u00FB", "Y\u00DB", | |
"y\u00F8", "Y\u00D8", "\u00F6\u00EF", "\u00D6\u00CF", "\u00F6\u00F5", "\u00D6\u00D5", | |
"\u00F6\u00FB", "\u00D6\u00DB", "\u00F6\u00F8", "\u00D6\u00D8", "\u00F6\u00F9", | |
"\u00D6\u00D9", "u\u00FB", "U\u00DB", "u\u00EF", "U\u00CF", "\u00F4\u00EF", "\u00D4\u00CF", | |
"\u00F4\u00F5", "\u00D4\u00D5", "\u00F4\u00FB", "\u00D4\u00DB", "\u00F4\u00F8", | |
"\u00D4\u00D8", "\u00F4\u00F9", "\u00D4\u00D9", "o\u00E4", "O\u00C4", "o\u00E3", "O\u00C3", | |
"o\u00E5", "O\u00C5", "o\u00E0", "O\u00C0", "o\u00E1", "O\u00C1", "o\u00FB", "O\u00DB", | |
"o\u00EF", "O\u00CF", "e\u00E4", "E\u00C4", "e\u00E3", "E\u00C3", "e\u00E5", "E\u00C5", | |
"e\u00E0", "E\u00C0", "e\u00E1", "E\u00C1", "e\u00F5", "E\u00D5", "e\u00FB", "E\u00DB", | |
"e\u00EF", "E\u00CF", "a\u00EB", "A\u00CB", "a\u00FC", "A\u00DC", "a\u00FA", "A\u00DA", | |
"a\u00E8", "A\u00C8", "a\u00E9", "A\u00C9", "a\u00E4", "A\u00C4", "a\u00E3", "A\u00C3", | |
"a\u00E5", "A\u00C5", "a\u00E0", "A\u00C0", "a\u00E1", "A\u00C1", "a\u00FB", "A\u00DB", | |
"a\u00EF", "A\u00CF", "u\u00F5", "U\u00D5", "a\u00EA", "A\u00CA", "y\u00F9", "u\u00F9", | |
"u\u00F8", "o\u00F5", "o\u00F9", "o\u00F8", "e\u00E2", "e\u00F9", "e\u00F8", "a\u00F5", | |
"a\u00E2", "a\u00F9", "a\u00F8", "Y\u00D9", "U\u00D9", "U\u00D8", "O\u00D5", "O\u00D9", | |
"O\u00D8", "E\u00C2", "E\u00D9", "E\u00D8", "A\u00D5", "A\u00C2", "A\u00D9", "A\u00D8"}; | |
final String[] Unicode_char = {"\u00C6", "\u00E6", "\u1EF9", "\u1EF8", "\u1EF7", "\u1EF6", | |
"\u1EF3", "\u1EF2", "\u1EF1", "\u1EF0", "\u1EEF", "\u1EEE", "\u1EED", "\u1EEC", "\u1EEB", | |
"\u1EEA", "\u1EE9", "\u1EE8", "\u1EE7", "\u1EE6", "\u1EE5", "\u1EE4", "\u1EE3", "\u1EE2", | |
"\u1EE1", "\u1EE0", "\u1EDF", "\u1EDE", "\u1EDD", "\u1EDC", "\u1EDB", "\u1EDA", "\u1ED9", | |
"\u1ED8", "\u1ED7", "\u1ED6", "\u1ED5", "\u1ED4", "\u1ED3", "\u1ED2", "\u1ED1", "\u1ED0", | |
"\u1ECF", "\u1ECE", "\u1ECD", "\u1ECC", "\u1EC7", "\u1EC6", "\u1EC5", "\u1EC4", "\u1EC3", | |
"\u1EC2", "\u1EC1", "\u1EC0", "\u1EBF", "\u1EBE", "\u1EBD", "\u1EBC", "\u1EBB", "\u1EBA", | |
"\u1EB9", "\u1EB8", "\u1EB7", "\u1EB6", "\u1EB5", "\u1EB4", "\u1EB3", "\u1EB2", "\u1EB1", | |
"\u1EB0", "\u1EAF", "\u1EAE", "\u1EAD", "\u1EAC", "\u1EAB", "\u1EAA", "\u1EA9", "\u1EA8", | |
"\u1EA7", "\u1EA6", "\u1EA5", "\u1EA4", "\u1EA3", "\u1EA2", "\u1EA1", "\u1EA0", "\u0169", | |
"\u0168", "\u0103", "\u0102", "\u00FD", "\u00FA", "\u00F9", "\u00F5", "\u00F3", "\u00F2", | |
"\u00EA", "\u00E9", "\u00E8", "\u00E3", "\u00E2", "\u00E1", "\u00E0", "\u00DD", "\u00DA", | |
"\u00D9", "\u00D5", "\u00D3", "\u00D2", "\u00CA", "\u00C9", "\u00C8", "\u00C3", "\u00C2", | |
"\u00C1", "\u00C0"}; | |
// Part 1 | |
str = str.replace('\u00D1', '\u0110') // DD | |
.replace('\u00F1', '\u0111') // dd | |
.replace('\u00D3', '\u0128') // I~ | |
.replace('\u00F3', '\u0129') // i~ | |
.replace('\u00D2', '\u1ECA') // I. | |
.replace('\u00F2', '\u1ECB') // i. | |
.replace('\u00C6', '\u1EC8') // I? | |
.replace('\u00E6', '\u1EC9') // i? | |
.replace('\u00CE', '\u1EF4') // Y. | |
.replace('\u00EE', '\u1EF5'); // y. | |
// Part 2 | |
// Transform "O\u00C2" -> "\u00C6" to later transform back to "\u00D4" in Part 3 | |
str = replaceString(str, VNI_char, Unicode_char); | |
// Part 3 | |
str = str.replace('\u00D4', '\u01A0') // O+ | |
.replace('\u00F4', '\u01A1') // o+ | |
.replace('\u00D6', '\u01AF') // U+ | |
.replace('\u00F6', '\u01B0') // u+ | |
.replace('\u00C6', '\u00D4') // O^ | |
.replace('\u00E6', '\u00F4'); // o^ | |
return str; | |
} | |
static String replaceString(String text, final String[] pattern, final String[] replace) { | |
int startIndex; | |
int foundIndex; | |
StringBuilder result = new StringBuilder(); | |
for (int i = 0; i < pattern.length; i++) { | |
startIndex = 0; | |
// Clear the buffer | |
result.setLength(0); | |
// Look for a pattern to replace | |
while ((foundIndex = text.indexOf(pattern[i], startIndex)) >= 0) { | |
result.append(text.substring(startIndex, foundIndex)); | |
result.append(replace[i]); | |
startIndex = foundIndex + pattern[i].length(); | |
} | |
result.append(text.substring(startIndex)); | |
text = result.toString(); | |
} | |
return text; | |
} | |
static String replaceString(String text, final String pattern, final String replace) { | |
int startIndex = 0; | |
int foundIndex; | |
StringBuilder result = new StringBuilder(); | |
// Look for a pattern to replace | |
while ((foundIndex = text.indexOf(pattern, startIndex)) >= 0) { | |
result.append(text.substring(startIndex, foundIndex)); | |
result.append(replace); | |
startIndex = foundIndex + pattern.length(); | |
} | |
result.append(text.substring(startIndex)); | |
return result.toString(); | |
} | |
private static void conFiles(File infile, File outfile) { | |
/* | |
FileInputStream inputStream = null; | |
java.util.Scanner sc = null; | |
try { | |
inputStream = new FileInputStream(infile); | |
sc = new java.util.Scanner(inputStream, "UTF-16"); | |
Writer out = new OutputStreamWriter(new FileOutputStream(outfile), "UTF-8"); | |
while (sc.hasNextLine()) { | |
String line = sc.nextLine(); | |
System.out.println(line); | |
out.write (line); | |
out.write ("\n"); | |
} | |
out.close(); | |
// note that Scanner suppresses exceptions | |
if (sc.ioException() != null) { | |
throw sc.ioException(); | |
} | |
}catch(Exception ex){ | |
ex.printStackTrace(); | |
} finally { | |
if (inputStream != null) { | |
try{ | |
inputStream.close(); | |
}catch(Exception ex){} | |
} | |
if (sc != null) { | |
sc.close(); | |
} | |
} | |
*/ | |
try { | |
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(infile) , "UTF-16")); | |
PrintWriter out2 = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outfile), StandardCharsets.UTF_8)); | |
String line = reader.readLine(); | |
String lineU16; | |
while(line!=null){ | |
lineU16 = convertVni2Unicode(line); | |
out2.println(lineU16); | |
line = reader.readLine(); | |
} | |
/* | |
Reader in = new InputStreamReader(new FileInputStream(infile), "UTF-16"); | |
Writer out = new OutputStreamWriter(new FileOutputStream("~outfile.txt"), "UTF-8"); | |
char cbuf[] = new char[2048]; | |
int len; | |
while ((len = in.read(cbuf, 0, cbuf.length)) != -1) { | |
out.write(cbuf, 0, len); | |
} | |
out.close(); | |
in.close(); | |
*/ | |
out2.close(); | |
reader.close(); | |
} catch (Exception ex) { | |
ex.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment