Created
July 3, 2018 08:50
-
-
Save rafgugi/529146755a705c724d96a314b842ed60 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package Token; | |
import java.io.BufferedReader; | |
import java.io.BufferedWriter; | |
import java.io.DataOutputStream; | |
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.io.FileOutputStream; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.io.OutputStreamWriter; | |
import java.nio.file.Files; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.StringTokenizer; | |
import java.util.Map.Entry; | |
public class Token { | |
private static List<Entry<String, Integer>> tableData = null; | |
private static Map<String, Integer> map = new HashMap<String, Integer>(); | |
private static Map<String, Short> dictionary_encode = new HashMap<>(); | |
private static Map<Short, String> dictionary_decode = new HashMap<>(); | |
private static String table_encode = "bahan/table-encode.txt"; | |
private static String inputFile = "bahan/test-input.txt"; | |
private static String outputEncode = "bahan/test-encode"; | |
private static String outputDecode = "bahan/test-decode"; | |
private static int ONEBYTE = 256; | |
private static int HOWMANY = 2; | |
private static String tokenlimiter = " !#$%&'()*+,-./:;<=>?@[\"\\\n]_`{|}~"; | |
static short exch; | |
public static void main(String[] args) throws IOException { | |
// Method 1 using StringTokenizer - with multiple tokens | |
String urls = usingBufferedReader(inputFile); | |
DataOutputStream out = null; | |
boolean generateDictData = true; | |
try { | |
out = new DataOutputStream(new FileOutputStream(outputEncode)); | |
} catch (FileNotFoundException e) { | |
e.printStackTrace(); | |
System.exit(1); | |
} | |
/* Read all the input data */ | |
ArrayList<String> alphanumeric = new ArrayList<>(); | |
StringTokenizer multiTokenizer = new StringTokenizer(urls, tokenlimiter); | |
while (multiTokenizer.hasMoreTokens()) { | |
// memasukkan tiap kata yang terpisah tokenizer | |
String a1 = multiTokenizer.nextToken(); | |
alphanumeric.add(a1); | |
// menghitung jumlah kata yang muncul | |
if (generateDictData) { | |
if (map.containsKey(a1)) { | |
map.put(a1, map.get(a1) + 1); | |
} else { | |
map.put(a1, 1); | |
} | |
} | |
} | |
if (generateDictData) { | |
/* print hashmap */ | |
for (String name : map.keySet()) { | |
String key = name.toString(); | |
String value = map.get(name).toString(); | |
System.out.println(key + " " + value); | |
} | |
/* pengurutan hashmap */ | |
tableData = entriesSortedByValues(map); | |
/* Create and save dictionary */ | |
int length = tableData.size(); | |
FileOutputStream r_en = new FileOutputStream(new File(table_encode)); | |
BufferedWriter dic_file = new BufferedWriter(new OutputStreamWriter(r_en)); | |
for (int i = 0; i < length; i++) { | |
exch = (short) i; | |
// untuk membuat 16 bit | |
short[] save_byte = int2SortArray(i); | |
// menampilkan hasil byte | |
System.out.print(save_byte[0] + " " + save_byte[1] + ":"); | |
System.out.print(i + ": "); | |
// input ke tabel dictionary | |
dictionary_encode.put(tableData.get(i).getKey(), exch); | |
dictionary_decode.put(exch, tableData.get(i).getKey()); | |
System.out.println(tableData.get(i).getKey()); | |
dic_file.write(tableData.get(i).getKey()); | |
dic_file.newLine(); | |
} | |
dic_file.close(); | |
} else { | |
readDictionary(); | |
} | |
/* Begin encoding */ | |
for (String a1 : alphanumeric) { | |
// memasukkan tiap kata yang terpisah tokenizer | |
short index = dictionary_encode.get(a1); | |
short[] bytes = int2SortArray(index); | |
out.writeByte(bytes[0]); | |
out.writeByte(bytes[1]); | |
} | |
out.close(); | |
/* Begin decoding */ | |
byte[] isi_file = Files.readAllBytes(new File(outputEncode).toPath()); | |
System.out.println(isi_file.length); | |
FileOutputStream r_de = new FileOutputStream(new File(outputDecode)); | |
BufferedWriter dec_file = new BufferedWriter(new OutputStreamWriter(r_de)); | |
for (int i = 0; i < isi_file.length; i += HOWMANY) { | |
/* Baca sejumlah byte biar bisa disearch di dict */ | |
short[] split = new short[HOWMANY]; | |
for (int j = 0; j < HOWMANY; j++) { | |
split[j] = (short) (isi_file[i + j] & (ONEBYTE - 1)); | |
} | |
short bytes = (short) sortArray2Int(split); | |
/* decoding */ | |
String string = dictionary_decode.get(bytes); | |
if (string == null) { | |
System.err.println("Out of bound: " + bytes); | |
System.exit(0); | |
} | |
// System.out.println(string); | |
dec_file.write(string); | |
} | |
System.out.println("<DONE>"); | |
dec_file.close(); | |
} | |
// Untuk sorting descending frekuensi token | |
static <K, V extends Comparable<? super V>> List<Entry<K, V>> entriesSortedByValues(Map<K, V> map) { | |
List<Entry<K, V>> sortedEntries = new ArrayList<Entry<K, V>>(map.entrySet()); | |
Collections.sort(sortedEntries, new Comparator<Entry<K, V>>() { | |
@Override | |
public int compare(Entry<K, V> e1, Entry<K, V> e2) { | |
return e2.getValue().compareTo(e1.getValue()); | |
} | |
}); | |
return sortedEntries; | |
} | |
public static String usingBufferedReader(String filePath) { | |
StringBuilder contentBuilder = new StringBuilder(); | |
try (BufferedReader br = new BufferedReader(new FileReader(filePath))) { | |
String sCurrentLine; | |
while ((sCurrentLine = br.readLine()) != null) { | |
contentBuilder.append(sCurrentLine).append("\n"); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
return contentBuilder.toString(); | |
} | |
public static void readDictionary() throws IOException { | |
try { | |
File f = new File(table_encode); | |
BufferedReader b = new BufferedReader(new FileReader(f)); | |
String readLine = ""; | |
int i = 0; | |
StringBuilder bits = new StringBuilder(); | |
while ((readLine = b.readLine()) != null) { | |
exch = (short) i; | |
dictionary_encode.put(readLine, exch); | |
dictionary_decode.put(exch, readLine); | |
bits.setLength(0); | |
i++; | |
} | |
b.close(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
public static short[] int2SortArray(int x, int dimension, int radix) { | |
short[] save_byte = new short[dimension]; | |
for (int j = 0; j < dimension; j++) { | |
save_byte[dimension - j - 1] = (short) (x % radix); | |
x /= radix; | |
} | |
return save_byte; | |
} | |
public static short[] int2SortArray(int x) { | |
return int2SortArray(x, HOWMANY, ONEBYTE); | |
} | |
public static int sortArray2Int(short[] bytes, int dimension, int one_byte) { | |
int x = 0; | |
for (int i = 0; i < dimension; i++) { | |
x = x * one_byte + bytes[i]; | |
} | |
return x; | |
} | |
public static int sortArray2Int(short[] bytes) { | |
return sortArray2Int(bytes, HOWMANY, ONEBYTE); | |
} | |
public static short unsignedToBytes(byte b) { | |
return (short) (b & 0xFF); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment