Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save anhldbk/9896ff9748672f02280f to your computer and use it in GitHub Desktop.
Save anhldbk/9896ff9748672f02280f to your computer and use it in GitHub Desktop.
Sequence Handler for Vietnamese (Mallet)
import cc.mallet.pipe.Input2CharSequence;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.types.Instance;
import cc.mallet.types.SingleInstanceIterator;
import cc.mallet.types.TokenSequence;
import vn.hus.nlp.tokenizer.VietTokenizer;
import vn.viettel.cyberspace.commons.Utils;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
/**
* Created by anhld on 7/13/15.
*/
public class CharSequence2VietnameseTokenSequence extends Pipe implements Serializable {
private VietTokenizer tokenizer = new VietTokenizer();
private List<String> stopWords = null;
private String specialCharacter = ":“”~`!@#$%^&*()-{[]}|/?.>,<\\'\"";
{
String input = Utils.readFile("data/stoplists/vi.txt").toLowerCase();
stopWords = Arrays.asList(input.split("\n"));
}
public Instance pipe(Instance carrier) {
List<String> tokens = tokenize(carrier.getData().toString());
TokenSequence ts = new TokenSequence();
for (String token : tokens) {
if(token.length() < 2){
continue;
}
if(!token.contains(" ")){
continue;
}
ts.add(token);
}
carrier.setData(ts);
return carrier;
}
public List<String> tokenize(String input) {
String[] tokens = tokenizer.tokenize(input);
if ((tokens == null) || (tokens.length == 0)) {
return null;
}
StringTokenizer defaultTokenizer = new StringTokenizer(tokens[0]);
List<String> ret = new ArrayList<>();
while (defaultTokenizer.hasMoreTokens()) {
ret.add(defaultTokenizer.nextToken());
}
ret = filter(ret);
return ret;
}
public List<String> filter(List<String> tokens) {
List<String> ret = new ArrayList<>();
for (String token : tokens) {
if (specialCharacter.contains(token)) {
continue;
}
if (stopWords.contains(token)) {
continue;
}
if (token.matches("[0-9]+")) {
continue; // number, we don't care
}
ret.add(token.replace('_', ' ').toLowerCase());
}
return ret;
}
public static void main(String[] args) {
try {
for (int i = 0; i < args.length; i++) {
Instance carrier = new Instance(new File(args[i]), null, null, null);
SerialPipes p = new SerialPipes(new Pipe[]{
new Input2CharSequence(),
new CharSequence2VietnameseTokenSequence()
}
);
carrier = p.newIteratorFrom(new SingleInstanceIterator(carrier)).next();
TokenSequence ts = (TokenSequence) carrier.getData();
System.out.println("===");
System.out.println(args[i]);
System.out.println(ts.toString());
}
} catch (Exception e) {
System.out.println(e);
e.printStackTrace();
}
}
// Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 0;
private void writeObject(ObjectOutputStream out) throws IOException {
out.writeInt(CURRENT_SERIAL_VERSION);
out.writeObject(tokenizer);
}
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt();
tokenizer = (VietTokenizer) in.readObject();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment