Created
July 31, 2015 09:49
-
-
Save anhldbk/9896ff9748672f02280f to your computer and use it in GitHub Desktop.
Sequence Handler for Vietnamese (Mallet)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cc.mallet.pipe.Input2CharSequence; | |
import cc.mallet.pipe.Pipe; | |
import cc.mallet.pipe.SerialPipes; | |
import cc.mallet.types.Instance; | |
import cc.mallet.types.SingleInstanceIterator; | |
import cc.mallet.types.TokenSequence; | |
import vn.hus.nlp.tokenizer.VietTokenizer; | |
import vn.viettel.cyberspace.commons.Utils; | |
import java.io.*; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.List; | |
import java.util.StringTokenizer; | |
/** | |
* Created by anhld on 7/13/15. | |
*/ | |
public class CharSequence2VietnameseTokenSequence extends Pipe implements Serializable { | |
private VietTokenizer tokenizer = new VietTokenizer(); | |
private List<String> stopWords = null; | |
private String specialCharacter = ":“”~`!@#$%^&*()-{[]}|/?.>,<\\'\""; | |
{ | |
String input = Utils.readFile("data/stoplists/vi.txt").toLowerCase(); | |
stopWords = Arrays.asList(input.split("\n")); | |
} | |
public Instance pipe(Instance carrier) { | |
List<String> tokens = tokenize(carrier.getData().toString()); | |
TokenSequence ts = new TokenSequence(); | |
for (String token : tokens) { | |
if(token.length() < 2){ | |
continue; | |
} | |
if(!token.contains(" ")){ | |
continue; | |
} | |
ts.add(token); | |
} | |
carrier.setData(ts); | |
return carrier; | |
} | |
public List<String> tokenize(String input) { | |
String[] tokens = tokenizer.tokenize(input); | |
if ((tokens == null) || (tokens.length == 0)) { | |
return null; | |
} | |
StringTokenizer defaultTokenizer = new StringTokenizer(tokens[0]); | |
List<String> ret = new ArrayList<>(); | |
while (defaultTokenizer.hasMoreTokens()) { | |
ret.add(defaultTokenizer.nextToken()); | |
} | |
ret = filter(ret); | |
return ret; | |
} | |
public List<String> filter(List<String> tokens) { | |
List<String> ret = new ArrayList<>(); | |
for (String token : tokens) { | |
if (specialCharacter.contains(token)) { | |
continue; | |
} | |
if (stopWords.contains(token)) { | |
continue; | |
} | |
if (token.matches("[0-9]+")) { | |
continue; // number, we don't care | |
} | |
ret.add(token.replace('_', ' ').toLowerCase()); | |
} | |
return ret; | |
} | |
public static void main(String[] args) { | |
try { | |
for (int i = 0; i < args.length; i++) { | |
Instance carrier = new Instance(new File(args[i]), null, null, null); | |
SerialPipes p = new SerialPipes(new Pipe[]{ | |
new Input2CharSequence(), | |
new CharSequence2VietnameseTokenSequence() | |
} | |
); | |
carrier = p.newIteratorFrom(new SingleInstanceIterator(carrier)).next(); | |
TokenSequence ts = (TokenSequence) carrier.getData(); | |
System.out.println("==="); | |
System.out.println(args[i]); | |
System.out.println(ts.toString()); | |
} | |
} catch (Exception e) { | |
System.out.println(e); | |
e.printStackTrace(); | |
} | |
} | |
// Serialization | |
private static final long serialVersionUID = 1; | |
private static final int CURRENT_SERIAL_VERSION = 0; | |
private void writeObject(ObjectOutputStream out) throws IOException { | |
out.writeInt(CURRENT_SERIAL_VERSION); | |
out.writeObject(tokenizer); | |
} | |
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { | |
int version = in.readInt(); | |
tokenizer = (VietTokenizer) in.readObject(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment