Skip to content

Instantly share code, notes, and snippets.

@igorbrigadir
Created September 28, 2015 18:39
Show Gist options
  • Save igorbrigadir/77f94859788db94d8cb1 to your computer and use it in GitHub Desktop.
Save igorbrigadir/77f94859788db94d8cb1 to your computer and use it in GitHub Desktop.
Turn 1 document per line text file into CoNLL-X formatted sentences. There's probably a much better way of doing this though...
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.parser.nndep.DependencyParser;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.util.CoreMap;
public class RunDependencyParser {
static final String taggerPath = "/stanford-parser-full-2015-04-20/pos-tagger/english-left3words/english-left3words-distsim.tagger";
static MaxentTagger tagger = new MaxentTagger(taggerPath);
static final String modelPath = "/stanford-parser-full-2015-04-20/models/parser/nndep/english_UD.gz";
static DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);
/*
* Input file is 1 document (a few sentences) per line.
*/
public static void main(String[] args) throws IOException {
File input = new File("/input.txt");
File output = new File("/output.txt");
for (String document : FileUtils.readLines(input)) {
// Append output file
FileUtils.writeStringToFile(output, getConllXString(document), true);
}
}
/*
* Get a chunk of text in CoNLL-X Format:
*/
public static String getConllXString(String text) {
StringBuilder sb = new StringBuilder();
// Split text into sentences:
DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
for (List<HasWord> sentence : tokenizer) {
// POS Tag & Parse each sentence
List<TaggedWord> tagged = tagger.tagSentence(sentence);
GrammaticalStructure gs = parser.predict(tagged);
// CoNLL-X Output format:
String s = GrammaticalStructure.dependenciesToCoNLLXString(gs, getCoreMap(tagged));
sb.append(s);
sb.append("\n\n");
}
return sb.toString();
}
/*
* Turn a sentence (List of annotated tokens) into a CoreMap object:
*/
public static CoreMap getCoreMap(List<? extends HasWord> sentence) {
CoreLabel sentenceLabel = new CoreLabel();
List<CoreLabel> tokens = new ArrayList<>();
int i = 1;
for (HasWord wd : sentence) {
CoreLabel label;
if (wd instanceof CoreLabel) {
label = (CoreLabel) wd;
if (label.tag() == null) {
throw new IllegalArgumentException("Parser requires words " + "with part-of-speech tag annotations");
}
} else {
label = new CoreLabel();
label.setValue(wd.word());
label.setWord(wd.word());
if (!(wd instanceof HasTag)) {
throw new IllegalArgumentException("Parser requires words " + "with part-of-speech tag annotations");
}
label.setTag(((HasTag) wd).tag());
}
label.setIndex(i);
i++;
tokens.add(label);
}
sentenceLabel.set(CoreAnnotations.TokensAnnotation.class, tokens);
return sentenceLabel;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment