Created
September 28, 2015 18:39
-
-
Save igorbrigadir/77f94859788db94d8cb1 to your computer and use it in GitHub Desktop.
Turn 1 document per line text file into CoNLL-X formatted sentences. There's probably a much better way of doing this though...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import java.io.File; | |
| import java.io.IOException; | |
| import java.io.StringReader; | |
| import java.util.ArrayList; | |
| import java.util.List; | |
| import org.apache.commons.io.FileUtils; | |
| import edu.stanford.nlp.ling.CoreAnnotations; | |
| import edu.stanford.nlp.ling.CoreLabel; | |
| import edu.stanford.nlp.ling.HasTag; | |
| import edu.stanford.nlp.ling.HasWord; | |
| import edu.stanford.nlp.ling.TaggedWord; | |
| import edu.stanford.nlp.parser.nndep.DependencyParser; | |
| import edu.stanford.nlp.process.DocumentPreprocessor; | |
| import edu.stanford.nlp.tagger.maxent.MaxentTagger; | |
| import edu.stanford.nlp.trees.GrammaticalStructure; | |
| import edu.stanford.nlp.util.CoreMap; | |
| public class RunDependencyParser { | |
| static final String taggerPath = "/stanford-parser-full-2015-04-20/pos-tagger/english-left3words/english-left3words-distsim.tagger"; | |
| static MaxentTagger tagger = new MaxentTagger(taggerPath); | |
| static final String modelPath = "/stanford-parser-full-2015-04-20/models/parser/nndep/english_UD.gz"; | |
| static DependencyParser parser = DependencyParser.loadFromModelFile(modelPath); | |
| /* | |
| * Input file is 1 document (a few sentences) per line. | |
| */ | |
| public static void main(String[] args) throws IOException { | |
| File input = new File("/input.txt"); | |
| File output = new File("/output.txt"); | |
| for (String document : FileUtils.readLines(input)) { | |
| // Append output file | |
| FileUtils.writeStringToFile(output, getConllXString(document), true); | |
| } | |
| } | |
| /* | |
| * Get a chunk of text in CoNLL-X Format: | |
| */ | |
| public static String getConllXString(String text) { | |
| StringBuilder sb = new StringBuilder(); | |
| // Split text into sentences: | |
| DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text)); | |
| for (List<HasWord> sentence : tokenizer) { | |
| // POS Tag & Parse each sentence | |
| List<TaggedWord> tagged = tagger.tagSentence(sentence); | |
| GrammaticalStructure gs = parser.predict(tagged); | |
| // CoNLL-X Output format: | |
| String s = GrammaticalStructure.dependenciesToCoNLLXString(gs, getCoreMap(tagged)); | |
| sb.append(s); | |
| sb.append("\n\n"); | |
| } | |
| return sb.toString(); | |
| } | |
| /* | |
| * Turn a sentence (List of annotated tokens) into a CoreMap object: | |
| */ | |
| public static CoreMap getCoreMap(List<? extends HasWord> sentence) { | |
| CoreLabel sentenceLabel = new CoreLabel(); | |
| List<CoreLabel> tokens = new ArrayList<>(); | |
| int i = 1; | |
| for (HasWord wd : sentence) { | |
| CoreLabel label; | |
| if (wd instanceof CoreLabel) { | |
| label = (CoreLabel) wd; | |
| if (label.tag() == null) { | |
| throw new IllegalArgumentException("Parser requires words " + "with part-of-speech tag annotations"); | |
| } | |
| } else { | |
| label = new CoreLabel(); | |
| label.setValue(wd.word()); | |
| label.setWord(wd.word()); | |
| if (!(wd instanceof HasTag)) { | |
| throw new IllegalArgumentException("Parser requires words " + "with part-of-speech tag annotations"); | |
| } | |
| label.setTag(((HasTag) wd).tag()); | |
| } | |
| label.setIndex(i); | |
| i++; | |
| tokens.add(label); | |
| } | |
| sentenceLabel.set(CoreAnnotations.TokensAnnotation.class, tokens); | |
| return sentenceLabel; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment