-
-
Save onewaterdrop/c8f78bdd4d9fb3e6a355 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package foo; | |
import edu.stanford.nlp.fsm.ExactGrammarCompactor; | |
import edu.stanford.nlp.io.IOUtils; | |
import edu.stanford.nlp.io.NumberRangeFileFilter; | |
import edu.stanford.nlp.io.NumberRangesFileFilter; | |
import edu.stanford.nlp.ling.*; | |
import edu.stanford.nlp.objectbank.TokenizerFactory; | |
import edu.stanford.nlp.parser.ViterbiParser; | |
import edu.stanford.nlp.parser.KBestViterbiParser; | |
import edu.stanford.nlp.process.DocumentPreprocessor; | |
import edu.stanford.nlp.util.Function; | |
import edu.stanford.nlp.process.WhitespaceTokenizer; | |
import edu.stanford.nlp.trees.*; | |
import edu.stanford.nlp.trees.international.arabic.ArabicTreebankLanguagePack; | |
import edu.stanford.nlp.util.Generics; | |
import edu.stanford.nlp.util.Numberer; | |
import edu.stanford.nlp.util.Pair; | |
import edu.stanford.nlp.util.Timing; | |
import edu.stanford.nlp.util.ScoredObject; | |
import java.io.*; | |
import java.text.DecimalFormat; | |
import java.text.NumberFormat; | |
import java.util.*; | |
import java.util.zip.GZIPOutputStream; | |
import java.util.*; | |
import edu.stanford.nlp.trees.*; | |
import edu.stanford.nlp.parser.lexparser.LexicalizedParser; | |
import edu.stanford.nlp.process.PTBTokenizer; | |
public class RunStanfordParser { | |
/** | |
* | |
* @param args Arg1 - full path of the stanford parser input file (englishPCFG.ser.gz), Arg2 - file to parse | |
* @throws Exception | |
*/ | |
public static void main(String[] args) throws Exception { | |
// input format: data directory, and output directory | |
String parserFileOrUrl=args[0]; | |
String fileToParse=args[1]; | |
LexicalizedParser lp = new LexicalizedParser(parserFileOrUrl); // Create new parser | |
//lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want | |
// Call parser on files, and tokenize the contents | |
FileInputStream fstream = new FileInputStream(fileToParse); | |
DataInputStream in = new DataInputStream(fstream); // Get the object of DataInputStream | |
BufferedReader br = new BufferedReader(new InputStreamReader(in)); | |
StringReader sr; // we need to re-read each line into its own reader because the tokenizer is over-complicated garbage | |
PTBTokenizer tkzr; // tokenizer object | |
WordStemmer ls = new WordStemmer(); // stemmer/lemmatizer object | |
// Read File Line By Line | |
String strLine; | |
while ((strLine = br.readLine()) != null) { | |
System.out.println ("Tokenizing and Parsing: "+strLine); // print current line to console | |
// do all the standard java over-complication to use the stanford parser tokenizer | |
sr = new StringReader(strLine); | |
tkzr = PTBTokenizer.newPTBTokenizer(sr); | |
List toks = tkzr.tokenize(); | |
System.out.println ("tokens: "+toks); | |
Tree parse = (Tree) lp.apply(toks); // finally, we actually get to parse something | |
// Output Option 1: Printing out various data by accessing it programmatically | |
// Get words, stemmed words and POS tags | |
ArrayList<String> words = new ArrayList(); | |
ArrayList<String> stems = new ArrayList(); | |
ArrayList<String> tags = new ArrayList(); | |
// Get words and Tags | |
for (TaggedWord tw : parse.taggedYield()){ | |
words.add(tw.word()); | |
tags.add(tw.tag()); | |
} | |
// Get stems | |
ls.visitTree(parse); // apply the stemmer to the tree | |
for (TaggedWord tw : parse.taggedYield()){ | |
stems.add(tw.word()); | |
} | |
// Get dependency tree | |
TreebankLanguagePack tlp = new PennTreebankLanguagePack(); | |
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); | |
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); | |
Collection tdl = gs.typedDependenciesCollapsed(); | |
// And print! | |
System.out.println("words: "+words); | |
System.out.println("POStags: "+tags); | |
System.out.println("stemmedWordsAndTags: "+stems); | |
System.out.println("typedDependencies: "+tdl); | |
// Output Option 2: Printing out various data using TreePrint | |
// Various TreePrint options | |
// "penn", // constituency parse | |
// "oneline", | |
// rootLabelOnlyFormat, | |
// "words", | |
// "wordsAndTags", // unstemmed words and pos tags | |
// "dependencies", // unlabeled dependency parse | |
// "typedDependencies", // dependency parse | |
// "typedDependenciesCollapsed", | |
// "latexTree", | |
// "collocations", | |
// "semanticGraph" | |
// Print using TreePrint with various options | |
//TreePrint tp = new TreePrint("wordsAndTags,typedDependencies"); | |
//tp.printTree(parse); | |
System.out.println(); // separate output lines | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package foo; | |
//Standard Java libraries | |
import java.io.*; | |
import java.util.*; | |
import syntax.Protein; | |
import syntax.AnnotationType; | |
import syntax.Token; | |
import util.Pair; | |
import util.Util; | |
// Stanford Parser | |
import edu.stanford.nlp.trees.*; | |
import edu.stanford.nlp.ling.HasWord; | |
import edu.stanford.nlp.ling.Word; | |
import edu.stanford.nlp.parser.lexparser.LexicalizedParser; | |
import edu.stanford.nlp.process.PTBTokenizer; // tokenizer | |
import edu.stanford.nlp.process.DocumentPreprocessor; // sentence splitter | |
public class RunStanfordParser2 { | |
// Stuff related to stanford parser | |
DocumentPreprocessor splitter; // sentence splitter | |
PTBTokenizer tkzr; // tokenizer object | |
WordStemmer ls; // stemmer/lemmatizer object | |
LexicalizedParser lp; // parser object | |
StringReader sr; | |
/** | |
* Construct a wrapper around Stanford Parser for creating .dep and .morph files | |
* @param parserFileOrUrl path to serialized Stanford Parser | |
*/ | |
public parserWrapper(String parserFileOrUrl){ | |
splitter = new DocumentPreprocessor(); | |
ls = new WordStemmer(); | |
lp = new LexicalizedParser(parserFileOrUrl); | |
//lp.setOptionFlags(new String[]{"-maxLength", "80", "-retainTmpSubcategories"}); // set max sentence length if you want | |
} | |
/** | |
* Return a filename without the extension | |
* @param fullpath path to a file | |
* @return | |
*/ | |
private static String getFileNameFromPath(String fullpath){ | |
String filename = new File(fullpath).getName(); | |
return filename.substring(0,filename.lastIndexOf('.')); | |
} | |
/** | |
* Does a file with the expected suffix exist? If not throw an exception. | |
* @param path | |
* @param filename | |
* @param suffix | |
* @throws Exception | |
*/ | |
public void existsFileWithSuffix(String path, String filename, String suffix) throws Exception{ | |
File fSS = new File(path+filename+suffix); | |
if( !fSS.exists() ) | |
throw new Exception("\n"+fSS.toString()+"\n"+"missing preprocessed file with suffix -"+suffix); | |
} | |
public void processGENIAfiles(String inputPath, String outputPath) throws Exception { | |
// Create the output directory if it doesn't exist | |
if(! (new File(outputPath).exists()) ){ new File(outputPath).mkdirs(); } | |
// Append a slash to the paths for ease of file reading/creation | |
if( !outputPath.endsWith("/") ){ outputPath = outputPath+"/"; } | |
if( !inputPath.endsWith("/") ){ inputPath = inputPath+"/"; } | |
// Look at files in the input path | |
File dir = new File(inputPath); | |
FileFilter fileFilter = new FileFilter() { | |
public boolean accept(File file) { | |
return file.isFile() && file.getName().endsWith(".txt") ; | |
} | |
}; | |
File[] files = dir.listFiles(fileFilter); | |
System.out.println("Processing "+files.length+" files..."); | |
int count = 0; | |
int total = files.length; | |
for(File f:files){ | |
String filename = getFileNameFromPath(f.toString()); | |
System.out.println("File:"+filename); | |
// Check if all necessary files exist! | |
existsFileWithSuffix(inputPath, filename, ".txt"); | |
existsFileWithSuffix(inputPath, filename, ".standoff"); | |
existsFileWithSuffix(inputPath, filename, ".tagNew"); | |
existsFileWithSuffix(inputPath, filename, ".a1"); | |
String txtFile = inputPath+filename+".txt"; | |
String standoffFile = inputPath+filename+".standoff"; | |
String tagFile = inputPath+filename+".tagNew"; | |
//String tagFile = inputPath+filename+".tag"; | |
String a1File = inputPath+filename+".a1"; | |
//Workflow | |
// 1. Load standoff file to get sentence offsets | |
// 2. Load the text file as a whole (for word offset calculation after the parsing step) | |
// 3. Run the parser loop- basically read each set of tokens in the tag file (each sentence) and do the processing there | |
HashMap<Integer, int[]> standoff = loadStandoffFile(standoffFile); | |
String fullText = Util.readFileAsString(txtFile); | |
parseTagFile(tagFile, standoff, fullText, outputPath, filename); // parser loop- see function for bulk of text processing | |
count++; | |
System.out.println("parsed file "+count+"/"+total); | |
} | |
} | |
/** | |
* HACK ALERT- GENIA standoff data is sometimes missing for last sentence, this hack fixes it | |
* @param standoff | |
* @param index | |
* @param fullText | |
* @return | |
*/ | |
private Pair<Integer, Integer> getStandOff(HashMap<Integer, int[]> standoff, int index, String fullText){ | |
int[] standOffArr = standoff.get(index); | |
int start = 0; | |
int stop = 0; | |
try { | |
start = standOffArr[0]; | |
stop = standOffArr[1]; | |
} | |
catch(Exception e) { | |
start = standoff.get(index-1)[1]+1; | |
stop = fullText.length(); | |
} | |
return new Pair(start,stop); | |
} | |
/** | |
* Parse the tag file produced by the GENIA tagger, do some processing and reconcile this with the A1 file | |
* Then print the .dep and .morph output files | |
* @param path path to current document | |
* @param standoff standoff file for all sentences in current document | |
* @param fullText full text of current document | |
* @param proteins all entities in current document | |
* @param outputPath path to output files | |
* @param filename filename without suffix for current document | |
* @throws Exception | |
*/ | |
private void parseTagFile(String path, HashMap<Integer, int[]> standoff, String fullText, String outputPath, String filename) throws Exception{ | |
int sentenceCounter = 0; | |
try { // | |
// Prepare the outputfiles | |
File fDep = new File(outputPath+filename+".dep"); | |
File fMorph = new File(outputPath+filename+".morph"); | |
fDep.createNewFile(); | |
fMorph.createNewFile(); | |
BufferedWriter depFile = new BufferedWriter(new FileWriter(fDep)); | |
BufferedWriter morphFile = new BufferedWriter(new FileWriter(fMorph)); | |
// All the stuff for the specific file | |
File f = new File(path); | |
BufferedReader reader = new BufferedReader(new FileReader(f)); | |
String line = null; | |
int tokenIndex = 0; | |
ArrayList<Token> tokens = new ArrayList(); | |
while ((line=reader.readLine()) != null) { | |
// NOTE: The input file is a series of lines of tokens, with a blank space separating tokens for different sentences. | |
// We consume tokens until we hit a blank line, and then process the sentence. | |
if(!line.isEmpty()){ | |
String elements[] = line.split("\\t"); // text, stem, pos, chunkTag, neTag | |
String tokText = elements[0]; | |
String tokStem = elements[1]; | |
String tokPOS = elements[2]; | |
String nounChunk = elements[3]; | |
String neChunk = elements[4]; | |
Token tok = new Token(tokenIndex,tokText,tokStem, tokPOS, nounChunk, neChunk); | |
tokens.add(tok); | |
tokenIndex++; | |
} | |
else { // empty line- parse the sentence whose tokens were collected | |
//System.out.println("sentence#="+sentenceCounter); | |
// Parse | |
Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair = null; | |
try { | |
// Workflow: | |
// 1. Get sentence start and stop offsets | |
// 2. Parse sentence in file | |
// 3. Add GENIA tagging information to parsed tokens | |
// 4. Reconcile parsed sentence with all applicable entities in entity file | |
// 5. Print .dep and .morph files | |
// 6. In case of failed parse, print .morph file and empty entry in the .dep file | |
// Get standoff data | |
Pair<Integer,Integer> st = getStandOff(standoff, sentenceCounter, fullText); | |
int start = st.a; | |
int stop = st.b; | |
// Parse | |
pair = parseSentence(tokens,start,stop,fullText); | |
// Add GENIA info to parsed tokens- noun and named entity chunking | |
// NOTE: Currently tokens omitted from parse representation (prepositions, etc) are not added back in | |
HashMap<Integer, Token> parsedToks = pair.a; | |
addGENIAInfoToTokens(tokens,parsedToks); | |
// Print file out | |
printSentenceToDepAndMorph(pair,depFile,morphFile); | |
} catch(Exception e){ | |
// When we cannot parse a sentence | |
// 1. Catch the exception and print an error message | |
System.err.println("Bad Parse on "+filename+".txt"+", sentence #"+(sentenceCounter+1)); | |
// 2. Print the tokens out to the .morph file anyways | |
// 3. Make an empty entry in the .dep file | |
printSentenceToDepAndMorph(tokens,depFile,morphFile); | |
} | |
sentenceCounter++; | |
tokens = new ArrayList(); | |
tokenIndex = 0; | |
System.out.print("."); // progress counter | |
} | |
} | |
System.out.println("."); // end of progress counter | |
// Close output files | |
depFile.close(); | |
morphFile.close(); | |
System.out.println("\t"+"created "+fDep.getPath()); | |
System.out.println("\t"+"created "+fMorph.getPath()); | |
} | |
catch(Exception e) { | |
System.err.println("Fatal Parse Error - skipping file "+filename+".txt"+", sentence #"+(sentenceCounter+1)); | |
} | |
} | |
/** | |
* Add GENIA tagger info to parsed tokens- specifically noun and named entity chunking info. | |
* @param tokens | |
* @param wordMap | |
*/ | |
private void addGENIAInfoToTokens(ArrayList<Token> tokens, HashMap<Integer, Token> wordMap) { | |
// NOTE: There are not the same number of objects in tokens as wordMap (parsing removes preps. and stuff from wordMap) | |
// thus: len(tokens) >= len(wordMap.values) | |
// We are creating a new wordmap object | |
//HashMap<Integer, Object[]> wordMap = new HashMap(); | |
List<Object[]> words = new ArrayList(wordMap.values()); | |
int j=0; | |
for(int i=0; i<tokens.size();i++){ | |
Token tok = tokens.get(i); | |
j=i+1; // Dependency indices start 1, not 0, so make the token indices match | |
if(wordMap.containsKey(j)){ | |
Token parsedTok = wordMap.get(j); | |
parsedTok.chunkTag = tok.chunkTag; | |
parsedTok.neTag = tok.neTag; | |
} | |
// else { | |
// // We add non-parsed words to the wordMap because they may contain GENIA tagging info for noun or ne chunks | |
// // NOTE: offsets are not calculated in the parser stage for these words | |
// // NOTE: -1 offset means disregard!!! | |
// Object[] wordArr = {tok.partOfSpeech, tok.text, tok.stem, -1, -1, tok.chunkTag, tok.neTag}; | |
// wordMap.put(j, wordArr); | |
// } | |
} | |
} | |
/** | |
* Run Stanford Parser on a string (one sentence) | |
* NOTE: The start/stop is necessary to calculate the offsets | |
* @param sentence a sentence to parse | |
* @param allText text sentence is part of | |
* @param start index of sentence starting point in allText | |
* @param stop index of end of sentence in allText | |
* @params sentenceTokens for calculating individual words offsets (kind of a hack) | |
* @throws Exception | |
*/ | |
private Pair<HashMap<Integer, Token>, ArrayList<Object[]>> parseSentence(ArrayList<Token> tok, int start, int stop, String fullText) throws Exception{ | |
//System.out.println("start,stop=<"+start+","+stop+">"); | |
// Create parser input from genia-tagged input | |
ArrayList<Word> toksW = new ArrayList(); | |
for(Token t:tok){ | |
toksW.add( new Word(t.text) ); | |
} | |
Tree parse = (Tree) lp.apply((List)toksW); // finally, we actually get to parse something | |
TreebankLanguagePack tlp = new PennTreebankLanguagePack(); | |
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); | |
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); | |
//Collection tdl = gs.typedDependenciesCollapsed(); //NOTE: Using Un-collapsed dependencies | |
Collection tdl = gs.typedDependencies(); | |
// Data structures | |
HashMap<Integer,Token> wordMap = new HashMap<Integer, Token>(); // Holds values we build with .morph ( index -> { POS tag, word, stem, start offset, end offset } ) | |
ArrayList<Object[]> relnList = new ArrayList<Object[]>(); // For .dep, holds a arrays of form { relation, index head, index child } | |
// We will walk over the dependency parse, pull out the indicies, then do the same but for the stemmed parse | |
// Unstemmed parse | |
for( Iterator<TypedDependency> iter = tdl.iterator(); iter.hasNext(); ) { | |
TypedDependency var = iter.next(); | |
TreeGraphNode dep = var.dep(); | |
TreeGraphNode gov = var.gov(); | |
// All useful information for a node in the tree | |
String reln = var.reln().getShortName(); | |
int depIdx = var.dep().index(); | |
int govIdx = var.gov().index(); | |
Object relnArr[] = {reln, govIdx, depIdx}; | |
relnList.add(relnArr); | |
Token depTok = new Token(depIdx, dep.label().tag(), dep.label().value()); | |
Token govTok = new Token(govIdx, gov.label().tag(), gov.label().value()); | |
wordMap.put(depIdx, depTok); | |
wordMap.put(govIdx, govTok); | |
} | |
// Stemmed parse | |
// Get stems! | |
ls.visitTree(parse); // apply the stemmer to the tree | |
gs = gsf.newGrammaticalStructure(parse); | |
tdl = gs.typedDependenciesCollapsed(); | |
for( Iterator<TypedDependency> iter = tdl.iterator(); iter.hasNext(); ) { | |
TypedDependency var = iter.next(); | |
TreeGraphNode dep = var.dep(); | |
TreeGraphNode gov = var.gov(); | |
int depIdx = dep.index(); | |
if( wordMap.containsKey(depIdx)) | |
wordMap.get( depIdx ).stem = dep.value(); | |
int govIdx = gov.index(); | |
if( wordMap.containsKey(govIdx)) | |
wordMap.get( govIdx ).stem = gov.value(); | |
} | |
calculateWordOffsets(wordMap, fullText, start, stop, toksW); | |
Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair = new Pair(wordMap, relnList); | |
return pair; | |
} | |
private void printSentenceToDepAndMorph(Pair<HashMap<Integer, Token>, ArrayList<Object[]>> pair, BufferedWriter depFile, BufferedWriter morphFile) throws IOException{ | |
HashMap<Integer, Token> tokens = pair.a; | |
ArrayList<Object[]> relnList = pair.b; | |
// Print .morph | |
List keys = new ArrayList(tokens.keySet()); Collections.sort(keys); // get tokens, in order | |
for(Object k:keys){ | |
int key = Integer.parseInt(k.toString()); | |
Token tok = tokens.get(key); | |
morphFile.write(tok.index+"\t"); | |
morphFile.write(tok.pos+"\t"); | |
morphFile.write(tok.text+"\t"); | |
morphFile.write(tok.stem+"\t"); | |
morphFile.write(tok.startOffset+"\t"); | |
morphFile.write(tok.endOffset+"\t"); | |
morphFile.write(tok.chunkTag+"\t"); | |
morphFile.write(tok.neTag+"\t"); | |
morphFile.write(tok.isAcronym+"\t"); | |
morphFile.write(tok.isPartialAcronymMatch+"\t"); | |
morphFile.write(tok.isProteinAcronymMatch+"\t"); | |
morphFile.write(tok.acronym+"\t"); | |
morphFile.write(tok.acronymExpandedText+"\t"); | |
morphFile.write(tok.proteinAcronym+"\t"); | |
morphFile.write("\n"); | |
} | |
morphFile.newLine(); | |
//print .dep | |
for(Object[] relnArr: relnList){ | |
String output = relnArr[0]+"\t"+relnArr[1]+"\t"+relnArr[2]; | |
depFile.write(output+"\n"); | |
} | |
depFile.newLine(); | |
} | |
private void printSentenceToDepAndMorph(ArrayList<Token> toks, BufferedWriter depFile, BufferedWriter morphFile) throws IOException{ | |
for(Token tok:toks){ | |
morphFile.write(tok.index+"\t"); | |
morphFile.write(tok.pos+"\t"); | |
morphFile.write(tok.text+"\t"); | |
morphFile.write(tok.stem+"\t"); | |
morphFile.write(tok.startOffset+"\t"); | |
morphFile.write(tok.endOffset+"\t"); | |
morphFile.write(tok.chunkTag+"\t"); | |
morphFile.write(tok.neTag+"\t"); | |
morphFile.write(tok.isAcronym+"\t"); | |
morphFile.write(tok.isPartialAcronymMatch+"\t"); | |
morphFile.write(tok.isProteinAcronymMatch+"\t"); | |
morphFile.write(tok.acronym+"\t"); | |
morphFile.write(tok.acronymExpandedText+"\t"); | |
morphFile.write(tok.proteinAcronym+"\t"); | |
morphFile.write("\n"); | |
} | |
morphFile.newLine(); | |
//print emtpy .dep | |
depFile.newLine(); | |
} | |
/** | |
* Load GENIA sentence splitter standoff file (shows sentence boundaries) | |
* @param path | |
* @return hashmap<sentence_number, {start,stop}> | |
* @throws Exception | |
*/ | |
private HashMap<Integer, int[]> loadStandoffFile(String path) throws Exception { | |
HashMap<Integer , int[]> standoff = new HashMap(); | |
File f = new File(path); | |
BufferedReader reader = new BufferedReader(new FileReader(f)); | |
String line = null; | |
int sentenceCounter = 0; | |
while ((line=reader.readLine()) != null) { | |
if( !line.isEmpty() ){ | |
String elements[] = line.split("\\t"); // format: text, stem, pos, chunkTag, neTag | |
int[] startstop = { Integer.parseInt(elements[0]), Integer.parseInt(elements[1]) }; | |
standoff.put(sentenceCounter, startstop); | |
sentenceCounter++; | |
} | |
} | |
return standoff; | |
} | |
/** | |
* Is this a whitespace character | |
* @param c character | |
* @return truth | |
*/ | |
private boolean IsWhiteSpace(char c){ if( c == '\n' || c == ' ' || c == '\r'){ return true; } else { return false; } } | |
/** | |
* Walk through the text and match each non-whitespace token in the tokenized sentence until completion. | |
* Some words (like prepositions and parens) aren't in the parse, so we need to make sure we can skip over them nicely and not break. | |
* NOTE: The whole thing is predicated on the fact that we are looping over the sentence, tokens and parse in order | |
* so the first word X or Y we see is the same in all 3 of these. | |
* @param wordMap Results of dependency parse (parseIndex-word mappings) | |
* @param text Block of text | |
* @param startIdx start of sentence | |
* @param stopIdx end of sentence | |
* @param sent0 Sentence tokens | |
* @throws Exception | |
*/ | |
private void calculateWordOffsets(HashMap<Integer, Token> wordMap, String text, int startIdx, int stopIdx, List<Word> sent0) throws Exception { | |
// Parsed word data structure | |
Object[] keys = wordMap.keySet().toArray(); | |
Arrays.sort(keys); // sort indicies by smallest to largest | |
int wordArrIdx = 0; | |
// get into the correct position | |
int offset = startIdx; | |
int start = 0; | |
text = text.substring(startIdx, stopIdx); | |
for(Word word : sent0){ | |
String w = word.toString(); | |
String originalW = edu.stanford.nlp.process.PTBTokenizer.ptbToken2Text(w); | |
// first trim off any leading whitespace | |
while( IsWhiteSpace(text.charAt(0)) ){ | |
text = text.substring(1); | |
offset++; //increment the offset counter | |
} | |
// now see if our word matches | |
start = offset; | |
char firstChar = originalW.charAt(0); | |
if( firstChar == text.charAt(0)){ | |
// Is this the word in the token? | |
if( text.startsWith(originalW, 0) ){ | |
int len = originalW.length(); | |
offset+=len; | |
text = text.substring(len); | |
// Is this token a word in the parse? | |
Token tok = wordMap.get(keys[wordArrIdx]); | |
String wParse = tok.text; | |
wParse = wParse.replaceAll("\\\\/", "/"); // java regexes are awful: "\\\\" == "\" | |
//System.out.println("wordmap=/"+wParse+"/ vs hasword=/"+originalW+"/"+" wordArrIdx="+wordArrIdx); | |
if ( wParse.equals(originalW) ){ | |
//System.out.println("\t"+originalW+" ("+start+","+offset+")"); | |
tok.startOffset = start; | |
tok.endOffset = offset; | |
wordArrIdx++; | |
if(wordArrIdx >= keys.length) | |
break; | |
} | |
} | |
} else { | |
System.out.println("w:"+word+", originalW"+originalW); | |
System.out.println("firstChar:"+firstChar+", text.charAt(0)"+text.charAt(0)); | |
throw new Exception("unknown token"); | |
} | |
} | |
} | |
/** | |
* Turn Stanford Parser sentence splitter output into a string (for a single sentence) | |
* @param s sentence list | |
* @return sentence as string | |
*/ | |
public static String join(List<String> s) { | |
if (s.isEmpty()) return ""; | |
String delimiter = " "; | |
Iterator<String> iter = s.iterator(); | |
StringBuffer buffer = new StringBuffer(iter.next().toString()); | |
while (iter.hasNext()) buffer.append(delimiter).append(iter.next()); | |
return buffer.toString(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment