allanj · April 10, 2019 09:54
diff --git a/dep_parse.java b/dep_parse.java
 package corenlp.process;

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.List;

 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.parser.nndep.DependencyParser;
 import edu.stanford.nlp.trees.GrammaticalStructure;
 import edu.stanford.nlp.trees.TypedDependency;
 import statistics.RAWF;

 public class Converter {

 	
 	String modelPath ;
 	
 	public Converter(String modelPath) {
 		this.modelPath = modelPath;

 	}
 	
 	/**
 	 * Read CoNLL-2003
 	 * @param path
 	 * @param writePath
 	 * @throws IOException
 	 */
 	public void readData(String path, String writePath) throws IOException{
 		DependencyParser parser = DependencyParser.loadFromModelFile(this.modelPath);
 		
 		BufferedReader br = RAWF.reader(path);
 		PrintWriter pw = RAWF.writer(writePath);
 		String line = null;
 		List<CoreLabel> words = new ArrayList<>();
 		ArrayList<String> output = new ArrayList<String>();
 		while((line = br.readLine())!=null){
 			if(line.equals("")){
 				
 				GrammaticalStructure gs = parser.predict(words);
 				List<TypedDependency> deps =  new ArrayList<>(gs.typedDependencies());
 				int[] heads = new int[words.size()];
 				String[] depLabels = new String[words.size()];
 				for (TypedDependency dep : deps) {
 					heads[dep.dep().index() - 1] = dep.gov().index() - 1;
 					depLabels[dep.dep().index() - 1] = dep.reln().getShortName();
 //					System.out.println(dep.gov().index() +", " +dep.dep().index()+ ", " + dep.reln().getShortName());
 				}
 //				System.exit(0);
 				
 				for (int p = 0; p < words.size(); p++) {
 					CoreLabel word = words.get(p);
 					int head = heads[p] + 1;
 					pw.println((p+1) + "\t"+  word.word() + "\t_\t" +word.tag()+"\t"+word.tag()+"\t_\t"+head+"\t"+depLabels[p]+"\t_\t_\t"+output.get(p) );
 				}
 				pw.println();
 				words = new ArrayList<CoreLabel>();
 				output = new ArrayList<String>();
 			} else {
 				String[] values = line.split(" ");
 				String entity = values[2];
 				output.add(entity);
 				CoreLabel token = new CoreLabel();
 				token.setWord(values[0]);
 				token.setTag(values[1]);
 				words.add(token);
 			}
 		}
 		br.close();
 		pw.close();
 	}
 	
 	/**
 	 * Read CoNLL-2003
 	 * @param path
 	 * @param writePath
 	 * @throws IOException
 	 */
 	public void readOntoNotes(String path, String writePath) throws IOException{
 		DependencyParser parser = DependencyParser.loadFromModelFile(this.modelPath);
 		
 		BufferedReader br = RAWF.reader(path);
 		PrintWriter pw = RAWF.writer(writePath);
 		String line = null;
 		List<CoreLabel> words = new ArrayList<>();
 		ArrayList<String> output = new ArrayList<String>();
 		while((line = br.readLine())!=null){
 			if(line.equals("")){
 				
 				GrammaticalStructure gs = parser.predict(words);
 				List<TypedDependency> deps =  new ArrayList<>(gs.typedDependencies());
 				int[] heads = new int[words.size()];
 				String[] depLabels = new String[words.size()];
 				for (TypedDependency dep : deps) {
 					heads[dep.dep().index() - 1] = dep.gov().index() - 1;
 					depLabels[dep.dep().index() - 1] = dep.reln().getShortName();
 //					System.out.println(dep.gov().index() +", " +dep.dep().index()+ ", " + dep.reln().getShortName());
 				}
 //				System.exit(0);
 				
 				for (int p = 0; p < words.size(); p++) {
 					CoreLabel word = words.get(p);
 					int head = heads[p] + 1;
 					pw.println((p+1) + "\t"+  word.word() + "\t_\t" +word.tag()+"\t"+word.tag()+"\t_\t"+head+"\t"+depLabels[p]+"\t_\t_\t"+output.get(p) );
 				}
 				pw.println();
 				words = new ArrayList<CoreLabel>();
 				output = new ArrayList<String>();
 			} else {
 				String[] values = line.split("\t");
 				String word =  values[1];
 				String pos = values[3];
 				String entity = values[values.length - 1];
 				output.add(entity);
 				CoreLabel token = new CoreLabel();
 				token.setWord(word);
 				token.setTag(pos);
 				words.add(token);
 			}
 		}
 		br.close();
 		pw.close();
 	}
 	
 	public static void main(String... args) throws IOException {
 		
 		
 		
 		String[] x = new String[]{"SD", "UD"};
 		for(String type : x) {
 			String path = "edu/stanford/nlp/models/parser/nndep/english_"+type+".gz";
 			Converter conv = new Converter(path);
 //			conv.readData("data/conll2003/train.txt", "data/conll2003/train."+type.toLowerCase()+".conllx");
 //			conv.readData("data/conll2003/dev.txt", "data/conll2003/dev."+type.toLowerCase()+".conllx");
 //			conv.readData("data/conll2003/test.txt", "data/conll2003/test."+type.toLowerCase()+".conllx");
 			
 			conv.readOntoNotes("data/ontonotes/train.sd.conllx", "data/ontonotes/train.pred"+type.toLowerCase()+".conllx");
 			conv.readOntoNotes("data/ontonotes/dev.sd.conllx", "data/ontonotes/dev.pred"+type.toLowerCase()+".conllx");
 			conv.readOntoNotes("data/ontonotes/test.sd.conllx", "data/ontonotes/test.pred"+type.toLowerCase()+".conllx");
 		}
 		
 	}
 	
 //	public static void main(String... args) {
 //		String modelPath = DependencyParser.DEFAULT_MODEL;
 //	    String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
 //
 //	    for (int argIndex = 0; argIndex < args.length; ) {
 //	      switch (args[argIndex]) {
 //	        case "-tagger":
 //	          taggerPath = args[argIndex + 1];
 //	          argIndex += 2;
 //	          break;
 //	        case "-model":
 //	          modelPath = args[argIndex + 1];
 //	          argIndex += 2;
 //	          break;
 //	        default:
 //	          throw new RuntimeException("Unknown argument " + args[argIndex]);
 //	      }
 //	    }
 //
 //	    String text = "I can almost always tell when movies use fake dinosaurs.";
 //
 //	    MaxentTagger tagger = new MaxentTagger(taggerPath);
 //	    
 //
 ////	    DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
 ////	    for (List<HasWord> sentence : tokenizer) {
 ////	      List<TaggedWord> tagged = tagger.tagSentence(sentence);
 ////	      GrammaticalStructure gs = parser.predict(tagged);
 ////	      System.out.println(gs.toString());
 ////	    }
 //	    
 //	    CoreLabel token = new CoreLabel();
 //	    token.setWord("I");
 //	    token.setTag("NN");
 //	    List<CoreLabel> sent = new ArrayList<>();
 //	    sent.add(token);
 //	    GrammaticalStructure gs = parser.predict(sent);
 //	    System.out.println(gs.toString());
 //	}
 }
	package corenlp.process;

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.PrintWriter;
	import java.util.ArrayList;
	import java.util.List;

	import edu.stanford.nlp.ling.CoreLabel;
	import edu.stanford.nlp.parser.nndep.DependencyParser;
	import edu.stanford.nlp.trees.GrammaticalStructure;
	import edu.stanford.nlp.trees.TypedDependency;
	import statistics.RAWF;

	public class Converter {


	String modelPath ;

	public Converter(String modelPath) {
	this.modelPath = modelPath;

	}

	/**
	* Read CoNLL-2003
	* @param path
	* @param writePath
	* @throws IOException
	*/
	public void readData(String path, String writePath) throws IOException{
	DependencyParser parser = DependencyParser.loadFromModelFile(this.modelPath);

	BufferedReader br = RAWF.reader(path);
	PrintWriter pw = RAWF.writer(writePath);
	String line = null;
	List<CoreLabel> words = new ArrayList<>();
	ArrayList<String> output = new ArrayList<String>();
	while((line = br.readLine())!=null){
	if(line.equals("")){

	GrammaticalStructure gs = parser.predict(words);
	List<TypedDependency> deps = new ArrayList<>(gs.typedDependencies());
	int[] heads = new int[words.size()];
	String[] depLabels = new String[words.size()];
	for (TypedDependency dep : deps) {
	heads[dep.dep().index() - 1] = dep.gov().index() - 1;
	depLabels[dep.dep().index() - 1] = dep.reln().getShortName();
	// System.out.println(dep.gov().index() +", " +dep.dep().index()+ ", " + dep.reln().getShortName());
	}
	// System.exit(0);

	for (int p = 0; p < words.size(); p++) {
	CoreLabel word = words.get(p);
	int head = heads[p] + 1;
	pw.println((p+1) + "\t"+ word.word() + "\t_\t" +word.tag()+"\t"+word.tag()+"\t_\t"+head+"\t"+depLabels[p]+"\t_\t_\t"+output.get(p) );
	}
	pw.println();
	words = new ArrayList<CoreLabel>();
	output = new ArrayList<String>();
	} else {
	String[] values = line.split(" ");
	String entity = values[2];
	output.add(entity);
	CoreLabel token = new CoreLabel();
	token.setWord(values[0]);
	token.setTag(values[1]);
	words.add(token);
	}
	}
	br.close();
	pw.close();
	}

	/**
	* Read CoNLL-2003
	* @param path
	* @param writePath
	* @throws IOException
	*/
	public void readOntoNotes(String path, String writePath) throws IOException{
	DependencyParser parser = DependencyParser.loadFromModelFile(this.modelPath);

	BufferedReader br = RAWF.reader(path);
	PrintWriter pw = RAWF.writer(writePath);
	String line = null;
	List<CoreLabel> words = new ArrayList<>();
	ArrayList<String> output = new ArrayList<String>();
	while((line = br.readLine())!=null){
	if(line.equals("")){

	GrammaticalStructure gs = parser.predict(words);
	List<TypedDependency> deps = new ArrayList<>(gs.typedDependencies());
	int[] heads = new int[words.size()];
	String[] depLabels = new String[words.size()];
	for (TypedDependency dep : deps) {
	heads[dep.dep().index() - 1] = dep.gov().index() - 1;
	depLabels[dep.dep().index() - 1] = dep.reln().getShortName();
	// System.out.println(dep.gov().index() +", " +dep.dep().index()+ ", " + dep.reln().getShortName());
	}
	// System.exit(0);

	for (int p = 0; p < words.size(); p++) {
	CoreLabel word = words.get(p);
	int head = heads[p] + 1;
	pw.println((p+1) + "\t"+ word.word() + "\t_\t" +word.tag()+"\t"+word.tag()+"\t_\t"+head+"\t"+depLabels[p]+"\t_\t_\t"+output.get(p) );
	}
	pw.println();
	words = new ArrayList<CoreLabel>();
	output = new ArrayList<String>();
	} else {
	String[] values = line.split("\t");
	String word = values[1];
	String pos = values[3];
	String entity = values[values.length - 1];
	output.add(entity);
	CoreLabel token = new CoreLabel();
	token.setWord(word);
	token.setTag(pos);
	words.add(token);
	}
	}
	br.close();
	pw.close();
	}

	public static void main(String... args) throws IOException {



	String[] x = new String[]{"SD", "UD"};
	for(String type : x) {
	String path = "edu/stanford/nlp/models/parser/nndep/english_"+type+".gz";
	Converter conv = new Converter(path);
	// conv.readData("data/conll2003/train.txt", "data/conll2003/train."+type.toLowerCase()+".conllx");
	// conv.readData("data/conll2003/dev.txt", "data/conll2003/dev."+type.toLowerCase()+".conllx");
	// conv.readData("data/conll2003/test.txt", "data/conll2003/test."+type.toLowerCase()+".conllx");

	conv.readOntoNotes("data/ontonotes/train.sd.conllx", "data/ontonotes/train.pred"+type.toLowerCase()+".conllx");
	conv.readOntoNotes("data/ontonotes/dev.sd.conllx", "data/ontonotes/dev.pred"+type.toLowerCase()+".conllx");
	conv.readOntoNotes("data/ontonotes/test.sd.conllx", "data/ontonotes/test.pred"+type.toLowerCase()+".conllx");
	}

	}

	// public static void main(String... args) {
	// String modelPath = DependencyParser.DEFAULT_MODEL;
	// String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
	//
	// for (int argIndex = 0; argIndex < args.length; ) {
	// switch (args[argIndex]) {
	// case "-tagger":
	// taggerPath = args[argIndex + 1];
	// argIndex += 2;
	// break;
	// case "-model":
	// modelPath = args[argIndex + 1];
	// argIndex += 2;
	// break;
	// default:
	// throw new RuntimeException("Unknown argument " + args[argIndex]);
	// }
	// }
	//
	// String text = "I can almost always tell when movies use fake dinosaurs.";
	//
	// MaxentTagger tagger = new MaxentTagger(taggerPath);
	//
	//
	//// DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
	//// for (List<HasWord> sentence : tokenizer) {
	//// List<TaggedWord> tagged = tagger.tagSentence(sentence);
	//// GrammaticalStructure gs = parser.predict(tagged);
	//// System.out.println(gs.toString());
	//// }
	//
	// CoreLabel token = new CoreLabel();
	// token.setWord("I");
	// token.setTag("NN");
	// List<CoreLabel> sent = new ArrayList<>();
	// sent.add(token);
	// GrammaticalStructure gs = parser.predict(sent);
	// System.out.println(gs.toString());
	// }
	}