This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package uk.co.fues.submission.classifier.nlp; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import opennlp.tools.cmdline.parser.ParserTool; | |
import opennlp.tools.parser.Parse; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static Set<String> removeRepeatedSentances(String text) { | |
String [] paras = text.split("\n+"); | |
List<String> paragraphs = Arrays.asList(paras); | |
for(int i = 0; i<paragraphs.size(); i++) { | |
String paragraph = paragraphs.get(i); | |
paragraphs.set(i, clearWhitespace(paragraph)); | |
} | |
Set<String> ret = new HashSet<String>(); | |
for(String para:paragraphs) { | |
ret.add(para); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static String trimShortSentanceSequences(Set<String> paragraphs) { | |
List<String> sens = new ArrayList<String>(); | |
List<Integer> counts = new ArrayList<Integer>(); | |
int cachecount = 0; | |
for(String sentances:paragraphs) { | |
counts.add(sentances.split(" ").length); | |
sens.add(sentances); | |
} | |
List<String> killindex = new ArrayList<String>(); | |
for(int i = 0; i<counts.size(); i++) { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static String removeSingleSentanceSequences(Set<String> paragraphs) { | |
// get single sentances in a row, likely to be menu items | |
List<String> sens = new ArrayList<String>(); | |
List<Integer> counts = new ArrayList<Integer>(); | |
int cachecount = 0; | |
for(String sentances:paragraphs) { | |
counts.add(sentances.split(" ").length); | |
sens.add(sentances); //just making efficient use of iteration to build list | |
} | |
List<String> killindex = new ArrayList<String>(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static String removeSingleSentanceSequences(Set<String> paragraphs) { | |
// get single sentances in a row, likely to be menu items | |
List<String> sens = new ArrayList<String>(); | |
List<Integer> counts = new ArrayList<Integer>(); | |
int cachecount = 0; | |
for(String sentances:paragraphs) { | |
counts.add(sentances.split(" ").length); | |
sens.add(sentances); //just making efficient use of iteration to build list | |
} | |
List<String> killindex = new ArrayList<String>(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static String removeSingleSentanceSequences(Set<String> paragraphs) { | |
// get single sentances in a row, likely to be menu items | |
List<String> sens = new ArrayList<String>(); | |
List<Integer> counts = new ArrayList<Integer>(); | |
int cachecount = 0; | |
for(String sentances:paragraphs) { | |
counts.add(sentances.split(" ").length); | |
sens.add(sentances); //just making efficient use of iteration to build list | |
} | |
List<String> killindex = new ArrayList<String>(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static String removeSingleSentanceSequences(Set<String> paragraphs) { | |
// get single sentances in a row, likely to be menu items | |
List<String> sens = new ArrayList<String>(); | |
List<Integer> counts = new ArrayList<Integer>(); | |
int cachecount = 0; | |
for(String sentances:paragraphs) { | |
counts.add(sentances.split(" ").length); | |
sens.add(sentances); //just making efficient use of iteration to build list | |
} | |
List<String> killindex = new ArrayList<String>(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static String removeSingleSentanceSequences(Set<String> paragraphs) { | |
// get single sentances in a row, likely to be menu items | |
List<String> sens = new ArrayList<String>(); | |
List<Integer> counts = new ArrayList<Integer>(); | |
int cachecount = 0; | |
for(String sentances:paragraphs) { | |
counts.add(sentances.split(" ").length); | |
sens.add(sentances); //just making efficient use of iteration to build list | |
} | |
List<String> killindex = new ArrayList<String>(); |