Skip to content

Instantly share code, notes, and snippets.

@joyita
Created September 3, 2012 16:38
Show Gist options
  • Save joyita/3610599 to your computer and use it in GitHub Desktop.
Save joyita/3610599 to your computer and use it in GitHub Desktop.
private static String removeSingleSentanceSequences(Set<String> paragraphs) {
// get single sentances in a row, likely to be menu items
List<String> sens = new ArrayList<String>();
List<Integer> counts = new ArrayList<Integer>();
int cachecount = 0;
for(String sentances:paragraphs) {
counts.add(sentances.split(" ").length);
sens.add(sentances); //just making efficient use of iteration to build list
}
List<String> killindex = new ArrayList<String>();
for(int i = 0; i<counts.size(); i++) {
Integer in = counts.get(i);
// arbitary decision, 15 words = short parapgraph.
if(in<15) {
cachecount++;
}
// 3 short sentances in a row.
if(cachecount>2) {
killindex.add(sens.get(i));
killindex.add(sens.get(i-1));
killindex.add(sens.get(i-2));
cachecount = 0;
}
}
for(String index:killindex) {
sens.remove(index);
}
// rebuild document
StringBuilder builder = new StringBuilder();
for(String sen:sens) {
builder.append(sen + "\n"); // "\n" was used as the paragraph splitter
}
return builder.toString();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment