Skip to content

Instantly share code, notes, and snippets.

@joyita
Created September 3, 2012 17:07
Show Gist options
  • Save joyita/3610899 to your computer and use it in GitHub Desktop.
Save joyita/3610899 to your computer and use it in GitHub Desktop.
Parsing spam
private static Set<String> removeRepeatedSentances(String text) {
String [] paras = text.split("\n+");
List<String> paragraphs = Arrays.asList(paras);
for(int i = 0; i<paragraphs.size(); i++) {
String paragraph = paragraphs.get(i);
paragraphs.set(i, clearWhitespace(paragraph));
}
Set<String> ret = new HashSet<String>();
for(String para:paragraphs) {
ret.add(para);
}
ret.remove("");
return ret;
}
private static String trimShortSentanceSequences(Set<String> paragraphs) {
List<String> sens = new ArrayList<String>();
List<Integer> counts = new ArrayList<Integer>();
int cachecount = 0;
for(String sentances:paragraphs) {
counts.add(sentances.split(" ").length);
sens.add(sentances);
}
List<String> killindex = new ArrayList<String>();
for(int i = 0; i<counts.size(); i++) {
Integer in = counts.get(i);
// arbitary decision, 30 words = short parapgraph.
if(in<30) {
cachecount++;
}
// 3 short sentances in a row.
if(cachecount>2) {
killindex.add(sens.get(i));
killindex.add(sens.get(i-1));
killindex.add(sens.get(i-2));
cachecount = 0;
}
}
for(String index:killindex) {
sens.remove(index);
}
// rebuild document
StringBuilder builder = new StringBuilder();
for(String sen:sens) {
// "\n" was used as the paragraph splitter
builder.append(sen + "\n");
}
return builder.toString();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment