Skip to content

Instantly share code, notes, and snippets.

@joyita
joyita / gist:3736164
Created September 17, 2012 08:24
PhraseTokensier
package uk.co.fues.submission.classifier.nlp;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.parser.Parse;
@joyita
joyita / gist:3610899
Created September 3, 2012 17:07
Parsing spam
private static Set<String> removeRepeatedSentances(String text) {
String [] paras = text.split("\n+");
List<String> paragraphs = Arrays.asList(paras);
for(int i = 0; i<paragraphs.size(); i++) {
String paragraph = paragraphs.get(i);
paragraphs.set(i, clearWhitespace(paragraph));
}
Set<String> ret = new HashSet<String>();
for(String para:paragraphs) {
ret.add(para);
@joyita
joyita / gist:3610897
Created September 3, 2012 17:06
Parsing spam
private static String trimShortSentanceSequences(Set<String> paragraphs) {
List<String> sens = new ArrayList<String>();
List<Integer> counts = new ArrayList<Integer>();
int cachecount = 0;
for(String sentances:paragraphs) {
counts.add(sentances.split(" ").length);
sens.add(sentances);
}
List<String> killindex = new ArrayList<String>();
for(int i = 0; i<counts.size(); i++) {
private static String removeSingleSentanceSequences(Set<String> paragraphs) {
// get single sentances in a row, likely to be menu items
List<String> sens = new ArrayList<String>();
List<Integer> counts = new ArrayList<Integer>();
int cachecount = 0;
for(String sentances:paragraphs) {
counts.add(sentances.split(" ").length);
sens.add(sentances); //just making efficient use of iteration to build list
}
List<String> killindex = new ArrayList<String>();
private static String removeSingleSentanceSequences(Set<String> paragraphs) {
// get single sentances in a row, likely to be menu items
List<String> sens = new ArrayList<String>();
List<Integer> counts = new ArrayList<Integer>();
int cachecount = 0;
for(String sentances:paragraphs) {
counts.add(sentances.split(" ").length);
sens.add(sentances); //just making efficient use of iteration to build list
}
List<String> killindex = new ArrayList<String>();
private static String removeSingleSentanceSequences(Set<String> paragraphs) {
// get single sentances in a row, likely to be menu items
List<String> sens = new ArrayList<String>();
List<Integer> counts = new ArrayList<Integer>();
int cachecount = 0;
for(String sentances:paragraphs) {
counts.add(sentances.split(" ").length);
sens.add(sentances); //just making efficient use of iteration to build list
}
List<String> killindex = new ArrayList<String>();
private static String removeSingleSentanceSequences(Set<String> paragraphs) {
// get single sentances in a row, likely to be menu items
List<String> sens = new ArrayList<String>();
List<Integer> counts = new ArrayList<Integer>();
int cachecount = 0;
for(String sentances:paragraphs) {
counts.add(sentances.split(" ").length);
sens.add(sentances); //just making efficient use of iteration to build list
}
List<String> killindex = new ArrayList<String>();
private static String removeSingleSentanceSequences(Set<String> paragraphs) {
// get single sentances in a row, likely to be menu items
List<String> sens = new ArrayList<String>();
List<Integer> counts = new ArrayList<Integer>();
int cachecount = 0;
for(String sentances:paragraphs) {
counts.add(sentances.split(" ").length);
sens.add(sentances); //just making efficient use of iteration to build list
}
List<String> killindex = new ArrayList<String>();