Created
April 28, 2015 23:35
-
-
Save shimondoodkin/48097c83097106e4dd72 to your computer and use it in GitHub Desktop.
steaming Frequent Terms from Keywords. basic algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.nini; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.HashMap; | |
import java.util.LinkedList; | |
import java.util.List; | |
import java.util.Map.Entry; | |
import com.nini.Repl.Keywords.Pair; | |
public class Repl { | |
public static void main(String[] args) { | |
System.out.println("start"); | |
Keywords terms_extractor=new Keywords(); | |
String [] text="1 1 2 1 1".split("\\s+"); | |
for(String s :text) | |
{ | |
System.out.println("add word "+s); | |
terms_extractor.AddWord(s); | |
} | |
/* | |
System.out.println("result"); | |
ArrayList<Keywords.Pair> result=terms_extractor.extractNoDuplicates(); | |
for(Keywords.Pair p :result) | |
{ | |
System.out.println(p.ch+" = "+p.count); | |
} | |
*/ | |
System.out.println("end"); | |
} | |
static class Keywords | |
{ | |
/* | |
* Architecture: | |
* trail is the current nwords list. | |
* prevtrail simulates the trail as it was nwords before. | |
* | |
* */ | |
public int nwords=5; | |
private LinkedList<String> trail=new LinkedList<String>(); | |
private LinkedList<String> prevtrail=new LinkedList<String>(); | |
public HashMap<String,Short> counter=new HashMap<String,Short> (); | |
public HashMap<String,Short> tempcounter=new HashMap<String,Short> (); | |
public Keywords() | |
{ | |
reset(); | |
} | |
public void reset() | |
{ | |
trail.clear(); | |
prevtrail.clear(); | |
counter.clear(); | |
//run=0; | |
} | |
//private int run=0; | |
private String prevword=null; | |
private String prevsequence=null; | |
public void AddWord(String word) | |
{ | |
//run++; | |
String sequence=""; | |
if(prevword!=null) | |
{ | |
prevtrail.addLast(prevword); | |
} | |
if(trail.size()==nwords) | |
{ | |
trail.removeLast(); | |
if(prevtrail.size()>nwords)prevtrail.removeFirst(); | |
//printList("prevtrail"+run,prevtrail); | |
sequence=""; | |
//for(int i=trail.size()-1;i>=0;i--) | |
tempcounter.clear(); | |
for(String ch : prevtrail) | |
{ | |
//String ch=trail.get(i); | |
sequence=sequence+ch+","; | |
//System.out.println("remove sequence="+sequence); | |
if(counter.containsKey(sequence)) | |
{ | |
short count=counter.get(sequence).shortValue(); | |
if(count==1) | |
counter.remove(sequence); | |
else | |
{ | |
if(tempcounter.containsKey(sequence)) System.out.println("error remove same key again: "+ch); | |
tempcounter.put(sequence,--count); | |
} | |
} | |
else | |
System.out.println("error removing not exist key: "+ch); | |
// counter.put(sequence,(short) 1); | |
} | |
for(Entry<String, Short> ch : tempcounter.entrySet()) | |
{ | |
counter.put(ch.getKey(),ch.getValue()); | |
} | |
} | |
trail.addFirst(word); | |
prevword=word; | |
//printList("trail"+run,trail); | |
sequence=""; | |
//for(int i=trail.size()-1;i>=0;i--) | |
tempcounter.clear(); | |
for(String ch : trail) | |
{ | |
//String ch=trail.get(i); | |
sequence=ch+","+sequence; | |
//System.out.println("sequence="+sequence); | |
if(counter.containsKey(sequence)) | |
{ | |
short count=counter.get(sequence).shortValue(); | |
if(tempcounter.containsKey(sequence)) System.out.println("error add same key again: "+ch); | |
tempcounter.put(sequence,++count); | |
} | |
else | |
counter.put(sequence,(short) 1); | |
} | |
for(Entry<String, Short> ch : tempcounter.entrySet()) | |
{ | |
counter.put(ch.getKey(),ch.getValue()); | |
} | |
prevsequence=sequence; | |
printListPair("result",extractNoDuplicates()); | |
System.out.println(""); | |
} | |
public static class DistanceString implements Comparable<DistanceString> { | |
public DistanceString(String value){this.value=value;} | |
public String value; | |
@Override | |
public int compareTo(DistanceString another) { | |
return value.compareTo(another.value); | |
} | |
} | |
public static class Pair { | |
public Pair(String c, short n){ch=c;count=n;} | |
public String ch; | |
public short count; | |
} | |
private Comparator<Pair> abc_sort= new Comparator<Pair>(){ | |
public int compare(Pair s1, Pair s2) { | |
return s1.ch.compareTo(s2.ch); | |
} | |
}; | |
private Comparator<Pair> length_sort= new Comparator<Pair>(){ | |
public int compare(Pair s1, Pair s2) { | |
final int l1=s1.ch.length(); | |
final int l2=s1.ch.length(); | |
if (l1<l2) return -1; | |
if (l1>l2) return 1; | |
//if (l1==l2) return 0; | |
return 0; | |
} | |
}; | |
public ArrayList<Pair> extractNoDuplicates() {return extractNoDuplicates(1,(short) 1);} | |
public ArrayList<Pair> extractNoDuplicates(int minwords,short minrepetition) | |
{ | |
ArrayList<Pair> terms=new ArrayList<Pair> (); | |
for(Entry<String, Short> p :counter.entrySet()) | |
{ | |
terms.add(new Pair(p.getKey(), p.getValue())); | |
} | |
Collections.sort(terms,abc_sort); | |
Collections.sort(terms,length_sort); | |
for(int i=0;i<terms.size()-1;) | |
{ | |
Pair pprev = terms.get(i); | |
Pair pnext = terms.get(i+1); | |
if(pnext.ch.startsWith(pprev.ch)) | |
{ | |
if(pprev.count==pnext.count) | |
terms.remove(i); | |
else | |
// { | |
// pprev.count=(short) (pprev.count-(pnext.count+1)); | |
i++; | |
// } | |
} | |
// maybe without length sort Collections.sort(terms,length_sort); | |
// else if(pprev.ch.startsWith(pnext.ch)) | |
// { | |
// if(pprev.count==pnext.count) | |
// terms.remove(i+1); | |
// else | |
// { | |
// pnext.count=(short) (pnext.count-pprev.count); | |
// i+=2; | |
// } | |
// } | |
else | |
i++; | |
} | |
for(int i=terms.size()-1;i>=0;i--) | |
{ | |
Pair pprev = terms.get(i); | |
if(pprev.count<=minrepetition || pprev.ch.split(",").length<=minwords || (!prevsequence.endsWith(pprev.ch)))terms.remove(i); | |
} | |
return terms; | |
} | |
} | |
public static void printList(String name, List<String> list){ | |
String o=""; | |
boolean first=true; | |
for(String elem : list){ | |
if(first)first=false; else o+=", "; | |
o+=elem; | |
} | |
System.out.println(name+"=["+o+"]"); | |
} | |
public static void printListPair(String name, List<Pair> list){ | |
boolean first=true; | |
System.out.println(name+"=["); | |
for(Pair p :list) | |
{ | |
System.out.println(p.ch+" = "+p.count+(first?"":",")); | |
if(first)first=false; | |
} | |
System.out.println("]"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment