Skip to content

Instantly share code, notes, and snippets.

@yuikns
Last active December 23, 2016 20:53
Show Gist options
  • Save yuikns/84f2d4a95c340efe67ee60c8b507d5f2 to your computer and use it in GitHub Desktop.
Save yuikns/84f2d4a95c340efe67ee60c8b507d5f2 to your computer and use it in GitHub Desktop.
package com.argcv.util;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
/**
*
* @author yu
*
* @param <E>
* doc type
*/
public class Index<E> {
int maxDocId;
Map<Integer, E> doc;
Map<Integer, Integer> meta;
Map<String, Map<Integer, Integer>> index;
public Index() {
maxDocId = 0;
doc = new HashMap<Integer, E>();
meta = new HashMap<Integer, Integer>();
index = new HashMap<String, Map<Integer, Integer>>();
}
@SuppressWarnings("unchecked")
public int insert(String s) {
return insert((E) s, s);
}
public synchronized int insert(E d, String s) {
doc.put(maxDocId, d);
Map<String, Integer> tokens = Strings.tokenExtractor(s);
Iterator<Entry<String, Integer>> it = tokens.entrySet().iterator();
int tokenSz = 0;
while (it.hasNext()) {
Entry<String, Integer> next = it.next();
tokenSz += next.getValue();
Map<Integer, Integer> terms = index.containsKey(next.getKey()) ? index.get(next.getKey()) : new HashMap<Integer, Integer>();
terms.put(maxDocId, next.getValue());
index.put(next.getKey(), terms);
}
meta.put(maxDocId, tokenSz);
doc.put(maxDocId, d);
maxDocId++;
return maxDocId - 1; // return doc id
}
public E get(Integer docId) {
return doc.get(docId);
}
public int size() {
return maxDocId;
}
public List<Pair<E, Double>> query(String s) {
List<Pair<E, Double>> result = new ArrayList<Pair<E, Double>>();
Map<String, Integer> tokens = Strings.tokenExtractor(s);
Iterator<Entry<String, Integer>> it = tokens.entrySet().iterator();
Map<Integer, Double> rst = new HashMap<Integer, Double>();
while (it.hasNext()) {
Entry<String, Integer> next = it.next();
if (index.containsKey(next.getKey())) {
Iterator<Entry<Integer, Integer>> idx = index.get(next.getKey()).entrySet().iterator();
int dscct = index.get(next.getKey()).size();
while (idx.hasNext()) {
Entry<Integer, Integer> next2 = idx.next();
if (rst.containsKey(next2.getKey())) {
rst.put(next2.getKey(), rst.get(next2.getKey()) + tfidf(next2.getValue(), meta.get(next2.getKey()), maxDocId, dscct));
} else {
rst.put(next2.getKey(), tfidf(next2.getValue(), meta.get(next2.getKey()), maxDocId, dscct));
}
}
}
}
Iterator<Entry<Integer, Double>> it2 = rst.entrySet().iterator();
while (it2.hasNext()) {
Entry<Integer, Double> next = it2.next();
result.add(new Pair<E, Double>(doc.get(next.getKey()), next.getValue()));
}
Collections.sort(result, new Comparator<Pair<E, Double>>() {
@Override
public int compare(Pair<E, Double> o1, Pair<E, Double> o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
return result;
}
// # tf-idf k in document D
// # stid : size of term k in D
// # atsid : all term size in D
// # ads : all document size
// # dscct : document size contains current term
public double tfidf(int stid, int atsid, int ads, int dscct) {
if (ads == 0 || atsid == 0 || dscct == 0)
return 0;
return ((double) stid / atsid) * Math.log((double) ads / dscct) / 2.302585;
}
// public static void main(String[] args) {
// Index<String> idx = new Index<String>();
// idx.insert("aaa bbb ccc ddd");
// idx.insert("bbb ccc ddd");
// idx.insert("ddd eee fff");
// List<Pair<String, Double>> query = idx.query("fff");
// for (Pair<String, Double> q : query) {
// System.out.println(q.getValue() + "\t" + q.getKey());
// }
// }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment