Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Created April 28, 2013 11:15
Show Gist options
  • Save mocobeta/5476604 to your computer and use it in GitHub Desktop.
Save mocobeta/5476604 to your computer and use it in GitHub Desktop.
Lucene API で TF-IDF 値を計算するテスト
package termvector;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
public class TermVectorTest {
public static void main(String[] args) throws IOException {
Directory directory = FSDirectory.open(new File("tumblrdata"));
IndexReader reader = DirectoryReader.open(directory);
showTfIDf(reader);
reader.close();
}
private static void showTfIDf(IndexReader reader) throws IOException {
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
// "content" フィールドの Term Vector を取得する
Terms vector = reader.getTermVector(i, "content");
TermsEnum itr = vector.iterator(null);
BytesRef ref = null;
List<TermFreq> list = new ArrayList<TermFreq>();
long tcSum = 0;
// Vector 中の単語をイテレート
while((ref = itr.next()) != null) {
String term = ref.utf8ToString(); // UTF-8 文字列
TermFreq freq = new TermFreq(term, maxDoc);
// ドキュメント内の単語出現回数
freq.setTc(itr.totalTermFreq());
// 単語の出現するドキュメント数
freq.setDf(reader.docFreq(new Term("content", term)));
list.add(freq);
tcSum += itr.totalTermFreq();
}
// ドキュメント内の全単語の出現回数をセット (TF値の計算で使用)
for (TermFreq freq : list) { freq.setTcSum(tcSum); }
Document doc = reader.document(i);
System.out.println(doc.get("id") + ": " + doc.get("title"));
Collections.sort(list, new TermFreqComparatorByTFIDF());
for (int j = 0; j < 10 && j < list.size(); j++) {
System.out.println("\t" + list.get(j));
}
}
}
}
/** 出現頻度の統計値を保持するクラス */
class TermFreq {
private final String term; /** term */
private long tc; /** 単語出現回数 */
private long tcSum; /** ドキュメント内の、全単語出現回数 */
private int df; /** 単語が出現するドキュメント数 */
private int maxDoc; /** インデックスに含まれる全ドキュメント数 */
TermFreq(String term, int maxDoc) {
this.term = term;
this.maxDoc = maxDoc;
}
String getTerm() { return term; }
void setTc(long tc) { this.tc = tc; }
long getTc() { return tc; }
void setTcSum(long tcSum) { this.tcSum = tcSum; }
void setDf(int df) { this.df = df; }
int getDf() { return df; }
/** TF-IDF値の計算 */
double calcTFIDF() {
double tf = (double)tc / (double)tcSum;
double idf = Math.log((double)maxDoc / (double)df);
return tf * idf;
}
@Override
public String toString() {
return term + "\tTF-IDF: " + calcTFIDF();
}
}
/** TFIDF値の大きい順に並べるComparator */
class TermFreqComparatorByTFIDF implements Comparator<TermFreq> {
@Override
public int compare(TermFreq freq1, TermFreq freq2) {
double tfidf1 = freq1.calcTFIDF();
double tfidf2 = freq2.calcTFIDF();
if (tfidf1 > tfidf2) {
return -1;
} else if (tfidf1 < tfidf2){
return 1;
} else {
return 0;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment