Created
April 28, 2013 11:15
-
-
Save mocobeta/5476604 to your computer and use it in GitHub Desktop.
Lucene API で TF-IDF 値を計算するテスト
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package termvector; | |
import java.io.File; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.List; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.index.DirectoryReader; | |
import org.apache.lucene.index.IndexReader; | |
import org.apache.lucene.index.Term; | |
import org.apache.lucene.index.Terms; | |
import org.apache.lucene.index.TermsEnum; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
import org.apache.lucene.util.BytesRef; | |
public class TermVectorTest { | |
public static void main(String[] args) throws IOException { | |
Directory directory = FSDirectory.open(new File("tumblrdata")); | |
IndexReader reader = DirectoryReader.open(directory); | |
showTfIDf(reader); | |
reader.close(); | |
} | |
private static void showTfIDf(IndexReader reader) throws IOException { | |
int maxDoc = reader.maxDoc(); | |
for (int i = 0; i < maxDoc; i++) { | |
// "content" フィールドの Term Vector を取得する | |
Terms vector = reader.getTermVector(i, "content"); | |
TermsEnum itr = vector.iterator(null); | |
BytesRef ref = null; | |
List<TermFreq> list = new ArrayList<TermFreq>(); | |
long tcSum = 0; | |
// Vector 中の単語をイテレート | |
while((ref = itr.next()) != null) { | |
String term = ref.utf8ToString(); // UTF-8 文字列 | |
TermFreq freq = new TermFreq(term, maxDoc); | |
// ドキュメント内の単語出現回数 | |
freq.setTc(itr.totalTermFreq()); | |
// 単語の出現するドキュメント数 | |
freq.setDf(reader.docFreq(new Term("content", term))); | |
list.add(freq); | |
tcSum += itr.totalTermFreq(); | |
} | |
// ドキュメント内の全単語の出現回数をセット (TF値の計算で使用) | |
for (TermFreq freq : list) { freq.setTcSum(tcSum); } | |
Document doc = reader.document(i); | |
System.out.println(doc.get("id") + ": " + doc.get("title")); | |
Collections.sort(list, new TermFreqComparatorByTFIDF()); | |
for (int j = 0; j < 10 && j < list.size(); j++) { | |
System.out.println("\t" + list.get(j)); | |
} | |
} | |
} | |
} | |
/** 出現頻度の統計値を保持するクラス */ | |
class TermFreq { | |
private final String term; /** term */ | |
private long tc; /** 単語出現回数 */ | |
private long tcSum; /** ドキュメント内の、全単語出現回数 */ | |
private int df; /** 単語が出現するドキュメント数 */ | |
private int maxDoc; /** インデックスに含まれる全ドキュメント数 */ | |
TermFreq(String term, int maxDoc) { | |
this.term = term; | |
this.maxDoc = maxDoc; | |
} | |
String getTerm() { return term; } | |
void setTc(long tc) { this.tc = tc; } | |
long getTc() { return tc; } | |
void setTcSum(long tcSum) { this.tcSum = tcSum; } | |
void setDf(int df) { this.df = df; } | |
int getDf() { return df; } | |
/** TF-IDF値の計算 */ | |
double calcTFIDF() { | |
double tf = (double)tc / (double)tcSum; | |
double idf = Math.log((double)maxDoc / (double)df); | |
return tf * idf; | |
} | |
@Override | |
public String toString() { | |
return term + "\tTF-IDF: " + calcTFIDF(); | |
} | |
} | |
/** TFIDF値の大きい順に並べるComparator */ | |
class TermFreqComparatorByTFIDF implements Comparator<TermFreq> { | |
@Override | |
public int compare(TermFreq freq1, TermFreq freq2) { | |
double tfidf1 = freq1.calcTFIDF(); | |
double tfidf2 = freq2.calcTFIDF(); | |
if (tfidf1 > tfidf2) { | |
return -1; | |
} else if (tfidf1 < tfidf2){ | |
return 1; | |
} else { | |
return 0; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment