Created
June 8, 2012 15:40
-
-
Save anvie/2896225 to your computer and use it in GitHub Desktop.
Lucene Value Scorer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package tests; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.WhitespaceAnalyzer; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.document.NumericField; | |
import org.apache.lucene.index.IndexWriter; | |
import org.apache.lucene.queryParser.QueryParser; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.Query; | |
import org.apache.lucene.search.ScoreDoc; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.search.function.CustomScoreQuery; | |
import org.apache.lucene.search.function.IntFieldSource; | |
import org.apache.lucene.search.function.ValueSourceQuery; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.RAMDirectory; | |
import org.apache.lucene.util.Version; | |
import junit.framework.TestCase; | |
public class AgeAndContentScoreQueryTest extends TestCase | |
{ | |
public class AgeAndContentScoreQuery extends CustomScoreQuery | |
{ | |
protected float peakX; | |
protected float sigma; | |
public AgeAndContentScoreQuery(Query subQuery, ValueSourceQuery valSrcQuery, float peakX, float sigma) { | |
super(subQuery, valSrcQuery); | |
this.setStrict(true); // do not normalize score values from ValueSourceQuery! | |
this.peakX = peakX; // age for which the age-relevance is best | |
this.sigma = sigma; | |
} | |
@Override | |
public float customScore(int doc, float subQueryScore, float valSrcScore){ | |
// subQueryScore is td-idf score from content query | |
float contentScore = subQueryScore; | |
// valSrcScore is a value of date-of-birth field, represented as a float | |
// let's convert age value to gaussian-like age relevance score | |
float x = (2011 - valSrcScore); // age | |
float ageScore = (float) Math.exp(-Math.pow(x - peakX, 2) / 2*sigma*sigma); | |
float finalScore = ageScore * contentScore; | |
System.out.println("#contentScore: " + contentScore); | |
System.out.println("#ageValue: " + (int)valSrcScore); | |
System.out.println("#ageScore: " + ageScore); | |
System.out.println("#finalScore: " + finalScore); | |
System.out.println("+++++++++++++++++"); | |
return finalScore; | |
} | |
} | |
protected Directory directory; | |
protected Analyzer analyzer = new WhitespaceAnalyzer(); | |
protected String fieldNameContent = "content"; | |
protected String fieldNameDOB = "dob"; | |
protected void setUp() throws Exception | |
{ | |
directory = new RAMDirectory(); | |
analyzer = new WhitespaceAnalyzer(); | |
// indexed documents | |
String[] contents = {"foo baz1", "foo baz2 baz3", "baz4"}; | |
int[] dobs = {1991, 1981, 1987}; // date of birth | |
IndexWriter writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); | |
for (int i = 0; i < contents.length; i++) | |
{ | |
Document doc = new Document(); | |
doc.add(new Field(fieldNameContent, contents[i], Field.Store.YES, Field.Index.ANALYZED)); // store & index | |
doc.add(new NumericField(fieldNameDOB, Field.Store.YES, true).setIntValue(dobs[i])); // store & index | |
writer.addDocument(doc); | |
} | |
writer.close(); | |
} | |
public void testSearch() throws Exception | |
{ | |
String inputTextQuery = "foo bar"; | |
float peak = 27.0f; | |
float sigma = 0.1f; | |
QueryParser parser = new QueryParser(Version.LUCENE_30, fieldNameContent, analyzer); | |
Query contentQuery = parser.parse(inputTextQuery); | |
ValueSourceQuery dobQuery = new ValueSourceQuery( new IntFieldSource(fieldNameDOB) ); | |
// or: FieldScoreQuery dobQuery = new FieldScoreQuery(fieldNameDOB,Type.INT); | |
CustomScoreQuery finalQuery = new AgeAndContentScoreQuery(contentQuery, dobQuery, peak, sigma); | |
IndexSearcher searcher = new IndexSearcher(directory); | |
TopDocs docs = searcher.search(finalQuery, 10); | |
System.out.println("\nDocuments found:\n"); | |
for(ScoreDoc match : docs.scoreDocs) | |
{ | |
Document d = searcher.doc(match.doc); | |
System.out.println("CONTENT: " + d.get(fieldNameContent) ); | |
System.out.println("D.O.B.: " + d.get(fieldNameDOB) ); | |
System.out.println("SCORE: " + match.score ); | |
System.out.println("-----------------"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment