Created
January 29, 2014 14:29
-
-
Save markharwood/8689136 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Groups a related set of terms, typically from the results of some auto-expansion, | |
// and provides the average DocFreq of the set in order to avoid Lucene's IDF ranking | |
// favouring the rarest interpretation, which is often a poor choice for auto-expanded | |
// terms e.g. the terms produced by a fuzzy query or trying alternative fields | |
class CommonIDFContext { | |
int commonDf = -1; | |
Map<Term, Integer> balancedDfs; | |
List<Term> commonTerms = new ArrayList<Term>(); | |
public void add(Term unbalancedQueryTerm) { | |
commonTerms.add(unbalancedQueryTerm); | |
} | |
//Gets a "balanced" docFreq for a Term by using averaged DF across related Terms. Rather than a completely | |
// level playing field returning the average for all related terms, a small bias is added for the more popular terms. | |
// Lucene naturally favours the rarest terms so this "popularity" bias is (somewhat perversely) manifested as smaller docfreqs. | |
// This approach allows us to use a balanced DF that plays nicely with other sets of terms but, where all other things are | |
// equal, the small "popularity" bias helps ensure that the various terms in this set are ranked correctly in relation to each | |
// other (i.e. the most likely interpretation comes first). | |
public int getBalancedDf(IndexReader reader, Term queryTerm) throws IOException { | |
if (balancedDfs == null) { | |
balancedDfs = new HashMap<Term, Integer>(); | |
long totalDocFreq = 0; | |
int totalNumNonZeroDfTerms = 0; | |
int dfs[] = new int[commonTerms.size()]; | |
for (int i = 0; i < commonTerms.size(); i++) { | |
dfs[i] = reader.docFreq(commonTerms.get(i)); | |
if (dfs[i] > 0) { | |
totalDocFreq += dfs[i]; | |
totalNumNonZeroDfTerms++; | |
} | |
} | |
// Average the DF only for those fields that actually have the | |
// term | |
if (totalNumNonZeroDfTerms == 0) { | |
commonDf = 0; | |
} else { | |
commonDf = (int) (totalDocFreq / totalNumNonZeroDfTerms); | |
} | |
// Rather than share a common DF for all fields, we introduce a | |
// little bias towards the more popular interpretation | |
// We use the average as the basis for DF and add 1 to doc freq | |
// for every other field that is more popular | |
for (int i = 0; i < dfs.length; i++) { | |
if (dfs[i] == 0) { | |
continue; | |
} | |
Term thisTerm = commonTerms.get(i); | |
int numOfOtherFieldsMorePopular = 0; | |
for (int j = 0; j < dfs.length; j++) { | |
if ((dfs[j] > 0) && (dfs[i] < dfs[j])) { | |
// Other field is more popular | |
numOfOtherFieldsMorePopular++; | |
} | |
} | |
// Adjust from the average DF - add more docs (effectively | |
// penalizing) where there are more popular alternatives | |
balancedDfs.put(thisTerm, commonDf + numOfOtherFieldsMorePopular); | |
} | |
} | |
Integer result = balancedDfs.get(queryTerm); | |
if (result == null) { | |
return 0; | |
} | |
return result; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment