markharwood · January 29, 2014 14:29
diff --git a/gistfile1.java b/gistfile1.java
    //Groups a related set of terms, typically from the results of some auto-expansion,
    // and provides the average DocFreq of the set in order to avoid Lucene's IDF ranking
    // favouring the rarest interpretation, which is often a poor choice for auto-expanded
    // terms e.g. the terms produced by a fuzzy query or trying alternative fields
    class CommonIDFContext {
        int commonDf = -1;
        Map<Term, Integer> balancedDfs;
        List<Term> commonTerms = new ArrayList<Term>();

        public void add(Term unbalancedQueryTerm) {
            commonTerms.add(unbalancedQueryTerm);
        }

        //Gets a "balanced" docFreq for a Term by using averaged DF across related Terms. Rather than a completely
        // level playing field returning the average for all related terms, a small bias is added for the more popular terms.
        // Lucene naturally favours the rarest terms so this "popularity" bias is (somewhat perversely) manifested as smaller docfreqs. 
        // This approach allows us to use a balanced DF that plays nicely with other sets of terms but, where all other things are 
        // equal, the small "popularity" bias helps ensure that the various terms in this set are ranked correctly in relation to each
        // other (i.e. the most likely interpretation comes first).
        public int getBalancedDf(IndexReader reader, Term queryTerm) throws IOException {
            if (balancedDfs == null) {
                balancedDfs = new HashMap<Term, Integer>();
                long totalDocFreq = 0;
                int totalNumNonZeroDfTerms = 0;
                int dfs[] = new int[commonTerms.size()];
                for (int i = 0; i < commonTerms.size(); i++) {
                    dfs[i] = reader.docFreq(commonTerms.get(i));
                    if (dfs[i] > 0) {
                        totalDocFreq += dfs[i];
                        totalNumNonZeroDfTerms++;
                    }
                }

                // Average the DF only for those fields that actually have the
                // term
                if (totalNumNonZeroDfTerms == 0) {
                    commonDf = 0;
                } else {
                    commonDf = (int) (totalDocFreq / totalNumNonZeroDfTerms);
                }
                // Rather than share a common DF for all fields, we introduce a
                // little bias towards the more popular interpretation
                // We use the average as the basis for DF and add 1 to doc freq
                // for every other field that is more popular

                for (int i = 0; i < dfs.length; i++) {
                    if (dfs[i] == 0) {
                        continue;
                    }
                    Term thisTerm = commonTerms.get(i);
                    int numOfOtherFieldsMorePopular = 0;
                    for (int j = 0; j < dfs.length; j++) {

                        if ((dfs[j] > 0) && (dfs[i] < dfs[j])) {
                            // Other field is more popular
                            numOfOtherFieldsMorePopular++;
                        }
                    }
                    // Adjust from the average DF - add more docs (effectively
                    // penalizing) where there are more popular alternatives
                    balancedDfs.put(thisTerm, commonDf + numOfOtherFieldsMorePopular);
                }

            }
            Integer result = balancedDfs.get(queryTerm);
            if (result == null) {
                return 0;
            }
            return result;
        }
    }
	//Groups a related set of terms, typically from the results of some auto-expansion,
	// and provides the average DocFreq of the set in order to avoid Lucene's IDF ranking
	// favouring the rarest interpretation, which is often a poor choice for auto-expanded
	// terms e.g. the terms produced by a fuzzy query or trying alternative fields
	class CommonIDFContext {
	int commonDf = -1;
	Map<Term, Integer> balancedDfs;
	List<Term> commonTerms = new ArrayList<Term>();

	public void add(Term unbalancedQueryTerm) {
	commonTerms.add(unbalancedQueryTerm);
	}

	//Gets a "balanced" docFreq for a Term by using averaged DF across related Terms. Rather than a completely
	// level playing field returning the average for all related terms, a small bias is added for the more popular terms.
	// Lucene naturally favours the rarest terms so this "popularity" bias is (somewhat perversely) manifested as smaller docfreqs.
	// This approach allows us to use a balanced DF that plays nicely with other sets of terms but, where all other things are
	// equal, the small "popularity" bias helps ensure that the various terms in this set are ranked correctly in relation to each
	// other (i.e. the most likely interpretation comes first).
	public int getBalancedDf(IndexReader reader, Term queryTerm) throws IOException {
	if (balancedDfs == null) {
	balancedDfs = new HashMap<Term, Integer>();
	long totalDocFreq = 0;
	int totalNumNonZeroDfTerms = 0;
	int dfs[] = new int[commonTerms.size()];
	for (int i = 0; i < commonTerms.size(); i++) {
	dfs[i] = reader.docFreq(commonTerms.get(i));
	if (dfs[i] > 0) {
	totalDocFreq += dfs[i];
	totalNumNonZeroDfTerms++;
	}
	}

	// Average the DF only for those fields that actually have the
	// term
	if (totalNumNonZeroDfTerms == 0) {
	commonDf = 0;
	} else {
	commonDf = (int) (totalDocFreq / totalNumNonZeroDfTerms);
	}
	// Rather than share a common DF for all fields, we introduce a
	// little bias towards the more popular interpretation
	// We use the average as the basis for DF and add 1 to doc freq
	// for every other field that is more popular

	for (int i = 0; i < dfs.length; i++) {
	if (dfs[i] == 0) {
	continue;
	}
	Term thisTerm = commonTerms.get(i);
	int numOfOtherFieldsMorePopular = 0;
	for (int j = 0; j < dfs.length; j++) {

	if ((dfs[j] > 0) && (dfs[i] < dfs[j])) {
	// Other field is more popular
	numOfOtherFieldsMorePopular++;
	}
	}
	// Adjust from the average DF - add more docs (effectively
	// penalizing) where there are more popular alternatives
	balancedDfs.put(thisTerm, commonDf + numOfOtherFieldsMorePopular);
	}

	}
	Integer result = balancedDfs.get(queryTerm);
	if (result == null) {
	return 0;
	}
	return result;
	}
	}