Created
April 3, 2014 09:17
-
-
Save markharwood/9951177 to your computer and use it in GitHub Desktop.
Precision/Recall measures for a query using aggs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//================================ | |
// Here a script for gathering the precision/recall stats for a query (see http://en.wikipedia.org/wiki/Precision_and_recall) | |
// A candidate classifier query's effectiveness is determined by counting hits on pre-classified content | |
// If we compute the F-measure we can potentially use it as the fitness function for a genetic algo that mutates our query | |
// (introducing phrases, minShouldMatch clauses etc) to move us towards our target goal of balancing precision/recall in our classifier. | |
//================================= | |
// Our candidate query for classifying documents in a category | |
var candidateQuery={ "terms": {"body": ["vs", "shr", "cts", "net", "revs", "note", "loss", "mths", "shrs", "avg", "profit"]}}; | |
// Our filter criteria for identifying documents in our target category | |
var categoryTest= { "term" : { "topics" : "earn" } }; | |
// The name of our category field (used for summarising false positives) | |
var categoryField="topics"; | |
var queryJson={ | |
"query" :candidateQuery, | |
"aggs" : { | |
"globals":{ | |
"global":{}, | |
"aggs":{ | |
"requiredHits":{ | |
// The count of docs in this bucket minus that of "truePositive" bucket gives our false negative figure | |
"filter" : categoryTest, | |
}, | |
"requiredMisses":{ | |
// The count of docs in this bucket minus that of "falsePositive" bucket gives our true negative figure | |
"filter":{ "bool" : {"mustNot" : categoryTest } } | |
} | |
} | |
}, | |
"truePositive" : { | |
"filter" : categoryTest, | |
"aggs":{ | |
//For documents with >1 classification this will summarise any "also-connected" categories | |
"relatedCategories":{ | |
"terms" : {"field" : categoryField} | |
} | |
} | |
}, | |
"falsePositive":{ | |
"filter":{ | |
"bool" : { "mustNot" : categoryTest } | |
}, | |
"aggs":{ | |
//This will list all of the category fields we got a false positive on - identifies categories that are "close" | |
"fpCollateral":{"terms" : {"field" : categoryField} } | |
} | |
} | |
} | |
}; | |
return queryJson; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment