Created
September 15, 2012 03:23
-
-
Save bwhite/3726239 to your computer and use it in GitHub Desktop.
Ranking Metrics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Information Retrieval metrics | |
Useful Resources: | |
http://www.cs.utexas.edu/~mooney/ir-course/slides/Evaluation.ppt | |
http://www.nii.ac.jp/TechReports/05-014E.pdf | |
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf | |
http://hal.archives-ouvertes.fr/docs/00/72/67/60/PDF/07-busa-fekete.pdf | |
Learning to Rank for Information Retrieval (Tie-Yan Liu) | |
""" | |
import numpy as np | |
def mean_reciprocal_rank(rs): | |
"""Score is reciprocal of the rank of the first relevant item | |
First element is 'rank 1'. Relevance is binary (nonzero is relevant). | |
Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank | |
>>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]] | |
>>> mean_reciprocal_rank(rs) | |
0.61111111111111105 | |
>>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]]) | |
>>> mean_reciprocal_rank(rs) | |
0.5 | |
>>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]] | |
>>> mean_reciprocal_rank(rs) | |
0.75 | |
Args: | |
rs: Iterator of relevance scores (list or numpy) in rank order | |
(first element is the first item) | |
Returns: | |
Mean reciprocal rank | |
""" | |
rs = (np.asarray(r).nonzero()[0] for r in rs) | |
return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs]) | |
def r_precision(r): | |
"""Score is precision after all relevant documents have been retrieved | |
Relevance is binary (nonzero is relevant). | |
>>> r = [0, 0, 1] | |
>>> r_precision(r) | |
0.33333333333333331 | |
>>> r = [0, 1, 0] | |
>>> r_precision(r) | |
0.5 | |
>>> r = [1, 0, 0] | |
>>> r_precision(r) | |
1.0 | |
Args: | |
r: Relevance scores (list or numpy) in rank order | |
(first element is the first item) | |
Returns: | |
R Precision | |
""" | |
r = np.asarray(r) != 0 | |
z = r.nonzero()[0] | |
if not z.size: | |
return 0. | |
return np.mean(r[:z[-1] + 1]) | |
def precision_at_k(r, k): | |
"""Score is precision @ k | |
Relevance is binary (nonzero is relevant). | |
>>> r = [0, 0, 1] | |
>>> precision_at_k(r, 1) | |
0.0 | |
>>> precision_at_k(r, 2) | |
0.0 | |
>>> precision_at_k(r, 3) | |
0.33333333333333331 | |
>>> precision_at_k(r, 4) | |
Traceback (most recent call last): | |
File "<stdin>", line 1, in ? | |
ValueError: Relevance score length < k | |
Args: | |
r: Relevance scores (list or numpy) in rank order | |
(first element is the first item) | |
Returns: | |
Precision @ k | |
Raises: | |
ValueError: len(r) must be >= k | |
""" | |
assert k >= 1 | |
r = np.asarray(r)[:k] != 0 | |
if r.size != k: | |
raise ValueError('Relevance score length < k') | |
return np.mean(r) | |
def average_precision(r): | |
"""Score is average precision (area under PR curve) | |
Relevance is binary (nonzero is relevant). | |
>>> r = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1] | |
>>> delta_r = 1. / sum(r) | |
>>> sum([sum(r[:x + 1]) / (x + 1.) * delta_r for x, y in enumerate(r) if y]) | |
0.7833333333333333 | |
>>> average_precision(r) | |
0.78333333333333333 | |
Args: | |
r: Relevance scores (list or numpy) in rank order | |
(first element is the first item) | |
Returns: | |
Average precision | |
""" | |
r = np.asarray(r) != 0 | |
out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]] | |
if not out: | |
return 0. | |
return np.mean(out) | |
def mean_average_precision(rs): | |
"""Score is mean average precision | |
Relevance is binary (nonzero is relevant). | |
>>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1]] | |
>>> mean_average_precision(rs) | |
0.78333333333333333 | |
>>> rs = [[1, 1, 0, 1, 0, 1, 0, 0, 0, 1], [0]] | |
>>> mean_average_precision(rs) | |
0.39166666666666666 | |
Args: | |
rs: Iterator of relevance scores (list or numpy) in rank order | |
(first element is the first item) | |
Returns: | |
Mean average precision | |
""" | |
return np.mean([average_precision(r) for r in rs]) | |
def dcg_at_k(r, k, method=0): | |
"""Score is discounted cumulative gain (dcg) | |
Relevance is positive real values. Can use binary | |
as the previous methods. | |
Example from | |
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf | |
>>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0] | |
>>> dcg_at_k(r, 1) | |
3.0 | |
>>> dcg_at_k(r, 1, method=1) | |
3.0 | |
>>> dcg_at_k(r, 2) | |
5.0 | |
>>> dcg_at_k(r, 2, method=1) | |
4.2618595071429155 | |
>>> dcg_at_k(r, 10) | |
9.6051177391888114 | |
>>> dcg_at_k(r, 11) | |
9.6051177391888114 | |
Args: | |
r: Relevance scores (list or numpy) in rank order | |
(first element is the first item) | |
k: Number of results to consider | |
method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...] | |
If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...] | |
Returns: | |
Discounted cumulative gain | |
""" | |
r = np.asfarray(r)[:k] | |
if r.size: | |
if method == 0: | |
return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1))) | |
elif method == 1: | |
return np.sum(r / np.log2(np.arange(2, r.size + 2))) | |
else: | |
raise ValueError('method must be 0 or 1.') | |
return 0. | |
def ndcg_at_k(r, k, method=0): | |
"""Score is normalized discounted cumulative gain (ndcg) | |
Relevance is positive real values. Can use binary | |
as the previous methods. | |
Example from | |
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf | |
>>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0] | |
>>> ndcg_at_k(r, 1) | |
1.0 | |
>>> r = [2, 1, 2, 0] | |
>>> ndcg_at_k(r, 4) | |
0.9203032077642922 | |
>>> ndcg_at_k(r, 4, method=1) | |
0.96519546960144276 | |
>>> ndcg_at_k([0], 1) | |
0.0 | |
>>> ndcg_at_k([1], 2) | |
1.0 | |
Args: | |
r: Relevance scores (list or numpy) in rank order | |
(first element is the first item) | |
k: Number of results to consider | |
method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...] | |
If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...] | |
Returns: | |
Normalized discounted cumulative gain | |
""" | |
dcg_max = dcg_at_k(sorted(r, reverse=True), k, method) | |
if not dcg_max: | |
return 0. | |
return dcg_at_k(r, k, method) / dcg_max | |
if __name__ == "__main__": | |
import doctest | |
doctest.testmod() |
is IDCG calculated across all the queries or IDCG for each query for calculating NDCG?
Has anyone made this into a pypi package? If not @bwhite, would you mind if I went ahead and made it into one? I'd of course accredit you. I'm just tired of copying and pasting this code because it is super useful haha
Made this into a cute little pypi package if anyone is interested: https://github.com/ncoop57/cute_ranking
i'm trying to learning about ndcg, but there's one thing i don't understand about the code. What is "method" variable ? is that something that produce same two maximum weights ?
method variable distinguishes between two ways of giving weights to relevances while calculating DCG
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@lucidyan , @cuteapi
The code is correct if you assume that the ranking list contains all the relevant documents that need to be retrieved. In your example, the query with ranking list r=[1,0,0] retrieves 3 documents, but only one is relevant, which is in the top position, so your Average Precision is 1.0. Note that Mean Average Precision assumes that each query is independent of each other, and in your example, there is no reason to believe that every query has to retrieve always 3 relevant documents. For your example r=[1,1,0] and r=[1,0,0] the relevant documents are 2 and 1 respectively, because the code assumes that the total number of 1's in your list is the total number of relevant documents (there are supposed to be no misses in the ranked list).
This is a strong assumption in this code, but it does not make the implementation incorrect. You need to be aware that the ranking list that you pass has to contain all the positions where relevant documents appear. If that is not the case, you need to use another implementation that takes into account recall, which is the missing piece in this code.