-
-
Save dondon2475848/8243a0611462219c0b42ecd6d39d8fdb to your computer and use it in GitHub Desktop.
Ranking Metrics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
程式參考自: | |
https://gist.github.com/bwhite/3726239 | |
https://gist.github.com/gumption/b54278ec9bab2c0e0472816d1d7663be | |
差異:新增「 sum (2^rel_i - 1) / log2(i + 1) 」的版本 | |
作者:Jie Dondon | |
版本:ndcg_dondon_20180201_v2 | |
""" | |
import numpy as np | |
def dcg_at_k(r, k, method=0): | |
"""Score is discounted cumulative gain (dcg) | |
Relevance is positive real values. Can use binary as the previous methods. | |
There is a typographical error on the formula referenced in the original definition of this function: | |
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf | |
log2(i) should be log2(i+1) | |
The formulas here are derived from | |
https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Discounted_Cumulative_Gain | |
The formulas return the same results when r contains only binary values | |
>>> r = [2,3,2,3,1,1] | |
>>> dcg_at_k(r,6,1) | |
12.674304175856518 | |
>>> r = [3,3,2,2,1,1] | |
>>> dcg_at_k(r,6,1) | |
14.951597943562946 | |
Args: | |
r: Relevance scores (list or numpy array) in rank order | |
(first element is the most relevant item) | |
k: Number of results to consider | |
method: If 0 then sum rel_i / log2(i + 1) [not log2(i)] | |
If 1 then sum (2^rel_i - 1) / log2(i + 1) | |
Returns: | |
Discounted cumulative gain | |
""" | |
r = np.asfarray(r)[:k] | |
if r.size: | |
if method == 0: | |
return np.sum(r / np.log2(np.arange(2, r.size + 2))) | |
elif method == 1 : | |
return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2))) | |
else: | |
raise ValueError('method must in [0,1]') | |
return 0. | |
def ndcg_at_k(r, k, method=0): | |
"""Score is normalized discounted cumulative gain (ndcg) | |
Relevance is positive real values. Can use binary | |
as the previous methods. | |
Example from | |
2013-Introduction to Information Retrieval Evaluation p.3 | |
(http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf) | |
↑ 此份文件的公式有誤,導致提供的結果也是錯誤的 | |
2012-微軟亞洲研究院-武威-機器學習及排序學習基礎 p.36 | |
>>> r = [2, 1, 2, 0] | |
>>> ndcg_at_k(r,4,0) | |
0.96519546960144276 | |
>>> r = [2,3,2,3,1,1] | |
0.84768893757694552 | |
Args: | |
r: Relevance scores (list or numpy array) in rank order | |
(first element is the most relevant item) | |
k: Number of results to consider | |
method: If 0 then sum rel_i / log2(i + 1) [not log2(i)] | |
If 1 then sum (2^rel_i - 1) / log2(i + 1) | |
Returns: | |
Normalized discounted cumulative gain | |
""" | |
dcg_max = dcg_at_k(sorted(r, reverse=True), k, method) | |
if not dcg_max: | |
return 0. | |
return dcg_at_k(r, k, method) / dcg_max |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment