Skip to content

Instantly share code, notes, and snippets.

@tokestermw
Last active August 31, 2016 17:46
Show Gist options
  • Save tokestermw/e3b8474d1aed94d661a8085d248a804f to your computer and use it in GitHub Desktop.
Save tokestermw/e3b8474d1aed94d661a8085d248a804f to your computer and use it in GitHub Desktop.
use probability distributions instead of ratios or counts
from __future__ import division
def average(x):
return sum(x) / len(x)
def wilson_lower_bound(count, total, z_score=1.96):
""" Implementation of Wilson Scores
Ref: http://www.evanmiller.org/how-not-to-sort-by-average-rating.html
"""
assert 0 <= count <= total
if total == 0:
return 0
phat = 1.0 * count / total
base = phat + z_score ** 2 / (2 * total)
diff = z_score * ((phat * (1 - phat) + z_score ** 2 / (4 * total)) / total) ** .5
norm = 1 + z_score ** 2 / total
return (base - diff) / norm
if __name__ == '__main__':
a = [0, 0, 0, 1, 1, 0, 1, 1]
b = [0, 1]
c = a * 2
d = [0, 1, 1, 1, 1, 1, 1, 1]
avg_a = average(a)
wlb_a = wilson_lower_bound(sum(a), len(a))
avg_b = average(b)
wlb_b = wilson_lower_bound(sum(b), len(b))
avg_c = average(c)
wlb_c = wilson_lower_bound(sum(c), len(c))
avg_d = average(d)
wlb_d = wilson_lower_bound(sum(d), len(d))
assert avg_a == avg_b == avg_c, "averages should be equal"
assert wlb_c > wlb_a > wlb_b, "but wilson lower bound should be higher when there is more data (i.e. lower variance)"
assert wlb_d > wlb_c and avg_d > avg_c, "d should be highest"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment