cheekybastard · December 15, 2015 08:29
diff --git a/coment_ranking.py b/coment_ranking.py
 # Rewritten code from /r2/r2/lib/db/_sorts.pyx
 # http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
 # http://blog.reddit.com/2009/10/reddits-new-comment-sorting-system.html
 # http://possiblywrong.wordpress.com/2011/06/05/reddits-comment-ranking-algorithm/
 """
 Randall Munroe of xkcd is the idea guy behind Reddit's best ranking. He has written a great blog post about it:

 You should read his blog post as it explains the algorithm in a very understandable way. The outline of his blog post is following:

    Using the hot algorithm for comments isn't that smart since it seems to be heavily biased toward comments posted early
    In a comment system you want to rank the best comments highest regardless of their submission time
    A solution for this has been found in 1927 by Edwin B. Wilson and it's called "Wilson score interval", Wilson's score interval can be made into "the confidence sort"
    The confidence sort treats the vote count as a statistical sampling of a hypothetical full vote by everyone - like in an opinion poll
    How Not To Sort By Average Rating outlines the confidence ranking in higher detail, definitely recommended reading!
    http://www.evanmiller.org/how-not-to-sort-by-average-rating.html

 http://blog.reddit.com/2009/10/reddits-new-comment-sorting-system.html
    If a comment has one upvote and zero downvotes, it has a 100% upvote rate, but since there's not very much data, the system will keep it near the bottom. But if it has 10 upvotes and only 1 downvote, the system might have enough confidence to place it above something with 40 upvotes and 20 downvotes -- figuring that by the time it's also gotten 40 upvotes, it's almost certain it will have fewer than 20 downvotes. And the best part is that if it's wrong (which it is 15% of the time), it will quickly get more data, since the comment with less data is near the top.

 """

 from math import sqrt

 def _confidence(ups, downs):
    n = ups + downs

    if n == 0:
        return 0

    z = 1.0 #1.0 = 85%, 1.6 = 95%
    phat = float(ups) / n
    return sqrt(phat+z*z/(2*n)-z*((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)

 def confidence(ups, downs):
    if ups + downs == 0:
        return 0
    else:
        return _confidence(ups, downs)
diff --git a/comments_ranking2.py b/comments_ranking2.py
 # From /r2/r2/lib/db/_sorts.pyx
 # http://possiblywrong.wordpress.com/2011/06/05/reddits-comment-ranking-algorithm/

 from math import sqrt
 
 def confidence_fixed(ups, downs):
    if ups == 0:
        return -downs
    n = ups + downs
    z = 1.64485 #1.0 = 85%, 1.6 = 95%
    phat = float(ups) / n
    return (phat+z*z/(2*n)-z*sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)
diff --git a/hot_ranking.py b/hot_ranking.py
 # Rewritten code from /r2/r2/lib/db/_sorts.pyx
 # http://www.seomoz.org/blog/reddit-stumbleupon-delicious-and-hacker-news-algorithms-exposed
 """
 Conclusion of Reddit's story ranking

    Submission time is a very important parameter, generally newer stories will rank higher than older
    The first 10 upvotes count as high as the next 100. E.g. a story that has 10 upvotes and a story that has 50 upvotes will have a similar ranking
    Controversial stories that get similar amounts of upvotes and downvotes will get a low ranking compared to stories that mainly get upvotes

 """

 from datetime import datetime, timedelta
 from math import log

 epoch = datetime(1970, 1, 1)

 def epoch_seconds(date):
    """Returns the number of seconds from the epoch to date."""
    td = date - epoch
    return td.days * 86400 + td.seconds + (float(td.microseconds) / 1000000)

 def score(ups, downs):
    return ups - downs

 def hot(ups, downs, date):
    """The hot formula. Should match the equivalent function in postgres."""
    s = score(ups, downs)
    order = log(max(abs(s), 1), 10)
    sign = 1 if s > 0 else -1 if s < 0 else 0
    seconds = epoch_seconds(date) - 1134028003
    return round(order + sign * seconds / 45000, 7)
diff --git a/Reddits_Algorithm.py b/Reddits_Algorithm.py
 # Rewritten code from /r2/r2/lib/db/_sorts.pyx
 # http://amix.dk/blog/post/19588 # How Reddit ranking algorithms work
 # http://amix.dk/blog/post/19574 # How Hacker News ranking algorithm works
 # http://www.evanmiller.org/how-not-to-sort-by-average-rating.html

 from math import sqrt

 def _confidence(ups, downs):
    n = ups + downs

    if n == 0:
        return 0

    z = 1.0 #1.0 = 85%, 1.6 = 95%
    phat = float(ups) / n
    return sqrt(phat+z*z/(2*n)-z*((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)

 def confidence(ups, downs):
    if ups + downs == 0:
        return 0
    else:
        return _confidence(ups, downs)
	# Rewritten code from /r2/r2/lib/db/_sorts.pyx
	# http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
	# http://blog.reddit.com/2009/10/reddits-new-comment-sorting-system.html
	# http://possiblywrong.wordpress.com/2011/06/05/reddits-comment-ranking-algorithm/
	"""
	Randall Munroe of xkcd is the idea guy behind Reddit's best ranking. He has written a great blog post about it:

	You should read his blog post as it explains the algorithm in a very understandable way. The outline of his blog post is following:

	Using the hot algorithm for comments isn't that smart since it seems to be heavily biased toward comments posted early
	In a comment system you want to rank the best comments highest regardless of their submission time
	A solution for this has been found in 1927 by Edwin B. Wilson and it's called "Wilson score interval", Wilson's score interval can be made into "the confidence sort"
	The confidence sort treats the vote count as a statistical sampling of a hypothetical full vote by everyone - like in an opinion poll
	How Not To Sort By Average Rating outlines the confidence ranking in higher detail, definitely recommended reading!
	http://www.evanmiller.org/how-not-to-sort-by-average-rating.html

	http://blog.reddit.com/2009/10/reddits-new-comment-sorting-system.html
	If a comment has one upvote and zero downvotes, it has a 100% upvote rate, but since there's not very much data, the system will keep it near the bottom. But if it has 10 upvotes and only 1 downvote, the system might have enough confidence to place it above something with 40 upvotes and 20 downvotes -- figuring that by the time it's also gotten 40 upvotes, it's almost certain it will have fewer than 20 downvotes. And the best part is that if it's wrong (which it is 15% of the time), it will quickly get more data, since the comment with less data is near the top.

	"""

	from math import sqrt

	def _confidence(ups, downs):
	n = ups + downs

	if n == 0:
	return 0

	z = 1.0 #1.0 = 85%, 1.6 = 95%
	phat = float(ups) / n
	return sqrt(phat+zz/(2n)-z((phat(1-phat)+zz/(4n))/n))/(1+z*z/n)

	def confidence(ups, downs):
	if ups + downs == 0:
	return 0
	else:
	return _confidence(ups, downs)
	# From /r2/r2/lib/db/_sorts.pyx
	# http://possiblywrong.wordpress.com/2011/06/05/reddits-comment-ranking-algorithm/

	from math import sqrt

	def confidence_fixed(ups, downs):
	if ups == 0:
	return -downs
	n = ups + downs
	z = 1.64485 #1.0 = 85%, 1.6 = 95%
	phat = float(ups) / n
	return (phat+zz/(2n)-zsqrt((phat(1-phat)+zz/(4n))/n))/(1+z*z/n)
	# Rewritten code from /r2/r2/lib/db/_sorts.pyx
	# http://www.seomoz.org/blog/reddit-stumbleupon-delicious-and-hacker-news-algorithms-exposed
	"""
	Conclusion of Reddit's story ranking

	Submission time is a very important parameter, generally newer stories will rank higher than older
	The first 10 upvotes count as high as the next 100. E.g. a story that has 10 upvotes and a story that has 50 upvotes will have a similar ranking
	Controversial stories that get similar amounts of upvotes and downvotes will get a low ranking compared to stories that mainly get upvotes

	"""

	from datetime import datetime, timedelta
	from math import log

	epoch = datetime(1970, 1, 1)

	def epoch_seconds(date):
	"""Returns the number of seconds from the epoch to date."""
	td = date - epoch
	return td.days * 86400 + td.seconds + (float(td.microseconds) / 1000000)

	def score(ups, downs):
	return ups - downs

	def hot(ups, downs, date):
	"""The hot formula. Should match the equivalent function in postgres."""
	s = score(ups, downs)
	order = log(max(abs(s), 1), 10)
	sign = 1 if s > 0 else -1 if s < 0 else 0
	seconds = epoch_seconds(date) - 1134028003
	return round(order + sign * seconds / 45000, 7)
	# Rewritten code from /r2/r2/lib/db/_sorts.pyx
	# http://amix.dk/blog/post/19588 # How Reddit ranking algorithms work
	# http://amix.dk/blog/post/19574 # How Hacker News ranking algorithm works
	# http://www.evanmiller.org/how-not-to-sort-by-average-rating.html

	from math import sqrt

	def _confidence(ups, downs):
	n = ups + downs

	if n == 0:
	return 0

	z = 1.0 #1.0 = 85%, 1.6 = 95%
	phat = float(ups) / n
	return sqrt(phat+zz/(2n)-z((phat(1-phat)+zz/(4n))/n))/(1+z*z/n)

	def confidence(ups, downs):
	if ups + downs == 0:
	return 0
	else:
	return _confidence(ups, downs)