Created
October 18, 2013 21:13
-
-
Save jiahao/7048359 to your computer and use it in GitHub Desktop.
Hypergeometric random variate generators implemented in Julia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Hypergeometric random variate generators | |
# | |
# (c) 2013 Jiahao Chen <[email protected]> | |
# | |
# This file implements several random variate generators as described in the | |
# reference paper. | |
# | |
# Algorithm Function | |
# HBU randhg_hbu | |
# HIN randhg_hin | |
# HALIAS randhg_alias | |
# H2PE randhg_h2pe | |
# | |
# Reference: | |
# "Computer generation of hypergeometric random variates", | |
# V. Kachitvichyanukul and B. Schmeiser, J. Stat. Comput. Simul. 22 (2), | |
# 1985, 127-145, doi:10.1080/00949658508810839 | |
using Distributions | |
################# | |
# Algorithm HBU # | |
################# | |
#Naive generation using sampling without replacement | |
function randhg_hbu_core(n1::Int, n2::Int, k::Int) | |
n = n1 + n2 | |
T, T1 = n, n1 | |
x = 0 | |
for J=0:min(k, n-k) | |
u = rand() | |
if u <= (T1/T) | |
x += 1 | |
if x==n1 return x end | |
T1 -= 1 | |
end | |
T -= 1 | |
end | |
x | |
end | |
#For large k, swap the arguments around using symmetry of the hypergeometric | |
#distribution | |
function randhg_hbu(n1::Int, n2::Int, k::Int) | |
2k > (n1 + n2) ? n1 - randhg_hbu_core(n1, n2, n1+n2-k) : randhg_hbu_core(n1, n2, k) | |
end | |
################# | |
# Algorithm HIN # | |
################# | |
#Inverse transformation | |
function randhg_hin_core(n1::Int, n2::Int, k::Int) | |
n = n1+n2 | |
if k<n2 | |
#p = factorial(n2)*factorial(n-k)/(factorial(n)*factorial(n2-k)) | |
p = exp(lfact(n2)+lfact(n-k)-lfact(n)-lfact(n2-k)) | |
x = 0 | |
else | |
#p = factorial(n1)*factorial(k)/(factorial(n)*factorial(k-n2)) | |
p = exp(lfact(n1)+lfact(k)-lfact(n)-lfact(k-n2)) | |
x = k-n2 | |
end | |
u = rand() | |
while u > p #Typo in paper | |
u -= p | |
p *= (n1-x)*(k-x)/((x+1)*(n2-k+1+x)) | |
x += 1 | |
if x == n error("Inverse transformation did not converge") end | |
end | |
x | |
end | |
function randhg_hin(n1::Int, n2::Int, k::Int) | |
if n1 <= n2 | |
if 2k <= (n1 + n2) | |
return randhg_hin_core(n1, n2, k) | |
else | |
return n1-randhg_hin_core(n1, n2, n1+n2-k) | |
end | |
else | |
if 2k <= (n1 + n2) | |
return k-randhg_hin_core(n2, n1, k) | |
else | |
return k-n2+randhg_hin_core(n2, n1, n1+n2-k) | |
end | |
end | |
end | |
#################### | |
# Algorithm HALIAS # | |
#################### | |
# The alias method | |
#XXX This doesn't work | |
function randhg_halias(n1::Int, n2::Int, k::Int) | |
#Step 1 | |
IL = max(0, k-n2) | |
IU = min(k, n2) | |
IL, IU = min(IL, IU), max(IL, IU) | |
m = IU-IL+1 | |
#Step 2. Calculate the hypergeometric probabilities for all mass points I_L...I_U | |
probabilities = pmf(HyperGeometric(n1, n2, k), IL:IU) #This really should be lowercase G | |
#Step 3. Setup the alias table A and table of its cutoff values f | |
A, F, _ = arbran(probabilities) | |
#To generate one hypergeometric random variate: | |
#Step 4. | |
u = rand() * m | |
i = int(ceil(u)) | |
#Step 5. | |
x = (u>F[i]) ? A[i] : i | |
#Step 6. | |
x+IL-1 | |
end | |
# Compute the aliases and cutoff values for the desired probability distribution. | |
# Input: | |
# e: The desired probability values | |
# n: number of variables | |
# Output: | |
# a: alias values | |
# f: cutoff values | |
# p: reconstructed probabilities | |
# Citation: | |
# "An Efficient Method for Generating Discrete Random Variables with General | |
# Distributions" | |
# A. J. Walker, ACM Trans. Math. Software 3 (3), 1977, 253-256 | |
# doi:10.1145/355744.355749 | |
function arbran{X<:Real}(e::Vector{X}) | |
n = length(e) | |
error = .1e-5 | |
#Initialize output | |
a = [1:n] | |
f = zeros(n) | |
b = e - 1/n | |
#Find largest positive and negative differences and their positions in b | |
c, k= minimum(b), indmin(b) | |
d, l= maximum(b), indmax(b) | |
#Test whether the sum of differences in b have become significant | |
if sum(abs(b)) >= error | |
#Assign alias and cutoff values | |
a[k], f[k], b[k], b[l] = l, 1+c*n, 0, c+d | |
end | |
#Computation of alias and cutoff values complete | |
#Now reconstruct the probabilities | |
p = f/n | |
for i=1:n, j=1:n | |
if (a[j]==i) p[i] += (1-f[j])/n end | |
end | |
a, f, p | |
end | |
################## | |
# Algorithm H2PE # | |
################## | |
# H2PE (Hypergeometric 2-Point Exponential) generator | |
# | |
# The new algorithm developed in Kachitvichyanukul and Schmeiser (1981). | |
# | |
# This takes an optional argument do_approx which controls whether or not to | |
# use the one-sided approximations in calculating the rescaled hypergeometric | |
# probability mass function, f. By default, do_approx=true, and is a faithful | |
# implementation of the H2PE algorithm; do_approx=false uses the exact method | |
# always. | |
function randhg_h2pe(n1✽::Int, n2✽::Int, k✽::Int, do_approx::Bool) | |
USE_HIN_THRESH = 10 | |
#Step 0. Set up constants as function of n1✽, n2✽, k✽ | |
#0.0. Determine parameter set for more efficient generation | |
n = n1✽ + n2✽ | |
n1, n2 = n1✽>n2✽ ? (n2✽, n1✽) : (n1✽, n2✽) | |
k = 2k✽<=n ? k✽ : n-k✽ | |
#0.1 Set up constants | |
M = floor((k+1)*(n1+1)/(n+2)) | |
if (M-max(0,k-n2))<USE_HIN_THRESH return randhg_hin(n1✽, n2✽, k✽) end | |
A = lfact(M) + lfact(n1-M) + lfact(k-M) + lfact(n2-k+M) | |
D = 1.5*sqrt((n-k)*k*n1*n2/((n-1)*n*n)) + 0.5 | |
xL, xR = M-D+0.5, M+D+0.5 | |
kL = exp(A-lfact(xL)-lfact(n1-xL)-lfact(k-xL)-lfact(n2-k+xL)) | |
#The paper has a sign error in the last term of kR | |
kR = exp(A-lfact(xR-1)-lfact(n1-xR+1)-lfact(k-xR+1)-lfact(n2-k+xR-1)) | |
λL = -log(xL*(n2-k+xL)) + log((n1-xL+1)*(k-xL+1)) | |
λR = -log((n1-xR+1)*(k-xR+1)) + log(xR*(n2-k+xR)) | |
p1 = 2D | |
p2 = p1 + kL/λL #Typo in paper | |
p3 = p2 + kR/λR | |
y = 0 #Need to define outside to obey Julia scoping rules | |
#Step 1. Begin logic to generate one hypergeometric variate y. | |
# Generate u~U(0,p3) for selecting the region, | |
# v~U(0,1) for the accept/reject decision. | |
# If region 1 is selected, generate a uniform variate between xL and xR. | |
while true #Keep sampling until we find an acceptable variate | |
u, v = rand()*p3, rand() | |
if u > p1 | |
#Step 2. Region 2, left exponential tail | |
if u <= p2 | |
y = int(floor(xL + log(v)/λL)) | |
if y < max(0, k-n2) continue end | |
v *= (u-p1)*λL | |
else | |
#Step 3. Region 3. right exponential tail | |
y = int(floor(xR - log(v)/λR)) | |
if y > min(n1, k) continue end | |
v *= (u-p2)*λR | |
end | |
else | |
y = int(floor(xL + u)) | |
end | |
#Step 4. Acceptance/rejection comparison | |
#4.0 test for appropriate method of evaluating f(y) | |
if !(do_approx && (M >=100 && y > 50)) #Use exact method | |
#4.1 Evaluate f(y) recursively via | |
#\[ | |
# f(y+1) = f(y) \frac{(n1-y)(k-y)}{(y+1)(n2-k+y+1} | |
#\] | |
#Start the search from the mode | |
f = 1.0 | |
if M < y | |
for i = M:y | |
f *= (n1-i+1)*(k-i+1)/(i*(n2-k+i)) | |
end | |
else | |
if M > y | |
for i=y:M | |
f *= i*(n2-k+i)/((n1-i)*(k-i)) | |
end | |
end | |
end | |
if v>f continue else break end | |
else #Use approximate method | |
#4.2 Squeezing. Check the value of ln(v) against upper and lower bound of | |
#ln(f). | |
x, y1, yM = y, y+1, y-M | |
yn, yk, nk = n1-y+1, k-y+1, n2-k+y1 | |
R, S, T = -yM/y1, yM/yn, yM/yk | |
E, G = -yM/nk, yn * yk / (y1 * nk) - 1 | |
DG = G>=0 ? 1 : 1 + G | |
GU = G*(1 + G*(-0.5+G/3.0)) | |
GL = GU - G^4/(4DG) | |
XM, Xn, Xk = M+0.5, n1-M+0.5, k-M+0.5 | |
nM = n2-k+XM | |
Ub = XM*R*(1+R*(-0.5+R/3.0)) | |
+ Xn*S*(1+S*(-0.5+S/3.0)) | |
+ Xk*T*(1+T*(-0.5+T/3.0)) | |
+ nM*E*(1+E*(-0.5+E/3.0)) + y*GU - M*GL + 0.0034 | |
Av = log(v) | |
if Av > Ub continue end | |
DR = XM * R^4 | |
if R < 0 DR /= 1+R end | |
DS = Xn * S^4 | |
if S < 0 DS /= 1+S end | |
DT = Xk * T^4 | |
if T < 0 DT /= 1+T end | |
DE = nM * E^4 | |
if E < 0 DE /= 1+E end | |
if Av < Ub-0.25*(DR + DS + DT + DE) + (y+M)*(GL-GU)-0.0078 break end | |
#4.3 Final acceptance/rejection test | |
if Av <= A-lfact(y)-lfact(n1-y)-lfact(k-y)-lfact(n2-k+y) break end | |
end #one-sided approximations | |
end #keep sampling until break condition | |
#Step 5. Return appropriate random variate | |
if 2k✽ < n #5.1 | |
return n1✽<=n2✽ ? y : k✽-y | |
else #5.2 | |
return n1✽<=n2✽ ? n1✽-y : k✽-n2✽-y | |
end | |
end | |
#The default is to do the approximation | |
randhg_h2pe(n1::Int, n2::Int, k::Int)=randhg_h2pe(n1, n2, k::Int, true) | |
####################################################### | |
# A very simple test driver to compare the algorithms # | |
####################################################### | |
NN = 100000 #Number of trials | |
for (n1, n2, k) in [(20, 20, 20), (100, 100, 20), (1000, 1000, 20), (10000, | |
100, 1000), (100, 100000, 10000), (1000, 10000, 1000), (10000, 1000, 1000), | |
(1000, 100000, 10000), (10000, 100, 10000)] #The parameters used in Table 1 | |
@printf("\nGenerating %d random hypergeometric variates with n1=%d, n2=%d, k=%d\n", | |
NN, n1, n2, k) | |
@printf("Algorithm mean \tstd. dev.\t runtime\n") | |
x1 = @elapsed z1 = [randhg_hbu(n1, n2, k) for t=1:NN] | |
@printf("HBU\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z1), std(z1), x1) | |
x2 = @elapsed z2 = [randhg_hin(n1, n2, k) for t=1:NN] | |
@printf("HIN\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z2), std(z2), x2) | |
#x3 = @elapsed z3 = [randhg_halias(n1, n2, k) for t=1:NN] | |
#@printf("HALIAS\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z3), std(z3), x3) | |
x3 = @elapsed z3 = [randhg_h2pe(n1, n2, k) for t=1:NN] | |
@printf("H2PE\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z3), std(z3), x3) | |
#Exact variant of H2PE | |
x4 = @elapsed z4 = [randhg_h2pe(n1, n2, k, false) for t=1:NN] | |
@printf("H2PEx\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z4), std(z4), x4) | |
#The rng in Distributions | |
Z = HyperGeometric(n1, n2, k) | |
x5 = @elapsed z5 = [rand(Z) for t=1:NN] | |
@printf("Julia\t%10.5f\t%10.5f\t (%10.5f seconds)\n", mean(z5), std(z5), x5) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment