Last active
August 29, 2015 14:01
-
-
Save binarybana/60edb3c3a7f925f7bc2d to your computer and use it in GitHub Desktop.
Benchmarking Sampling Code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using StatsBase | |
randsubseq2(A::AbstractVector, m::Integer) = randsubseq2!(Array(eltype(A),m), A) | |
function randsubseq2!(S::AbstractVector, A::AbstractArray) | |
m = length(S) | |
m == 0 && return S # needed for correctness: code below requires m > 0 | |
n = length(A) | |
m == n && return copy!(S, A) | |
0 <= m <= n || throw(DomainError()) | |
i = 0; j = 0 | |
while true | |
if (n - i)*rand() <= m - j | |
S[j += 1] = A[i += 1] | |
j == m && return S | |
else | |
i += 1 | |
end | |
end | |
end | |
vitter(A::AbstractArray, k::Integer) = vitter!(A,Array(eltype(A),k)) | |
function vitter!(A::AbstractArray, x::AbstractArray) | |
N = length(A) | |
n = length(x) | |
V_prime = exp(log(rand())/n) | |
quant1 = N-n+1 | |
quant2 = quant1/N | |
alpha_inverse = 15 | |
threshold = alpha_inverse * n | |
ai=1 | |
local S | |
while n>1 && threshold<N | |
while true | |
local X | |
while true | |
X = N * (1.0 - V_prime) | |
S = itrunc(X) | |
S < quant1 && break | |
V_prime = exp(log(rand())/n) | |
end | |
y = rand()/quant2 | |
LHS = exp(log(y)/(n-1)) | |
RHS = ((quant1 - S)/quant1) * (N/(N-X)) | |
if LHS <= RHS # Accept S | |
V_prime = LHS/RHS | |
break | |
end | |
if n-1>S | |
bottom = N-n | |
limit = N-S | |
else | |
bottom = N-S-1 | |
limit = quant1 | |
end | |
for top=N-1:-1:limit | |
y *= top/bottom | |
bottom = bottom - 1 | |
end | |
if exp(log(y)/(n-1)) <= N/(N-X) | |
# Accept S | |
V_prime = exp(log(rand())/(n-1)) | |
break | |
end | |
V_prime = exp(log(rand())/n) | |
end | |
# Select (S+1)st record | |
N -= S+1 | |
n -= 1 | |
x[end-n] = A[ai+=S] | |
quant1 -= S | |
quant2 = quant1/N | |
threshold -= alpha_inverse | |
end | |
if n>1 # Algo A | |
top = N - n | |
for n = n-1:-1:2 # FIXME: this n-1 was originally n | |
V = rand() | |
S = 1 | |
quot = top/N | |
while quot > V | |
S = S+1 | |
top = top-1 | |
N = N-1 | |
quot *= top/N | |
end | |
x[end-n] = A[ai+=S] | |
N-=1 | |
end | |
end | |
#One left: | |
S = itrunc(N*V_prime) | |
x[end] = A[ai+=S] | |
end | |
function rand_first_index(n, k) | |
r = rand() | |
p = k/n | |
i = 1 | |
while p < r | |
i += 1 | |
p += (1-p)k/(n-(i-1)) | |
end | |
return i | |
end | |
ordered_sample_norep{T}(xs::AbstractArray{T}, k::Integer) = ordered_sample_norep!(xs, Array(T,k)) | |
function ordered_sample_norep!(xs::AbstractArray, target::AbstractArray) | |
n = length(xs) | |
k = length(target) | |
i = 0 | |
for j in 1:k | |
step = rand_first_index(n, k) | |
n -= step | |
i += step | |
target[j] = xs[i] | |
k -= 1 | |
end | |
return target | |
end | |
function floyd1{T}(a::AbstractArray{T}, m::Integer) | |
n = length(a) | |
0 <= m <= n || throw(DomainError()) | |
# Floyd's Õ(m) algorithm from J. Bentley and B. Floyd, "Programming pearls: A sample of brilliance," | |
# Commun. ACM Magazine 30, no. 9, 754--757 (1987). | |
# Adapted from https://github.com/JuliaLang/julia/issues/6714#issuecomment-42057923 | |
#s = IntSet() | |
s = Set{Int}() | |
x = Array(T, m) | |
for (xind,j) = enumerate(n-m+1:n) | |
i = rand(1:j) | |
ind = i in s ? j : i | |
x[xind] = a[ind] | |
push!(s, ind) | |
end | |
x | |
end | |
function floyd1p{T}(a::AbstractArray{T}, m::Integer) | |
n = length(a) | |
0 <= m <= n || throw(DomainError()) | |
# Floyd's Õ(m) algorithm from J. Bentley and B. Floyd, "Programming pearls: A sample of brilliance," | |
# Commun. ACM Magazine 30, no. 9, 754--757 (1987). | |
# Adapted from https://github.com/JuliaLang/julia/issues/6714#issuecomment-42057923 | |
#s = IntSet() | |
s = Set{Int}() | |
x = T[] | |
sizehint(x, m) | |
for j = n-m+1:n | |
i = rand(1:j) | |
if i in s | |
#use j | |
push!(x,a[j]) | |
push!(s,j) | |
else | |
unshift!(x,a[j]) | |
push!(s,i) | |
end | |
end | |
x | |
end | |
# warmup | |
sample(1:1000, 100) | |
floyd1(1:1000, 100) | |
floyd1p(1:1000, 100) | |
ordered_sample_norep(1:1000, 100) | |
vitter(1:1000, 100) | |
N=10_000 | |
krange = 5:5:200 | |
navg=200 | |
using PyPlot | |
close("all") | |
labels=["sample ordered", | |
"sample unordered", | |
"randsubseq2", | |
"vitter", | |
"floyd ordered", | |
"floyd unordered"] | |
#"one-more-min ordered"] | |
times = {Float64[] for x=1:length(labels)} | |
for i=krange | |
gc_disable() | |
meas = [median([@elapsed sample(1:N, i, replace=false, ordered=true) for x=1:navg]), | |
median([@elapsed sample(1:N, i, replace=false) for x=1:navg]), | |
median([@elapsed randsubseq2(1:N, i) for x=1:navg]), | |
median([@elapsed vitter(1:N, i) for x=1:navg]), | |
median([@elapsed floyd1(1:N, i) for x=1:navg]), | |
median([@elapsed floyd1p(1:N, i) for x=1:navg])] | |
#median([@elapsed ordered_sample_norep(1:N, i) for x=1:10])] | |
gc_enable() | |
gc() | |
[push!(x,t) for (x,t) in zip(times,meas)] | |
end | |
[plot([krange],t,label=l,linewidth=5, alpha=0.7) for (t,l) in zip(times,labels)] | |
legend(loc="best") | |
title("From $(1:N), sample ...") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment