Skip to content

Instantly share code, notes, and snippets.

@jiahao
Created November 25, 2015 16:03
Show Gist options
  • Select an option

  • Save jiahao/19cee463d4a8334495e6 to your computer and use it in GitHub Desktop.

Select an option

Save jiahao/19cee463d4a8334495e6 to your computer and use it in GitHub Desktop.
An implementation of Vitter's Method D for sampling a random subset from an arbitrary iterable in doi:10.1145/23002.23003
function randsubset(iterable, n::Int, T::Int=22)
state = start(iterable)
sample = eltype(iterable)[]
sizehint!(sample, n)
for j=1:n #Make first n record candidates for the sample
x, state = next(iterable, state)
done(iterable, state) && j<n && error("Requested sample of size $n but only $j items available")
push!(sample, x)
end
t = n #Number of records processed so far
#Use Algorithm X until t is large
#T=22
thresh = T*n
S = 0
while !done(iterable, state) && t ≤ thresh
#Find smallest S satisfying (4.1)
#i.e. ((t+1-n)/(t+1))^ceil(S+1) ≤ V
V = rand()
S = 0
quot = (t-n)/t
while quot > V
S += 1
t += 1
quot *= (t-n)/t
end
for i=1:S #Skip S records
_, state = next(iterable, state)
if done(iterable, state)
t -= S-i
@goto returnsample
end
end
if !done(iterable, state) #Make next record a candidate
M = rand(1:n) #Replace this one at random
x, state = next(iterable, state)
sample[M] = x
end
t += 1
end
#Process the rest of the records using the rejection technique
W = exp(-log(rand())/n)
term = t-n+1 #Always equal to t-n+1
while !done(iterable, state)
while true
U = rand()
X = t*(W-1)
S = floor(Integer, X)
#Test if U ≤ h(S) / cg(X) in the manner of (6.3)
lhs = exp(log(((U*(((t+1)/term)^2)) * (term + S))/(t+X))/n)
rhs = (((t + X)/(term + S)) * term)/t
if lhs ≤ rhs
W = rhs/lhs
break
end
#Test if Y ≤ f(S)/cg(X)
y = (((U*(t+1))/term)*(t + S + 1))/(t + X)
if n < S
denom = t
numer_lim = term + S
else
denom = t - n + S
numer_lim = t + 1
end
for numer = t+S:-1:numer_lim
y *= numer/denom
denom -= 1
end
W = exp(-log(rand())/n)
if exp(log(y)/n) ≤ (t + X)/t
break
end
end
for i=1:S #Skip S records
_, state = next(iterable, state)
if done(iterable, state)
t += i
@goto returnsample
end
end
if !done(iterable, state) #Make next record a candidate
M = rand(1:n) #Replace this one at random
x, state = next(iterable, state)
sample[M] = x
end
t += S + 1
term += S + 1
end
@label returnsample
sample, t
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment