Created
November 25, 2015 16:03
-
-
Save jiahao/19cee463d4a8334495e6 to your computer and use it in GitHub Desktop.
An implementation of Vitter's Method D for sampling a random subset from an arbitrary iterable in doi:10.1145/23002.23003
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| function randsubset(iterable, n::Int, T::Int=22) | |
| state = start(iterable) | |
| sample = eltype(iterable)[] | |
| sizehint!(sample, n) | |
| for j=1:n #Make first n record candidates for the sample | |
| x, state = next(iterable, state) | |
| done(iterable, state) && j<n && error("Requested sample of size $n but only $j items available") | |
| push!(sample, x) | |
| end | |
| t = n #Number of records processed so far | |
| #Use Algorithm X until t is large | |
| #T=22 | |
| thresh = T*n | |
| S = 0 | |
| while !done(iterable, state) && t ≤ thresh | |
| #Find smallest S satisfying (4.1) | |
| #i.e. ((t+1-n)/(t+1))^ceil(S+1) ≤ V | |
| V = rand() | |
| S = 0 | |
| quot = (t-n)/t | |
| while quot > V | |
| S += 1 | |
| t += 1 | |
| quot *= (t-n)/t | |
| end | |
| for i=1:S #Skip S records | |
| _, state = next(iterable, state) | |
| if done(iterable, state) | |
| t -= S-i | |
| @goto returnsample | |
| end | |
| end | |
| if !done(iterable, state) #Make next record a candidate | |
| M = rand(1:n) #Replace this one at random | |
| x, state = next(iterable, state) | |
| sample[M] = x | |
| end | |
| t += 1 | |
| end | |
| #Process the rest of the records using the rejection technique | |
| W = exp(-log(rand())/n) | |
| term = t-n+1 #Always equal to t-n+1 | |
| while !done(iterable, state) | |
| while true | |
| U = rand() | |
| X = t*(W-1) | |
| S = floor(Integer, X) | |
| #Test if U ≤ h(S) / cg(X) in the manner of (6.3) | |
| lhs = exp(log(((U*(((t+1)/term)^2)) * (term + S))/(t+X))/n) | |
| rhs = (((t + X)/(term + S)) * term)/t | |
| if lhs ≤ rhs | |
| W = rhs/lhs | |
| break | |
| end | |
| #Test if Y ≤ f(S)/cg(X) | |
| y = (((U*(t+1))/term)*(t + S + 1))/(t + X) | |
| if n < S | |
| denom = t | |
| numer_lim = term + S | |
| else | |
| denom = t - n + S | |
| numer_lim = t + 1 | |
| end | |
| for numer = t+S:-1:numer_lim | |
| y *= numer/denom | |
| denom -= 1 | |
| end | |
| W = exp(-log(rand())/n) | |
| if exp(log(y)/n) ≤ (t + X)/t | |
| break | |
| end | |
| end | |
| for i=1:S #Skip S records | |
| _, state = next(iterable, state) | |
| if done(iterable, state) | |
| t += i | |
| @goto returnsample | |
| end | |
| end | |
| if !done(iterable, state) #Make next record a candidate | |
| M = rand(1:n) #Replace this one at random | |
| x, state = next(iterable, state) | |
| sample[M] = x | |
| end | |
| t += S + 1 | |
| term += S + 1 | |
| end | |
| @label returnsample | |
| sample, t | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment