Skip to content

Instantly share code, notes, and snippets.

@atremblay
Created January 13, 2016 13:40
Show Gist options
  • Save atremblay/b8c83609444bb17f8c02 to your computer and use it in GitHub Desktop.
Save atremblay/b8c83609444bb17f8c02 to your computer and use it in GitHub Desktop.
Reservoir sampling python
import random
import numpy as np
# http://erikerlandson.github.io/blog/2015/11/20/very-fast-reservoir-sampling/
def fast_reservoir_sampling(file_handle, N=1000000, callable=None):
sample = []
if callable is None:
callable = lambda x: x
j = N
for n, line in enumerate(file_handle):
if n < N:
sample.append(callable(line))
else:
if n < j:
continue
p = N / n
g = np.random.geometric(p)
j = j + g
replace = random.randint(0, N-1)
sample[replace] = callable(line)
return sample
import random
def reservoir_sampling(file_handle, N=1000000, callable=None):
sample = []
if callable is None:
callable = lambda x: x
for n, line in enumerate(file_handle):
if n < N:
sample.append(callable(line))
elif n >= N and random.random() < N/float(n+1):
replace = random.randint(0,len(sample)-1)
sample[replace] = callable(line)
return sample
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment