Skip to content

Instantly share code, notes, and snippets.

@kdubovikov
Last active October 8, 2017 11:56
Show Gist options
  • Save kdubovikov/81b8460653ba8d75c611a59759631d31 to your computer and use it in GitHub Desktop.
Save kdubovikov/81b8460653ba8d75c611a59759631d31 to your computer and use it in GitHub Desktop.
Fast random subset sampling from large arrays
import timeit
import numpy as np
from collections import Counter
from tqdm import tqdm
def get_sample(arr, n_iter=None, sample_size=10,
fast=True):
"""Get random sample from arr.
Parameters
----------
arr: np.array
array to sample from.
n_iter: int
current iteration number.
sample_size: int
sample size
fast: bool
use sampling optimized for fast consecutive samples
from the same array.
Returns
-------
sample: np.array
sample from arr of length n_iter.
"""
if fast:
# find the index we last sampled from
start_idx = (n_iter * sample_size) % n
if start_idx + sample_size >= n:
# shuffle array if we have reached the end and repeat again
np.random.shuffle(arr)
return arr[start_idx:start_idx+sample_size]
else:
return np.random.choice(arr, sample_size, replace=False)
def collect_samples(arr,
sample_size,
n_samples,
fast=False):
"""
Collect several samples from arr.
Parameters
----------
arr: np.array
array to sample from.
sample_size: int
sample size.
n_samples: int
number of samples to take.
fast: bool
use sampling optimized for fast consecutive samples
from the same array.
Returns
-------
samples: np.ndarray
sample matrix of shape (n_samples, sample_size)
"""
samples = np.zeros((n_samples + 1, sample_size), np.int32)
for sample_n in range(0, n_samples):
sample = get_sample(arr,
n_iter=sample_n,
sample_size=sample_size,
fast=fast)
samples[sample_n] = sample
return samples
n = 3000000
arr = np.array([i for i in range(n)]).astype(np.int64)
timeit.timeit(stmt="collect_samples(arr, 1000, 10, fast=True)", setup="from __main__ import collect_samples, arr", number=10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment