Last active
October 8, 2017 11:56
-
-
Save kdubovikov/81b8460653ba8d75c611a59759631d31 to your computer and use it in GitHub Desktop.
Fast random subset sampling from large arrays
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
import numpy as np | |
from collections import Counter | |
from tqdm import tqdm | |
def get_sample(arr, n_iter=None, sample_size=10, | |
fast=True): | |
"""Get random sample from arr. | |
Parameters | |
---------- | |
arr: np.array | |
array to sample from. | |
n_iter: int | |
current iteration number. | |
sample_size: int | |
sample size | |
fast: bool | |
use sampling optimized for fast consecutive samples | |
from the same array. | |
Returns | |
------- | |
sample: np.array | |
sample from arr of length n_iter. | |
""" | |
if fast: | |
# find the index we last sampled from | |
start_idx = (n_iter * sample_size) % n | |
if start_idx + sample_size >= n: | |
# shuffle array if we have reached the end and repeat again | |
np.random.shuffle(arr) | |
return arr[start_idx:start_idx+sample_size] | |
else: | |
return np.random.choice(arr, sample_size, replace=False) | |
def collect_samples(arr, | |
sample_size, | |
n_samples, | |
fast=False): | |
""" | |
Collect several samples from arr. | |
Parameters | |
---------- | |
arr: np.array | |
array to sample from. | |
sample_size: int | |
sample size. | |
n_samples: int | |
number of samples to take. | |
fast: bool | |
use sampling optimized for fast consecutive samples | |
from the same array. | |
Returns | |
------- | |
samples: np.ndarray | |
sample matrix of shape (n_samples, sample_size) | |
""" | |
samples = np.zeros((n_samples + 1, sample_size), np.int32) | |
for sample_n in range(0, n_samples): | |
sample = get_sample(arr, | |
n_iter=sample_n, | |
sample_size=sample_size, | |
fast=fast) | |
samples[sample_n] = sample | |
return samples | |
n = 3000000 | |
arr = np.array([i for i in range(n)]).astype(np.int64) | |
timeit.timeit(stmt="collect_samples(arr, 1000, 10, fast=True)", setup="from __main__ import collect_samples, arr", number=10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment