Skip to content

Instantly share code, notes, and snippets.

@Pagliacii
Created September 15, 2023 01:44
Show Gist options
  • Save Pagliacii/957fd72ac34bdfc1fa66c3622244c52b to your computer and use it in GitHub Desktop.
Save Pagliacii/957fd72ac34bdfc1fa66c3622244c52b to your computer and use it in GitHub Desktop.
Reservoir sampling for multiple sample sets
#!/usr/bin/env python3
# coding=utf-8
import random
import time
def get_elems(total):
for i in range(total):
yield {
"type": random.randint(0, 2),
"num": i,
"data": {},
}
elems_size = 100_000
sample_size = 10
num_samples = 3 # Number of sample sets to generate
elems = get_elems(elems_size)
samples = [[None] * sample_size for _ in range(num_samples)]
sample_idx = {i: 0 for i in range(num_samples)}
# Getting a time-based seed
random.seed(time.time())
for elem in elems:
i = sample_idx[elem["type"]]
a = random.randint(0, i)
if a < sample_size:
samples[elem["type"]][a] = elem
sample_idx[elem["type"]] += 1
# Printing the samples
for n in range(num_samples):
print(f"Sample set {n + 1}:")
for elem in samples[n]:
if not elem:
continue
print(elem["num"], end=" ")
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment