Created
April 25, 2020 10:58
-
-
Save echasnovski/2008041c73fe07944de6f841db80b71e to your computer and use it in GitHub Desktop.
Simple class for doing "cycled sampling": iterative sampling when currently sampled element is put in the beginning of sampling pool.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
# Used only for exploration | |
import numpy as np | |
class CycleSample: | |
def __init__(self, x, weights=None, preshuffle=True): | |
self.pool = list(x) | |
if preshuffle: | |
random.shuffle(self.pool) | |
self._n = len(x) | |
self._inds = list(range(self._n)) | |
# For future, it can be useful for `weights` to be a function that is | |
# applied to current pool before each draw. For example, this enables | |
# usage "Don't sample recent `k` items and sample rest with probability | |
# proportional to items' lengths". This will introduce significant | |
# overhead, though. | |
self.weights = weights | |
def _pop(self, index): | |
"""Return item at index and place it at the beginning""" | |
res = self.pool.pop(index) | |
self.pool.insert(0, res) | |
return res | |
def draw(self, size=1): | |
"""Draw sample | |
Draw sample based on the following algorithm: | |
- Select position in the current pool, at which element will be drawn. | |
- Pop this elment from the pool: extract it, put at the beginning of | |
the pool, and return it. | |
- Repeat. | |
Parameters | |
---------- | |
size : int, optional | |
Size of sample to draw, by default 1 | |
Returns | |
------- | |
draw : list | |
List of drawn elements | |
""" | |
inds = random.choices(self._inds, weights=self.weights, k=size) | |
return [self._pop(i) for i in inds] | |
def __str__(self): | |
return ( | |
"Cycle sampling object with current pool " | |
"(from most to least recently used):\n" | |
f"{self.pool}" | |
) | |
def count_period_repeats(x, periods=[1]): | |
x = np.asarray(x) | |
return {st: np.sum(x[:-st] == x[st:]) for st in periods} | |
x = list(range(10)) | |
n_sample = 10000 | |
n_recent = 3 | |
# `n_recent` most recently drawn elements won't be drawn | |
not_n_recent_weights = [0] * n_recent + [1] * (len(x) - n_recent) | |
not_n_recent = CycleSample(x, not_n_recent_weights) | |
rand_draw_not_n_recent = np.array(not_n_recent.draw(n_sample)) | |
print(np.bincount(rand_draw_not_n_recent)) | |
print(count_period_repeats(rand_draw_not_n_recent, periods=range(1, 11))) | |
# `n_recent` most recently drawn elements won't be drawn AND the recent - | |
# following "human logic" (the least recently drawn element will have bigger | |
# probability to be chosen) | |
not_n_recent_human_weights = [0] * n_recent + list(range(1, len(x) - n_recent + 1)) | |
not_n_recent_human = CycleSample(x, not_n_recent_human_weights) | |
rand_draw_not_n_recent_human = np.array(not_n_recent_human.draw(n_sample)) | |
print(np.bincount(rand_draw_not_n_recent_human)) | |
print(count_period_repeats(rand_draw_not_n_recent_human, periods=range(1, 11))) | |
#%% First experiments | |
from collections import Counter | |
import numpy as np | |
import matplotlib | |
import matplotlib.pyplot as plt | |
class HumanRandom: | |
def __init__(self, x): | |
self.pool = list(np.random.permutation(x)) | |
self.weights = np.arange(1, len(x) + 1) | |
self.weights = self.weights / np.sum(self.weights) | |
def draw(self, size): | |
inds = np.random.choice(np.arange(len(self.pool)), size=size, p=self.weights) | |
return [self.pop_to_front(i) for i in inds] | |
def pop_to_front(self, index): | |
res = self.pool.pop(index) | |
self.pool.insert(0, res) | |
return res | |
def __str__(self): | |
return f"Human Random with pool: {repr(self.pool)}" | |
# Tests for implementation correctness | |
x = np.arange(4) | |
humrand = HumanRandom(x) | |
str(humrand) | |
c = Counter() | |
for _ in range(10000): | |
pool_before = humrand.pool.copy() | |
res = humrand.draw(1) | |
c[pool_before.index(res)] += 1 | |
## Should be approximately in proportion 1:2:3:4 | |
dict(sorted(c.items(), key=lambda x: x[1])) | |
# Tests for draw properties | |
x = np.arange(20) | |
random_draw = np.random.choice(x, size=100000) | |
humrandom_draw = np.array(HumanRandom(x).draw(100000)) | |
## "Human random" has less "empty spots" which indicates for "more random", | |
## i.e. less streaks without certain number | |
plt.plot(random_draw) | |
plt.plot(humrandom_draw) | |
print(f"Simple random bin count: {np.bincount(random_draw)}") | |
print(f"Human random bin count: {np.bincount(humrandom_draw)}") | |
## Average distances between consecutive draws of one element | |
def count_avg_dist(x, element): | |
return np.mean(np.diff((x == element).nonzero())) | |
print({el: count_avg_dist(random_draw, el) for el in x}) | |
print({el: count_avg_dist(humrandom_draw, el) for el in x}) | |
## Number of consecutive repeats | |
def count_steps_repeats(x, steps=[1]): | |
return {st: np.sum(x[:-st] == x[st:]) for st in steps} | |
## Using "human random" significantly reduces number of repeats with step less | |
## than number of initial elements | |
steps = np.arange(1, 21) | |
random_step_repeats = count_steps_repeats(random_draw, steps) | |
humrandom_step_repeats = count_steps_repeats(humrandom_draw, steps) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment