alexgarel · April 11, 2022 14:45
diff --git a/test_stream_random_selection.py b/test_stream_random_selection.py
 import collections
 import statistics

 def run_exp(k, N):
    """run one experimentation, that is stream selecting k items among N"""
    result = []
    seen = 0
    for i in range(N):
        seen += 1
        index = random.randrange(seen)
        if index < N:
            result.insert(index, i)
    result = result[:k]
    return result

 def run_exps(k, N):
    """run N*100 experience of selecting k itmes among N"""
    num = N * 100
    c = collections.Counter()
    for i in range(num):
        c.update(run_exp(k, N))
    print("Expected value:", k * 100, "Mean:", statistics.mean(c.values()), "Std dev:", statistics.stdev(c.values()))
    return c

 data = run_exps(10,200)
 # Expected value: 1000 Mean: 1000 Std dev: 29.911595035837518
 # Expected value: 1000 Mean: 1000 Std dev: 30.925912440888588
 # Expected value: 1000 Mean: 1000 Std dev: 28.366765181003117

 # data = run_exps(20, 2000)  # takes around 5 min on my cpu !
 # Expected value: 2000 Mean: 2000 Std dev: 43.85401449461209
	import collections
	import statistics

	def run_exp(k, N):
	"""run one experimentation, that is stream selecting k items among N"""
	result = []
	seen = 0
	for i in range(N):
	seen += 1
	index = random.randrange(seen)
	if index < N:
	result.insert(index, i)
	result = result[:k]
	return result

	def run_exps(k, N):
	"""run N*100 experience of selecting k itmes among N"""
	num = N * 100
	c = collections.Counter()
	for i in range(num):
	c.update(run_exp(k, N))
	print("Expected value:", k * 100, "Mean:", statistics.mean(c.values()), "Std dev:", statistics.stdev(c.values()))
	return c

	data = run_exps(10,200)
	# Expected value: 1000 Mean: 1000 Std dev: 29.911595035837518
	# Expected value: 1000 Mean: 1000 Std dev: 30.925912440888588
	# Expected value: 1000 Mean: 1000 Std dev: 28.366765181003117

	# data = run_exps(20, 2000) # takes around 5 min on my cpu !
	# Expected value: 2000 Mean: 2000 Std dev: 43.85401449461209