Last active
August 29, 2015 14:00
-
-
Save thecapacity/80cf9b4a3809a05ce248 to your computer and use it in GitHub Desktop.
Testing parms for efficiently searching a given population for duplicates with a given sample size and number of iterations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from optparse import OptionParser | |
import collections | |
import itertools | |
import random | |
import math | |
import profile | |
import sys | |
def has_dups(sample): | |
if len(sample) != len(set(sample)): return True | |
else: return False | |
def run_test(population, overlap, sample_size, iterations): | |
results = { 'items_seen': set(), 'dupes_found': set(), | |
'pop_size': len(population) } | |
add = math.floor(len(pop) * overlap) | |
results['dupes_added'] = int(add) | |
overlap = (random.choice(population) for i in xrange(int(add))) | |
population.extend(overlap) | |
random.shuffle(population) | |
samples = (itertools.combinations(population, sample_size)) | |
s = None | |
for i in xrange(iterations): | |
try: | |
s = next(samples) | |
except Exception as e: | |
print "Sample:", s | |
print "Cobinations Exhausted @Itr:", i | |
print e | |
break | |
results['items_seen'] = results['items_seen'].union(s) | |
if has_dups(s): | |
dupes = [x for x, y in collections.Counter(s).items() if y > 1] | |
results['dupes_found'] = results['dupes_found'].union(dupes) | |
results['dupes_found'] = len(results['dupes_found'] ) | |
results['items_seen'] = len(results['items_seen'] ) | |
results['work'] = sample_size * iterations | |
results['success_rate'] = results['dupes_found'] / add | |
results['efficiency'] = results['success_rate'] / results['work'] | |
return results | |
if __name__ == "__main__": | |
parser = OptionParser() | |
parser.add_option("-n", "--num_sample", dest="n", type="int", | |
help="How many items are sampled in each iteration", | |
default = 10) | |
parser.add_option("-p", "--pop_size", dest="pop_size", type="int", | |
help="How big is the overall population", | |
default = 100) | |
parser.add_option("-i", "--iterations", dest="i", type="int", | |
help="How many times to sample (i.e. total iterations)", | |
default = 1000) | |
parser.add_option("-o", "--overlaop", dest="overlap", type="float", | |
help="Decimal amount of overlap to create (i.e. dupes)", | |
default = .10) | |
(options, args) = parser.parse_args() | |
pop = range(options.pop_size) | |
results = run_test(pop, options.overlap, options.n, options.i) | |
print results | |
#profile.run("results = run_test(pop, overlap, n, i)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment