Created
August 2, 2017 16:01
-
-
Save pganssle/fe45b08092f2ba56e29295f56377d1f6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import string | |
from timeit import timeit | |
def _recombine_skipped_queue(tokens, skipped_idxs): | |
""" | |
>>> tokens = ["foo", " ", "bar", " ", "19June2000", "baz"] | |
>>> skipped_idxs = set([0, 1, 2, 5]) | |
>>> _recombine_skipped(tokens, skipped_idxs) | |
["foo bar", "baz"] | |
""" | |
# This groups consecutive values | |
skipped_tokens = [] | |
idx_queue = [] | |
for idx in skipped_idxs: | |
if idx_queue and idx - 1 != idx_queue[-1]: | |
skipped_tokens.append(''.join(map(tokens.__getitem__, idx_queue))) | |
idx_queue = [] | |
idx_queue.append(idx) | |
if idx_queue: | |
skipped_tokens.append(''.join(map(tokens.__getitem__, idx_queue))) | |
return skipped_tokens | |
def _recombine_skipped(tokens, skipped_idxs): | |
""" | |
>>> tokens = ["foo", " ", "bar", " ", "19June2000", "baz"] | |
>>> skipped_idxs = set([0, 1, 2, 5]) | |
>>> _recombine_skipped(tokens, skipped_idxs) | |
["foo bar", "baz"] | |
""" | |
skipped_tokens = [] | |
idx_queue = [] | |
for i, idx in enumerate(sorted(skipped_idxs)): | |
if i > 0 and idx - 1 == skipped_idxs[i - 1]: | |
skipped_tokens[-1] = skipped_tokens[-1] + tokens[idx] | |
else: | |
skipped_tokens.append(tokens[idx]) | |
return skipped_tokens | |
def _recombine_skipped_set(tokens, skipped_idxs): | |
""" | |
>>> tokens = ["foo", " ", "bar", " ", "19June2000", "baz"] | |
>>> skipped_idxs = set([0, 1, 2, 5]) | |
>>> _recombine_skipped(tokens, skipped_idxs) | |
["foo bar", "baz"] | |
""" | |
skipped_tokens = [] | |
for idx in sorted(list(skipped_idxs)): | |
if idx-1 in skipped_idxs: | |
skipped_tokens[-1] = skipped_tokens[-1] + tokens[idx] | |
else: | |
skipped_tokens.append(tokens[idx]) | |
return skipped_tokens | |
def get_rand_tokens(n_token_sets=500, token_population=300): | |
ALL_TOKENS = [''.join(random.choice(string.ascii_letters) | |
for x in range(random.randint(1, 20))) | |
for i in range(token_population)] | |
token_sets = [] | |
for ii in range(n_token_sets): | |
n_tokens = random.randint(1, 15) | |
n_skipped_tokens = random.randint(0, n_tokens) | |
all_idxs = list(range(n_tokens)) | |
tokens = [random.choice(ALL_TOKENS) for i in range(0, n_tokens)] | |
skipped_idxs = sorted(random.sample(all_idxs, k=n_skipped_tokens)) | |
token_sets.append((tokens, skipped_idxs)) | |
return token_sets | |
def assert_same_result(token_set): | |
for tokens, skipped_idxs in token_set: | |
skipped_idxs_set = set(skipped_idxs) | |
lversion = _recombine_skipped(tokens, skipped_idxs) | |
sversion = _recombine_skipped_set(tokens, skipped_idxs_set) | |
msg = ('Failure with:\n' + | |
' tokens == {}\n' + | |
' skipped_idxs == {}\n' + | |
' lversion == {}\n' | |
' sversion == {}').format(tokens, skipped_idxs, | |
lversion, sversion) | |
assert lversion == sversion, msg | |
def test_token_recombination(token_sets): | |
for tok_args in token_sets: | |
_recombine_skipped(*tok_args) | |
def test_token_recombination_queue(token_sets): | |
for tok_args in token_sets: | |
_recombine_skipped_queue(*tok_args) | |
def test_token_recombination_set(token_sets_set): | |
for tok_args in token_sets_set: | |
_recombine_skipped_set(*tok_args) | |
def print_results(loop_sets, N_loops, N_sets): | |
m = sum(loop_times) * 1000 / N_loops | |
m_set = (m / N_sets) * 1000 | |
print('{} loops, {} sets: {:0.3f} ms per loop, {:0.3f} us per set'.format( | |
N_loops, N_sets, m, m_set)) | |
if __name__ == "__main__": | |
from timeit import default_timer as timer | |
# Make sure it actually works | |
tokens = tokens = ["foo", " ", "bar", " ", "19June2000", "baz"] | |
skipped_idxs = [0, 1, 2, 5] | |
skipped_idxs_set = set(skipped_idxs) | |
assert _recombine_skipped(tokens, skipped_idxs) == _recombine_skipped_set(tokens, skipped_idxs_set) | |
# Test with random sets | |
N_sets = 500 | |
N_loops = 1000 | |
token_sets = get_rand_tokens(N_sets) | |
token_sets_set = [(tokens, set(skipped_idxs)) for tokens, skipped_idxs in token_sets] | |
assert_same_result(token_sets) | |
runs = [('extend', test_token_recombination, (token_sets,)), | |
('queue', test_token_recombination_queue, (token_sets,)), | |
('set', test_token_recombination_set, (token_sets_set,))] | |
for name, func, args, in runs: | |
print('Running {} timing test'.format(name)) | |
loop_times = [] | |
for i in range(N_loops): | |
t_start = timer() | |
func(*args) | |
t_end = timer() | |
loop_times.append(t_end - t_start) | |
print_results(loop_times, N_loops, N_sets) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment