Created
May 17, 2011 17:18
-
-
Save evansd/976885 to your computer and use it in GitHub Desktop.
Some useful iterator functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# For more explanation and usage examples, see: | |
# http://www.drhevans.com/blog/posts/331-comparing-large-data-sets-in-python/ | |
from itertools import imap, izip, count, repeat, tee | |
import heapq | |
def full_outer_join(*iterables, **kwargs): | |
""" | |
Perform a full outer join on a sequence of sorted iterables, where the | |
key function supplies the join condition. | |
full_outer_join(iterables... [, key=lambda x:(x,)][, missing=None]) | |
Where an iterable has no matching item for a particular key, the value | |
of the keyword argument "missing" will be returned (by default: None). | |
Yields tuples of the form: (key, list_of_values_with_that_key) | |
Note: each of the iterables must already be in ascending order by key. | |
>>> for i in full_outer_join([1,2,3,5], [4], [1,1,3,5,5,6]): | |
... print i | |
... | |
((1,), [1, None, 1]) | |
((1,), [None, None, 1]) | |
((2,), [2, None, None]) | |
((3,), [3, None, 3]) | |
((4,), [None, 4, None]) | |
((5,), [5, None, 5]) | |
((5,), [None, None, 5]) | |
((6,), [None, None, 6]) | |
""" | |
key = kwargs.pop('key', None) | |
missing = kwargs.pop('missing', None) | |
# Decorate each iterator in the following format: | |
# - key: The result of applying the key function to the value | |
# - counter: An always incrementing counter which acts as a tie- | |
# breaker so that elements with the same key get returned | |
# in their original order | |
# - iter_num: So we know which iterator each value came from | |
# - value: The original value yielded from the iterator | |
counter = count() | |
decorated = [ | |
izip(imap(key, iter1), counter, repeat(iter_num), iter2) | |
for iter_num, (iter1, iter2) in enumerate(map(tee, iterables)) | |
] | |
empty_output = [missing] * len(iterables) | |
output = empty_output[:] | |
# heapq.merge does most of the actual work here | |
for key, counter, iter_num, value in heapq.merge(*decorated): | |
try: | |
# If the key has changed, or we've already seen this key from | |
# this particular iterator, yield and reset the output array | |
if key != prev_key or output[iter_num] is not missing: | |
yield (prev_key, output) | |
output[:] = empty_output | |
# Allow for there being no prev_key on first run of loop | |
except NameError: | |
pass | |
output[iter_num] = value | |
prev_key = key | |
# If there's output pending, yield it | |
if output != empty_output: | |
yield (key, output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment