Skip to content

Instantly share code, notes, and snippets.

@evansd
Created May 17, 2011 17:18
Show Gist options
  • Save evansd/976885 to your computer and use it in GitHub Desktop.
Save evansd/976885 to your computer and use it in GitHub Desktop.
Some useful iterator functions
# For more explanation and usage examples, see:
# http://www.drhevans.com/blog/posts/331-comparing-large-data-sets-in-python/
from itertools import imap, izip, count, repeat, tee
import heapq
def full_outer_join(*iterables, **kwargs):
"""
Perform a full outer join on a sequence of sorted iterables, where the
key function supplies the join condition.
full_outer_join(iterables... [, key=lambda x:(x,)][, missing=None])
Where an iterable has no matching item for a particular key, the value
of the keyword argument "missing" will be returned (by default: None).
Yields tuples of the form: (key, list_of_values_with_that_key)
Note: each of the iterables must already be in ascending order by key.
>>> for i in full_outer_join([1,2,3,5], [4], [1,1,3,5,5,6]):
... print i
...
((1,), [1, None, 1])
((1,), [None, None, 1])
((2,), [2, None, None])
((3,), [3, None, 3])
((4,), [None, 4, None])
((5,), [5, None, 5])
((5,), [None, None, 5])
((6,), [None, None, 6])
"""
key = kwargs.pop('key', None)
missing = kwargs.pop('missing', None)
# Decorate each iterator in the following format:
# - key: The result of applying the key function to the value
# - counter: An always incrementing counter which acts as a tie-
# breaker so that elements with the same key get returned
# in their original order
# - iter_num: So we know which iterator each value came from
# - value: The original value yielded from the iterator
counter = count()
decorated = [
izip(imap(key, iter1), counter, repeat(iter_num), iter2)
for iter_num, (iter1, iter2) in enumerate(map(tee, iterables))
]
empty_output = [missing] * len(iterables)
output = empty_output[:]
# heapq.merge does most of the actual work here
for key, counter, iter_num, value in heapq.merge(*decorated):
try:
# If the key has changed, or we've already seen this key from
# this particular iterator, yield and reset the output array
if key != prev_key or output[iter_num] is not missing:
yield (prev_key, output)
output[:] = empty_output
# Allow for there being no prev_key on first run of loop
except NameError:
pass
output[iter_num] = value
prev_key = key
# If there's output pending, yield it
if output != empty_output:
yield (key, output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment