evansd · May 17, 2011 17:18
diff --git a/heapqutils.py b/heapqutils.py
 # For more explanation and usage examples, see:
 # http://www.drhevans.com/blog/posts/331-comparing-large-data-sets-in-python/

 from itertools import imap, izip, count, repeat, tee
 import heapq

 def full_outer_join(*iterables, **kwargs):
    """
    Perform a full outer join on a sequence of sorted iterables, where the
    key function supplies the join condition.

    full_outer_join(iterables... [, key=lambda x:(x,)][, missing=None])

    Where an iterable has no matching item for a particular key, the value
    of the keyword argument "missing" will be returned (by default: None).

    Yields tuples of the form: (key, list_of_values_with_that_key)

    Note: each of the iterables must already be in ascending order by key.

    >>> for i in full_outer_join([1,2,3,5], [4], [1,1,3,5,5,6]):
    ...     print i
    ... 
    ((1,), [1, None, 1])
    ((1,), [None, None, 1])
    ((2,), [2, None, None])
    ((3,), [3, None, 3])
    ((4,), [None, 4, None])
    ((5,), [5, None, 5])
    ((5,), [None, None, 5])
    ((6,), [None, None, 6])
    """
    key = kwargs.pop('key', None)
    missing = kwargs.pop('missing', None)
    # Decorate each iterator in the following format:
    #  - key: The result of applying the key function to the value
    #  - counter: An always incrementing counter which acts as a tie-
    #             breaker so that elements with the same key get returned
    #             in their original order
    #  - iter_num: So we know which iterator each value came from
    #  - value: The original value yielded from the iterator
    counter = count()
    decorated = [
        izip(imap(key, iter1), counter, repeat(iter_num), iter2)
        for iter_num, (iter1, iter2) in enumerate(map(tee, iterables))
    ]
    empty_output = [missing] * len(iterables)
    output = empty_output[:]
    # heapq.merge does most of the actual work here
    for key, counter, iter_num, value in heapq.merge(*decorated):
        try:
            # If the key has changed, or we've already seen this key from
            # this particular iterator, yield and reset the output array
            if key != prev_key or output[iter_num] is not missing:
              yield (prev_key, output)
              output[:] = empty_output
        # Allow for there being no prev_key on first run of loop
        except NameError:
            pass
        output[iter_num] = value
        prev_key = key
    # If there's output pending, yield it
    if output != empty_output:
        yield (key, output)
	# For more explanation and usage examples, see:
	# http://www.drhevans.com/blog/posts/331-comparing-large-data-sets-in-python/

	from itertools import imap, izip, count, repeat, tee
	import heapq

	def full_outer_join(iterables, *kwargs):
	"""
	Perform a full outer join on a sequence of sorted iterables, where the
	key function supplies the join condition.

	full_outer_join(iterables... [, key=lambda x:(x,)][, missing=None])

	Where an iterable has no matching item for a particular key, the value
	of the keyword argument "missing" will be returned (by default: None).

	Yields tuples of the form: (key, list_of_values_with_that_key)

	Note: each of the iterables must already be in ascending order by key.

	>>> for i in full_outer_join([1,2,3,5], [4], [1,1,3,5,5,6]):
	... print i
	...
	((1,), [1, None, 1])
	((1,), [None, None, 1])
	((2,), [2, None, None])
	((3,), [3, None, 3])
	((4,), [None, 4, None])
	((5,), [5, None, 5])
	((5,), [None, None, 5])
	((6,), [None, None, 6])
	"""
	key = kwargs.pop('key', None)
	missing = kwargs.pop('missing', None)
	# Decorate each iterator in the following format:
	# - key: The result of applying the key function to the value
	# - counter: An always incrementing counter which acts as a tie-
	# breaker so that elements with the same key get returned
	# in their original order
	# - iter_num: So we know which iterator each value came from
	# - value: The original value yielded from the iterator
	counter = count()
	decorated = [
	izip(imap(key, iter1), counter, repeat(iter_num), iter2)
	for iter_num, (iter1, iter2) in enumerate(map(tee, iterables))
	]
	empty_output = [missing] * len(iterables)
	output = empty_output[:]
	# heapq.merge does most of the actual work here
	for key, counter, iter_num, value in heapq.merge(*decorated):
	try:
	# If the key has changed, or we've already seen this key from
	# this particular iterator, yield and reset the output array
	if key != prev_key or output[iter_num] is not missing:
	yield (prev_key, output)
	output[:] = empty_output
	# Allow for there being no prev_key on first run of loop
	except NameError:
	pass
	output[iter_num] = value
	prev_key = key
	# If there's output pending, yield it
	if output != empty_output:
	yield (key, output)