Skip to content

Instantly share code, notes, and snippets.

@benburrill
Created January 4, 2018 10:33
Show Gist options
  • Save benburrill/798f982defb864ef0cc9d2180da01ff0 to your computer and use it in GitHub Desktop.
Save benburrill/798f982defb864ef0cc9d2180da01ff0 to your computer and use it in GitHub Desktop.
Iterator diffing for /u/Skaperen
"""
Iterator diffing: a response to
https://www.reddit.com/r/learnpython/comments/7o0swm/interleaving_iterators/
MIT License:
Copyright (c) 2017 Ben Burrill
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import itertools as it
from collections import namedtuple, deque
def itdiff(a, b, *, lookahead=1000):
"""
A naïve function for iterable diffing
Returns Diff objects representing the differences between a and b
lookahead is the maximum number of consecutive items that can be
considered as a single insertion or deletion.
Memory concerns:
Auxiliary memory is theoretically bounded by `lookahead`, but
this function may create tees of tees of tees of tees, which
will probably take up a small amount of memory each time due to
the cost of the abstraction. However, this only happens when
"rem" or "change" diffs are encountered (not "add" diffs), and I
believe it may be possible to write this function in a more
memory-efficient way.
Subtleties:
This function readily (ab)uses the fact that a tee'd iterator
will get exhausted as far as the farthest reached point by one
of its tees. Keep that in mind if you are trying to comprehend
"where" each iterator is at a given time.
Final notes:
I have no clue what I'm doing. I don't know anything about
diffing algorithms, and I'm sure there is a better way to go
about this.
"""
a = iter(a)
b = iter(b)
bubble = ()
while True:
a_copy, a_head = it.tee(a)
b_copy, b_head = it.tee(b)
try:
na = next(a_head)
except StopIteration:
yield from map(Diff.add, b_head)
return
try:
nb = next(b_head)
except StopIteration:
yield from map(Diff.rem, a_head)
return
if bubble:
if na != nb:
# If we start in a bubble, all we can do is mark the
# diff as changed and continue on.
yield Diff.change(na, nb)
else:
bubble = ()
elif na != nb:
bubble = (na, nb)
try:
yield from insertions(b_copy, na, lookahead=lookahead)
bubble = ()
# b is exhausted up to and including na
continue
except TooManyInsertions:
b = b_head
try:
yield from map(Diff.invert, insertions(a_copy, nb, lookahead=lookahead))
bubble = ()
# a is exhausted up to and including nb
continue
except TooManyInsertions:
a = a_head
yield Diff.change(na, nb)
class Diff(namedtuple("Diff", ["kind", "old", "new"])):
""" Silly class """
@classmethod
def add(cls, item):
return cls("add", None, item)
@classmethod
def rem(cls, item):
return cls("rem", item, None)
@classmethod
def change(cls, old, new):
return cls("change", old, new)
def invert(self):
kind = {
"add": "rem",
"rem": "add"
}.get(self.kind, self.kind)
return type(self)(kind, self.new, self.old)
def upto(iterable, stop):
""" everything up to and including stop """
for item in iterable:
yield item
if item == stop:
break
class TooManyInsertions(Exception):
""" Internal signaling exception, you shouldn't ever see it """
def insertions(iterable, marker, *, lookahead):
"""
This function only really makes sense in the context of itdiff.
"""
known = deque(upto(it.islice(iterable, lookahead), marker))
if known.pop() == marker: # Found it!
yield from map(Diff.add, known)
else:
raise TooManyInsertions
if __name__ == "__main__":
# Example: an infinite stream with infinite insertions
for diff in itdiff(it.repeat(0), it.cycle([0, 42, 1114111])):
print(diff)
@benburrill
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment