Created
January 4, 2018 10:33
-
-
Save benburrill/798f982defb864ef0cc9d2180da01ff0 to your computer and use it in GitHub Desktop.
Iterator diffing for /u/Skaperen
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Iterator diffing: a response to | |
https://www.reddit.com/r/learnpython/comments/7o0swm/interleaving_iterators/ | |
MIT License: | |
Copyright (c) 2017 Ben Burrill | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. | |
""" | |
import itertools as it | |
from collections import namedtuple, deque | |
def itdiff(a, b, *, lookahead=1000): | |
""" | |
A naïve function for iterable diffing | |
Returns Diff objects representing the differences between a and b | |
lookahead is the maximum number of consecutive items that can be | |
considered as a single insertion or deletion. | |
Memory concerns: | |
Auxiliary memory is theoretically bounded by `lookahead`, but | |
this function may create tees of tees of tees of tees, which | |
will probably take up a small amount of memory each time due to | |
the cost of the abstraction. However, this only happens when | |
"rem" or "change" diffs are encountered (not "add" diffs), and I | |
believe it may be possible to write this function in a more | |
memory-efficient way. | |
Subtleties: | |
This function readily (ab)uses the fact that a tee'd iterator | |
will get exhausted as far as the farthest reached point by one | |
of its tees. Keep that in mind if you are trying to comprehend | |
"where" each iterator is at a given time. | |
Final notes: | |
I have no clue what I'm doing. I don't know anything about | |
diffing algorithms, and I'm sure there is a better way to go | |
about this. | |
""" | |
a = iter(a) | |
b = iter(b) | |
bubble = () | |
while True: | |
a_copy, a_head = it.tee(a) | |
b_copy, b_head = it.tee(b) | |
try: | |
na = next(a_head) | |
except StopIteration: | |
yield from map(Diff.add, b_head) | |
return | |
try: | |
nb = next(b_head) | |
except StopIteration: | |
yield from map(Diff.rem, a_head) | |
return | |
if bubble: | |
if na != nb: | |
# If we start in a bubble, all we can do is mark the | |
# diff as changed and continue on. | |
yield Diff.change(na, nb) | |
else: | |
bubble = () | |
elif na != nb: | |
bubble = (na, nb) | |
try: | |
yield from insertions(b_copy, na, lookahead=lookahead) | |
bubble = () | |
# b is exhausted up to and including na | |
continue | |
except TooManyInsertions: | |
b = b_head | |
try: | |
yield from map(Diff.invert, insertions(a_copy, nb, lookahead=lookahead)) | |
bubble = () | |
# a is exhausted up to and including nb | |
continue | |
except TooManyInsertions: | |
a = a_head | |
yield Diff.change(na, nb) | |
class Diff(namedtuple("Diff", ["kind", "old", "new"])): | |
""" Silly class """ | |
@classmethod | |
def add(cls, item): | |
return cls("add", None, item) | |
@classmethod | |
def rem(cls, item): | |
return cls("rem", item, None) | |
@classmethod | |
def change(cls, old, new): | |
return cls("change", old, new) | |
def invert(self): | |
kind = { | |
"add": "rem", | |
"rem": "add" | |
}.get(self.kind, self.kind) | |
return type(self)(kind, self.new, self.old) | |
def upto(iterable, stop): | |
""" everything up to and including stop """ | |
for item in iterable: | |
yield item | |
if item == stop: | |
break | |
class TooManyInsertions(Exception): | |
""" Internal signaling exception, you shouldn't ever see it """ | |
def insertions(iterable, marker, *, lookahead): | |
""" | |
This function only really makes sense in the context of itdiff. | |
""" | |
known = deque(upto(it.islice(iterable, lookahead), marker)) | |
if known.pop() == marker: # Found it! | |
yield from map(Diff.add, known) | |
else: | |
raise TooManyInsertions | |
if __name__ == "__main__": | |
# Example: an infinite stream with infinite insertions | |
for diff in itdiff(it.repeat(0), it.cycle([0, 42, 1114111])): | |
print(diff) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://gist.github.com/benburrill/798f982defb864ef0cc9d2180da01ff0#file-itdiff-py-L79 is a bug probably.