Skip to content

Instantly share code, notes, and snippets.

@HelloZeroNet
Last active March 31, 2016 10:53
Show Gist options
  • Save HelloZeroNet/0eed5fc05227ab793e7d to your computer and use it in GitHub Desktop.
Save HelloZeroNet/0eed5fc05227ab793e7d to your computer and use it in GitHub Desktop.
Fast diff and undiff in python
def diff(old, new):
# via https://github.com/paulgb/simplediff/blob/master/python/LICENSE by Paul Butler
# Create a map from old values to their indices
old_index_map = dict()
for i, val in enumerate(old):
old_index_map.setdefault(val,list()).append(i)
# Find the largest substring common to old and new.
# We use a dynamic programming approach here.
#
# We iterate over each value in the `new` list, calling the
# index `inew`. At each iteration, `overlap[i]` is the
# length of the largest suffix of `old[:i]` equal to a suffix
# of `new[:inew]` (or unset when `old[i]` != `new[inew]`).
#
# At each stage of iteration, the new `overlap` (called
# `_overlap` until the original `overlap` is no longer needed)
# is built from the old one.
#
# If the length of overlap exceeds the largest substring
# seen so far (`sub_length`), we update the largest substring
# to the overlapping strings.
overlap = dict()
# `sub_start_old` is the index of the beginning of the largest overlapping
# substring in the old list. `sub_start_new` is the index of the beginning
# of the same substring in the new list. `sub_length` is the length that
# overlaps in both.
# These track the largest overlapping substring seen so far, so naturally
# we start with a 0-length substring.
sub_start_old = 0
sub_start_new = 0
sub_length = 0
for inew, val in enumerate(new):
_overlap = dict()
for iold in old_index_map.get(val,list()):
# now we are considering all values of iold such that
# `old[iold] == new[inew]`.
_overlap[iold] = (iold and overlap.get(iold - 1, 0)) + 1
if(_overlap[iold] > sub_length):
# this is the largest substring seen so far, so store its
# indices
sub_length = _overlap[iold]
sub_start_old = iold - sub_length + 1
sub_start_new = inew - sub_length + 1
overlap = _overlap
if sub_length == 0:
# If no common substring is found, we return an insert and delete...
return (old and [('-', len(old))] or []) + (new and [('+', new)] or [])
else:
# ...otherwise, the common substring is unchanged and we recursively
# diff the text before and after that substring
return diff(old[ : sub_start_old], new[ : sub_start_new]) + \
[('=', sub_length)] + \
diff(old[sub_start_old + sub_length : ], new[sub_start_new + sub_length : ])
#lines_from = ["Hello", "Bello", "Ize", "Mize"]
#lines_to = ["Hello", "Mello", "Ize", "Aze", "Laze", "Mize"]
lines_from = list(open("users_archive2.json"))
lines_to = list(open("users_archive.json"))
import time, json
s = time.time()
diff_out = diff(lines_from, lines_to)
#print diff(list(json_from), list(json_to))
print "Diff", time.time()-s
for line in diff_out:
print line
# Undiff
def undiff(lines, diffs):
i = 0
for action, param in diffs:
if action == "=": # Same
i += param
elif action == "-": # Delete
del(lines[i:i+param])
elif action == "+":
for add_line in param:
lines.insert(i, add_line)
i += 1
return len(lines) == i
s = time.time()
for i in range(1000):
lines_from_copy = lines_from[:]
assert lines_from_copy != lines_to
assert undiff(lines_from_copy, diff_out)
assert lines_from_copy == lines_to
print "Undiff x 1000", time.time()-s
import difflib
def diff(old, new):
matcher = difflib.SequenceMatcher(None, old, new)
diffs = []
for tag, old_from, old_to, new_from, new_to in matcher.get_opcodes():
if tag == "insert":
diffs.append(("+", new[new_from:new_to]))
elif tag == "equal":
diffs.append(("=", old_to-old_from))
elif tag == "delete":
diffs.append(("-", old_to-old_from))
elif tag == "replace":
diffs.append(("-", old_to-old_from))
diffs.append(("+", new[new_from:new_to]))
return diffs
def patch(old, diffs):
i = 0
for action, param in diffs:
if action == "=": # Same lines
i += param
elif action == "-": # Delete lines
del(old[i:i + param])
elif action == "+": # Add lines
for add_line in param:
old.insert(i, add_line)
i += 1
return len(old) == i
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment