Last active
March 31, 2016 10:53
-
-
Save HelloZeroNet/0eed5fc05227ab793e7d to your computer and use it in GitHub Desktop.
Fast diff and undiff in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def diff(old, new): | |
# via https://github.com/paulgb/simplediff/blob/master/python/LICENSE by Paul Butler | |
# Create a map from old values to their indices | |
old_index_map = dict() | |
for i, val in enumerate(old): | |
old_index_map.setdefault(val,list()).append(i) | |
# Find the largest substring common to old and new. | |
# We use a dynamic programming approach here. | |
# | |
# We iterate over each value in the `new` list, calling the | |
# index `inew`. At each iteration, `overlap[i]` is the | |
# length of the largest suffix of `old[:i]` equal to a suffix | |
# of `new[:inew]` (or unset when `old[i]` != `new[inew]`). | |
# | |
# At each stage of iteration, the new `overlap` (called | |
# `_overlap` until the original `overlap` is no longer needed) | |
# is built from the old one. | |
# | |
# If the length of overlap exceeds the largest substring | |
# seen so far (`sub_length`), we update the largest substring | |
# to the overlapping strings. | |
overlap = dict() | |
# `sub_start_old` is the index of the beginning of the largest overlapping | |
# substring in the old list. `sub_start_new` is the index of the beginning | |
# of the same substring in the new list. `sub_length` is the length that | |
# overlaps in both. | |
# These track the largest overlapping substring seen so far, so naturally | |
# we start with a 0-length substring. | |
sub_start_old = 0 | |
sub_start_new = 0 | |
sub_length = 0 | |
for inew, val in enumerate(new): | |
_overlap = dict() | |
for iold in old_index_map.get(val,list()): | |
# now we are considering all values of iold such that | |
# `old[iold] == new[inew]`. | |
_overlap[iold] = (iold and overlap.get(iold - 1, 0)) + 1 | |
if(_overlap[iold] > sub_length): | |
# this is the largest substring seen so far, so store its | |
# indices | |
sub_length = _overlap[iold] | |
sub_start_old = iold - sub_length + 1 | |
sub_start_new = inew - sub_length + 1 | |
overlap = _overlap | |
if sub_length == 0: | |
# If no common substring is found, we return an insert and delete... | |
return (old and [('-', len(old))] or []) + (new and [('+', new)] or []) | |
else: | |
# ...otherwise, the common substring is unchanged and we recursively | |
# diff the text before and after that substring | |
return diff(old[ : sub_start_old], new[ : sub_start_new]) + \ | |
[('=', sub_length)] + \ | |
diff(old[sub_start_old + sub_length : ], new[sub_start_new + sub_length : ]) | |
#lines_from = ["Hello", "Bello", "Ize", "Mize"] | |
#lines_to = ["Hello", "Mello", "Ize", "Aze", "Laze", "Mize"] | |
lines_from = list(open("users_archive2.json")) | |
lines_to = list(open("users_archive.json")) | |
import time, json | |
s = time.time() | |
diff_out = diff(lines_from, lines_to) | |
#print diff(list(json_from), list(json_to)) | |
print "Diff", time.time()-s | |
for line in diff_out: | |
print line | |
# Undiff | |
def undiff(lines, diffs): | |
i = 0 | |
for action, param in diffs: | |
if action == "=": # Same | |
i += param | |
elif action == "-": # Delete | |
del(lines[i:i+param]) | |
elif action == "+": | |
for add_line in param: | |
lines.insert(i, add_line) | |
i += 1 | |
return len(lines) == i | |
s = time.time() | |
for i in range(1000): | |
lines_from_copy = lines_from[:] | |
assert lines_from_copy != lines_to | |
assert undiff(lines_from_copy, diff_out) | |
assert lines_from_copy == lines_to | |
print "Undiff x 1000", time.time()-s | |
import difflib | |
def diff(old, new): | |
matcher = difflib.SequenceMatcher(None, old, new) | |
diffs = [] | |
for tag, old_from, old_to, new_from, new_to in matcher.get_opcodes(): | |
if tag == "insert": | |
diffs.append(("+", new[new_from:new_to])) | |
elif tag == "equal": | |
diffs.append(("=", old_to-old_from)) | |
elif tag == "delete": | |
diffs.append(("-", old_to-old_from)) | |
elif tag == "replace": | |
diffs.append(("-", old_to-old_from)) | |
diffs.append(("+", new[new_from:new_to])) | |
return diffs | |
def patch(old, diffs): | |
i = 0 | |
for action, param in diffs: | |
if action == "=": # Same lines | |
i += param | |
elif action == "-": # Delete lines | |
del(old[i:i + param]) | |
elif action == "+": # Add lines | |
for add_line in param: | |
old.insert(i, add_line) | |
i += 1 | |
return len(old) == i |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment