Created
July 13, 2017 19:37
-
-
Save hughdbrown/8b9ca8877de6a214fafd514880cb3779 to your computer and use it in GitHub Desktop.
A python script to diff the columns of CSV files that match in different directories
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
import os | |
import os.path | |
import sys | |
from csv import DictReader | |
class DictDiffer(object): | |
""" | |
Calculate the difference between two dictionaries as: | |
(1) items added | |
(2) items removed | |
(3) keys same in both but changed values | |
(4) keys same in both and unchanged values | |
""" | |
def __init__(self, current_dict, past_dict): | |
self.current_dict, self.past_dict = current_dict, past_dict | |
self.current_keys, self.past_keys = [ | |
set(d.keys()) for d in (current_dict, past_dict) | |
] | |
self.intersect = self.current_keys.intersection(self.past_keys) | |
def added(self): | |
return self.current_keys - self.intersect | |
def removed(self): | |
return self.past_keys - self.intersect | |
def changed(self): | |
return set(o for o in self.intersect | |
if self.past_dict[o] != self.current_dict[o]) | |
def unchanged(self): | |
return set(o for o in self.intersect | |
if self.past_dict[o] == self.current_dict[o]) | |
def rowdiff(row1, row2): | |
dd = DictDiffer(row1, row2) | |
return dd.changed() | |
def csvdiff(filename, dir1, dir2): | |
filename1 = os.path.join(dir1, filename) | |
filename2 = os.path.join(dir2, filename) | |
csv1 = DictReader(open(filename1)) | |
csv2 = DictReader(open(filename2)) | |
print("-" * 30) | |
print(filename) | |
for i, (row1, row2) in enumerate(zip(csv1, csv2), start=1): | |
diffs = rowdiff(row1, row2) | |
if diffs: | |
print("line {0}".format(i)) | |
for column in diffs: | |
print("{0}:\n\t{1}\n\t{2}".format(column, row1[column], row2[column])) | |
def diff(dir1, dir2): | |
dir1_files = os.listdir(dir1) | |
dir2_files = os.listdir(dir2) | |
common = set(dir1_files).intersection(set(dir2_files)) | |
for filename in common: | |
csvdiff(filename, dir1, dir2) | |
if __name__ == '__main__': | |
dir1 = sys.argv[1] | |
dir2 = sys.argv[2] | |
diff(dir1, dir2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment