Skip to content

Instantly share code, notes, and snippets.

@csmoore
Created May 5, 2014 14:16
Show Gist options
  • Save csmoore/6da35a72f4fae54490c5 to your computer and use it in GitHub Desktop.
Save csmoore/6da35a72f4fae54490c5 to your computer and use it in GitHub Desktop.
Brute force set comparison of 2 .csv's
#-------------------------------------------------------------------------------
# Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
# See the license for details but you know the general guidance: this is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#-------------------------------------------------------------------------------
# Name: CsvSetComparer.py
# Usage: python.exe CsvSetComparer.py [Input File:"Truth"(.csv)] [Input File:"Test"(.csv)]
# {Output Diff File (.csv)="diffs.csv"} {ColumnsToCompare=cols}
# Description: *Brute force* set comparison of 2 .csv's for entries in a "Test" csv's columns that
# do not exist in a "Truth" csv's column. Outputs the diffs from "Test" into
# an output "diff.csv" file. Use [Number of] ColumnsToCompare to compare only the 1st few columns
# Important: this script assumes the csv's have a 1st/header row (that will get ignored)
#-------------------------------------------------------------------------------
import csv
import os
import sys
### Params:
### 1 - "Truth Data" input file (.csv)
### 2 - "Compare/Test Data" input file (.csv)
### 3 - Output Diff File (.csv)
### 4 - Columns to compare=number of left columns to check,default=all (use if you don't need to check all cols)
useDefaults = False
if len(sys.argv) < 3 :
print 'Usage: [Input File:"Truth"(.csv)] [Input File:"Test"(.csv)] ' + \
'{Output Diff File (.csv)="diffs.csv"} {ColumnsToCompare=cols}'
# useDefaults = True # For IDE testing
sys.exit(0)
if useDefaults :
currentPath = os.path.dirname(__file__)
dataPath = os.path.normpath(os.path.join(currentPath, r"Data/"))
outputDiffs = os.path.normpath(os.path.join(dataPath, r"Diffs.txt"))
truthCsv = os.path.normpath(os.path.join(dataPath, r"True.csv"))
testCsv = os.path.normpath(os.path.join(dataPath, r"Test.csv"))
leftColumnsToCompare = 5
else :
truthCsv = sys.argv[1]
testCsv = sys.argv[2]
if not (os.path.exists(truthCsv) and os.path.exists(testCsv)) :
print 'Input files do not exist: '
print ' [Input File:"Truth"(.csv)] : ' + truthCsv
print ' [Input File:"Test"(.csv)] : ' + testCsv
sys.exit(0)
if len(sys.argv) > 3 :
outputDiffs = sys.argv[3]
else :
if not useDefaults :
outputDiffs = "Diffs.csv"
diffs = open(outputDiffs, 'w')
f1 = open(truthCsv, "r")
headingRow1 = f1.readline().strip();
maxNumberOfColumns = len(headingRow1.split(","))
# set number of columns to compare
if useDefaults :
pass
else :
if (len(sys.argv) > 4) :
leftColumnsToCompare = int(sys.argv[4]) # get from command line
else :
leftColumnsToCompare = maxNumberOfColumns # use count from header row
if (leftColumnsToCompare < 1) or (leftColumnsToCompare > maxNumberOfColumns) :
print 'ColumnsToCompare exceeds Max/Min: ' + str(leftColumnsToCompare) + \
', Max=' + str(maxNumberOfColumns) + ', Min=1'
sys.exit(0)
diffs.write(headingRow1+",Column that Differs from Truth\n") # add a col to output for which col differs
f1.close()
diffs.close()
for index in range(0, leftColumnsToCompare) :
f1 = open(truthCsv, "rb")
f2 = open(testCsv, "rb")
headingRow1 = f1.readline();
headingRow2 = f2.readline();
# Set solution borrowed from here: http://stackoverflow.com/questions/21718235/compare-two-csv-file-python
reader1, reader2 = [csv.reader(f, delimiter=',') for f in [f1,f2]]
print('Checking Column: ' + str(index))
m1 = {row[index]:row[0:leftColumnsToCompare] for row in reader1}
m2 = {row[index]:row[0:leftColumnsToCompare] for row in reader2}
# TODO: decide if set intersect or set difference (or other) best suits your individual needs
# intersect = set(m1.keys()) & set(m2.keys()) # = new set with elements common to m1 and m2
difference = set(m2.keys()) - set(m1.keys()) # = new set with elements in m2 but not in m1
with open(outputDiffs, 'ab') as f_out:
writer = csv.writer(f_out, delimiter=',', dialect='excel' )
for k in difference:
print('Found Diff in Column: ' + str(index))
# keep in case we do intersect (and need to tell between/print True & Test)
# m1[k].append("TRUE")
# m1[k].append(index)
# writer.writerow(m1[k])
# m2[k].append("TEST")
m2[k].append(index)
writer.writerow(m2[k])
f1.close
f2.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment