Created
May 5, 2014 14:16
-
-
Save csmoore/6da35a72f4fae54490c5 to your computer and use it in GitHub Desktop.
Brute force set comparison of 2 .csv's
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------- | |
# Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 | |
# See the license for details but you know the general guidance: this is distributed on an | |
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
#------------------------------------------------------------------------------- | |
# Name: CsvSetComparer.py | |
# Usage: python.exe CsvSetComparer.py [Input File:"Truth"(.csv)] [Input File:"Test"(.csv)] | |
# {Output Diff File (.csv)="diffs.csv"} {ColumnsToCompare=cols} | |
# Description: *Brute force* set comparison of 2 .csv's for entries in a "Test" csv's columns that | |
# do not exist in a "Truth" csv's column. Outputs the diffs from "Test" into | |
# an output "diff.csv" file. Use [Number of] ColumnsToCompare to compare only the 1st few columns | |
# Important: this script assumes the csv's have a 1st/header row (that will get ignored) | |
#------------------------------------------------------------------------------- | |
import csv | |
import os | |
import sys | |
### Params: | |
### 1 - "Truth Data" input file (.csv) | |
### 2 - "Compare/Test Data" input file (.csv) | |
### 3 - Output Diff File (.csv) | |
### 4 - Columns to compare=number of left columns to check,default=all (use if you don't need to check all cols) | |
useDefaults = False | |
if len(sys.argv) < 3 : | |
print 'Usage: [Input File:"Truth"(.csv)] [Input File:"Test"(.csv)] ' + \ | |
'{Output Diff File (.csv)="diffs.csv"} {ColumnsToCompare=cols}' | |
# useDefaults = True # For IDE testing | |
sys.exit(0) | |
if useDefaults : | |
currentPath = os.path.dirname(__file__) | |
dataPath = os.path.normpath(os.path.join(currentPath, r"Data/")) | |
outputDiffs = os.path.normpath(os.path.join(dataPath, r"Diffs.txt")) | |
truthCsv = os.path.normpath(os.path.join(dataPath, r"True.csv")) | |
testCsv = os.path.normpath(os.path.join(dataPath, r"Test.csv")) | |
leftColumnsToCompare = 5 | |
else : | |
truthCsv = sys.argv[1] | |
testCsv = sys.argv[2] | |
if not (os.path.exists(truthCsv) and os.path.exists(testCsv)) : | |
print 'Input files do not exist: ' | |
print ' [Input File:"Truth"(.csv)] : ' + truthCsv | |
print ' [Input File:"Test"(.csv)] : ' + testCsv | |
sys.exit(0) | |
if len(sys.argv) > 3 : | |
outputDiffs = sys.argv[3] | |
else : | |
if not useDefaults : | |
outputDiffs = "Diffs.csv" | |
diffs = open(outputDiffs, 'w') | |
f1 = open(truthCsv, "r") | |
headingRow1 = f1.readline().strip(); | |
maxNumberOfColumns = len(headingRow1.split(",")) | |
# set number of columns to compare | |
if useDefaults : | |
pass | |
else : | |
if (len(sys.argv) > 4) : | |
leftColumnsToCompare = int(sys.argv[4]) # get from command line | |
else : | |
leftColumnsToCompare = maxNumberOfColumns # use count from header row | |
if (leftColumnsToCompare < 1) or (leftColumnsToCompare > maxNumberOfColumns) : | |
print 'ColumnsToCompare exceeds Max/Min: ' + str(leftColumnsToCompare) + \ | |
', Max=' + str(maxNumberOfColumns) + ', Min=1' | |
sys.exit(0) | |
diffs.write(headingRow1+",Column that Differs from Truth\n") # add a col to output for which col differs | |
f1.close() | |
diffs.close() | |
for index in range(0, leftColumnsToCompare) : | |
f1 = open(truthCsv, "rb") | |
f2 = open(testCsv, "rb") | |
headingRow1 = f1.readline(); | |
headingRow2 = f2.readline(); | |
# Set solution borrowed from here: http://stackoverflow.com/questions/21718235/compare-two-csv-file-python | |
reader1, reader2 = [csv.reader(f, delimiter=',') for f in [f1,f2]] | |
print('Checking Column: ' + str(index)) | |
m1 = {row[index]:row[0:leftColumnsToCompare] for row in reader1} | |
m2 = {row[index]:row[0:leftColumnsToCompare] for row in reader2} | |
# TODO: decide if set intersect or set difference (or other) best suits your individual needs | |
# intersect = set(m1.keys()) & set(m2.keys()) # = new set with elements common to m1 and m2 | |
difference = set(m2.keys()) - set(m1.keys()) # = new set with elements in m2 but not in m1 | |
with open(outputDiffs, 'ab') as f_out: | |
writer = csv.writer(f_out, delimiter=',', dialect='excel' ) | |
for k in difference: | |
print('Found Diff in Column: ' + str(index)) | |
# keep in case we do intersect (and need to tell between/print True & Test) | |
# m1[k].append("TRUE") | |
# m1[k].append(index) | |
# writer.writerow(m1[k]) | |
# m2[k].append("TEST") | |
m2[k].append(index) | |
writer.writerow(m2[k]) | |
f1.close | |
f2.close | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment