csmoore · May 5, 2014 14:16
diff --git a/CsvSetComparer.py b/CsvSetComparer.py
 #-------------------------------------------------------------------------------
 # Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
 # See the license for details but you know the general guidance: this is distributed on an  
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 #-------------------------------------------------------------------------------
 # Name: CsvSetComparer.py
 # Usage: python.exe CsvSetComparer.py [Input File:"Truth"(.csv)] [Input File:"Test"(.csv)]
 #                    {Output Diff File (.csv)="diffs.csv"} {ColumnsToCompare=cols}
 # Description: *Brute force* set comparison of 2 .csv's for entries in a "Test" csv's columns that 
 #              do not exist in a "Truth" csv's column. Outputs the diffs from "Test" into
 #              an output "diff.csv" file. Use [Number of] ColumnsToCompare to compare only the 1st few columns
 # Important: this script assumes the csv's have a 1st/header row (that will get ignored)
 #-------------------------------------------------------------------------------

 import csv
 import os
 import sys

 ### Params:
 ### 1 - "Truth Data" input file (.csv)
 ### 2 - "Compare/Test Data" input file (.csv)
 ### 3 - Output Diff File (.csv)
 ### 4 - Columns to compare=number of left columns to check,default=all (use if you don't need to check all cols)

 useDefaults = False
 
 if len(sys.argv) < 3 :
    print 'Usage: [Input File:"Truth"(.csv)] [Input File:"Test"(.csv)] ' + \
        '{Output Diff File (.csv)="diffs.csv"} {ColumnsToCompare=cols}'
    # useDefaults = True  # For IDE testing
    sys.exit(0)
 
 if useDefaults : 
    currentPath = os.path.dirname(__file__)
    dataPath = os.path.normpath(os.path.join(currentPath, r"Data/"))
    outputDiffs = os.path.normpath(os.path.join(dataPath, r"Diffs.txt"))
    truthCsv = os.path.normpath(os.path.join(dataPath, r"True.csv"))
    testCsv  = os.path.normpath(os.path.join(dataPath, r"Test.csv"))
    leftColumnsToCompare = 5
 else :
    truthCsv = sys.argv[1]
    testCsv  = sys.argv[2]

 if not (os.path.exists(truthCsv) and os.path.exists(testCsv)) :
    print 'Input files do not exist: '
    print '   [Input File:"Truth"(.csv)] : ' + truthCsv
    print '   [Input File:"Test"(.csv)]  : ' + testCsv
    sys.exit(0)
    
 if len(sys.argv) > 3 :
    outputDiffs = sys.argv[3]
 else :
    if not useDefaults : 
        outputDiffs = "Diffs.csv"

 diffs = open(outputDiffs, 'w')

 f1 = open(truthCsv, "r")
 headingRow1 = f1.readline().strip();

 maxNumberOfColumns = len(headingRow1.split(","))

 # set number of columns to compare
 if useDefaults :
    pass
 else : 
    if (len(sys.argv) > 4) :
        leftColumnsToCompare = int(sys.argv[4]) # get from command line
    else : 
        leftColumnsToCompare = maxNumberOfColumns # use count from header row 

 if (leftColumnsToCompare < 1) or (leftColumnsToCompare > maxNumberOfColumns) :
    print 'ColumnsToCompare exceeds Max/Min: ' + str(leftColumnsToCompare) + \
        ', Max=' + str(maxNumberOfColumns) + ', Min=1'
    sys.exit(0)    

 diffs.write(headingRow1+",Column that Differs from Truth\n") # add a col to output for which col differs
 f1.close()

 diffs.close()

 for index in range(0, leftColumnsToCompare) : 

    f1 = open(truthCsv, "rb")
    f2 = open(testCsv, "rb")

    headingRow1 = f1.readline();
    headingRow2 = f2.readline();

    # Set solution borrowed from here: http://stackoverflow.com/questions/21718235/compare-two-csv-file-python
    reader1, reader2 = [csv.reader(f, delimiter=',') for f in [f1,f2]]
        
    print('Checking Column: ' + str(index))
    m1 = {row[index]:row[0:leftColumnsToCompare] for row in reader1}
    m2 = {row[index]:row[0:leftColumnsToCompare] for row in reader2}

    # TODO: decide if set intersect or set difference (or other) best suits your individual needs
    # intersect = set(m1.keys()) & set(m2.keys()) # = new set with elements common to m1 and m2
    difference = set(m2.keys()) - set(m1.keys()) # = new set with elements in m2 but not in m1

    with open(outputDiffs, 'ab') as f_out:
        writer = csv.writer(f_out, delimiter=',', dialect='excel' )
        for k in difference:
            print('Found Diff in Column: ' + str(index))
            # keep in case we do intersect (and need to tell between/print True & Test)
            # m1[k].append("TRUE")
            # m1[k].append(index)
            # writer.writerow(m1[k])

            # m2[k].append("TEST")
            m2[k].append(index)
            writer.writerow(m2[k])

    f1.close
    f2.close
	#-------------------------------------------------------------------------------
	# Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0
	# See the license for details but you know the general guidance: this is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	#-------------------------------------------------------------------------------
	# Name: CsvSetComparer.py
	# Usage: python.exe CsvSetComparer.py [Input File:"Truth"(.csv)] [Input File:"Test"(.csv)]
	# {Output Diff File (.csv)="diffs.csv"} {ColumnsToCompare=cols}
	# Description: Brute force set comparison of 2 .csv's for entries in a "Test" csv's columns that
	# do not exist in a "Truth" csv's column. Outputs the diffs from "Test" into
	# an output "diff.csv" file. Use [Number of] ColumnsToCompare to compare only the 1st few columns
	# Important: this script assumes the csv's have a 1st/header row (that will get ignored)
	#-------------------------------------------------------------------------------

	import csv
	import os
	import sys

	### Params:
	### 1 - "Truth Data" input file (.csv)
	### 2 - "Compare/Test Data" input file (.csv)
	### 3 - Output Diff File (.csv)
	### 4 - Columns to compare=number of left columns to check,default=all (use if you don't need to check all cols)

	useDefaults = False

	if len(sys.argv) < 3 :
	print 'Usage: [Input File:"Truth"(.csv)] [Input File:"Test"(.csv)] ' + \
	'{Output Diff File (.csv)="diffs.csv"} {ColumnsToCompare=cols}'
	# useDefaults = True # For IDE testing
	sys.exit(0)

	if useDefaults :
	currentPath = os.path.dirname(__file__)
	dataPath = os.path.normpath(os.path.join(currentPath, r"Data/"))
	outputDiffs = os.path.normpath(os.path.join(dataPath, r"Diffs.txt"))
	truthCsv = os.path.normpath(os.path.join(dataPath, r"True.csv"))
	testCsv = os.path.normpath(os.path.join(dataPath, r"Test.csv"))
	leftColumnsToCompare = 5
	else :
	truthCsv = sys.argv[1]
	testCsv = sys.argv[2]

	if not (os.path.exists(truthCsv) and os.path.exists(testCsv)) :
	print 'Input files do not exist: '
	print ' [Input File:"Truth"(.csv)] : ' + truthCsv
	print ' [Input File:"Test"(.csv)] : ' + testCsv
	sys.exit(0)

	if len(sys.argv) > 3 :
	outputDiffs = sys.argv[3]
	else :
	if not useDefaults :
	outputDiffs = "Diffs.csv"

	diffs = open(outputDiffs, 'w')

	f1 = open(truthCsv, "r")
	headingRow1 = f1.readline().strip();

	maxNumberOfColumns = len(headingRow1.split(","))

	# set number of columns to compare
	if useDefaults :
	pass
	else :
	if (len(sys.argv) > 4) :
	leftColumnsToCompare = int(sys.argv[4]) # get from command line
	else :
	leftColumnsToCompare = maxNumberOfColumns # use count from header row

	if (leftColumnsToCompare < 1) or (leftColumnsToCompare > maxNumberOfColumns) :
	print 'ColumnsToCompare exceeds Max/Min: ' + str(leftColumnsToCompare) + \
	', Max=' + str(maxNumberOfColumns) + ', Min=1'
	sys.exit(0)

	diffs.write(headingRow1+",Column that Differs from Truth\n") # add a col to output for which col differs
	f1.close()

	diffs.close()

	for index in range(0, leftColumnsToCompare) :

	f1 = open(truthCsv, "rb")
	f2 = open(testCsv, "rb")

	headingRow1 = f1.readline();
	headingRow2 = f2.readline();

	# Set solution borrowed from here: http://stackoverflow.com/questions/21718235/compare-two-csv-file-python
	reader1, reader2 = [csv.reader(f, delimiter=',') for f in [f1,f2]]

	print('Checking Column: ' + str(index))
	m1 = {row[index]:row[0:leftColumnsToCompare] for row in reader1}
	m2 = {row[index]:row[0:leftColumnsToCompare] for row in reader2}

	# TODO: decide if set intersect or set difference (or other) best suits your individual needs
	# intersect = set(m1.keys()) & set(m2.keys()) # = new set with elements common to m1 and m2
	difference = set(m2.keys()) - set(m1.keys()) # = new set with elements in m2 but not in m1

	with open(outputDiffs, 'ab') as f_out:
	writer = csv.writer(f_out, delimiter=',', dialect='excel' )
	for k in difference:
	print('Found Diff in Column: ' + str(index))
	# keep in case we do intersect (and need to tell between/print True & Test)
	# m1[k].append("TRUE")
	# m1[k].append(index)
	# writer.writerow(m1[k])

	# m2[k].append("TEST")
	m2[k].append(index)
	writer.writerow(m2[k])

	f1.close
	f2.close