davidwtbuxton · March 6, 2012 23:10
diff --git a/gistfile1.py b/gistfile1.py
 # http://www.reddit.com/r/learnpython/comments/qkh43/new_to_python_searching_csv_files/
 # http://stackoverflow.com/questions/9564322/loop-through-rows-of-one-csv-file-to-find-corresponding-data-in-another
 # http://stackoverflow.com/questions/9577997/search-through-csv-from-specific-row-down

 import csv


 # Difference constants. Note these are floats, so don't expect perfect decimal
 # mathematics.
 DELTA_HI = 0.001
 DELTA_LO = 0.0015


 def main(filename1, filename2):
    # Mapping of ID to date string
    source_dts = dict(csv.reader(open(filename1, 'rU'), delimiter=','))
    
    # Invert the mapping, date string to ID. Having date as key makes searching easy
    source_dts = dict((v, k) for k, v in source_dts.items())
    
    # Will hold details of matching first rows (i.e. before finding delta row)
    matches = []
    
    # Will hold details of first rows and deltas (i.e. final results)
    deltas = []
    
    for row in csv.reader(open(filename2, 'rU'), delimiter=','):
        dt, bid, ask = row[3:]
        
        # Calculate deltas. Need this for checking matches and for storing.
        bid = float(bid)
        hi = bid + DELTA_HI
        lo = bid - DELTA_LO
    
        # Check if we have a match from the first file. A match is when datetime is
        # in the same minute.
        key = dt[:16]
        if key in source_dts:
            
            # Store a 3-tuple of (high, low, rowdata)
            data = hi, lo, ([dt, source_dts[key]] + row)
            matches.append(data)
            
            # Remove source entry so we don't match it again
            del source_dts[key]
        
        # Check if we have a match for a previous row. A match is when the bid is
        # within a previous row's low / high.
        for idx, (p_hi, p_lo, p_row) in enumerate(matches):
            # This row's bid has exceeded the delta
            if (hi > p_hi) or (lo < p_lo):
                deltas.append((p_row, row))
                # Remove from previous rows so we don't match again
                del matches[idx]
                break
        
    # Deltas should have 2-tuples of row data. For each tuple, first is row data
    # for the datetime specified in file1, second is earliest row data after first
    # with a bid that exceeds the delta.
    return deltas


 if __name__ == "__main__":
    import sys
    f1, f2 = sys.argv[1:3]
    results = main(f1, f2)
    for a, b in results:
        print a, b
	# http://www.reddit.com/r/learnpython/comments/qkh43/new_to_python_searching_csv_files/
	# http://stackoverflow.com/questions/9564322/loop-through-rows-of-one-csv-file-to-find-corresponding-data-in-another
	# http://stackoverflow.com/questions/9577997/search-through-csv-from-specific-row-down

	import csv


	# Difference constants. Note these are floats, so don't expect perfect decimal
	# mathematics.
	DELTA_HI = 0.001
	DELTA_LO = 0.0015


	def main(filename1, filename2):
	# Mapping of ID to date string
	source_dts = dict(csv.reader(open(filename1, 'rU'), delimiter=','))

	# Invert the mapping, date string to ID. Having date as key makes searching easy
	source_dts = dict((v, k) for k, v in source_dts.items())

	# Will hold details of matching first rows (i.e. before finding delta row)
	matches = []

	# Will hold details of first rows and deltas (i.e. final results)
	deltas = []

	for row in csv.reader(open(filename2, 'rU'), delimiter=','):
	dt, bid, ask = row[3:]

	# Calculate deltas. Need this for checking matches and for storing.
	bid = float(bid)
	hi = bid + DELTA_HI
	lo = bid - DELTA_LO

	# Check if we have a match from the first file. A match is when datetime is
	# in the same minute.
	key = dt[:16]
	if key in source_dts:

	# Store a 3-tuple of (high, low, rowdata)
	data = hi, lo, ([dt, source_dts[key]] + row)
	matches.append(data)

	# Remove source entry so we don't match it again
	del source_dts[key]

	# Check if we have a match for a previous row. A match is when the bid is
	# within a previous row's low / high.
	for idx, (p_hi, p_lo, p_row) in enumerate(matches):
	# This row's bid has exceeded the delta
	if (hi > p_hi) or (lo < p_lo):
	deltas.append((p_row, row))
	# Remove from previous rows so we don't match again
	del matches[idx]
	break

	# Deltas should have 2-tuples of row data. For each tuple, first is row data
	# for the datetime specified in file1, second is earliest row data after first
	# with a bid that exceeds the delta.
	return deltas


	if __name__ == "__main__":
	import sys
	f1, f2 = sys.argv[1:3]
	results = main(f1, f2)
	for a, b in results:
	print a, b