omnisis · December 12, 2015 08:39
diff --git a/find_leftovers.py b/find_leftovers.py
 #!/usr/bin/python

 #############################################################################
 # This script finds the "leftover records" between files containing arbitrary
 # linesplits of a very large sorted CSV file.  These types of files would exist
 # if for example, one ran the UNIX 'split' program (breaking on line boundaries)
 # on a large sorted CSV file to break it into smaller sorted CSV files but did
 # not bother to ensure that no logical records (assumming records can span text lines) 
 # were split across files.  Generally, it's impossible for a generic utility 
 # like 'split' to make such application-specific decisions at the time the
 # splitting happens.
 #
 # The basic algorithm for finding leftovers is as follows:
 #
 #  - Initialize a string 'answer' to the empty string   
 #  - Iterate over all sorted files in the proper order 
 #      - For each pair (prev,curr) of files in the list of sorted files:
 #          - If the last line of prev and first line of curr are the same record:
 #               - Go backwards some amount in prev to catch all such logical records
 #                 appending this string to some number of similar logical records found
 #                 at the beginning of curr and concatenating it to 'answer'
 #  - return the accumulated string 'answer'
 #
 #  NOTE: This script makes use of the UNIX 'head' and 'tail' programs
 #        so is *NOT* cross-platform!
 #############################################################################

 from subprocess import check_output
 import argparse
 import glob
 import string


 # dictionary of global configuration options that control how this script operates
 config = {'delim': ",", "numfields": 1, "lines_context": 50}

 """
    Returns a list of files (in lexiographically sorted order) matching 
        'prefix*'
 """
 def filesMatchingPrefix(prefix):
    return glob.glob(prefix + '*') # glob returns in sorted order by default

 """
    Extracts 'numfields' records from a CSV line 'line' with delimiter 'delim'
 """
 def extractFieldMatchStr(line, delim=",", numfields=1):
    return delim.join(string.split(line,delim)[0:numfields]).strip()

 """
    Wrapper for $> tail -n [N] [file]
 """
 def tailN(fname, N):
    return check_output(["tail", "-n", str(N), fname])
 """
    Wrapper for $> head -n [N] [file]
 """
 def headN(fname, N):
    return check_output(["head", "-n", str(N), fname])

 """
    Examines 'ctxlines' from the tail of prevFile and head of currFile.  If any of these lines match the 
    last line of the first file, then it is assummed to be a 'match' and 
    accumulated in the returned result.
 """
 def findLeftovers(prevFile, currFile, ctxlines=50):
    # if one of either files is None then there can be no leftovers
    if prevFile is None or currFile is None:
        return []

    # files have content, exp leftover is the match on last line of prev
    prevlineLast=tailN(prevFile,1)
    currlineFirst=headN(currFile,1)
    expMatch=extractFieldMatchStr(prevlineLast, config["delim"], config["numfields"])

    # if the last line of prev and first line of curr don't match up there is no 'overlap'
    if expMatch != extractFieldMatchStr(currlineFirst, config["delim"], config["numfields"])
        return []

    # create a buffer from both files that we will soon filter
    buf = tailN(prevFile,50) + headN(currFile, 50)

    # filter out all lines that aren't a leftover match
    return filter(lambda x:  expMatch == extractFieldMatchStr(x, config["delim"], config["numfields"]), 
        buf.splitlines())

 """
    Given a list of files, iterates thru the list calling findLeftovers() on each file in turn
    passing in the name of the previous file and the current file and appending the results
    to a list that is ultimately returned.
 """
 def allLeftovers(files):
    prevFile=None
    results=[]
    for file in files:
        results.append(findLeftovers(prevFile, file, config["lines_context"]))
        prevFile=file
    # apparently this is a python idiom for flattenning a list of lists 
    return [item for sublist in results for item in sublist]

 """
    Parses cmdline options for this tool.
 """
 def parseCmdLine():
    parser = argparse.ArgumentParser(description='Takes a series of sorted CSV files and finds the "leftovers" between them.',
        usage='%(prog)s [options] PREFIX')
    parser.add_argument('-d', '--delim', required=False, help='delimiter character to use ("," by default)',
                        default=',')
    parser.add_argument('-n', '--numfields', required=False, type=int,
                        help='A string indicating how many fields must be equal in order to have a ' +
                        ' "record match" in consecutive lines of the CSV files (default is just the first field).',
                        default=1)
    parser.add_argument('-l', '--lines-context', required=False, default=50, type=int,
                        help='The number of lines of context required to guarantee a leftover is found.  This is used '+
                        'when looking at the tail/head of files to decide how big the lookback buffer should be. '+
                        'The default is 50 lines')
    parser.add_argument('PREFIX', 
                        help='The prefix pattern to look for in locating sorted CSV files.  It is assummed that all CSV files ' +
                        'will start with this prefix and will sort lexiographically. (e.g. the files [file.part1, file.part2, file.part3] '+
                        'would have a fileprefix of "file.part".')

    args = parser.parse_args()
    return vars(args)


 """
    Main function body
 """
 if __name__ == '__main__':
    # initialize global config from the parsed cmdline options
    config = dict(parseCmdLine())
    print "Using configuration: ", config

    # find all files matching the PREFIX
    splitfiles=filesMatchingPrefix(config['PREFIX'])
    #print "Operating over filelist: ", splitfiles

    allLeftoversList = allLeftovers(splitfiles)
    #print "All leftovers list: ", allLeftoversList

    print '\r\n'.join(allLeftoversList)
diff --git a/testfilea b/testfilea
 a,john,age,32
 a,john,tel,123-123-1234
 a,john,group,7
 a,john,assoc,98
 a,john,weight,250lbs
 a,john,martial_status,single
 a,john,occupation,construction worker
 a,john,favorite color,red
 a,john,last_name,smith
 a,john,car_make,toyota
 a,john,car_model,corolla
 b,mary,age,45
 b,mary,tel,465-232-5566
 b,mary,assoc,87
 b,mary,maritial_status,married
 b,mary,num_children,3
 b,mary,favorite color,blue
 b,mary,weight,135
diff --git a/testfileb b/testfileb
 b,mary,role,supervisor
 b,mary,car_make,ford
 b,mary,car_model,taurus
 b,mary,taxcode,34343
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 c,george,dontcare,xxx
 d,billy,age,34
 d,billy,last_name,johnson
diff --git a/testfilec b/testfilec
 d,billy,favorite band,green day
 d,billy,favorite color,green
 d,billy,favorite ice cream flavor, pistachio
 d,billy,num likes on facebook,2
 d,billy,marital status, divorced
 d,billy,num children,17
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 e,austin,dontcare,dontcare
 f,craig,occupation,coder
diff --git a/testfiled b/testfiled
 f,craig,age,27
 f,craig,favorite band,the ramones
 f,craig,favorite color,orange
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
 g,cindy,dontcare,xxx
diff --git a/testit.sh b/testit.sh
 #!/bin/sh

 ./find_leftovers.py -n2 'testfile*'
	#!/usr/bin/python

	#############################################################################
	# This script finds the "leftover records" between files containing arbitrary
	# linesplits of a very large sorted CSV file. These types of files would exist
	# if for example, one ran the UNIX 'split' program (breaking on line boundaries)
	# on a large sorted CSV file to break it into smaller sorted CSV files but did
	# not bother to ensure that no logical records (assumming records can span text lines)
	# were split across files. Generally, it's impossible for a generic utility
	# like 'split' to make such application-specific decisions at the time the
	# splitting happens.
	#
	# The basic algorithm for finding leftovers is as follows:
	#
	# - Initialize a string 'answer' to the empty string
	# - Iterate over all sorted files in the proper order
	# - For each pair (prev,curr) of files in the list of sorted files:
	# - If the last line of prev and first line of curr are the same record:
	# - Go backwards some amount in prev to catch all such logical records
	# appending this string to some number of similar logical records found
	# at the beginning of curr and concatenating it to 'answer'
	# - return the accumulated string 'answer'
	#
	# NOTE: This script makes use of the UNIX 'head' and 'tail' programs
	# so is NOT cross-platform!
	#############################################################################

	from subprocess import check_output
	import argparse
	import glob
	import string


	# dictionary of global configuration options that control how this script operates
	config = {'delim': ",", "numfields": 1, "lines_context": 50}

	"""
	Returns a list of files (in lexiographically sorted order) matching
	'prefix*'
	"""
	def filesMatchingPrefix(prefix):
	return glob.glob(prefix + '*') # glob returns in sorted order by default

	"""
	Extracts 'numfields' records from a CSV line 'line' with delimiter 'delim'
	"""
	def extractFieldMatchStr(line, delim=",", numfields=1):
	return delim.join(string.split(line,delim)[0:numfields]).strip()

	"""
	Wrapper for $> tail -n [N] [file]
	"""
	def tailN(fname, N):
	return check_output(["tail", "-n", str(N), fname])
	"""
	Wrapper for $> head -n [N] [file]
	"""
	def headN(fname, N):
	return check_output(["head", "-n", str(N), fname])

	"""
	Examines 'ctxlines' from the tail of prevFile and head of currFile. If any of these lines match the
	last line of the first file, then it is assummed to be a 'match' and
	accumulated in the returned result.
	"""
	def findLeftovers(prevFile, currFile, ctxlines=50):
	# if one of either files is None then there can be no leftovers
	if prevFile is None or currFile is None:
	return []

	# files have content, exp leftover is the match on last line of prev
	prevlineLast=tailN(prevFile,1)
	currlineFirst=headN(currFile,1)
	expMatch=extractFieldMatchStr(prevlineLast, config["delim"], config["numfields"])

	# if the last line of prev and first line of curr don't match up there is no 'overlap'
	if expMatch != extractFieldMatchStr(currlineFirst, config["delim"], config["numfields"])
	return []

	# create a buffer from both files that we will soon filter
	buf = tailN(prevFile,50) + headN(currFile, 50)

	# filter out all lines that aren't a leftover match
	return filter(lambda x: expMatch == extractFieldMatchStr(x, config["delim"], config["numfields"]),
	buf.splitlines())

	"""
	Given a list of files, iterates thru the list calling findLeftovers() on each file in turn
	passing in the name of the previous file and the current file and appending the results
	to a list that is ultimately returned.
	"""
	def allLeftovers(files):
	prevFile=None
	results=[]
	for file in files:
	results.append(findLeftovers(prevFile, file, config["lines_context"]))
	prevFile=file
	# apparently this is a python idiom for flattenning a list of lists
	return [item for sublist in results for item in sublist]

	"""
	Parses cmdline options for this tool.
	"""
	def parseCmdLine():
	parser = argparse.ArgumentParser(description='Takes a series of sorted CSV files and finds the "leftovers" between them.',
	usage='%(prog)s [options] PREFIX')
	parser.add_argument('-d', '--delim', required=False, help='delimiter character to use ("," by default)',
	default=',')
	parser.add_argument('-n', '--numfields', required=False, type=int,
	help='A string indicating how many fields must be equal in order to have a ' +
	' "record match" in consecutive lines of the CSV files (default is just the first field).',
	default=1)
	parser.add_argument('-l', '--lines-context', required=False, default=50, type=int,
	help='The number of lines of context required to guarantee a leftover is found. This is used '+
	'when looking at the tail/head of files to decide how big the lookback buffer should be. '+
	'The default is 50 lines')
	parser.add_argument('PREFIX',
	help='The prefix pattern to look for in locating sorted CSV files. It is assummed that all CSV files ' +
	'will start with this prefix and will sort lexiographically. (e.g. the files [file.part1, file.part2, file.part3] '+
	'would have a fileprefix of "file.part".')

	args = parser.parse_args()
	return vars(args)


	"""
	Main function body
	"""
	if __name__ == '__main__':
	# initialize global config from the parsed cmdline options
	config = dict(parseCmdLine())
	print "Using configuration: ", config

	# find all files matching the PREFIX
	splitfiles=filesMatchingPrefix(config['PREFIX'])
	#print "Operating over filelist: ", splitfiles

	allLeftoversList = allLeftovers(splitfiles)
	#print "All leftovers list: ", allLeftoversList

	print '\r\n'.join(allLeftoversList)
	a,john,age,32
	a,john,tel,123-123-1234
	a,john,group,7
	a,john,assoc,98
	a,john,weight,250lbs
	a,john,martial_status,single
	a,john,occupation,construction worker
	a,john,favorite color,red
	a,john,last_name,smith
	a,john,car_make,toyota
	a,john,car_model,corolla
	b,mary,age,45
	b,mary,tel,465-232-5566
	b,mary,assoc,87
	b,mary,maritial_status,married
	b,mary,num_children,3
	b,mary,favorite color,blue
	b,mary,weight,135
	b,mary,role,supervisor
	b,mary,car_make,ford
	b,mary,car_model,taurus
	b,mary,taxcode,34343
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	c,george,dontcare,xxx
	d,billy,age,34
	d,billy,last_name,johnson
	d,billy,favorite band,green day
	d,billy,favorite color,green
	d,billy,favorite ice cream flavor, pistachio
	d,billy,num likes on facebook,2
	d,billy,marital status, divorced
	d,billy,num children,17
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	e,austin,dontcare,dontcare
	f,craig,occupation,coder
	f,craig,age,27
	f,craig,favorite band,the ramones
	f,craig,favorite color,orange
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx
	g,cindy,dontcare,xxx