Last active
December 12, 2015 08:39
-
-
Save omnisis/4745947 to your computer and use it in GitHub Desktop.
Python script for finding all leftovers in a series of N CSV files whose records are in sorted order
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
############################################################################# | |
# This script finds the "leftover records" between files containing arbitrary | |
# linesplits of a very large sorted CSV file. These types of files would exist | |
# if for example, one ran the UNIX 'split' program (breaking on line boundaries) | |
# on a large sorted CSV file to break it into smaller sorted CSV files but did | |
# not bother to ensure that no logical records (assumming records can span text lines) | |
# were split across files. Generally, it's impossible for a generic utility | |
# like 'split' to make such application-specific decisions at the time the | |
# splitting happens. | |
# | |
# The basic algorithm for finding leftovers is as follows: | |
# | |
# - Initialize a string 'answer' to the empty string | |
# - Iterate over all sorted files in the proper order | |
# - For each pair (prev,curr) of files in the list of sorted files: | |
# - If the last line of prev and first line of curr are the same record: | |
# - Go backwards some amount in prev to catch all such logical records | |
# appending this string to some number of similar logical records found | |
# at the beginning of curr and concatenating it to 'answer' | |
# - return the accumulated string 'answer' | |
# | |
# NOTE: This script makes use of the UNIX 'head' and 'tail' programs | |
# so is *NOT* cross-platform! | |
############################################################################# | |
from subprocess import check_output | |
import argparse | |
import glob | |
import string | |
# dictionary of global configuration options that control how this script operates | |
config = {'delim': ",", "numfields": 1, "lines_context": 50} | |
""" | |
Returns a list of files (in lexiographically sorted order) matching | |
'prefix*' | |
""" | |
def filesMatchingPrefix(prefix): | |
return glob.glob(prefix + '*') # glob returns in sorted order by default | |
""" | |
Extracts 'numfields' records from a CSV line 'line' with delimiter 'delim' | |
""" | |
def extractFieldMatchStr(line, delim=",", numfields=1): | |
return delim.join(string.split(line,delim)[0:numfields]).strip() | |
""" | |
Wrapper for $> tail -n [N] [file] | |
""" | |
def tailN(fname, N): | |
return check_output(["tail", "-n", str(N), fname]) | |
""" | |
Wrapper for $> head -n [N] [file] | |
""" | |
def headN(fname, N): | |
return check_output(["head", "-n", str(N), fname]) | |
""" | |
Examines 'ctxlines' from the tail of prevFile and head of currFile. If any of these lines match the | |
last line of the first file, then it is assummed to be a 'match' and | |
accumulated in the returned result. | |
""" | |
def findLeftovers(prevFile, currFile, ctxlines=50): | |
# if one of either files is None then there can be no leftovers | |
if prevFile is None or currFile is None: | |
return [] | |
# files have content, exp leftover is the match on last line of prev | |
prevlineLast=tailN(prevFile,1) | |
currlineFirst=headN(currFile,1) | |
expMatch=extractFieldMatchStr(prevlineLast, config["delim"], config["numfields"]) | |
# if the last line of prev and first line of curr don't match up there is no 'overlap' | |
if expMatch != extractFieldMatchStr(currlineFirst, config["delim"], config["numfields"]) | |
return [] | |
# create a buffer from both files that we will soon filter | |
buf = tailN(prevFile,50) + headN(currFile, 50) | |
# filter out all lines that aren't a leftover match | |
return filter(lambda x: expMatch == extractFieldMatchStr(x, config["delim"], config["numfields"]), | |
buf.splitlines()) | |
""" | |
Given a list of files, iterates thru the list calling findLeftovers() on each file in turn | |
passing in the name of the previous file and the current file and appending the results | |
to a list that is ultimately returned. | |
""" | |
def allLeftovers(files): | |
prevFile=None | |
results=[] | |
for file in files: | |
results.append(findLeftovers(prevFile, file, config["lines_context"])) | |
prevFile=file | |
# apparently this is a python idiom for flattenning a list of lists | |
return [item for sublist in results for item in sublist] | |
""" | |
Parses cmdline options for this tool. | |
""" | |
def parseCmdLine(): | |
parser = argparse.ArgumentParser(description='Takes a series of sorted CSV files and finds the "leftovers" between them.', | |
usage='%(prog)s [options] PREFIX') | |
parser.add_argument('-d', '--delim', required=False, help='delimiter character to use ("," by default)', | |
default=',') | |
parser.add_argument('-n', '--numfields', required=False, type=int, | |
help='A string indicating how many fields must be equal in order to have a ' + | |
' "record match" in consecutive lines of the CSV files (default is just the first field).', | |
default=1) | |
parser.add_argument('-l', '--lines-context', required=False, default=50, type=int, | |
help='The number of lines of context required to guarantee a leftover is found. This is used '+ | |
'when looking at the tail/head of files to decide how big the lookback buffer should be. '+ | |
'The default is 50 lines') | |
parser.add_argument('PREFIX', | |
help='The prefix pattern to look for in locating sorted CSV files. It is assummed that all CSV files ' + | |
'will start with this prefix and will sort lexiographically. (e.g. the files [file.part1, file.part2, file.part3] '+ | |
'would have a fileprefix of "file.part".') | |
args = parser.parse_args() | |
return vars(args) | |
""" | |
Main function body | |
""" | |
if __name__ == '__main__': | |
# initialize global config from the parsed cmdline options | |
config = dict(parseCmdLine()) | |
print "Using configuration: ", config | |
# find all files matching the PREFIX | |
splitfiles=filesMatchingPrefix(config['PREFIX']) | |
#print "Operating over filelist: ", splitfiles | |
allLeftoversList = allLeftovers(splitfiles) | |
#print "All leftovers list: ", allLeftoversList | |
print '\r\n'.join(allLeftoversList) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
a,john,age,32 | |
a,john,tel,123-123-1234 | |
a,john,group,7 | |
a,john,assoc,98 | |
a,john,weight,250lbs | |
a,john,martial_status,single | |
a,john,occupation,construction worker | |
a,john,favorite color,red | |
a,john,last_name,smith | |
a,john,car_make,toyota | |
a,john,car_model,corolla | |
b,mary,age,45 | |
b,mary,tel,465-232-5566 | |
b,mary,assoc,87 | |
b,mary,maritial_status,married | |
b,mary,num_children,3 | |
b,mary,favorite color,blue | |
b,mary,weight,135 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
b,mary,role,supervisor | |
b,mary,car_make,ford | |
b,mary,car_model,taurus | |
b,mary,taxcode,34343 | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
c,george,dontcare,xxx | |
d,billy,age,34 | |
d,billy,last_name,johnson |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
d,billy,favorite band,green day | |
d,billy,favorite color,green | |
d,billy,favorite ice cream flavor, pistachio | |
d,billy,num likes on facebook,2 | |
d,billy,marital status, divorced | |
d,billy,num children,17 | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
e,austin,dontcare,dontcare | |
f,craig,occupation,coder |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
f,craig,age,27 | |
f,craig,favorite band,the ramones | |
f,craig,favorite color,orange | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx | |
g,cindy,dontcare,xxx |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
./find_leftovers.py -n2 'testfile*' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment