Skip to content

Instantly share code, notes, and snippets.

@omnisis
Last active December 12, 2015 08:39
Show Gist options
  • Save omnisis/4745947 to your computer and use it in GitHub Desktop.
Save omnisis/4745947 to your computer and use it in GitHub Desktop.
Python script for finding all leftovers in a series of N CSV files whose records are in sorted order
#!/usr/bin/python
#############################################################################
# This script finds the "leftover records" between files containing arbitrary
# linesplits of a very large sorted CSV file. These types of files would exist
# if for example, one ran the UNIX 'split' program (breaking on line boundaries)
# on a large sorted CSV file to break it into smaller sorted CSV files but did
# not bother to ensure that no logical records (assumming records can span text lines)
# were split across files. Generally, it's impossible for a generic utility
# like 'split' to make such application-specific decisions at the time the
# splitting happens.
#
# The basic algorithm for finding leftovers is as follows:
#
# - Initialize a string 'answer' to the empty string
# - Iterate over all sorted files in the proper order
# - For each pair (prev,curr) of files in the list of sorted files:
# - If the last line of prev and first line of curr are the same record:
# - Go backwards some amount in prev to catch all such logical records
# appending this string to some number of similar logical records found
# at the beginning of curr and concatenating it to 'answer'
# - return the accumulated string 'answer'
#
# NOTE: This script makes use of the UNIX 'head' and 'tail' programs
# so is *NOT* cross-platform!
#############################################################################
from subprocess import check_output
import argparse
import glob
import string
# dictionary of global configuration options that control how this script operates
config = {'delim': ",", "numfields": 1, "lines_context": 50}
"""
Returns a list of files (in lexiographically sorted order) matching
'prefix*'
"""
def filesMatchingPrefix(prefix):
return glob.glob(prefix + '*') # glob returns in sorted order by default
"""
Extracts 'numfields' records from a CSV line 'line' with delimiter 'delim'
"""
def extractFieldMatchStr(line, delim=",", numfields=1):
return delim.join(string.split(line,delim)[0:numfields]).strip()
"""
Wrapper for $> tail -n [N] [file]
"""
def tailN(fname, N):
return check_output(["tail", "-n", str(N), fname])
"""
Wrapper for $> head -n [N] [file]
"""
def headN(fname, N):
return check_output(["head", "-n", str(N), fname])
"""
Examines 'ctxlines' from the tail of prevFile and head of currFile. If any of these lines match the
last line of the first file, then it is assummed to be a 'match' and
accumulated in the returned result.
"""
def findLeftovers(prevFile, currFile, ctxlines=50):
# if one of either files is None then there can be no leftovers
if prevFile is None or currFile is None:
return []
# files have content, exp leftover is the match on last line of prev
prevlineLast=tailN(prevFile,1)
currlineFirst=headN(currFile,1)
expMatch=extractFieldMatchStr(prevlineLast, config["delim"], config["numfields"])
# if the last line of prev and first line of curr don't match up there is no 'overlap'
if expMatch != extractFieldMatchStr(currlineFirst, config["delim"], config["numfields"])
return []
# create a buffer from both files that we will soon filter
buf = tailN(prevFile,50) + headN(currFile, 50)
# filter out all lines that aren't a leftover match
return filter(lambda x: expMatch == extractFieldMatchStr(x, config["delim"], config["numfields"]),
buf.splitlines())
"""
Given a list of files, iterates thru the list calling findLeftovers() on each file in turn
passing in the name of the previous file and the current file and appending the results
to a list that is ultimately returned.
"""
def allLeftovers(files):
prevFile=None
results=[]
for file in files:
results.append(findLeftovers(prevFile, file, config["lines_context"]))
prevFile=file
# apparently this is a python idiom for flattenning a list of lists
return [item for sublist in results for item in sublist]
"""
Parses cmdline options for this tool.
"""
def parseCmdLine():
parser = argparse.ArgumentParser(description='Takes a series of sorted CSV files and finds the "leftovers" between them.',
usage='%(prog)s [options] PREFIX')
parser.add_argument('-d', '--delim', required=False, help='delimiter character to use ("," by default)',
default=',')
parser.add_argument('-n', '--numfields', required=False, type=int,
help='A string indicating how many fields must be equal in order to have a ' +
' "record match" in consecutive lines of the CSV files (default is just the first field).',
default=1)
parser.add_argument('-l', '--lines-context', required=False, default=50, type=int,
help='The number of lines of context required to guarantee a leftover is found. This is used '+
'when looking at the tail/head of files to decide how big the lookback buffer should be. '+
'The default is 50 lines')
parser.add_argument('PREFIX',
help='The prefix pattern to look for in locating sorted CSV files. It is assummed that all CSV files ' +
'will start with this prefix and will sort lexiographically. (e.g. the files [file.part1, file.part2, file.part3] '+
'would have a fileprefix of "file.part".')
args = parser.parse_args()
return vars(args)
"""
Main function body
"""
if __name__ == '__main__':
# initialize global config from the parsed cmdline options
config = dict(parseCmdLine())
print "Using configuration: ", config
# find all files matching the PREFIX
splitfiles=filesMatchingPrefix(config['PREFIX'])
#print "Operating over filelist: ", splitfiles
allLeftoversList = allLeftovers(splitfiles)
#print "All leftovers list: ", allLeftoversList
print '\r\n'.join(allLeftoversList)
a,john,age,32
a,john,tel,123-123-1234
a,john,group,7
a,john,assoc,98
a,john,weight,250lbs
a,john,martial_status,single
a,john,occupation,construction worker
a,john,favorite color,red
a,john,last_name,smith
a,john,car_make,toyota
a,john,car_model,corolla
b,mary,age,45
b,mary,tel,465-232-5566
b,mary,assoc,87
b,mary,maritial_status,married
b,mary,num_children,3
b,mary,favorite color,blue
b,mary,weight,135
b,mary,role,supervisor
b,mary,car_make,ford
b,mary,car_model,taurus
b,mary,taxcode,34343
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
c,george,dontcare,xxx
d,billy,age,34
d,billy,last_name,johnson
d,billy,favorite band,green day
d,billy,favorite color,green
d,billy,favorite ice cream flavor, pistachio
d,billy,num likes on facebook,2
d,billy,marital status, divorced
d,billy,num children,17
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
e,austin,dontcare,dontcare
f,craig,occupation,coder
f,craig,age,27
f,craig,favorite band,the ramones
f,craig,favorite color,orange
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
g,cindy,dontcare,xxx
#!/bin/sh
./find_leftovers.py -n2 'testfile*'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment