Last active
January 1, 2016 00:19
-
-
Save sumanthprabhu/8066184 to your computer and use it in GitHub Desktop.
A python utility script to merge files in a directory. Pass two arguments - 1) the path to the directory containing files to be merged 2) required number of files after merging
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Combine all files in a directory into required number of files | |
''' | |
import csv | |
import sys | |
import os | |
import time | |
def fetch(num): | |
''' | |
Return file number for a record | |
''' | |
file_number = 1 | |
for entry in fetch.indexer: | |
if num <= entry: | |
return file_number | |
file_number += 1 | |
return file_number | |
def main(argv): | |
''' | |
Main function | |
''' | |
record_count = 0 | |
temp_name = "temp" + str(time.time()) | |
with open(temp_name, "a+") as target_file: | |
directory = os.path.join(argv[1]) | |
for root,dirs,files in os.walk(directory): | |
for entry in files: | |
if entry.endswith('.csv'):#matched | |
#write all data to target file | |
with open(os.path.join(directory, entry), 'rb') as srcfile: | |
for line in srcfile: | |
target_file.write(line) | |
record_count += 1 | |
#delete file | |
os.remove(os.path.join(directory, entry) ) | |
#split into required number of files | |
number_of_files = int(argv[2]) | |
fetch.indexer = [] | |
lines_per_file = record_count / number_of_files | |
remainder = record_count % number_of_files | |
for i in xrange(1, number_of_files): | |
fetch.indexer.append(i * lines_per_file) | |
#last index should be the record count itself | |
fetch.indexer.append(record_count) | |
print "Beginning writing process.." | |
line_list = [] | |
count = 0 | |
# Write 3000 lines at a time into corresponding file | |
# Basically, chose 3k as limit for size of line_list | |
comparator = min(3000, lines_per_file) | |
with open(temp_name, "rb") as target_file: | |
for i, line in enumerate(target_file): | |
if count < comparator: | |
line_list.append(line) | |
count += 1 | |
else: | |
file_number = fetch(i) | |
count = 0 | |
path = os.path.join(argv[1], "train_set%s.csv" % file_number) | |
with open(path, "ab") as train_piece: | |
for line in line_list: | |
train_piece.write(line) | |
line_list = [] | |
with open(path, "ab") as train_piece: | |
for line in line_list: | |
train_piece.write(line) | |
#delete temp file | |
os.remove(temp_name) | |
if __name__ == "__main__": | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment