Created
March 14, 2018 15:50
-
-
Save keithrozario/a1ef71e7a36a93aa8c6de8d5ca9554ea to your computer and use it in GitHub Desktop.
Split files between folders
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
import logging | |
import shutil | |
from pathlib import Path | |
""" Assumes files end with _xx.jsonl | |
where xx is an integer (that is at least 1 digit | |
""" | |
if __name__ == "__main__": | |
# read in directory name | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-d", "--dir", | |
help="Directory(Folder) with the JSONL files", | |
required=False, | |
type=str, | |
default='output') | |
parser.add_argument("-p", "--partitions", | |
help="number of partitions", | |
required=False, | |
type=int, | |
default=4) | |
args = parser.parse_args() | |
# Logging setup | |
logging.basicConfig(filename='logs/partition.log', | |
filemode='a', | |
level=logging.INFO, | |
format='%(asctime)s %(message)s', | |
datefmt='%m/%d/%Y %I:%M:%S %p') | |
logger = logging.getLogger(__name__) | |
console = logging.StreamHandler() | |
console.setLevel(logging.INFO) | |
logger.addHandler(console) | |
# Get list of files | |
files = [x for x in Path(args.dir + '/').iterdir() if x.is_file()] | |
files.sort() | |
# Create the Folders | |
folders = [] | |
for x in range(args.partitions): | |
new_folder_name = args.dir + "/" + str(x) | |
if not os.path.exists(new_folder_name): | |
os.makedirs(new_folder_name) | |
logger.info("INFO: Created New Folder: " + new_folder_name) | |
folders.append(new_folder_name) | |
# Count files in Folders | |
start_files = 0 | |
for folder in folders: | |
start_files += len(os.listdir(folder)) | |
# Move File to Folders | |
for file in files: | |
# Get the file number (between the '_' and .jsonl) | |
file_number = file.name[file.name.rfind('_') + 1:file.name.rfind('.jsonl')] | |
# Based on file number, determine which folder to move into | |
folder_number = int(file_number) % args.partitions | |
folder = folders[folder_number] | |
# Move to folder | |
shutil.move(file._str, folder + "/" + file.name) | |
logger.info("Moved:" + file.name + " to " + folder) | |
end_files = 0 | |
for folder in folders: | |
end_files += len(os.listdir(folder)) | |
logger.info("Folder: " + folder + " has " + str(len(os.listdir(folder))) + " files") | |
logger.info("Total files to move: " + str(len(files))) | |
logger.info("Number of new files in additional directories: " + | |
str(end_files - start_files)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment