Skip to content

Instantly share code, notes, and snippets.

@keithrozario
Created March 14, 2018 15:50
Show Gist options
  • Save keithrozario/a1ef71e7a36a93aa8c6de8d5ca9554ea to your computer and use it in GitHub Desktop.
Save keithrozario/a1ef71e7a36a93aa8c6de8d5ca9554ea to your computer and use it in GitHub Desktop.
Split files between folders
import argparse
import os
import logging
import shutil
from pathlib import Path
""" Assumes files end with _xx.jsonl
where xx is an integer (that is at least 1 digit
"""
if __name__ == "__main__":
# read in directory name
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dir",
help="Directory(Folder) with the JSONL files",
required=False,
type=str,
default='output')
parser.add_argument("-p", "--partitions",
help="number of partitions",
required=False,
type=int,
default=4)
args = parser.parse_args()
# Logging setup
logging.basicConfig(filename='logs/partition.log',
filemode='a',
level=logging.INFO,
format='%(asctime)s %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p')
logger = logging.getLogger(__name__)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logger.addHandler(console)
# Get list of files
files = [x for x in Path(args.dir + '/').iterdir() if x.is_file()]
files.sort()
# Create the Folders
folders = []
for x in range(args.partitions):
new_folder_name = args.dir + "/" + str(x)
if not os.path.exists(new_folder_name):
os.makedirs(new_folder_name)
logger.info("INFO: Created New Folder: " + new_folder_name)
folders.append(new_folder_name)
# Count files in Folders
start_files = 0
for folder in folders:
start_files += len(os.listdir(folder))
# Move File to Folders
for file in files:
# Get the file number (between the '_' and .jsonl)
file_number = file.name[file.name.rfind('_') + 1:file.name.rfind('.jsonl')]
# Based on file number, determine which folder to move into
folder_number = int(file_number) % args.partitions
folder = folders[folder_number]
# Move to folder
shutil.move(file._str, folder + "/" + file.name)
logger.info("Moved:" + file.name + " to " + folder)
end_files = 0
for folder in folders:
end_files += len(os.listdir(folder))
logger.info("Folder: " + folder + " has " + str(len(os.listdir(folder))) + " files")
logger.info("Total files to move: " + str(len(files)))
logger.info("Number of new files in additional directories: " +
str(end_files - start_files))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment