Skip to content

Instantly share code, notes, and snippets.

@Guitaricet
Created January 11, 2019 09:06
Show Gist options
  • Save Guitaricet/937264d4958cb62084008cc5d6da0685 to your computer and use it in GitHub Desktop.
Save Guitaricet/937264d4958cb62084008cc5d6da0685 to your computer and use it in GitHub Desktop.
Split large text file into shards
import os
import logging
import argparse
from pathlib import Path
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument('--input-file')
parser.add_argument('--shards-directory')
parser.add_argument('--shard-size', default=1e6, type=int)
logger = logging.getLogger('shard_large_file')
logger.setLevel(logging.INFO)
if __name__ == "__main__":
args = parser.parse_args()
logger.info(f'Output shards will be at {args.shards_directory} directory')
os.makedirs(args.shards_directory, exist_ok=True)
with open(args.input_file) as f:
current_shard = 0
out = None
for i, line in tqdm(enumerate(f)):
if i >= current_shard * args.shard_size:
logger.info(f'Processing shard {current_shard}')
if out:
out.close()
current_shard += 1
out = open(Path(args.shards_directory) / f'shard_{current_shard}.txt', 'w')
out.write(line + '\n')
out.close()
logger.info('Script finished successfuly')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment