Skip to content

Instantly share code, notes, and snippets.

@TAG-Epic
Created October 16, 2020 01:01
Show Gist options
  • Select an option

  • Save TAG-Epic/9e9513d7b8dd8a727f4a7f5a131570c9 to your computer and use it in GitHub Desktop.

Select an option

Save TAG-Epic/9e9513d7b8dd8a727f4a7f5a131570c9 to your computer and use it in GitHub Desktop.
from pathlib import Path
from tqdm import tqdm, trange
from math import ceil
input_dir = Path("OneChunk-CRR")
output_dir = Path("output")
per_file = 10000
input_files = input_dir.glob("*.txt")
current_data = []
for file in tqdm(list(input_files)):
with file.open() as f:
current_data = [*current_data, *f.readlines()]
current_data = list(set(current_data))
for file_index in trange(ceil(len(current_data) / per_file)):
start = file_index * per_file
end = ((file_index + 1) * per_file) - 1
file = output_dir / f"{start}-{end}.txt"
with file.open("w+") as f:
f.writelines(current_data[start:end])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment