Skip to content

Instantly share code, notes, and snippets.

@CoffeeVampir3
Last active October 19, 2023 20:04
Show Gist options
  • Save CoffeeVampir3/a161894603d57ab7022b1e4dae6b832d to your computer and use it in GitHub Desktop.
Save CoffeeVampir3/a161894603d57ab7022b1e4dae6b832d to your computer and use it in GitHub Desktop.
Maximal context splitting
import json
import argparse
import os
import re
from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
def l(tokenizer, txt):
return len(tokenizer.encode(txt))
def process_file(filename, output_directory):
tokenizer = AutoTokenizer.from_pretrained('tokenizer')
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "."], keep_separator=True, chunk_overlap=100, chunk_size=2048)
with open(filename, 'r') as f:
text = f.read()
splits = text_splitter.split_text(text)
buffer = ""
buffers = [] # List to store the buffers
for split in splits:
le = l(tokenizer, split)
ble = l(tokenizer, buffer)
if ble + le < 2048:
buffer += split
continue
buffers.append(buffer) # Store buffer
buffer = split
buffers.append(buffer) # Don't forget the last buffer
# Create jsonl file
base_filename = os.path.basename(filename)
jsonl_filename = os.path.join(output_directory, base_filename.split(".")[0] + '.jsonl')
os.makedirs(output_directory, exist_ok=True) # Ensure the directory exists
with open(jsonl_filename, 'w') as f:
for buffer in buffers:
json.dump({"text": buffer}, f)
f.write('\n')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process a file and save output to a directory.')
parser.add_argument('filename', help='the file to process')
parser.add_argument('output_directory', help='the directory where the output should be saved')
args = parser.parse_args()
target_dir = args.filename.split('/')[-2] or ""
final_dir = os.path.join(args.output_directory, target_dir) or target_dir
os.makedirs(final_dir, exist_ok=True)
process_file(args.filename, final_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment