Created
February 28, 2023 16:25
-
-
Save ScotterC/b0ea4f53bd588bf2e9834c7ef4e5bdda to your computer and use it in GitHub Desktop.
Chunk up text using lanchain's text splitters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Note: if spacy isn't working you may need to download english model python -m spacy download en | |
import argparse | |
import json | |
from langchain.text_splitter import NLTKTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter, SpacyTextSplitter, TokenTextSplitter | |
def chunkify(text, chunk_size, chunk_overlap, method): | |
if method == 'nltk': | |
text_splitter = NLTKTextSplitter.from_tiktoken_encoder( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
) | |
elif method == 'character': | |
text_splitter = CharacterTextSplitter.from_tiktoken_encoder( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
) | |
elif method == 'recursive': | |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
) | |
elif method == 'spacy': | |
text_splitter = SpacyTextSplitter.from_tiktoken_encoder( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
) | |
elif method == 'token': | |
text_splitter = TokenTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
) | |
else: | |
raise ValueError(f"Invalid method: {method}") | |
chunks = text_splitter.split_text(text) | |
return {'chunks': chunks} | |
if __name__ == '__main__': | |
# Set up command line argument parser. | |
parser = argparse.ArgumentParser(description='Chunkify a long string of text.') | |
group = parser.add_mutually_exclusive_group(required=True) | |
group.add_argument('--text', type=str, help='the long string of text to be chunked up') | |
group.add_argument('--input-file', type=str, help='the file containing the text to be chunked up') | |
parser.add_argument('--output-file', type=str, required=True, help='the file to write the chunked output to') | |
parser.add_argument('--chunk-size', type=int, default=600, help='the size of each chunk') | |
parser.add_argument('--chunk-overlap', type=int, default=100, help='the overlap between each chunk') | |
parser.add_argument('--method', type=str, default='nltk', help='the method to use for chunking up the text') | |
args = parser.parse_args() | |
# Get the input text. | |
if args.text: | |
text = args.text | |
else: | |
with open(args.input_file, 'r') as f: | |
text = f.read() | |
# Chunkify the text. | |
result = chunkify(text, args.chunk_size, args.chunk_overlap, args.method) | |
# Write the output to a file. | |
with open(args.output_file, 'w') as f: | |
json.dump(result, f, indent=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment