Created
June 8, 2024 15:23
-
-
Save wonderbeyond/257318385ab20c75761c4913bf2ae4ab to your computer and use it in GitHub Desktop.
[Python]Split text into chunks with overlap.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def make_chunks(text, chunk_size: int, overlap: int = 0): | |
"""Split text into chunks with overlap.""" | |
chunks = [] | |
len_ = len(text) | |
for i in range(0, len_, chunk_size - overlap): | |
chunks.append(text[i: i + chunk_size]) | |
if i + chunk_size >= len_: | |
break | |
return chunks | |
if __name__ == "__main__": | |
assert make_chunks("abcd", chunk_size=2) == ["ab", "cd"] | |
assert make_chunks("abcde", chunk_size=2) == ["ab", "cd", "e"] | |
assert make_chunks( | |
"The weather is lovely today.", | |
chunk_size=8, | |
overlap=3, | |
) == [ | |
"The weat", "eather i", "r is lov", "lovely t", "y today." | |
] | |
assert make_chunks("Are you OK?", chunk_size=8, overlap=3) == [ | |
"Are you ", "ou OK?" | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment