Created
May 12, 2015 09:49
-
-
Save prehensile/a0ce581718ceec7a6dce to your computer and use it in GitHub Desktop.
A Python function to chop up a string into chunks with a maximum length for each chunk. Chops at whitespace for nice separation, rather than just every n characters. Useful, for, say, breaking up a long piece of text into multiple tweets.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def chunk_string( str_in, max_chunk_length ): | |
l = len(str_in) | |
chunks =[] | |
if l <= max_chunk_length: | |
chunks = [str_in] | |
else: | |
spaces = re.finditer( "\s+", str_in ) | |
start_index = 0 | |
do_chunking = True | |
while do_chunking: | |
end_index = start_index + max_chunk_length | |
if end_index > l: | |
end_index = l | |
do_chunking = False | |
if do_chunking: | |
# find the chunk of whitespace closest to end_index | |
end_space = None | |
for space_match in spaces: | |
if space_match.start() > end_index: | |
break | |
if space_match.start() >= start_index: | |
end_space = space_match | |
if end_space: | |
end_index = end_space.start() | |
this_chunk = str_in[start_index:end_index] | |
chunks.append( this_chunk ) | |
start_index = end_index | |
if end_space: | |
start_index = end_space.end() | |
return chunks | |
# simple test case | |
if __name__ == '__main__': | |
t = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Cras imperdiet nec erat ac condimentum. Nulla vel rutrum ligula. Sed hendrerit interdum orci a posuere. Vivamus ut velit aliquet, mollis purus eget, iaculis nisl. Proin posuere malesuada ante. Proin auctor orci eros, ac molestie lorem dictum nec. Vestibulum sit amet erat est. Morbi luctus sed elit ac luctus. Proin blandit, enim vitae egestas posuere, neque elit ultricies dui, vel mattis nibh enim ac lorem. Maecenas molestie nisl sit amet velit dictum lobortis. Aliquam erat volutpat." | |
chunks = chunk_string( t, 140 ) | |
for chunk in chunks: | |
print chunk | |
print len(chunk) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you.