Last active
March 9, 2021 05:35
-
-
Save vindard/b2eb2af5688f40bb8ab8bbdcd3fb229b to your computer and use it in GitHub Desktop.
Splits a paragraph string into smaller paragraph strings based on the 'max_lines' number passed in
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List | |
def split_para(para_string: str, max_lines: int = 3) -> List[str]: | |
if not max_lines > 0: | |
print(f"Error: Please pass a 'max_lines' arg greater than 0") | |
return [para_string] | |
# Get 1st 'max chunk' of para_string and add to result | |
para_lines = para_string.split('\n') | |
para_1st_string = '\n'.join(para_lines[:max_lines]) | |
result = [para_1st_string] | |
# Get 2nd chunk of para_string and potentially process further recursively | |
para_rest_string = '\n'.join(para_lines[max_lines:]) | |
if para_rest_string: | |
result += split_para(para_rest_string, max_lines) | |
return result | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from typing import List | |
def split_para_regex(para_string: str, max_lines: int = 3) -> List[str]: | |
unchanged_result = [para_string] | |
if not max_lines > 0: | |
print(f"Error: Please pass a 'max_lines' arg greater than 0") | |
return unchanged_result | |
# Fetch all max-lines-sized chunks | |
regex = r".*\n"*max_lines + r"?" | |
result = re.findall(regex, para_string) | |
# Clean null chunks and trim extra '\n' chars | |
result = list(filter(None, result)) | |
result = [para.strip('\n') for para in result] | |
# Return original para when max_size is greater than number of lines | |
result = result or unchanged_result | |
# Append any remaining chunks smaller than the max_lines size | |
result_string = '\n'.join(result) | |
rest = para_string.split(result_string) | |
rest = filter(None, rest) | |
if rest: | |
rest = [para.strip('\n') for para in rest] | |
result += rest | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment