Last active
June 4, 2022 13:09
-
-
Save xflr6/c0e28a7a8ee4e79b571d24682805ba94 to your computer and use it in GitHub Desktop.
Split a string into chunks by a pattern matching at the start of each item
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Split a string into chunks by a pattern matching at the start of each item. | |
>>> list(itersplit(r'!', 'spam !eggs !ham')) | |
['spam ', '!eggs ', '!ham'] | |
>>> list(itersplit(r'X', 'spam !eggs !ham')) | |
['spam !eggs !ham'] | |
>>> list(itersplit(r'!', '!spam !eggs !ham')) | |
['', '!spam ', '!eggs ', '!ham'] | |
>>> list(itersplit(r'!', '!spam !eggs !ham', no_empty=True)) | |
['!spam ', '!eggs ', '!ham'] | |
>>> re.findall(r'(?s).+?(?=!|\Z)', 'spam !eggs !ham') | |
['spam ', '!eggs ', '!ham'] | |
>>> re.findall(r'(?s).+?(?=X|\Z)', 'spam !eggs !ham') | |
['spam !eggs !ham'] | |
>>> re.findall(r'(?s).+?(?=!|\Z)', '!spam !eggs !ham') | |
['!spam ', '!eggs ', '!ham'] | |
""" | |
from collections.abc import Iterator | |
import itertools | |
import re | |
def itersplit_(start_pattern: str, string: str, *, | |
flags: int = 0) -> Iterator[str]: | |
starts = (ma.start() for ma in re.finditer(start_pattern, string, flags)) | |
starts = itertools.chain([0], starts, [len(string)]) | |
starts, ends = itertools.tee(starts) | |
next(ends) | |
return (string[s:e] for s, e in itertools.izip(starts, ends)) | |
def itersplit(start_pattern, string, *, | |
no_empty: bool = False, | |
flags: int = 0) -> Iterator[str]: | |
matches = re.finditer(start_pattern, string, flags) | |
try: | |
ma = next(matches) | |
except StopIteration: | |
yield string | |
return | |
pos = ma.start() | |
if pos or not no_empty: | |
yield string[:pos] | |
for ma in matches: | |
end = ma.start() | |
yield string[pos:end] | |
pos = end | |
yield string[end:] | |
if __name__ == '__main__': | |
import doctest | |
doctest.testmod() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Split a string into chunks by a string matching at the start of each item.""" | |
from collections.abc import Iterator | |
import functools | |
def finditer(sep: str, string: str, *, | |
no_empty: bool = False) -> Iterator[str]: | |
""" | |
>>> list(finditer('!', 'spam !eggs !ham')) | |
['spam ', '!eggs ', '!ham'] | |
>>> list(finditer('X', 'spam !eggs !ham')) | |
['spam !eggs !ham'] | |
>>> list(finditer('!', '!spam !eggs !ham')) | |
['', '!spam ', '!eggs ', '!ham'] | |
>>> list(finditer('!', '!spam !eggs !ham', no_empty=True)) | |
['!spam ', '!eggs ', '!ham'] | |
""" | |
s = len(sep) | |
find = functools.partial(string.find, sep) | |
i = find() | |
if i < 0: | |
yield string | |
return | |
elif i == 0 and no_empty: | |
i = find(s) | |
yield string[:i] | |
n = find(i + s) | |
while n > 0: | |
yield string[i:n] | |
i, n = n, find(n + s) | |
yield string[i:] | |
def finditer_caret(sep: str, string: str, *, | |
no_empty: bool = False) -> Iterator[str]: | |
r""" | |
>>> list(finditer_caret('!', 'spam\n!eggs\n!ham')) | |
['spam\n', '!eggs\n', '!ham'] | |
>>> list(finditer_caret('X', 'spam\n!eggs\n!ham')) | |
['spam\n!eggs\n!ham'] | |
>>> list(finditer_caret('!', '!spam\n!eggs\n!ham')) | |
['', '!spam\n', '!eggs\n', '!ham'] | |
>>> list(finditer_caret('!', '!spam\n!eggs\n!ham', no_empty=True)) | |
['!spam\n', '!eggs\n', '!ham'] | |
>>> list(finditer_caret('!', '\n!spam\n!eggs\n!ham')) | |
['\n', '!spam\n', '!eggs\n', '!ham'] | |
""" | |
find = functools.partial(string.find, '\n' + sep) | |
if string.find(sep) == 0: | |
if not no_empty: | |
yield string[:0] | |
i = find(len(sep)) | |
else: | |
i = find() | |
if i < 0: | |
yield string | |
return | |
s = 1 + len(sep) | |
yield string[:i + 1] | |
n = find(i + s) | |
while n > 0: | |
yield string[i + 1:n + 1] | |
i, n = n, find(n + s) | |
yield string[i + 1:] | |
def itersplit(sep: str, string: str, *, | |
no_empty: bool = False) -> Iterator[str]: | |
r""" | |
>>> list(itersplit('!', 'spam !eggs !ham')) | |
['spam ', '!eggs ', '!ham'] | |
>>> list(itersplit('^!', 'spam\n!eggs\n!ham')) | |
['spam\n', '!eggs\n', '!ham'] | |
>>> list(itersplit('X', 'spam !eggs !ham')) | |
['spam !eggs !ham'] | |
>>> list(itersplit('^X', 'spam\n!eggs\n!ham')) | |
['spam\n!eggs\n!ham'] | |
>>> list(itersplit('!', '!spam !eggs !ham')) | |
['', '!spam ', '!eggs ', '!ham'] | |
>>> list(itersplit('^!', '!spam\n!eggs\n!ham')) | |
['', '!spam\n', '!eggs\n', '!ham'] | |
>>> list(itersplit('!', '!spam !eggs !ham', no_empty=True)) | |
['!spam ', '!eggs ', '!ham'] | |
>>> list(itersplit('^!', '!spam\n!eggs\n!ham', no_empty=True)) | |
['!spam\n', '!eggs\n', '!ham'] | |
""" | |
sep, caret = (sep[1:], True) if sep.startswith('^') else (sep, False) | |
s = caret + len(sep) | |
if caret: | |
find = functools.partial(string.find, '\n' + sep) | |
if string.find(sep) == 0: | |
if not no_empty: | |
yield string[:0] | |
i = find(len(sep)) | |
else: | |
i = find() | |
else: | |
find = functools.partial(string.find, sep) | |
i = find() | |
if i == 0 and no_empty: | |
i = find(s) | |
if i < 0: | |
yield string | |
return | |
yield string[:i + caret] | |
n = find(i + s) | |
while n > 0: | |
yield string[i + caret:n + caret] | |
i, n = n, find(n + s) | |
yield string[i + caret:] | |
if __name__ == '__main__': | |
import doctest | |
doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment