Skip to content

Instantly share code, notes, and snippets.

@pombredanne
Created September 22, 2022 10:13
Show Gist options
  • Save pombredanne/131e0cbc5b99e2e0dcd135a15f60c4a4 to your computer and use it in GitHub Desktop.
Save pombredanne/131e0cbc5b99e2e0dcd135a15f60c4a4 to your computer and use it in GitHub Desktop.
Grouping tokens
def group_tokens(tokens):
"""
Return lists of token group lists where all tokens in a group have the same
characteristics. This is a list of lists.
For example::
>>> tok1 = Token(value="tok1", line_num=1, match_ids=[0])
>>> tok2 = Token(value="tok2", line_num=1, match_ids=[1])
>>> tok3 = Token(value="tok3", line_num=1, match_ids=[3])
>>> results = group_tokens([tok1, tok2, tok3])
>>> expected = [[tok1], [tok2], [tok3]]
>>> assert results == expected, f"Failed: {results!r}"
For overlaping matches we have multiple match ids::
>>> tok1 = Token(value="tok1", line_num=1, match_ids=[0, 1])
>>> tok2 = Token(value="tok2", line_num=1, match_ids=[0, 1])
>>> tok3 = Token(value="tok3", line_num=1, match_ids=[1, 2])
>>> tok4 = Token(value="tok4", line_num=1, match_ids=[1, 2])
>>> tok5 = Token(value="tok5", line_num=1, match_ids=[3])
>>> results = group_tokens([tok1, tok2, tok3, tok4, tok5])
>>> expected = [[tok1, tok2], [tok3, tok4], [tok5]]
>>> assert results == expected, f"Failed: {results!r}"
"""
groups = []
current_group = None
for tok in tokens:
if not current_group:
current_group = []
groups.append(current_group)
else:
previous_token = current_group[-1]
if previous_token.match_ids != tok.match_ids:
current_group = []
groups.append(current_group)
current_group.append(tok)
return groups
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment