Last active
July 3, 2020 14:31
-
-
Save dardanxhymshiti/4328a75149ff06e70df4a6cc440dc40e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_context(text, list_of_tokens, context_span=20): | |
import re | |
context = [] | |
for token in list_of_tokens: | |
all_occurences_indices = [m.start() for m in re.finditer(token, text)] | |
for index in all_occurences_indices: | |
left_index = max(index - context_span, 0) | |
right_index = min(index + context_span, len(text)) | |
substring = text[left_index: right_index].strip() | |
if left_index == 0: | |
substring = ' '.join(substring.split(' ')[:-1]) | |
elif right_index == len(text): | |
substring = ' '.join(substring.split(' ')[1:]) | |
else: | |
substring = ' '.join(substring.split(' ')[1:-1]) | |
context.append(substring) | |
return context | |
# Test | |
text = """Twitter said Tuesday that Trump's tweets about mail-in votin did not violate the company's rules because they don't explicitely discourage people from voting. But, the company said, the label offers context surrounding Trump's claims""" | |
get_context(text, ['Trump'], context_span=30) | |
# [ "Twitter said Tuesday that Trump's tweets about mail-in", "offers context surrounding Trump's claims" ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment