Skip to content

Instantly share code, notes, and snippets.

@dardanxhymshiti
Last active July 3, 2020 14:31
Show Gist options
  • Save dardanxhymshiti/4328a75149ff06e70df4a6cc440dc40e to your computer and use it in GitHub Desktop.
Save dardanxhymshiti/4328a75149ff06e70df4a6cc440dc40e to your computer and use it in GitHub Desktop.
def get_context(text, list_of_tokens, context_span=20):
import re
context = []
for token in list_of_tokens:
all_occurences_indices = [m.start() for m in re.finditer(token, text)]
for index in all_occurences_indices:
left_index = max(index - context_span, 0)
right_index = min(index + context_span, len(text))
substring = text[left_index: right_index].strip()
if left_index == 0:
substring = ' '.join(substring.split(' ')[:-1])
elif right_index == len(text):
substring = ' '.join(substring.split(' ')[1:])
else:
substring = ' '.join(substring.split(' ')[1:-1])
context.append(substring)
return context
# Test
text = """Twitter said Tuesday that Trump's tweets about mail-in votin did not violate the company's rules because they don't explicitely discourage people from voting. But, the company said, the label offers context surrounding Trump's claims"""
get_context(text, ['Trump'], context_span=30)
# [ "Twitter said Tuesday that Trump's tweets about mail-in", "offers context surrounding Trump's claims" ]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment