Created
November 9, 2023 11:53
-
-
Save joao/9907c3c289d0821cc56505257c75e36f to your computer and use it in GitHub Desktop.
get excerpt from text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get excerpt of article ———————————————————————————————————————————————————————————————— | |
def get_excerpt(new_word, article_text, padding): | |
# Article text | |
text = article_text | |
# Find the index of the new_word in the text | |
# Convert both the text and the keyword to lowercase | |
keyword = new_word | |
text_lower = text.lower() | |
keyword_lower = keyword.lower() | |
# Find the index of the keyword in the lowercase text | |
index = text_lower.find(keyword_lower) | |
if index == -1: | |
return None # Keyword not found in text | |
# Find the start and end indices of the excerpt | |
start = max(0, index - padding) | |
while start > 0 and text[start] != ' ': | |
start -= 1 | |
end = min(len(text), index + len(keyword) + padding) | |
while end < len(text) and text[end] != ' ': | |
end += 1 | |
# Extract the excerpt and add ellipses if necessary | |
excerpt = text[start:end].strip() | |
if start == 0: # se for o início do texto | |
excerpt = '"' + excerpt | |
elif start > 0: # se início do excerto não for início do texto | |
excerpt = '"...' + excerpt | |
if end < len(text): # final do excerto | |
excerpt = excerpt + '..."' | |
# Remove any new lines or paragraphs from the excerpt | |
excerpt = excerpt.replace('\n', ' ').replace('\r', '').replace('\r\n', ' ').replace(' ', ' ') | |
return excerpt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment