This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from docx import Document | |
from docx.shared import Cm, Pt | |
article_1 = """Bayern Munich came out on top in a thrilling German Cup final, beating Bayer Leverkusen 4-2 to secure its 20th title and remain on course for an historic treble. | |
David Alaba's stunning free kick and Serge Gnabry's clinical finish gave Bayern a commanding lead heading into half time and Hans-Dieter Flick's side seemingly already had one hand on the trophy. | |
However, Leverkusen responded well early in the second half and had a golden opportunity to halve the deficit through substitute Kevin Volland.""" | |
article_2 = """(CNN)Liverpool got its Premier League title-winning celebrations back on track with a 2-0 win over Aston Villa, just days after being on the receiving end of a record-equaling defeat. | |
Many had suggested Jurgen Klopp's side was suffering from something of a hangover during Thursday's 4-0 demolition at the hands of Manchester City -- the joint-heaviest defeat by a team already crowned Premier League champion -- but Liverpool re |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def describe_text(text): | |
import re, string | |
description = dict() | |
# remove punctuation marks | |
text_wo_punctuation_marks = re.sub(f'[%s]' % re.escape(string.punctuation), '', text) | |
# tokens of the text without punctuation marks | |
tokens_of_text_wo_punctuation_marks = text_wo_punctuation_marks.split(' ') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def remove_punctuation_marks(text): | |
import string | |
import re | |
pattern = f'[%s]' % re.escape(string.punctuation) | |
text_wo_punctuation_marks = re.sub(pattern, '', text) | |
return text_wo_punctuation_marks | |
# Test | |
text = """Hello, World!""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_text_within_brackets(text): | |
import re | |
pattern = r"[\(|\[|\{](.*?)[\)|\]|\}]" | |
list_of_findings = re.findall(pattern, text) | |
return list_of_findings | |
# Test | |
text = '''I was very surprised (and it's pretty hard to surprise me!)... He [Felix] is a gret friends of me...''' | |
get_text_within_brackets(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_consequent_title_words(text): | |
import re | |
pattern_compiled = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M) | |
list_of_sentences = re.findall(pattern_compiled, text) | |
list_of_sentence_tokens = [sentence.split(' ') for sentence in list_of_sentences] | |
list_of_consequent_tokens = list() | |
for tokens in list_of_sentence_tokens: | |
temp_list_of_title_tokens = list() | |
for index, t in enumerate(tokens): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_context(text, list_of_tokens, context_span=20): | |
import re | |
context = [] | |
for token in list_of_tokens: | |
all_occurences_indices = [m.start() for m in re.finditer(token, text)] | |
for index in all_occurences_indices: | |
left_index = max(index - context_span, 0) | |
right_index = min(index + context_span, len(text)) | |
substring = text[left_index: right_index].strip() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_sentences(text): | |
import re | |
pattern = r'([A-Z][^\.!?]*[\.!?])' | |
pattern_compiled = re.compile(pattern, re.M) | |
list_of_sentences = re.findall(pattern, text) | |
return list_of_sentences | |
# Test | |
text = """This is the most frequent question we're asked by prospective students. And our response? Absolutely! We've trained people from all walks of life.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_capital_words(text): | |
import re | |
pattern = r'(\b[A-Z]{2,}\b)' | |
list_of_capital_words = re.findall(pattern, text) | |
return list_of_capital_words; | |
# Test | |
text = """Thank you! Your customer service request has been logged. A specialist will reach out by EOD""" | |
get_capital_words(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_text_within_quotes(text): | |
import re | |
pattern = "\"(.*?)\"" | |
list_of_findings = re.findall(pattern, text) | |
return list_of_findings | |
# Test | |
text = """The sign said, "Walk". Then it said, "Don't Walk" then, "Walk" all within thirty seconds""" | |
get_text_within_quotes(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_numbers_from_text(text): | |
import re | |
pattern = '[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?' | |
list_of_numbers = re.findall(pattern, text) | |
return list_of_numbers | |
# Test | |
text = """A rise in cases was re[prted acrpss a staggering 36 US states last week. In Florida, officals recorded 9,585 new cases on Saturday.""" | |
get_numbers_from_text(text) |
NewerOlder