Skip to content

Instantly share code, notes, and snippets.

View dardanxhymshiti's full-sized avatar
🤓
That tall guy who codes!

DardanX dardanxhymshiti

🤓
That tall guy who codes!
View GitHub Profile
from docx import Document
from docx.shared import Cm, Pt
article_1 = """Bayern Munich came out on top in a thrilling German Cup final, beating Bayer Leverkusen 4-2 to secure its 20th title and remain on course for an historic treble.
David Alaba's stunning free kick and Serge Gnabry's clinical finish gave Bayern a commanding lead heading into half time and Hans-Dieter Flick's side seemingly already had one hand on the trophy.
However, Leverkusen responded well early in the second half and had a golden opportunity to halve the deficit through substitute Kevin Volland."""
article_2 = """(CNN)Liverpool got its Premier League title-winning celebrations back on track with a 2-0 win over Aston Villa, just days after being on the receiving end of a record-equaling defeat.
Many had suggested Jurgen Klopp's side was suffering from something of a hangover during Thursday's 4-0 demolition at the hands of Manchester City -- the joint-heaviest defeat by a team already crowned Premier League champion -- but Liverpool re
def describe_text(text):
import re, string
description = dict()
# remove punctuation marks
text_wo_punctuation_marks = re.sub(f'[%s]' % re.escape(string.punctuation), '', text)
# tokens of the text without punctuation marks
tokens_of_text_wo_punctuation_marks = text_wo_punctuation_marks.split(' ')
def remove_punctuation_marks(text):
import string
import re
pattern = f'[%s]' % re.escape(string.punctuation)
text_wo_punctuation_marks = re.sub(pattern, '', text)
return text_wo_punctuation_marks
# Test
text = """Hello, World!"""
def get_text_within_brackets(text):
import re
pattern = r"[\(|\[|\{](.*?)[\)|\]|\}]"
list_of_findings = re.findall(pattern, text)
return list_of_findings
# Test
text = '''I was very surprised (and it's pretty hard to surprise me!)... He [Felix] is a gret friends of me...'''
get_text_within_brackets(text)
def get_consequent_title_words(text):
import re
pattern_compiled = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M)
list_of_sentences = re.findall(pattern_compiled, text)
list_of_sentence_tokens = [sentence.split(' ') for sentence in list_of_sentences]
list_of_consequent_tokens = list()
for tokens in list_of_sentence_tokens:
temp_list_of_title_tokens = list()
for index, t in enumerate(tokens):
def get_context(text, list_of_tokens, context_span=20):
import re
context = []
for token in list_of_tokens:
all_occurences_indices = [m.start() for m in re.finditer(token, text)]
for index in all_occurences_indices:
left_index = max(index - context_span, 0)
right_index = min(index + context_span, len(text))
substring = text[left_index: right_index].strip()
def get_sentences(text):
import re
pattern = r'([A-Z][^\.!?]*[\.!?])'
pattern_compiled = re.compile(pattern, re.M)
list_of_sentences = re.findall(pattern, text)
return list_of_sentences
# Test
text = """This is the most frequent question we're asked by prospective students. And our response? Absolutely! We've trained people from all walks of life."""
def get_capital_words(text):
import re
pattern = r'(\b[A-Z]{2,}\b)'
list_of_capital_words = re.findall(pattern, text)
return list_of_capital_words;
# Test
text = """Thank you! Your customer service request has been logged. A specialist will reach out by EOD"""
get_capital_words(text)
def get_text_within_quotes(text):
import re
pattern = "\"(.*?)\""
list_of_findings = re.findall(pattern, text)
return list_of_findings
# Test
text = """The sign said, "Walk". Then it said, "Don't Walk" then, "Walk" all within thirty seconds"""
get_text_within_quotes(text)
def get_numbers_from_text(text):
import re
pattern = '[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?'
list_of_numbers = re.findall(pattern, text)
return list_of_numbers
# Test
text = """A rise in cases was re[prted acrpss a staggering 36 US states last week. In Florida, officals recorded 9,585 new cases on Saturday."""
get_numbers_from_text(text)