Skip to content

Instantly share code, notes, and snippets.

View Sirsirious's full-sized avatar

Tiago Duque Sirsirious

  • Brazil
View GitHub Profile
@Sirsirious
Sirsirious / structures.py
Created February 4, 2020 18:50
The tokenize function
def tokenize(raw_input_sentence, join_split_text = True, split_text_char = '\-', punctuation_patterns= DEFAULT_PUNCTUATIONS, split_characters = r'\s|\t|\n|\r', delimiter_token='<SPLIT>'):
working_sentence = raw_input_sentence
#First deal with possible word splits:
if join_split_text:
working_sentence = re.sub('[a-z]+('+split_text_char+'[\n])[a-z]+','', working_sentence)
#Escape punctuation
for punct in punctuation_patterns:
working_sentence = re.sub(punct, " \g<0> ", working_sentence)
#Split at any split_characters
working_sentence = re.sub(split_characters, delimiter_token, working_sentence)
@Sirsirious
Sirsirious / test_tokenizer.py
Last active February 4, 2020 19:17
A simple function to load the Declaration of Human Rights and test our Tokenization/Sentencizing functions
import pprint
declaration = []
with open('data.txt','r') as file:
declaration+=file.readlines()
pp = pprint.PrettyPrinter(indent=2, compact=True)
for line in declaration:
res = {'Document': line, 'Sentences':[]}
@Sirsirious
Sirsirious / stemming.py
Created February 7, 2020 19:21
The classes for our stemmer algorithm.
class AbstractStemmer:
def stem(self, word):
pass
class PorterStemmer(AbstractStemmer):
@Sirsirious
Sirsirious / stemming.py
Last active February 7, 2020 19:37
Function to divide a word into groups of vowels or consonants.
class PorterStemmer(AbstractStemmer):
consonants = "bcdfghjklmnpqrstwxz"
special_case = "y"
vowels = "aeiou"
def _divide_into_groups(self, word):
groups = []
preceding = ""
for idx, letter in enumerate(word.lower()):
if preceding == "":
@Sirsirious
Sirsirious / stemming.py
Created February 7, 2020 19:39
A method to determine the class of a group and a method to encode a whole word - Porter Stemmer parts.
def _determine_class(self, group):
if group[0] in self.consonants:
return 'C'
return 'V'
def _encode_word(self, word):
encoded = self._divide_into_groups(word)
classified = [self._determine_class(group) for group in encoded]
return classified
@Sirsirious
Sirsirious / stemming.py
Created February 7, 2020 19:42
Method to determine the number of m in Porter Stemmer.
def _det_m(self, word):
classes = self._encode_word(word)
if len(classes) < 2:
return 0
if classes[0] == 'C':
classes = classes[1:]
if classes[-1] == 'V':
classes = classes[:len(classes)-1]
m = len(classes)//2 if (len(classes)/2) >= 1 else 0
return m
@Sirsirious
Sirsirious / stemming.py
Created February 10, 2020 14:43
Checking for stem end, stem contains a vowel, stem ends with double consonant or stem ends with cvc.
def _chk_LT(self, stem, lt):
for letter in lt:
if stem.endswith(letter):
return True
return False
def _chk_v(self, stem):
for letter in stem:
if letter in self.vowels:
return True
@Sirsirious
Sirsirious / stemming.py
Created February 10, 2020 17:55
First Step of the Stemmer.
def _porter_step_1(self, word):
"""
Deals with plurals and past participles.
"""
stem = word
stepb2 = False
#Step 1a
if stem.endswith('sses'):
stem = stem[:-2]
@Sirsirious
Sirsirious / stemming.py
Created February 10, 2020 18:00
Step 2 of Porter Stemmer.
def _porter_step_2(self, stem):
pair_tests = [('ational','ate'), ('tional','tion'), ('enci','ence'), ('anci','ance'), ('izer', 'ize'),
('abli','able'), ('alli','al'), ('entli', 'ent'), ('eli', 'e'), ('ousli', 'ous'), ('ization', 'ize'),
('ation', 'ate'), ('ator', 'ate'), ('alism', 'al'), ('iveness', 'ive'), ('fulness', 'ful'),
('ousness', 'ous'), ('aliti','al'), ('ivit', 'ive'), ('biliti','ble')]
if self._det_m(stem) > 0:
for term, subs in pair_tests:
if stem.endswith(term):
return stem[:-len(term)]+subs
return stem
@Sirsirious
Sirsirious / stemming.py
Created February 10, 2020 18:03
Step 3 of porter Stemmer.
def _porter_step_3(self, stem):
pair_tests = [('icate','ic'),('ative',''),('alize','al'),('iciti','ic'),('ical','ic'),('ful',''),('ness','')]
if self._det_m(stem) > 0:
for term, subs in pair_tests:
if stem.endswith(term):
return stem[:-len(term)]+subs
return stem