This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def tokenize(raw_input_sentence, join_split_text = True, split_text_char = '\-', punctuation_patterns= DEFAULT_PUNCTUATIONS, split_characters = r'\s|\t|\n|\r', delimiter_token='<SPLIT>'): | |
| working_sentence = raw_input_sentence | |
| #First deal with possible word splits: | |
| if join_split_text: | |
| working_sentence = re.sub('[a-z]+('+split_text_char+'[\n])[a-z]+','', working_sentence) | |
| #Escape punctuation | |
| for punct in punctuation_patterns: | |
| working_sentence = re.sub(punct, " \g<0> ", working_sentence) | |
| #Split at any split_characters | |
| working_sentence = re.sub(split_characters, delimiter_token, working_sentence) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pprint | |
| declaration = [] | |
| with open('data.txt','r') as file: | |
| declaration+=file.readlines() | |
| pp = pprint.PrettyPrinter(indent=2, compact=True) | |
| for line in declaration: | |
| res = {'Document': line, 'Sentences':[]} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class AbstractStemmer: | |
| def stem(self, word): | |
| pass | |
| class PorterStemmer(AbstractStemmer): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class PorterStemmer(AbstractStemmer): | |
| consonants = "bcdfghjklmnpqrstwxz" | |
| special_case = "y" | |
| vowels = "aeiou" | |
| def _divide_into_groups(self, word): | |
| groups = [] | |
| preceding = "" | |
| for idx, letter in enumerate(word.lower()): | |
| if preceding == "": |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _determine_class(self, group): | |
| if group[0] in self.consonants: | |
| return 'C' | |
| return 'V' | |
| def _encode_word(self, word): | |
| encoded = self._divide_into_groups(word) | |
| classified = [self._determine_class(group) for group in encoded] | |
| return classified |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _det_m(self, word): | |
| classes = self._encode_word(word) | |
| if len(classes) < 2: | |
| return 0 | |
| if classes[0] == 'C': | |
| classes = classes[1:] | |
| if classes[-1] == 'V': | |
| classes = classes[:len(classes)-1] | |
| m = len(classes)//2 if (len(classes)/2) >= 1 else 0 | |
| return m |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _chk_LT(self, stem, lt): | |
| for letter in lt: | |
| if stem.endswith(letter): | |
| return True | |
| return False | |
| def _chk_v(self, stem): | |
| for letter in stem: | |
| if letter in self.vowels: | |
| return True |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _porter_step_1(self, word): | |
| """ | |
| Deals with plurals and past participles. | |
| """ | |
| stem = word | |
| stepb2 = False | |
| #Step 1a | |
| if stem.endswith('sses'): | |
| stem = stem[:-2] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _porter_step_2(self, stem): | |
| pair_tests = [('ational','ate'), ('tional','tion'), ('enci','ence'), ('anci','ance'), ('izer', 'ize'), | |
| ('abli','able'), ('alli','al'), ('entli', 'ent'), ('eli', 'e'), ('ousli', 'ous'), ('ization', 'ize'), | |
| ('ation', 'ate'), ('ator', 'ate'), ('alism', 'al'), ('iveness', 'ive'), ('fulness', 'ful'), | |
| ('ousness', 'ous'), ('aliti','al'), ('ivit', 'ive'), ('biliti','ble')] | |
| if self._det_m(stem) > 0: | |
| for term, subs in pair_tests: | |
| if stem.endswith(term): | |
| return stem[:-len(term)]+subs | |
| return stem |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _porter_step_3(self, stem): | |
| pair_tests = [('icate','ic'),('ative',''),('alize','al'),('iciti','ic'),('ical','ic'),('ful',''),('ness','')] | |
| if self._det_m(stem) > 0: | |
| for term, subs in pair_tests: | |
| if stem.endswith(term): | |
| return stem[:-len(term)]+subs | |
| return stem |