This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class DummySentencizer: | |
| def __init__(self, input_text, split_characters=['.','?','!',':'], delimiter_token='<SPLIT>'): | |
| self.sentences = [] | |
| self.raw = str(input_text) | |
| self._split_characters=split_characters | |
| self._delimiter_token=delimiter_token | |
| self._index=0 | |
| self._sentencize() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _sentencize(self): | |
| work_sentence = self.raw | |
| for character in self._split_characters: | |
| work_sentence = work_sentence.replace(character, character+""+self._delimiter_token) | |
| self.sentences = [x.strip() for x in work_sentence.split(self._delimiter_token) if x !=''] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def __iter__(self): | |
| return self | |
| def __next__(self): | |
| if self._index < len(self.sentences): | |
| result = self.sentences[self._index] | |
| self._index+=1 | |
| return result | |
| raise StopIteration |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import string | |
| class DummyTokenizer: | |
| def __init__(self, sentence, token_boundaries=[' ', '-'], | |
| punctuations=string.punctuation, delimiter_token='<SPLIT>'): | |
| self.tokens = [] | |
| self.raw = str(sentence) | |
| self._token_boundaries = token_boundaries | |
| self._delimiter_token = delimiter_token | |
| self._punctuations = punctuations | |
| self._index = 0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def _tokenize(self): | |
| work_sentence = self.raw | |
| for punctuation in self._punctuations: | |
| work_sentence = work_sentence.replace(punctuation, | |
| " "+punctuation+" ") | |
| for delimiter in self._token_boundaries: | |
| work_sentence = work_sentence.replace(delimiter, | |
| self._delimiter_token) | |
| self.tokens = [x.strip() for x in work_sentence.split(self._delimiter_token) if x != ''] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def __iter__(self): | |
| return self | |
| def __next__(self): | |
| if self._index < len(self.tokens): | |
| result = self.tokens[self._index] | |
| self._index+=1 | |
| return result | |
| raise StopIteration |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class Document: | |
| def __init__(self, document_text): | |
| self.raw = document_text | |
| self.sentences = sentencize(self.raw) | |
| self._index = 0 | |
| #[...] | |
| class Sentence: | |
| def __init__(self, start_position, end_position, raw_document_reference): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #[...] | |
| def get(self): | |
| if self.SOS: | |
| return '<SOS>' | |
| elif self.EOS: | |
| return '<EOS>' | |
| else: | |
| return self._sentence_string[self.start_pos:self.end_pos] | |
| # Displays the Token value in the terminal if the variable is called | |
| def __repr__(self): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| DEFAULT_SENTENCE_BOUNDARIES = ['(?<=[0-9]|[^0-9.])(\.)(?=[^0-9.]|[^0-9.]|[\s]|$)','\.{2,}','\!+','\:+','\?+'] | |
| """ | |
| Breaking it down: | |
| (?<=[0-9]|[^0-9.])(\.)(?=[^0-9.]|[^0-9.]|[\s]|$) -> looks for ant period that is not preceded or succeded by a digit or other period. | |
| This avoids the algorithm to split sentences at decimal numbers or reticences. | |
| \.{2,} -> captures reticences. | |
| \!+ -> captures series of exclamation points. | |
| \:+ -> captures series of colons. | |
| \?+ -> captures series of question marks. | |
| """ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| #[...] | |
| def sentencize(raw_input_document, sentence_boundaries = DEFAULT_SENTENCE_BOUNDARIES, delimiter_token='<SPLIT>'): | |
| working_document = raw_input_document | |
| punctuation_patterns = sentence_boundaries | |
| for punct in punctuation_patterns: | |
| working_document = re.sub(punct, '\g<0>'+delimiter_token, working_document, flags=re.UNICODE) | |
| list_of_string_sentences = [x.strip() for x in working_document.split(delimiter_token) if x.strip() != ""] |
OlderNewer