Last active
January 10, 2021 23:32
-
-
Save sharvaridhote/2741dbbb44a6f67f5a1df12b99404c5e to your computer and use it in GitHub Desktop.
Custom Sentence Segmentation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def custom_sentence_boundary(doc): | |
| # function to split sentences at the end of citation bracket and no splitting at some other charecters | |
| for i, token in enumerate(doc): | |
| if token.text == ']': | |
| doc[i + 1].sent_start = True | |
| return doc | |
| def sentence_tokenization(text_batches): | |
| nlp = spacy.load('en_core_web_sm') | |
| nlp.add_pipe(custom_sentence_boundary, before='parser') | |
| sents_list = [] | |
| for index, elem in enumerate(text_batches): | |
| doc = nlp(text_batches[index]) | |
| for sent in doc.sents: | |
| sents_list.append(sent.text) | |
| df = pd.DataFrame(sents_list, columns=['text']) | |
| df.to_csv('sents_list.csv', index=False) | |
| return sents_list |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment