sharvaridhote · January 10, 2021 23:32
diff --git a/sentencer.py b/sentencer.py
 def custom_sentence_boundary(doc):
    #  function to split sentences at the end of citation bracket and no splitting at some other charecters
    for i, token in enumerate(doc):
        if token.text == ']':
            doc[i + 1].sent_start = True
    return doc

 def sentence_tokenization(text_batches):
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(custom_sentence_boundary, before='parser')
    sents_list = []
    for index, elem in enumerate(text_batches):
        doc = nlp(text_batches[index])
        for sent in doc.sents:
            sents_list.append(sent.text)
    df = pd.DataFrame(sents_list, columns=['text'])
    df.to_csv('sents_list.csv', index=False)
    return sents_list
	def custom_sentence_boundary(doc):
	# function to split sentences at the end of citation bracket and no splitting at some other charecters
	for i, token in enumerate(doc):
	if token.text == ']':
	doc[i + 1].sent_start = True
	return doc

	def sentence_tokenization(text_batches):
	nlp = spacy.load('en_core_web_sm')
	nlp.add_pipe(custom_sentence_boundary, before='parser')
	sents_list = []
	for index, elem in enumerate(text_batches):
	doc = nlp(text_batches[index])
	for sent in doc.sents:
	sents_list.append(sent.text)
	df = pd.DataFrame(sents_list, columns=['text'])
	df.to_csv('sents_list.csv', index=False)
	return sents_list
No results found