Skip to content

Instantly share code, notes, and snippets.

@Madhivarman
Last active January 17, 2018 16:44
Show Gist options
  • Save Madhivarman/f968077a44f3080cbaf8c44d3c4e85ae to your computer and use it in GitHub Desktop.
Save Madhivarman/f968077a44f3080cbaf8c44d3c4e85ae to your computer and use it in GitHub Desktop.
"""Sentence segmentation, means, to split a given paragraph of text into sentences, by identifying the sentence boundaries.
In many cases, a full stop is all that is required to identify the end of a sentence, but the task is not all that simple.
This is an open ended challenge to which there are no perfect solutions. Try to break up given paragraphs into text into
individual sentences. Even if you don't manage to segment the text perfectly, the more sentences you identify and display
correctly, the more you will score."""
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
class TextSegmentation():
def segmentation(self,text,pos_tagging):
count = 0
complete_sentence=[]
#loop through the sentence
for words in pos_tagging:
#count number of sentences
if words[1] == '.':
count += 1
print("Number of sentences:{}".format(count))
#now split the sentences
for words in pos_tagging:
if words[1] == '.':
replace = words[0].replace(".","stopped")
complete_sentence.append(replace)
else:
complete_sentence.append(words[0])
print("Complete Sentence:{}".format(complete_sentence))
#now we can split
join_sentence = " ".join(complete_sentence)
#print("joined_sentence:{}".format(join_sentence))
final_splitted_sentence = join_sentence.split("stopped")
print("Splitted Sentence is:{}\n".format(final_splitted_sentence))
return final_splitted_sentence
def printoutput(self,result):
count = 1
print("Final result is:")
print("-------------------------------------------------------------")
for sent in result:
print(count,sent)
count +=1
def main(text):
word_tokens = word_tokenize(text)
#pos tagging
pos_tag_sentence = nltk.pos_tag(word_tokens)
return pos_tag_sentence
def isconditionistrue(user_text,pos):
#list to count number of characters
char = []
for w in user_text:
char.append(w)
if len(w) <= 10000:
if len(pos) <= 1000:
return "1"
else:
return "0"
if __name__ == '__main__':
text = raw_input("Enter your text here:\n")
#pos_tag sentence
#To find its grammer
pos_tagging = main(text) #return as list
#print(pos_tagging)
#condition to meet certain constraints
if(isconditionistrue(text,pos_tagging) == '1'):
obj = TextSegmentation()
split = obj.segmentation(text,pos_tagging)
obj.printoutput(split)
else:
print("Certain constraints failed")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment