Last active
January 17, 2018 16:44
-
-
Save Madhivarman/f968077a44f3080cbaf8c44d3c4e85ae to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Sentence segmentation, means, to split a given paragraph of text into sentences, by identifying the sentence boundaries. | |
In many cases, a full stop is all that is required to identify the end of a sentence, but the task is not all that simple. | |
This is an open ended challenge to which there are no perfect solutions. Try to break up given paragraphs into text into | |
individual sentences. Even if you don't manage to segment the text perfectly, the more sentences you identify and display | |
correctly, the more you will score.""" | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
class TextSegmentation(): | |
def segmentation(self,text,pos_tagging): | |
count = 0 | |
complete_sentence=[] | |
#loop through the sentence | |
for words in pos_tagging: | |
#count number of sentences | |
if words[1] == '.': | |
count += 1 | |
print("Number of sentences:{}".format(count)) | |
#now split the sentences | |
for words in pos_tagging: | |
if words[1] == '.': | |
replace = words[0].replace(".","stopped") | |
complete_sentence.append(replace) | |
else: | |
complete_sentence.append(words[0]) | |
print("Complete Sentence:{}".format(complete_sentence)) | |
#now we can split | |
join_sentence = " ".join(complete_sentence) | |
#print("joined_sentence:{}".format(join_sentence)) | |
final_splitted_sentence = join_sentence.split("stopped") | |
print("Splitted Sentence is:{}\n".format(final_splitted_sentence)) | |
return final_splitted_sentence | |
def printoutput(self,result): | |
count = 1 | |
print("Final result is:") | |
print("-------------------------------------------------------------") | |
for sent in result: | |
print(count,sent) | |
count +=1 | |
def main(text): | |
word_tokens = word_tokenize(text) | |
#pos tagging | |
pos_tag_sentence = nltk.pos_tag(word_tokens) | |
return pos_tag_sentence | |
def isconditionistrue(user_text,pos): | |
#list to count number of characters | |
char = [] | |
for w in user_text: | |
char.append(w) | |
if len(w) <= 10000: | |
if len(pos) <= 1000: | |
return "1" | |
else: | |
return "0" | |
if __name__ == '__main__': | |
text = raw_input("Enter your text here:\n") | |
#pos_tag sentence | |
#To find its grammer | |
pos_tagging = main(text) #return as list | |
#print(pos_tagging) | |
#condition to meet certain constraints | |
if(isconditionistrue(text,pos_tagging) == '1'): | |
obj = TextSegmentation() | |
split = obj.segmentation(text,pos_tagging) | |
obj.printoutput(split) | |
else: | |
print("Certain constraints failed") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment