Created
March 31, 2016 11:03
-
-
Save karimkhanp/4899433eb73091288e72db5ea59421f4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
""" | |
NltkSentTokenize Class for all nltk sent tokenize | |
""" | |
class NltkSentTokenize(object): | |
""" | |
Initialization function of NltkSentTokenize Class | |
""" | |
def __init__(self): | |
pass | |
""" | |
Function to get sentence tokens by nltk sent_tokenize | |
@sent <type 'str'> sentences which need to be tokenize in to sentence | |
@return <type 'list'> list of token sentences | |
""" | |
def getSentTokens(self, sent): | |
from nltk.tokenize import sent_tokenize | |
tokens = sent_tokenize(sent) | |
return tokens | |
""" | |
NltkWordTokenize Class for all nltk word tokenize | |
""" | |
class NltkWordTokenize(object): | |
""" | |
Initialization function of NltkWordTokenize Class | |
""" | |
def __init__(self): | |
pass | |
""" | |
Function to get word tokens by nltk word_tokenize | |
@sent <type 'str'> sentence for which word tokenize need to be done | |
@return <type 'list'> list of token words | |
""" | |
def getWordTokens(self, sent): | |
from nltk.tokenize import word_tokenize | |
tokens = word_tokenize(sent) | |
return tokens | |
""" | |
NltkPosTag Class for all nltk pos tagging | |
""" | |
class NltkPosTag(object): | |
""" | |
Initialization function of NltkWordTokenize Class | |
""" | |
def __init__(self): | |
pass | |
""" | |
Function to get pos tags of words by nltk pos tagging | |
@sent <type 'str'> sentence for which pos tagging need to be done | |
@return <type 'list'> list of pos tagged word | |
""" | |
def getPosTags(self, query_tokens): | |
import nltk | |
pos_tags = nltk.pos_tag(query_tokens) | |
return pos_tags | |
""" | |
NltkStopWords Class for nltk stop words | |
""" | |
class NltkStopWords(object): | |
""" | |
Initialization function of NltkStopWords Class | |
""" | |
def __init__(self): | |
pass | |
""" | |
Function to get stop words | |
@language <type 'str'> stop words of which language | |
@return <type 'list'> list of stop words of the language asked for | |
""" | |
def getStopWords(self, language='english'): | |
from nltk.corpus import stopwords | |
words = stopwords.words(language) | |
stopwords_list = [] | |
for word in words: | |
stopwords_list.append(str(word)) | |
return stopwords_list | |
""" | |
Function to remove stop words | |
@query <type 'str'> tokens of the query from which stopwords needs to be removed | |
@return <type 'str'> return the list of important words removing stop words | |
""" | |
def removeStopWords(self, query_tokens): | |
stop_words = self.getStopWords('english') | |
imp_words = [] | |
for token in query_tokens: | |
if token not in stop_words: | |
imp_words.append(token) | |
return imp_words | |
""" | |
NltkMorphology class for getting the base form of word | |
""" | |
class NltkMorphology(object): | |
""" | |
Function to get the word net lemmatized form of word | |
@word <type 'str'> word which needs to be lemmatize | |
@return <type 'str'> lemmatized form | |
""" | |
def wordNetLemmatiz(self, word, typ='v'): | |
from nltk.stem.wordnet import WordNetLemmatizer | |
return WordNetLemmatizer().lemmatize(word, typ) | |
""" | |
Function to get the lancaster stem form of word | |
@word <type 'str'> word which needs to be stemmed | |
@return <type 'str'> stemmed form | |
""" | |
def lancasterStem(self, word): | |
from nltk.stem.lancaster import LancasterStemmer | |
return LancasterStemmer().stem(word) | |
""" | |
Function to get the base form of a Word | |
@word <type 'str'> word for which we need to get the base from | |
@return <type 'str'> base form of a word | |
""" | |
def getBaseWord(self, word): | |
import en | |
from nltk.corpus import wordnet as wn | |
try: | |
word_list = word.split(' ') | |
base_forms = [] | |
for prop in word_list: | |
append_flag = 0 | |
if prop != '': | |
tag = NltkPosTag().getPosTags(NltkWordTokenize().getWordTokens(prop)) | |
if tag[0][1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: | |
#base_forms.append(WordNetLemmatizer().lemmatize(prop,'v')) | |
base_forms.append(self.wordNetLemmatiz(prop,'v')) | |
else: | |
sin_prop = en.noun.singular(prop) | |
if sin_prop != prop: | |
base_forms.append(sin_prop) | |
else: | |
try: | |
#Converting noun to its euivalent verb | |
lem = wn.lemmas(prop)[0] | |
related_forms = lem.derivationally_related_forms() | |
for related_form in related_forms: | |
if related_form.synset().pos() == 'v': | |
base_forms.append(related_form.name()) | |
append_flag = 1 | |
if related_forms == [] or append_flag != 1: | |
base_forms.append(prop) | |
except Exception: | |
base_forms.append(prop) | |
if base_forms != []: | |
return ' '.join(base_forms) | |
else: | |
return word | |
except Exception: | |
import traceback | |
print traceback.format_exc() | |
class NltkWordNet(object): | |
""" | |
Returns the synonyms of wordnet for a given word | |
@argv <type 'str'> word | |
@return <type 'list'> synonyms list | |
""" | |
def callWordNet(self, word): | |
from nltk.corpus import wordnet as wn | |
synset_list = [] | |
syns = wn.synsets(word) | |
syn_set = [l.name() for s in syns for l in s.lemmas()] | |
synset_list = list(set(syn_set)) | |
return synset_list | |
""" | |
NltkScore Class for nltk ngram scoring | |
""" | |
class NltkScore(object): | |
""" | |
Initialization function of NltkScore class | |
""" | |
def __init__(self): | |
pass | |
""" | |
Function gets the score for pairs of words | |
""" | |
def createScore(self, corpus_data): | |
scored = [] | |
import nltk.collocations | |
from nltk.probability import FreqDist | |
bgm = nltk.collocations.BigramAssocMeasures() | |
tgm = nltk.collocations.TrigramAssocMeasures() | |
file_word_list = [x.lower() for x in corpus_data.split(' ') if x.isalpha()] | |
freq_dist = FreqDist() | |
for word in file_word_list: | |
freq_dist.inc(word) | |
freq_dist_sorted = freq_dist.keys() | |
for word in freq_dist_sorted: | |
list_freq = [] | |
list_freq.append((tuple([word]), freq_dist[word])) | |
scored.extend(list_freq) | |
finder = nltk.collocations.BigramCollocationFinder.from_words(file_word_list) | |
finder.apply_freq_filter(2) | |
scored.extend(finder.score_ngrams( bgm.likelihood_ratio )) | |
finder = nltk.collocations.TrigramCollocationFinder.from_words(file_word_list) | |
finder.apply_freq_filter(2) | |
scored.extend(finder.score_ngrams(tgm.likelihood_ratio)) | |
return scored | |
""" | |
NltkNER Class for nltk ngram scoring | |
""" | |
class NltkNER(object): | |
""" | |
Initialization function of NltkScore class | |
""" | |
def __init__(self): | |
pass | |
def getNounEntities(self, query): | |
import nltk | |
sent_tokens = NltkSentTokenize().getSentTokens(query) | |
Ne_list = [] | |
for sent in sent_tokens: | |
query_tokens = NltkWordTokenize().getWordTokens(sent) | |
query_pos_tags = NltkPosTag().getPosTags(query_tokens) | |
sentt = nltk.ne_chunk(query_pos_tags, binary = True) | |
#Ne_list = [] | |
for subtree in sentt.subtrees(filter=lambda t: t.label() == 'NE'): | |
myNE = [] | |
for leave in subtree.leaves(): | |
myNE.append(str(leave[0])) | |
Ne_list.append(' '.join(myNE).replace(" ","_")) | |
return list(set(Ne_list)) | |
class CorrectSyntax(object): | |
""" | |
Nltk function to proper case the text and get links | |
@argv <type 'string'> sentence | |
@return <type 'string'> syntactic_sentence | |
""" | |
def sentenceCorrectSyntax(self, sentence): | |
import nltk | |
import re | |
words_list = nltk.word_tokenize(sentence) | |
tagged_sent = nltk.pos_tag([word.lower() for word in words_list]) # apply POS-tagging | |
normalized_sent = [w.capitalize() if t in ["NNS"] or w in ['i'] or words_list[i].istitle() else w for (i, (w,t)) in enumerate(tagged_sent)] # infer capitalization from POS-tags | |
normalized_sent[0] = normalized_sent[0].capitalize() # capitalize first word in sentence | |
syntactic_sentence = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent)) # use regular expression to get punctuation right | |
expr_syntactic_sentence_remove = re.compile("((\([^()]*(\([^()]*\))*[^)]*\))+)|((\[[^\[\]]*(\[[^\[\]]*\])*[^\]]*\])+)|((\{[^{}]*(\{[^{}]*\})*[^\}]*\})+)") | |
expr_syntactic_sentence_search = expr_syntactic_sentence_remove.search(syntactic_sentence) | |
syntactic_sentence = expr_syntactic_sentence_remove.sub('', syntactic_sentence) | |
print '\033[95m' + '\nSyntactic Correct Sentence - \n', '\033[0m', syntactic_sentence | |
if expr_syntactic_sentence_search: | |
print '\033[95m' + '\nNeglected part of Sentence - \n', '\033[0m', expr_syntactic_sentence_search.group() | |
return syntactic_sentence | |
if __name__ == '__main__': | |
#print NltkSentTokenize().getSentTokens(sys.argv[1]) | |
#print NltkWordTokenize().getWordTokens(sys.argv[1]) | |
#print NltkNER().getNounEntities(sys.argv[1]) | |
#print NltkMorphology().getBaseWord(sys.argv[1]) | |
print NltkWordNet().callWordNet(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment