Skip to content

Instantly share code, notes, and snippets.

@karimkhanp
Created March 31, 2016 11:03
Show Gist options
  • Save karimkhanp/4899433eb73091288e72db5ea59421f4 to your computer and use it in GitHub Desktop.
Save karimkhanp/4899433eb73091288e72db5ea59421f4 to your computer and use it in GitHub Desktop.
import sys
"""
NltkSentTokenize Class for all nltk sent tokenize
"""
class NltkSentTokenize(object):
"""
Initialization function of NltkSentTokenize Class
"""
def __init__(self):
pass
"""
Function to get sentence tokens by nltk sent_tokenize
@sent <type 'str'> sentences which need to be tokenize in to sentence
@return <type 'list'> list of token sentences
"""
def getSentTokens(self, sent):
from nltk.tokenize import sent_tokenize
tokens = sent_tokenize(sent)
return tokens
"""
NltkWordTokenize Class for all nltk word tokenize
"""
class NltkWordTokenize(object):
"""
Initialization function of NltkWordTokenize Class
"""
def __init__(self):
pass
"""
Function to get word tokens by nltk word_tokenize
@sent <type 'str'> sentence for which word tokenize need to be done
@return <type 'list'> list of token words
"""
def getWordTokens(self, sent):
from nltk.tokenize import word_tokenize
tokens = word_tokenize(sent)
return tokens
"""
NltkPosTag Class for all nltk pos tagging
"""
class NltkPosTag(object):
"""
Initialization function of NltkWordTokenize Class
"""
def __init__(self):
pass
"""
Function to get pos tags of words by nltk pos tagging
@sent <type 'str'> sentence for which pos tagging need to be done
@return <type 'list'> list of pos tagged word
"""
def getPosTags(self, query_tokens):
import nltk
pos_tags = nltk.pos_tag(query_tokens)
return pos_tags
"""
NltkStopWords Class for nltk stop words
"""
class NltkStopWords(object):
"""
Initialization function of NltkStopWords Class
"""
def __init__(self):
pass
"""
Function to get stop words
@language <type 'str'> stop words of which language
@return <type 'list'> list of stop words of the language asked for
"""
def getStopWords(self, language='english'):
from nltk.corpus import stopwords
words = stopwords.words(language)
stopwords_list = []
for word in words:
stopwords_list.append(str(word))
return stopwords_list
"""
Function to remove stop words
@query <type 'str'> tokens of the query from which stopwords needs to be removed
@return <type 'str'> return the list of important words removing stop words
"""
def removeStopWords(self, query_tokens):
stop_words = self.getStopWords('english')
imp_words = []
for token in query_tokens:
if token not in stop_words:
imp_words.append(token)
return imp_words
"""
NltkMorphology class for getting the base form of word
"""
class NltkMorphology(object):
"""
Function to get the word net lemmatized form of word
@word <type 'str'> word which needs to be lemmatize
@return <type 'str'> lemmatized form
"""
def wordNetLemmatiz(self, word, typ='v'):
from nltk.stem.wordnet import WordNetLemmatizer
return WordNetLemmatizer().lemmatize(word, typ)
"""
Function to get the lancaster stem form of word
@word <type 'str'> word which needs to be stemmed
@return <type 'str'> stemmed form
"""
def lancasterStem(self, word):
from nltk.stem.lancaster import LancasterStemmer
return LancasterStemmer().stem(word)
"""
Function to get the base form of a Word
@word <type 'str'> word for which we need to get the base from
@return <type 'str'> base form of a word
"""
def getBaseWord(self, word):
import en
from nltk.corpus import wordnet as wn
try:
word_list = word.split(' ')
base_forms = []
for prop in word_list:
append_flag = 0
if prop != '':
tag = NltkPosTag().getPosTags(NltkWordTokenize().getWordTokens(prop))
if tag[0][1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
#base_forms.append(WordNetLemmatizer().lemmatize(prop,'v'))
base_forms.append(self.wordNetLemmatiz(prop,'v'))
else:
sin_prop = en.noun.singular(prop)
if sin_prop != prop:
base_forms.append(sin_prop)
else:
try:
#Converting noun to its euivalent verb
lem = wn.lemmas(prop)[0]
related_forms = lem.derivationally_related_forms()
for related_form in related_forms:
if related_form.synset().pos() == 'v':
base_forms.append(related_form.name())
append_flag = 1
if related_forms == [] or append_flag != 1:
base_forms.append(prop)
except Exception:
base_forms.append(prop)
if base_forms != []:
return ' '.join(base_forms)
else:
return word
except Exception:
import traceback
print traceback.format_exc()
class NltkWordNet(object):
"""
Returns the synonyms of wordnet for a given word
@argv <type 'str'> word
@return <type 'list'> synonyms list
"""
def callWordNet(self, word):
from nltk.corpus import wordnet as wn
synset_list = []
syns = wn.synsets(word)
syn_set = [l.name() for s in syns for l in s.lemmas()]
synset_list = list(set(syn_set))
return synset_list
"""
NltkScore Class for nltk ngram scoring
"""
class NltkScore(object):
"""
Initialization function of NltkScore class
"""
def __init__(self):
pass
"""
Function gets the score for pairs of words
"""
def createScore(self, corpus_data):
scored = []
import nltk.collocations
from nltk.probability import FreqDist
bgm = nltk.collocations.BigramAssocMeasures()
tgm = nltk.collocations.TrigramAssocMeasures()
file_word_list = [x.lower() for x in corpus_data.split(' ') if x.isalpha()]
freq_dist = FreqDist()
for word in file_word_list:
freq_dist.inc(word)
freq_dist_sorted = freq_dist.keys()
for word in freq_dist_sorted:
list_freq = []
list_freq.append((tuple([word]), freq_dist[word]))
scored.extend(list_freq)
finder = nltk.collocations.BigramCollocationFinder.from_words(file_word_list)
finder.apply_freq_filter(2)
scored.extend(finder.score_ngrams( bgm.likelihood_ratio ))
finder = nltk.collocations.TrigramCollocationFinder.from_words(file_word_list)
finder.apply_freq_filter(2)
scored.extend(finder.score_ngrams(tgm.likelihood_ratio))
return scored
"""
NltkNER Class for nltk ngram scoring
"""
class NltkNER(object):
"""
Initialization function of NltkScore class
"""
def __init__(self):
pass
def getNounEntities(self, query):
import nltk
sent_tokens = NltkSentTokenize().getSentTokens(query)
Ne_list = []
for sent in sent_tokens:
query_tokens = NltkWordTokenize().getWordTokens(sent)
query_pos_tags = NltkPosTag().getPosTags(query_tokens)
sentt = nltk.ne_chunk(query_pos_tags, binary = True)
#Ne_list = []
for subtree in sentt.subtrees(filter=lambda t: t.label() == 'NE'):
myNE = []
for leave in subtree.leaves():
myNE.append(str(leave[0]))
Ne_list.append(' '.join(myNE).replace(" ","_"))
return list(set(Ne_list))
class CorrectSyntax(object):
"""
Nltk function to proper case the text and get links
@argv <type 'string'> sentence
@return <type 'string'> syntactic_sentence
"""
def sentenceCorrectSyntax(self, sentence):
import nltk
import re
words_list = nltk.word_tokenize(sentence)
tagged_sent = nltk.pos_tag([word.lower() for word in words_list]) # apply POS-tagging
normalized_sent = [w.capitalize() if t in ["NNS"] or w in ['i'] or words_list[i].istitle() else w for (i, (w,t)) in enumerate(tagged_sent)] # infer capitalization from POS-tags
normalized_sent[0] = normalized_sent[0].capitalize() # capitalize first word in sentence
syntactic_sentence = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent)) # use regular expression to get punctuation right
expr_syntactic_sentence_remove = re.compile("((\([^()]*(\([^()]*\))*[^)]*\))+)|((\[[^\[\]]*(\[[^\[\]]*\])*[^\]]*\])+)|((\{[^{}]*(\{[^{}]*\})*[^\}]*\})+)")
expr_syntactic_sentence_search = expr_syntactic_sentence_remove.search(syntactic_sentence)
syntactic_sentence = expr_syntactic_sentence_remove.sub('', syntactic_sentence)
print '\033[95m' + '\nSyntactic Correct Sentence - \n', '\033[0m', syntactic_sentence
if expr_syntactic_sentence_search:
print '\033[95m' + '\nNeglected part of Sentence - \n', '\033[0m', expr_syntactic_sentence_search.group()
return syntactic_sentence
if __name__ == '__main__':
#print NltkSentTokenize().getSentTokens(sys.argv[1])
#print NltkWordTokenize().getWordTokens(sys.argv[1])
#print NltkNER().getNounEntities(sys.argv[1])
#print NltkMorphology().getBaseWord(sys.argv[1])
print NltkWordNet().callWordNet(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment