Created
October 13, 2020 17:27
-
-
Save ishritam/006b5974e34e01eb2ea4abb00ab20f5c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=UTF-8 | |
import pickle | |
import nltk | |
from nltk.corpus import brown | |
#TextBlob FastNPExtractor + ConllExtractor | |
# Textblob | |
from textblob import TextBlob | |
from textblob.np_extractors import FastNPExtractor | |
from textblob.np_extractors import ConllExtractor | |
#spacy | |
import spacy | |
nlp = spacy.load('en') | |
import nltk | |
import nltk | |
nltk.download('brown') | |
import nltk | |
nltk.download('conll2000') | |
dbfile = open('PivotX_English_Common_Words_Curpos', 'rb') | |
PivotX_English_Common_Words_Curpos = pickle.load(dbfile) | |
# This is a fast and simple noun phrase extractor (based on NLTK) | |
# Feel free to use it, just keep a link back to this post | |
# http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/ | |
# Create by Shlomi Babluki | |
# May, 2013 | |
# This is our fast Part of Speech tagger | |
############################################################################# | |
brown_train = brown.tagged_sents(categories='news') | |
regexp_tagger = nltk.RegexpTagger( | |
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), | |
(r'(-|:|;)$', ':'), | |
(r'\'*$', 'MD'), | |
(r'(The|the|A|a|An|an)$', 'AT'), | |
(r'.*able$', 'JJ'), | |
(r'^[A-Z].*$', 'NNP'), | |
(r'.*ness$', 'NN'), | |
(r'.*ly$', 'RB'), | |
(r'.*s$', 'NNS'), | |
(r'.*ing$', 'VBG'), | |
(r'.*ed$', 'VBD'), | |
(r'.*', 'NN') | |
]) | |
unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger) | |
bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger) | |
############################################################################# | |
# This is our semi-CFG; Extend it according to your own needs | |
############################################################################# | |
cfg = {} | |
cfg["NNP+NNP"] = "NNP" | |
cfg["NN+NN"] = "NNI" | |
cfg["NNI+NN"] = "NNI" | |
cfg["JJ+JJ"] = "JJ" | |
cfg["JJ+NN"] = "NNI" | |
############################################################################# | |
class NPExtractor(object): | |
def __init__(self, sentence): | |
self.sentence = sentence | |
# Split the sentence into singlw words/tokens | |
def tokenize_sentence(self, sentence): | |
tokens = nltk.word_tokenize(sentence) | |
return tokens | |
# Normalize brown corpus' tags ("NN", "NN-PL", "NNS" > "NN") | |
def normalize_tags(self, tagged): | |
n_tagged = [] | |
for t in tagged: | |
if t[1] == "NP-TL" or t[1] == "NP": | |
n_tagged.append((t[0], "NNP")) | |
continue | |
if t[1].endswith("-TL"): | |
n_tagged.append((t[0], t[1][:-3])) | |
continue | |
if t[1].endswith("S"): | |
n_tagged.append((t[0], t[1][:-1])) | |
continue | |
n_tagged.append((t[0], t[1])) | |
return n_tagged | |
# Extract the main topics from the sentence | |
def extract(self): | |
tokens = self.tokenize_sentence(self.sentence) | |
tags = self.normalize_tags(bigram_tagger.tag(tokens)) | |
merge = True | |
while merge: | |
merge = False | |
for x in range(0, len(tags) - 1): | |
t1 = tags[x] | |
t2 = tags[x + 1] | |
key = "%s+%s" % (t1[1], t2[1]) | |
value = cfg.get(key, '') | |
if value: | |
merge = True | |
tags.pop(x) | |
tags.pop(x) | |
match = "%s %s" % (t1[0], t2[0]) | |
pos = value | |
tags.insert(x, (match, pos)) | |
break | |
matches = [] | |
for t in tags: | |
#if t[1] == "NNP" or t[1] == "NNI": | |
if t[1] == "NNP" or t[1] == "NNI" or t[1] == "NN": | |
matches.append(t[0]) | |
return matches | |
def term_extraction(sentence): | |
blob_ConllExtractor = TextBlob(sentence, np_extractor=ConllExtractor()) | |
blob_ConllExtractor_list = list(set(blob_ConllExtractor.noun_phrases)) | |
np_extractor = NPExtractor(sentence) | |
result = np_extractor.extract() | |
for term in result: | |
if term.lower() in blob_ConllExtractor_list: | |
blob_ConllExtractor_list.remove(term.lower()) | |
#merge both the extractor | |
resultList= list(set(result) | set(blob_ConllExtractor_list)) | |
final_result=[] | |
for i in (resultList): | |
if i.lower() in PivotX_English_Common_Words_Curpos or i in PivotX_English_Common_Words_Curpos: | |
pass | |
else: | |
final_result.append(i) | |
final_result_ = list(set(final_result)) | |
return final_result_ | |
# #Testing | |
# sentence = """We also saw that apart from thermal excitation, we can also use light in order to excite | |
# carriers across the band gap. If E g is the band gap of the material, the wavelength of light that | |
# is required in order to excite carriers is nothing, but hc/ λ. We can do this calculation for | |
# silicon, where we find that λ is approximately 1000 nanometers and this lies in the IR region. | |
# As long as we shine light with a wavelength that is less than 1,000 nanometers, which means | |
# the energy will be higher than the band gap you can always excite carriers from the valence | |
# band to the conduction band. So, this explains, while silicon is opaque because visible light | |
# has a wavelength less than 1000 nanometers, the visible range is from 400 to 800 nanometers, | |
# which means silicon will be able to absorb the visible light and produce electrons and holes. | |
# Similarly, SiO 2 which is glass has band gap of approximately 10 electron volts. | |
# """ | |
# resultList = term_extraction(sentence) | |
# print(f"Predicted o/p: \n{resultList}\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment