Last active
February 13, 2019 08:26
-
-
Save mohdsanadzakirizvi/9f967917079b53e2db997519e1d793bf to your computer and use it in GitHub Desktop.
Intro to Stanford NLP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#get the dependency parse of the first sentence | |
print('---') | |
print('dependency parse of first sentence') | |
dependency_parse = sentence.basicDependencies | |
print(dependency_parse) | |
# get the first token of the first sentence | |
print('---') | |
print('first token of first sentence') | |
token = sentence.token[0] | |
print(token) | |
# get the part-of-speech tag | |
print('---') | |
print('part of speech tag of token') | |
token.pos | |
print(token.pos) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get the named entity tag | |
print('---') | |
print('named entity tag of token') | |
print(token.ner) | |
# get an entity mention from the first sentence | |
print('---') | |
print('first entity mention in sentence') | |
print(sentence.mentions[0]) | |
# access the coref chain | |
print('---') | |
print('coref chains for the example') | |
print(ann.corefChain) | |
# Use tokensregex patterns to find who wrote a sentence. | |
pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/' | |
matches = client.tokensregex(text, pattern) | |
# sentences contains a list with matches for each sentence. | |
assert len(matches["sentences"]) == 3 | |
# length tells you whether or not there are any matches in this | |
assert matches["sentences"][1]["length"] == 1 | |
# You can access matches like most regex groups. | |
matches["sentences"][1]["0"]["text"] == "Chris wrote a simple sentence" | |
matches["sentences"][1]["0"]["1"]["text"] == "Chris" | |
# Use semgrex patterns to directly find who wrote what. | |
pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object' | |
matches = client.semgrex(text, pattern) | |
# sentences contains a list with matches for each sentence. | |
assert len(matches["sentences"]) == 3 | |
# length tells you whether or not there are any matches in this | |
assert matches["sentences"][1]["length"] == 1 | |
# You can access matches like most regex groups. | |
matches["sentences"][1]["0"]["text"] == "wrote" | |
matches["sentences"][1]["0"]["$subject"]["text"] == "Chris" | |
matches["sentences"][1]["0"]["$object"]["text"] == "sentence" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from stanfordnlp.server import CoreNLPClient | |
# example text | |
print('---') | |
print('input text') | |
print('') | |
text = "Chris Manning is a nice person. Chris wrote a simple sentence. He also gives oranges to people." | |
print(text) | |
# set up the client | |
print('---') | |
print('starting up Java Stanford CoreNLP Server...') | |
# set up the client | |
with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner','depparse','coref'], timeout=30000, memory='16G') as client: | |
# submit the request to the server | |
ann = client.annotate(text) | |
# get the first sentence | |
sentence = ann.sentence[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
#extract lemma | |
def extract_lemma(doc): | |
parsed_text = {'word':[], 'lemma':[]} | |
for sent in doc.sentences: | |
for wrd in sent.words: | |
#extract text and lemma | |
parsed_text['word'].append(wrd.text) | |
parsed_text['lemma'].append(wrd.lemma) | |
#return a dataframe | |
return pd.DataFrame(parsed_text) | |
#call the function on doc | |
extract_lemma(doc) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#dictionary that contains pos tags and their explanations | |
pos_dict = { | |
'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner', | |
'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")', | |
'FW': 'foreign word','IN': 'preposition/subordinating conjunction','JJ': 'adjective \'big\'', | |
'JJR': 'adjective, comparative \'bigger\'','JJS': 'adjective, superlative \'biggest\'', | |
'LS': 'list marker 1)','MD': 'modal could, will','NN': 'noun, singular \'desk\'', | |
'NNS': 'noun plural \'desks\'','NNP': 'proper noun, singular \'Harrison\'', | |
'NNPS': 'proper noun, plural \'Americans\'','PDT': 'predeterminer \'all the kids\'', | |
'POS': 'possessive ending parent\'s','PRP': 'personal pronoun I, he, she', | |
'PRP$': 'possessive pronoun my, his, hers','RB': 'adverb very, silently,', | |
'RBR': 'adverb, comparative better','RBS': 'adverb, superlative best', | |
'RP': 'particle give up','TO': 'to go \'to\' the store.','UH': 'interjection errrrrrrrm', | |
'VB': 'verb, base form take','VBD': 'verb, past tense took', | |
'VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken', | |
'VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes', | |
'WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose', | |
'WRB': 'wh-abverb where, when','QF' : 'quantifier, bahut, thoda, kam (Hindi)','VM' : 'main verb', | |
'PSP' : 'postposition, common in indian langs','DEM' : 'demonstrative, common in indian langs' | |
} | |
#extract parts of speech | |
def extract_pos(doc): | |
parsed_text = {'word':[], 'pos':[], 'exp':[]} | |
for sent in doc.sentences: | |
for wrd in sent.words: | |
if wrd.pos in pos_dict.keys(): | |
pos_exp = pos_dict[wrd.pos] | |
else: | |
pos_exp = 'NA' | |
parsed_text['word'].append(wrd.text) | |
parsed_text['pos'].append(wrd.pos) | |
parsed_text['exp'].append(pos_exp) | |
#return a dataframe of pos and text | |
return pd.DataFrame(parsed_text) | |
#extract pos | |
extract_pos(doc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment