mohdsanadzakirizvi · February 13, 2019 08:26
diff --git a/corenlp_depparse.py b/corenlp_depparse.py
    #get the dependency parse of the first sentence
    print('---')
    print('dependency parse of first sentence')
    dependency_parse = sentence.basicDependencies
    print(dependency_parse)
    # get the first token of the first sentence
    print('---')
    print('first token of first sentence')
    token = sentence.token[0]
    print(token)
    # get the part-of-speech tag
    print('---')
    print('part of speech tag of token')
    token.pos
    print(token.pos)
diff --git a/corenlp_ner.py b/corenlp_ner.py
    # get the named entity tag
    print('---')
    print('named entity tag of token')
    print(token.ner)
    # get an entity mention from the first sentence
    print('---')
    print('first entity mention in sentence')
    print(sentence.mentions[0])
    # access the coref chain
    print('---')
    print('coref chains for the example')
    print(ann.corefChain)
    # Use tokensregex patterns to find who wrote a sentence.
    pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/'
    matches = client.tokensregex(text, pattern)
    # sentences contains a list with matches for each sentence.
    assert len(matches["sentences"]) == 3
    # length tells you whether or not there are any matches in this
    assert matches["sentences"][1]["length"] == 1
    # You can access matches like most regex groups.
    matches["sentences"][1]["0"]["text"] == "Chris wrote a simple sentence"
    matches["sentences"][1]["0"]["1"]["text"] == "Chris"
    # Use semgrex patterns to directly find who wrote what.
    pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object'
    matches = client.semgrex(text, pattern)
    # sentences contains a list with matches for each sentence.
    assert len(matches["sentences"]) == 3
    # length tells you whether or not there are any matches in this
    assert matches["sentences"][1]["length"] == 1
    # You can access matches like most regex groups.
    matches["sentences"][1]["0"]["text"] == "wrote"
    matches["sentences"][1]["0"]["$subject"]["text"] == "Chris"
    matches["sentences"][1]["0"]["$object"]["text"] == "sentence"
diff --git a/corenlp_setup.py b/corenlp_setup.py
 from stanfordnlp.server import CoreNLPClient
 # example text
 print('---')
 print('input text')
 print('')
 text = "Chris Manning is a nice person. Chris wrote a simple sentence. He also gives oranges to people."
 print(text)
 # set up the client
 print('---')
 print('starting up Java Stanford CoreNLP Server...')
 # set up the client
 with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner','depparse','coref'], timeout=30000, memory='16G') as client:
    # submit the request to the server
    ann = client.annotate(text)
    # get the first sentence
    sentence = ann.sentence[0]
diff --git a/lemma.py b/lemma.py
 import pandas as pd

 #extract lemma
 def extract_lemma(doc):
    parsed_text = {'word':[], 'lemma':[]}
    for sent in doc.sentences:
        for wrd in sent.words:
            #extract text and lemma
            parsed_text['word'].append(wrd.text)
            parsed_text['lemma'].append(wrd.lemma)
    #return a dataframe
    return pd.DataFrame(parsed_text)

 #call the function on doc
 extract_lemma(doc)
diff --git a/parts_of_speech.py b/parts_of_speech.py
 #dictionary that contains pos tags and their explanations
 pos_dict = {
 'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner',
 'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")',
 'FW': 'foreign word','IN':  'preposition/subordinating conjunction','JJ': 'adjective \'big\'',
 'JJR': 'adjective, comparative \'bigger\'','JJS': 'adjective, superlative \'biggest\'',
 'LS': 'list marker 1)','MD': 'modal could, will','NN': 'noun, singular \'desk\'',
 'NNS': 'noun plural \'desks\'','NNP': 'proper noun, singular \'Harrison\'',
 'NNPS': 'proper noun, plural \'Americans\'','PDT': 'predeterminer \'all the kids\'',
 'POS': 'possessive ending parent\'s','PRP': 'personal pronoun I, he, she',
 'PRP$': 'possessive pronoun my, his, hers','RB': 'adverb very, silently,',
 'RBR': 'adverb, comparative better','RBS': 'adverb, superlative best',
 'RP': 'particle give up','TO': 'to go \'to\' the store.','UH': 'interjection errrrrrrrm',
 'VB': 'verb, base form take','VBD': 'verb, past tense took',
 'VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken',
 'VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes',
 'WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose',
 'WRB': 'wh-abverb where, when','QF' : 'quantifier, bahut, thoda, kam (Hindi)','VM' : 'main verb',
 'PSP' : 'postposition, common in indian langs','DEM' : 'demonstrative, common in indian langs'
 }

 #extract parts of speech
 def extract_pos(doc):
    parsed_text = {'word':[], 'pos':[], 'exp':[]}
    for sent in doc.sentences:
        for wrd in sent.words:
            if wrd.pos in pos_dict.keys():
                pos_exp = pos_dict[wrd.pos]
            else:
                pos_exp = 'NA'
            parsed_text['word'].append(wrd.text)
            parsed_text['pos'].append(wrd.pos)
            parsed_text['exp'].append(pos_exp)
    #return a dataframe of pos and text
    return pd.DataFrame(parsed_text)

 #extract pos
 extract_pos(doc)
	#get the dependency parse of the first sentence
	print('---')
	print('dependency parse of first sentence')
	dependency_parse = sentence.basicDependencies
	print(dependency_parse)
	# get the first token of the first sentence
	print('---')
	print('first token of first sentence')
	token = sentence.token[0]
	print(token)
	# get the part-of-speech tag
	print('---')
	print('part of speech tag of token')
	token.pos
	print(token.pos)
	# get the named entity tag
	print('---')
	print('named entity tag of token')
	print(token.ner)
	# get an entity mention from the first sentence
	print('---')
	print('first entity mention in sentence')
	print(sentence.mentions[0])
	# access the coref chain
	print('---')
	print('coref chains for the example')
	print(ann.corefChain)
	# Use tokensregex patterns to find who wrote a sentence.
	pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence\|article/'
	matches = client.tokensregex(text, pattern)
	# sentences contains a list with matches for each sentence.
	assert len(matches["sentences"]) == 3
	# length tells you whether or not there are any matches in this
	assert matches["sentences"][1]["length"] == 1
	# You can access matches like most regex groups.
	matches["sentences"][1]["0"]["text"] == "Chris wrote a simple sentence"
	matches["sentences"][1]["0"]["1"]["text"] == "Chris"
	# Use semgrex patterns to directly find who wrote what.
	pattern = '{word:wrote} >nsubj {}=subject >dobj {}=object'
	matches = client.semgrex(text, pattern)
	# sentences contains a list with matches for each sentence.
	assert len(matches["sentences"]) == 3
	# length tells you whether or not there are any matches in this
	assert matches["sentences"][1]["length"] == 1
	# You can access matches like most regex groups.
	matches["sentences"][1]["0"]["text"] == "wrote"
	matches["sentences"][1]["0"]["$subject"]["text"] == "Chris"
	matches["sentences"][1]["0"]["$object"]["text"] == "sentence"
	from stanfordnlp.server import CoreNLPClient
	# example text
	print('---')
	print('input text')
	print('')
	text = "Chris Manning is a nice person. Chris wrote a simple sentence. He also gives oranges to people."
	print(text)
	# set up the client
	print('---')
	print('starting up Java Stanford CoreNLP Server...')
	# set up the client
	with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner','depparse','coref'], timeout=30000, memory='16G') as client:
	# submit the request to the server
	ann = client.annotate(text)
	# get the first sentence
	sentence = ann.sentence[0]
	import pandas as pd

	#extract lemma
	def extract_lemma(doc):
	parsed_text = {'word':[], 'lemma':[]}
	for sent in doc.sentences:
	for wrd in sent.words:
	#extract text and lemma
	parsed_text['word'].append(wrd.text)
	parsed_text['lemma'].append(wrd.lemma)
	#return a dataframe
	return pd.DataFrame(parsed_text)

	#call the function on doc
	extract_lemma(doc)
	#dictionary that contains pos tags and their explanations
	pos_dict = {
	'CC': 'coordinating conjunction','CD': 'cardinal digit','DT': 'determiner',
	'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")',
	'FW': 'foreign word','IN': 'preposition/subordinating conjunction','JJ': 'adjective \'big\'',
	'JJR': 'adjective, comparative \'bigger\'','JJS': 'adjective, superlative \'biggest\'',
	'LS': 'list marker 1)','MD': 'modal could, will','NN': 'noun, singular \'desk\'',
	'NNS': 'noun plural \'desks\'','NNP': 'proper noun, singular \'Harrison\'',
	'NNPS': 'proper noun, plural \'Americans\'','PDT': 'predeterminer \'all the kids\'',
	'POS': 'possessive ending parent\'s','PRP': 'personal pronoun I, he, she',
	'PRP$': 'possessive pronoun my, his, hers','RB': 'adverb very, silently,',
	'RBR': 'adverb, comparative better','RBS': 'adverb, superlative best',
	'RP': 'particle give up','TO': 'to go \'to\' the store.','UH': 'interjection errrrrrrrm',
	'VB': 'verb, base form take','VBD': 'verb, past tense took',
	'VBG': 'verb, gerund/present participle taking','VBN': 'verb, past participle taken',
	'VBP': 'verb, sing. present, non-3d take','VBZ': 'verb, 3rd person sing. present takes',
	'WDT': 'wh-determiner which','WP': 'wh-pronoun who, what','WP$': 'possessive wh-pronoun whose',
	'WRB': 'wh-abverb where, when','QF' : 'quantifier, bahut, thoda, kam (Hindi)','VM' : 'main verb',
	'PSP' : 'postposition, common in indian langs','DEM' : 'demonstrative, common in indian langs'
	}

	#extract parts of speech
	def extract_pos(doc):
	parsed_text = {'word':[], 'pos':[], 'exp':[]}
	for sent in doc.sentences:
	for wrd in sent.words:
	if wrd.pos in pos_dict.keys():
	pos_exp = pos_dict[wrd.pos]
	else:
	pos_exp = 'NA'
	parsed_text['word'].append(wrd.text)
	parsed_text['pos'].append(wrd.pos)
	parsed_text['exp'].append(pos_exp)
	#return a dataframe of pos and text
	return pd.DataFrame(parsed_text)

	#extract pos
	extract_pos(doc)