Skip to content

Instantly share code, notes, and snippets.

@philnguyen
Last active December 10, 2015 16:06
Show Gist options
  • Select an option

  • Save philnguyen/1d3a0b477dcb8b0e03b9 to your computer and use it in GitHub Desktop.

Select an option

Save philnguyen/1d3a0b477dcb8b0e03b9 to your computer and use it in GitHub Desktop.
import pyvw
from nltk.corpus import wordnet as wn
valid_labels = {'n.act': 0,
'n.animal': 1,
'n.artifact': 2,
'n.attribute': 3,
'n.body': 4,
'n.cognition': 5,
'n.communication': 6,
'n.event': 7,
'n.feeling': 8,
'n.food': 9,
'n.group': 10,
'n.location': 11,
'n.motive': 12,
'n.natural_object': 13,
'n.other': 14,
'n.person': 15,
'n.phenomenon': 16,
'n.plant': 17,
'n.possession': 18,
'n.process': 19,
'n.quantity': 20,
'n.relation': 21,
'n.shape': 22,
'n.state': 23,
'n.substance': 24,
'n.time': 25,
'v.body': 26,
'v.change': 27,
'v.cognition': 28,
'v.communication': 29,
'v.competition': 30,
'v.consumption': 31,
'v.contact': 32,
'v.creation': 33,
'v.emotion': 34,
'v.motion': 35,
'v.perception': 36,
'v.possession': 37,
'v.social': 38,
'v.stative': 39,
'v.weather': 40}
valid_labels_rev = { v:k for k,v in valid_labels.iteritems() }
class BIO:
# construct a BIO object using a bio type ('O', 'B' or 'I') and a
# optionally a label (that can be used to capture the supersense tag).
# this additionally computes a numeric_label to be used by vw
def __init__(self, bio, label=None):
if bio != 'O' and bio != 'B' and bio != 'I':
raise TypeError
self.bio = bio
self.label = None # the label will only be needed for supersenses
self.numeric_label = 1
if self.bio == 'B':
self.numeric_label = 2
elif self.bio == 'I':
self.numeric_label = 3
# a.can_follow(b) returns true if:
# a is O and b is I or O or
# a is B and b is I or O or
# ...
def can_follow(self, prev):
return (self.bio == 'O' and (prev.bio == 'I' or prev.bio == 'O') ) or \
(self.bio == 'B' and (prev.bio == 'I' or prev.bio == 'O') ) or \
(self.bio == 'I' and (prev.bio == 'B' or prev.bio == 'I') )
# given a label, produce a list of all valid BIO items that can
# come next.
def valid_next(self):
return [b for b in [BIO('B'), BIO('I'), BIO('O')] if b.can_follow(self)]
# produce a human-readable string
def __str__( self): return self.bio #return 'O' if self.bio == 'O' else (self.bio + '-' + self.label)
def __repr__(self): return self.__str__()
# compute equality
def __eq__(self, other):
if not isinstance(other, BIO): return False
return self.bio == other.bio and self.label == other.label
def __ne__(self, other): return not self.__eq__(other)
# convert a numerical prediction back to a BIO label
def numeric_label_to_BIO(num):
if not isinstance(num, int):
raise TypeError
if num == 1:
return BIO('O')
elif num == 2:
return BIO('B')
elif num == 3:
return BIO('I')
# given a previous PREDICTED label (prev), which may be incorrect; and
# the current TRUE label (truth), generate a list of valid reference
# actions. the return type should be [BIO]. for example, if the truth
# is O or B, then regardless of what prev is the correct thing to do
# is [truth]. the most important thing is to handle the case when, for
# instance, truth is I but prev is neither I nor B
def compute_reference(prev, truth):
if truth.bio!='I':
return [truth]
elif truth.bio=='I':
if prev.bio!='O':
return [ truth ] # TODO
elif prev.bio=='O':
return [truth] # TODO
# POS -> wn.POS
def pos_2_wn_pos(pos):
return {"NOUN": wn.NOUN, "VERB": wn.VERB, "ADJ" : wn.ADJ, "ADV" : wn.ADV}.get(pos)
# String POS Int -> String
# Return word's upto-nth hypernym
def nth_hypernym(word, pos, n):
# Synset Int -> Synset
def loop(synset, k):
if k > 0:
hypernyms = synset.hypernyms()
return loop(hypernyms[0], k - 1) if hypernyms else synset
else: return synset
synsets = wn.synsets(word, pos = pos_2_wn_pos(pos))
return loop(synsets[0], n).name() if synsets else word
# String POS -> String
# Return the name of the word's supersense, or the word itself if can't find
def top_hypernym(word, pos):
synsets = wn.synsets(word, pos = pos_2_wn_pos(pos))
if synsets:
hypernyms = synsets[0].root_hypernyms()
return hypernyms[0].name() if hypernyms else word
else: return word
class MWE(pyvw.SearchTask):
def __init__(self, vw, sch, num_actions):
# you must must must initialize the parent class
# this will automatically store self.sch <- sch, self.vw <- vw
pyvw.SearchTask.__init__(self, vw, sch, num_actions)
# for now we will use AUTO_HAMMING_LOSS; in Part II, you should remove this and implement a more task-focused loss
# like one-minus-F-measure.
sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES )
def _run(self, sentence):
output = []
prev = BIO('O') # store the previous prediction
for n in range(len(sentence)):
# label is a BIO, word is a string and pos is a string
label,word,lemma,pos = sentence[n]
poses = [sentence[i][3] for i in [n-1,n,n+1] if 0 <= i < len(sentence)]
hyps = [top_hypernym(sentence[i][1], sentence[i][3])
for i in [n-1,n,n+1] if 0 <= i < len(sentence)]
with self.make_example(word, lemma, poses, hyps) as ex: # construct the VW example
# first, compute the numeric labels for all valid reference actions
refs = [ bio.numeric_label for bio in compute_reference(prev, label) ]
# next, because some actions are invalid based on the
# previous decision, we need to compute a list of
# valid actions available at this point
valid = [ bio.numeric_label for bio in prev.valid_next() ]
# make a prediction
pred = self.sch.predict(examples = ex,
my_tag = n+1,
oracle = refs,
condition = [(n, 'p'), (n, 'h'), (n-1, 'q')],
allowed = valid)
# map that prediction back to a BIO label
this = numeric_label_to_BIO(pred)
# append it to output
output.append(this)
# update the 'previous' prediction to the current
prev = this
# return the list of predictions as BIO labels
return output
def make_example(self, word, lemma, poses, hypers):
return self.example({
'w': [word],
'l': [lemma],
'p': poses,
'h': hypers
})
def make_data(BIO,filename):
data = []
sentence = []
f = open(filename,'r')
for l in f:
l = l.strip()
# at end of sentence
if l == "":
data.append(sentence)
sentence = []
else:
[offset,word,lemma,pos,mwe,parent,strength,ssense,sid] = l.split('\t')
sentence.append((BIO(mwe),word,lemma,pos))
return data
if __name__ == "__main__":
# input/output files
trainfilename='dimsum16.p3.train.contiguous'
testfilename='dimsum16.p3.test.contiguous'
outfilename='dimsum16.p3.test.contiguous.out'
# read in some examples to be used as training/dev set
train_data = make_data(BIO,trainfilename)
# initialize VW and sequence labeler as learning to search
vw = pyvw.vw(search=3, quiet=True, search_task='hook', ring_size=1024, \
search_rollin='learn', search_rollout='none')
# tell VW to construct your search task object
sequenceLabeler = vw.init_search_task(MWE)
# train!
# we make 5 passes over the training data, training on the first 80%
# examples (we retain the last 20% as development data)
print 'training!'
N = int(0.8 * len(train_data))
for i in xrange(5):
print 'iteration ', i, ' ...'
sequenceLabeler.learn(train_data[0:N])
# now see the predictions on 20% held-out sentences
print 'predicting!'
hamming_loss, total_words = 0,0
for n in range(N, len(train_data)):
truth = [label for label,word,lemma,pos in train_data[n]]
pred = sequenceLabeler.predict( [(BIO('O'),word,lemma,pos) for label,word,lemma,pos in train_data[n]] )
for i,t in enumerate(truth):
if t != pred[i]:
hamming_loss += 1
total_words += 1
# print 'predicted:', '\t'.join(map(str, pred))
# print ' truth:', '\t'.join(map(str, truth))
# print ''
print 'total hamming loss on dev set:', hamming_loss, '/', total_words
# In Part II, you will have to output predictions on the test set.
#test_data = make_data(BIO,testfilename)
#for n in range(N, len(test_data)):
# make predictions for current sentence
#pred = sequenceLabeler.predict( [(BIO('O'),word,lemma,pos) for label,word,lemma,pos in train_data[n]] )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment