Created May 3, 2017 12:54
spaCy tagger training desperation
from random import shuffle
from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.tokens import Doc
from import GoldParse
from spacy.en.tag_map import TAG_MAP
def read_postagged(fn):
alltags = set()
with open(fn) as f:
returnset = []
for l in
ws, ts = [], []
for wt in l.split():
elems = wt.split('/')
w = "/".join(elems[:-1])
t = elems[-1]
assert(len(ws) == len(ts))
if ws != []:
returnset.append((ws, ts))
return returnset, alltags
trainset, alltags = read_postagged("/tmp/0-18.tagged")
testset , _ = read_postagged("/tmp/22.tagged")
def eval(t):
right = 0
wrong = 0
for (ws, gold) in testset:
doc = Doc(vocab, ws)
pred = [word.tag_ for word in doc]
for (g, p) in zip(gold, pred):
if g == p:
right += 1
wrong += 1
acc = 100 * right / (right + wrong)
print(f"Accuracy: {acc:.2f}")
tag_map = {t: {'pos': 'X'} for t in alltags}
vocab = Vocab(tag_map=tag_map)
# Add all train words to vocab!
for (ws, _) in trainset + testset:
for w in ws:
_ = vocab[w]
tagger = Tagger(vocab)
for i in range(50):
print(f"Epoch {i}:")
for (ws, ts) in trainset:
doc = Doc(vocab, words=ws)
gold = GoldParse(doc, tags=ts)
tagger.update(doc, gold)
