Skip to content

Instantly share code, notes, and snippets.

@DataTurks
Created May 27, 2018 10:17
Show Gist options
  • Save DataTurks/f685f6372b59aa12ff928c75d3e38a61 to your computer and use it in GitHub Desktop.
Save DataTurks/f685f6372b59aa12ff928c75d3e38a61 to your computer and use it in GitHub Desktop.
Use pickled training file to train Spacy NER.
def train_spacy(training_pickle_file):
#read pickle file to load training data
with open(training_pickle_file, 'rb') as input:
TRAIN_DATA=pickle.load(input)
nlp = spacy.blank('en') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(1):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
@Nisit007
Copy link

Traceback (most recent call last):

File "", line 1, in
for _, annotations in TRAIN_DATA:

NameError: name 'TRAIN_DATA' is not defined

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment