Created
May 27, 2018 10:35
-
-
Save DataTurks/f6035b1e58497d52bf88517ff7bf64cf to your computer and use it in GitHub Desktop.
Train Spacy NER example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
################### Train Spacy NER.########### | |
def train_spacy(): | |
TRAIN_DATA = convert_dataturks_to_spacy("dataturks_downloaded.json"); | |
nlp = spacy.blank('en') # create blank Language class | |
# create the built-in pipeline components and add them to the pipeline | |
# nlp.create_pipe works for built-ins that are registered with spaCy | |
if 'ner' not in nlp.pipe_names: | |
ner = nlp.create_pipe('ner') | |
nlp.add_pipe(ner, last=True) | |
# add labels | |
for _, annotations in TRAIN_DATA: | |
for ent in annotations.get('entities'): | |
ner.add_label(ent[2]) | |
# get names of other pipes to disable them during training | |
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] | |
with nlp.disable_pipes(*other_pipes): # only train NER | |
optimizer = nlp.begin_training() | |
for itn in range(1): | |
print("Statring iteration " + str(itn)) | |
random.shuffle(TRAIN_DATA) | |
losses = {} | |
for text, annotations in TRAIN_DATA: | |
nlp.update( | |
[text], # batch of texts | |
[annotations], # batch of annotations | |
drop=0.2, # dropout - make it harder to memorise data | |
sgd=optimizer, # callable to update weights | |
losses=losses) | |
print(losses) | |
#do prediction | |
doc = nlp("Samsing mobiles below $100") | |
print ("Entities= " + str(["" + str(ent.text) + "_" + str(ent.label_) for ent in doc.ents])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
what loss function is used here?