Last active
August 29, 2019 20:44
-
-
Save sanusart/464c48aec3e912c842ee11b7689a67bf to your computer and use it in GitHub Desktop.
Create training data #spacy #nlp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
from spacy.matcher import Matcher | |
from spacy.lang.en import English | |
nlp = English() | |
matcher = Matcher(nlp.vocab) | |
# create some patterns and add to matcher | |
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}] | |
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}] | |
matcher.add("GADGET", None, pattern1, pattern2) | |
TRAINING_DATA = [] | |
for doc in nlp.pipe(TEXTS): | |
# match on the doc and create a list of matched spans | |
spans = [doc[start:end] for match_id, start, end in matcher(doc)] | |
# Get (start character, end character, label) tuples of matches | |
entities = [(span.start_char, span.end_char, "GADGET") for span in spans] | |
# Format the matches as a (doc.text, entities) tuple | |
training_example = (doc.text, {"entities": entities}) | |
# Append the example to the training data | |
TRAINING_DATA.append(training_example) | |
# structure of resulting training data | |
TRAINING_DATA = [ | |
("How to preorder the iPhone X", {'entities': [(20, 28, 'GADGET')]}) | |
("Examples without labels are also needed", {'entities': []}) | |
# And many more examples... | |
] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hghj |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment