Hironsan · November 9, 2022 12:34
diff --git a/create_dataset.py b/create_dataset.py
 import spacy
 from spacy.tokens import DocBin
 from spacy_partial_tagger.tokenizer import CharacterTokenizer

 text = "Selegiline - induced postural hypotension in Parkinson's disease: a longitudinal study on the effects of drug withdrawal."
 patterns = [
    {"label": "Chemical", "pattern": [{"LOWER": "selegiline"}]},
    {"label": "Disease", "pattern": [{"LOWER": "hypotension"}]},
    {
        "label": "Disease",
        "pattern": [{"LOWER": "parkinson"}, {"LOWER": "'s"}, {"LOWER": "disease"}],
    },
 ]

 # Add an entity ruler to the pipeline.
 nlp = spacy.blank("en")
 ruler = nlp.add_pipe("entity_ruler")
 ruler.add_patterns(patterns)

 # Extract entities from the text.
 doc = nlp(text)
 entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

 # Create a DocBin object.
 nlp = spacy.blank("en")
 nlp.tokenizer = CharacterTokenizer(nlp.vocab)
 doc_bin = DocBin()
 doc = nlp.make_doc(text)
 doc.ents = [
    doc.char_span(start, end, label=label) for start, end, label in entities
 ]
 doc_bin.add(doc)
 doc_bin.to_disk("/path/to/data.spacy")
	import spacy
	from spacy.tokens import DocBin
	from spacy_partial_tagger.tokenizer import CharacterTokenizer

	text = "Selegiline - induced postural hypotension in Parkinson's disease: a longitudinal study on the effects of drug withdrawal."
	patterns = [
	{"label": "Chemical", "pattern": [{"LOWER": "selegiline"}]},
	{"label": "Disease", "pattern": [{"LOWER": "hypotension"}]},
	{
	"label": "Disease",
	"pattern": [{"LOWER": "parkinson"}, {"LOWER": "'s"}, {"LOWER": "disease"}],
	},
	]

	# Add an entity ruler to the pipeline.
	nlp = spacy.blank("en")
	ruler = nlp.add_pipe("entity_ruler")
	ruler.add_patterns(patterns)

	# Extract entities from the text.
	doc = nlp(text)
	entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

	# Create a DocBin object.
	nlp = spacy.blank("en")
	nlp.tokenizer = CharacterTokenizer(nlp.vocab)
	doc_bin = DocBin()
	doc = nlp.make_doc(text)
	doc.ents = [
	doc.char_span(start, end, label=label) for start, end, label in entities
	]
	doc_bin.add(doc)
	doc_bin.to_disk("/path/to/data.spacy")
No results found