Created
January 6, 2024 23:21
-
-
Save thatbudakguy/9abd74d89cbc1af7cc40a2a9bcad9712 to your computer and use it in GitHub Desktop.
CoNLL-2002 and CoNLL-U generators (spaCy)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Auto-generate CoNLL-2002 (IOB) entities by tagging a text file.""" | |
from pathlib import Path | |
from typing import Optional | |
from typing_extensions import Annotated | |
import spacy | |
from spacy.training import offsets_to_biluo_tags | |
from spacy.training import biluo_to_iob | |
import typer | |
def generate( | |
input_path: Path, | |
model_name: Annotated[Optional[str], typer.Argument()] = "en_core_web_lg" | |
): | |
# load the input text file | |
assert input_path.is_file() | |
input_text = input_path.read_text().strip() | |
# load the model and parse the text | |
nlp = spacy.load(model_name) | |
doc = nlp(input_text) | |
# convert to conll-2002 (IOB) format | |
output = "" | |
for sentence in doc.sents: | |
doc = sentence.as_doc() | |
ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents] | |
iob_tags = biluo_to_iob(offsets_to_biluo_tags(doc, ents)) | |
tokens = [token.text for token in sentence] | |
assert len(tokens) == len(iob_tags) | |
for token, tag in zip(tokens, iob_tags): | |
output += f"{token} {tag}\n" | |
output += "\n" | |
# write to stdout | |
typer.echo(output.strip()) | |
if __name__ == "__main__": | |
typer.run(generate) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Auto-generate CoNLL-U output by tagging a text file.""" | |
from pathlib import Path | |
from typing import Optional | |
from typing_extensions import Annotated | |
import spacy | |
import spacy_conll | |
import typer | |
def generate( | |
input_path: Path, | |
model_name: Annotated[Optional[str], typer.Argument()] = "en_core_web_lg", | |
): | |
# load the input text file | |
assert input_path.is_file() | |
input_text = input_path.read_text().strip() | |
# load the model and add output formatter | |
nlp = spacy.load(model_name) | |
nlp.add_pipe("conll_formatter", last=True, config={"include_headers": True}) | |
# parse the doc and write to stdout | |
doc = nlp(input_text) | |
output = doc._.conll_str | |
# write to stdout | |
typer.echo(output.strip()) | |
if __name__ == "__main__": | |
typer.run(generate) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment