Created
May 23, 2022 08:20
-
-
Save ljvmiranda921/dafdfd24c5a2904d210eb9c455b34941 to your computer and use it in GitHub Desktop.
Spans key weird behaviour
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
from spacy.tokens import DocBin, SpanGroup | |
from wasabi import msg | |
from copy import copy | |
def main(): | |
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." | |
nlp = spacy.load("en_core_web_sm") | |
doc = nlp(text) | |
spans_key = "sc" | |
msg.info("Trying out SpanGroup approach") | |
doc_sg = copy(doc) # copy doc to make sure it's a different one | |
if spans_key not in doc_sg.spans: | |
doc_sg.spans[spans_key] = SpanGroup(doc_sg) | |
doc_sg.spans[spans_key].extend(list(doc_sg.ents)) | |
db_sg = DocBin(docs=[doc_sg]) | |
msg.text("Deserializing the DocBin and checking the output...") | |
deserialized_docs_sg = list(db_sg.get_docs(nlp.vocab)) | |
print(deserialized_docs_sg[0].spans) | |
msg.info("Trying out iteration approach") | |
doc_it = copy(doc) # copy doc to make sure it's a different one | |
spans = [ent for ent in doc_it.ents] | |
group = SpanGroup(doc_it, name=spans_key, spans=spans) | |
doc_it.spans[spans_key] = group | |
db_it = DocBin(docs=[doc_it]) | |
msg.text("Deserializing the DocBin and checking the output...") | |
deserialized_docs_it = list(db_it.get_docs(nlp.vocab)) | |
print(deserialized_docs_it[0].spans) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample output:
