Last active
May 6, 2020 16:39
-
-
Save schaunwheeler/57846d935b17383ac94979dc0f8784ba to your computer and use it in GitHub Desktop.
Example of how to use spaCy to process many texts at once
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from spacy import load as spacy_load | |
# This loads the largest English corpus, which must be downloaded | |
# separate from package installation. Other choices are available. | |
nlp = spacy_load('en_core_web_lg') | |
def doc_to_spans(list_of_texts, join_string=' ||| '): | |
all_docs = nlp(' ||| '.join(list_of_texts)) | |
split_inds = [i for i, token in enumerate(all_docs) if token.text == '|||'] + [len(all_docs)] | |
new_docs = [all_docs[(i + 1 if i > 0 else i):j] for i, j in zip([0] + split_inds[:-1], split_inds)] | |
return new_docs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Very nice, thank you!