Created
October 14, 2020 08:34
-
-
Save sevperez/883b2b89118747b65b8f4b5f168ce7cb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load spaCy model | |
nlp = spacy.load("en_core_web_md") | |
# tokenize documents | |
def spacy_doc(model, text, lower=True): | |
""" | |
- Parameters: model (spaCy model), text (string), lower (bool). | |
- Returns: A spaCy Document object processed using the provided | |
model. Document is all lowercase if lower is True. | |
""" | |
if lower: | |
text = text.lower() | |
return model(text) | |
sotu_docs = [spacy_doc(nlp, text) for text in sotu_df["text"]] | |
# build dictionary | |
def get_token_texts(doc): | |
""" | |
- Parameters: doc (spaCy Document object). | |
- Returns: A list of strings based on the text value of each token | |
in doc. | |
""" | |
token_list = [token for token in doc] | |
return [token.text for token in token_list] | |
def build_dictionary(doc_list): | |
""" | |
- Parameters: doc_list (list of spaCy Document objects). | |
- Returns: A Gensim Dictionary, built using the tokens in each | |
document contained in doc_list. | |
""" | |
return Dictionary([get_token_texts(doc) for doc in doc_list]) | |
sotu_dictionary = build_dictionary(sotu_docs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment