Created
October 14, 2020 08:35
-
-
Save sevperez/e035ec72b98c46623cecc7401709889c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def build_corpus(doc_list, dictionary): | |
""" | |
- Parameters: doc_list (list of spaCy Document objects), dictionary | |
(Gensim Dictionary object). | |
- Returns: A list of documents in bag-of-words format, containing | |
tuples with (token_id, token_count) for each token in the text. | |
""" | |
return [dictionary.doc2bow(get_token_texts(doc)) for doc in doc_list] | |
def build_td_matrix(doc_list, dictionary): | |
""" | |
- Parameters: doc_list (list of spaCy Document objects), dictionary | |
(Gensim Dictionary object). | |
- Returns: A term-document matrix in the form of a 2D NumPy Array, | |
where each row contains the count of a token in the corresponding | |
document and each column index is the id of a token in the | |
dictionary. | |
""" | |
corpus = build_corpus(sotu_docs, sotu_dictionary) | |
tdm = [] | |
for bow in corpus: | |
vector = np.zeros(len(dictionary)) | |
for token_id, token_count in bow: | |
vector[token_id] = token_count | |
tdm.append(vector) | |
return np.array(tdm) | |
def build_term_document_df(doc_list, dictionary): | |
""" | |
- Parameters: doc_list (list of spaCy Document objects), dictionary | |
(Gensim Dictionary object). | |
- Returns a term-document matrix in the form of a Pandas Dataframe, | |
where each row is a document and each column is a token. Values in | |
the dataframe are token counts for the given document / token. | |
""" | |
tdm = build_td_matrix(doc_list, dictionary) | |
cols = list(dictionary.token2id.keys()) | |
return pd.DataFrame(tdm, columns=cols, dtype=pd.Int64Dtype) | |
sotu_td_df = build_term_document_df(sotu_docs, sotu_dictionary) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment