Last active
December 28, 2019 16:48
-
-
Save GeorgeSeif/de4c0da2e9035e64a5f3bed6a5a4c501 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
def get_tf_idf(vectorizer): | |
feature_names = vectorizer.get_feature_names() | |
dense_vec = vectors.todense() | |
dense_list = dense_vec.tolist() | |
tfidf_data = pd.DataFrame(dense_list, columns=feature_names) | |
return tfidf_data | |
vectorizer = TfidfVectorizer() | |
doc_1 = "TF-IDF uses statistics to measure how important a word is to " \ | |
"a particular document" | |
doc_2 = "The TF-IDF is perfectly balanced, considering both local and global " \ | |
"levels of statistics for the target word." | |
doc_3 = "Words that occur more frequently in a document are weighted higher, " \ | |
"but only if they're more rare within the whole document." | |
documents_list = [doc_1, doc_2, doc_3] | |
vectors = vectorizer.fit_transform(documents_list) | |
tfidf_data = get_tf_idf(vectorizer) | |
print(tfidf_data) | |
# Prints the TF-IDF data for all words across all documents |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment