Created
April 23, 2021 16:10
-
-
Save ivopbernardo/0bf0304e2bd8255ecbcfdb32695d5557 to your computer and use it in GitHub Desktop.
Python Text Representation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import sklearn vectorizers and pandas | |
import pandas as pd | |
from sklearn.feature_extraction.text import ( | |
CountVectorizer, | |
TfidfVectorizer | |
) | |
# Defining our sentence examples | |
sentence_list = [ | |
'I went to the grocery store', | |
'I went to the movie theater', | |
'I went to the grocery store and then went to the bike store' | |
] | |
# Let's also convert this into a pandas | |
# df as this will be the most common format | |
# one will probably use | |
sentence_list_df = ( | |
pd.DataFrame(sentence_list, columns=['text']) | |
) | |
# Binary Vectorizer | |
cvec = CountVectorizer( | |
tokenizer=str.split, | |
binary=True | |
) | |
sparse_cvec = ( | |
cvec.fit_transform(sentence_list_df.text) | |
) | |
# Count Vectorizer | |
cvec_count = CountVectorizer( | |
tokenizer=str.split, | |
binary=True | |
) | |
sparse_cvec_count = ( | |
cvec_count.fit_transform(sentence_list_df.text) | |
) | |
# TFIDF Vectorizer | |
tfidf = TfidfVectorizer( | |
tokenizer=str.split, | |
binary=True | |
) | |
sparse_tfidf = ( | |
tfidf.fit_transform(sentence_list_df.text) | |
) | |
# Vectorizer with Limit | |
cvec_limit = CountVectorizer( | |
tokenizer=str.split, | |
binary=False, | |
min_df=2 | |
) | |
sparse_cvec_limit = ( | |
cvec_limit.fit_transform(sentence_list_df.text) | |
) | |
# Don't forget that if you want to pass any | |
# of the objects above to matrix to dense | |
# - Exemplifying with sparse_cvec_limit | |
sparse_cvec_limit.todense() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment