Created
May 16, 2020 10:18
-
-
Save fclesio/ae0857a4418a5972a2b6c16297a9a575 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
# Generate DF | |
df = \ | |
pd.DataFrame({'jobId' : [1,2,3,4,5], | |
'serviceId' : [99,88,77,66, 55], | |
'text' : ['Ich hätte gerne ein Bild an meiner Wand.', | |
'Ich will ein Bild auf meinem Auto.', | |
'Ich brauche ein Bild auf meinem Auto.', | |
'Ich brauche einen Rasenmäher für meinen Garten.', | |
'Ich brauche einen Maler, der mein Haus streicht.' | |
]}) | |
# Show DF | |
print(df) | |
# jobId serviceId text | |
#0 1 99 Ich hätte gerne ein Bild an meiner Wand. | |
#1 2 88 Ich will ein Bild auf meinem Auto. | |
#2 3 77 Ich brauche ein Bild auf meinem Auto. | |
#3 4 66 Ich brauche einen Rasenmäher für meinen Garten. | |
#4 5 55 Ich brauche einen Maler, der mein Haus streicht. | |
# Vectorizer to convert a collection of raw documents to a matrix of TF-IDF features | |
vectorizer = TfidfVectorizer() | |
# Learn vocabulary and idf, return term-document matrix. | |
tfidf = vectorizer.fit_transform(df['text'].values.astype('U')) | |
# Check TF-IDF sparce matrix | |
tfidf | |
# <5x23 sparse matrix of type '<class 'numpy.float64'>' | |
# with 37 stored elements in Compressed Sparse Row format> | |
# Now we can convert that to array | |
tfidf.toarray() | |
# array([[0.40409121, 0. , 0. , 0.27062459, 0. , | |
# 0. , 0.27062459, 0. , 0. , 0. , | |
# 0.40409121, 0. , 0.40409121, 0.19255163, 0. , | |
# 0. , 0. , 0. , 0.40409121, 0. , | |
# 0. , 0.40409121, 0. ], | |
# [0. , 0.39957751, 0.39957751, 0.33168543, 0. , | |
# 0. , 0.33168543, 0. , 0. , 0. , | |
# 0. , 0. , 0. , 0.23599692, 0. , | |
# 0. , 0.39957751, 0. , 0. , 0. , | |
# 0. , 0. , 0.49526603], | |
# [0. , 0.42969627, 0.42969627, 0.35668672, 0.35668672, | |
# 0. , 0.35668672, 0. , 0. , 0. , | |
# 0. , 0. , 0. , 0.25378554, 0. , | |
# 0. , 0.42969627, 0. , 0. , 0. , | |
# 0. , 0. , 0. ], | |
# [0. , 0. , 0. , 0. , 0.29017996, | |
# 0. , 0. , 0.34957636, 0.43329089, 0.43329089, | |
# 0. , 0. , 0. , 0.20646543, 0. , | |
# 0. , 0. , 0.43329089, 0. , 0.43329089, | |
# 0. , 0. , 0. ], | |
# [0. , 0. , 0. , 0. , 0.26626038, | |
# 0.39757465, 0. , 0.32076072, 0. , 0. , | |
# 0. , 0.39757465, 0. , 0.18944645, 0.39757465, | |
# 0.39757465, 0. , 0. , 0. , 0. , | |
# 0.39757465, 0. , 0. ]]) | |
# Geenrate a pandas DF and include new column | |
df_tf_idf = pd.DataFrame(tfidf.toarray()) | |
df_tf_idf['new_column'] = 'brasil_patria_educadora' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment