fclesio · May 16, 2020 10:18
diff --git a/include-new-column-in-tfidf-matrix.py b/include-new-column-in-tfidf-matrix.py
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer

 # Generate DF
 df = \
    pd.DataFrame({'jobId' : [1,2,3,4,5],
                  'serviceId' : [99,88,77,66, 55],
                  'text' : ['Ich hätte gerne ein Bild an meiner Wand.',
                            'Ich will ein Bild auf meinem Auto.',
                            'Ich brauche ein Bild auf meinem Auto.',
                            'Ich brauche einen Rasenmäher für meinen Garten.',
                            'Ich brauche einen Maler, der mein Haus streicht.'
                            ]}) 

 # Show DF
 print(df)

 #   jobId  serviceId                                              text
 #0      1         99          Ich hätte gerne ein Bild an meiner Wand.
 #1      2         88                Ich will ein Bild auf meinem Auto.
 #2      3         77             Ich brauche ein Bild auf meinem Auto.
 #3      4         66   Ich brauche einen Rasenmäher für meinen Garten.
 #4      5         55  Ich brauche einen Maler, der mein Haus streicht.

 # Vectorizer to convert a collection of raw documents to a matrix of TF-IDF features
 vectorizer = TfidfVectorizer()

 # Learn vocabulary and idf, return term-document matrix.
 tfidf = vectorizer.fit_transform(df['text'].values.astype('U'))

 # Check TF-IDF sparce matrix
 tfidf
 # <5x23 sparse matrix of type '<class 'numpy.float64'>'
 # 	with 37 stored elements in Compressed Sparse Row format>

 # Now we can convert that to array
 tfidf.toarray()
 # array([[0.40409121, 0.        , 0.        , 0.27062459, 0.        ,
 #         0.        , 0.27062459, 0.        , 0.        , 0.        ,
 #         0.40409121, 0.        , 0.40409121, 0.19255163, 0.        ,
 #         0.        , 0.        , 0.        , 0.40409121, 0.        ,
 #         0.        , 0.40409121, 0.        ],
 #        [0.        , 0.39957751, 0.39957751, 0.33168543, 0.        ,
 #         0.        , 0.33168543, 0.        , 0.        , 0.        ,
 #         0.        , 0.        , 0.        , 0.23599692, 0.        ,
 #         0.        , 0.39957751, 0.        , 0.        , 0.        ,
 #         0.        , 0.        , 0.49526603],
 #        [0.        , 0.42969627, 0.42969627, 0.35668672, 0.35668672,
 #         0.        , 0.35668672, 0.        , 0.        , 0.        ,
 #         0.        , 0.        , 0.        , 0.25378554, 0.        ,
 #         0.        , 0.42969627, 0.        , 0.        , 0.        ,
 #         0.        , 0.        , 0.        ],
 #        [0.        , 0.        , 0.        , 0.        , 0.29017996,
 #         0.        , 0.        , 0.34957636, 0.43329089, 0.43329089,
 #         0.        , 0.        , 0.        , 0.20646543, 0.        ,
 #         0.        , 0.        , 0.43329089, 0.        , 0.43329089,
 #         0.        , 0.        , 0.        ],
 #        [0.        , 0.        , 0.        , 0.        , 0.26626038,
 #         0.39757465, 0.        , 0.32076072, 0.        , 0.        ,
 #         0.        , 0.39757465, 0.        , 0.18944645, 0.39757465,
 #         0.39757465, 0.        , 0.        , 0.        , 0.        ,
 #         0.39757465, 0.        , 0.        ]])


 # Geenrate a pandas DF and include new column
 df_tf_idf = pd.DataFrame(tfidf.toarray())
 df_tf_idf['new_column'] = 'brasil_patria_educadora'
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer

	# Generate DF
	df = \
	pd.DataFrame({'jobId' : [1,2,3,4,5],
	'serviceId' : [99,88,77,66, 55],
	'text' : ['Ich hätte gerne ein Bild an meiner Wand.',
	'Ich will ein Bild auf meinem Auto.',
	'Ich brauche ein Bild auf meinem Auto.',
	'Ich brauche einen Rasenmäher für meinen Garten.',
	'Ich brauche einen Maler, der mein Haus streicht.'
	]})

	# Show DF
	print(df)

	# jobId serviceId text
	#0 1 99 Ich hätte gerne ein Bild an meiner Wand.
	#1 2 88 Ich will ein Bild auf meinem Auto.
	#2 3 77 Ich brauche ein Bild auf meinem Auto.
	#3 4 66 Ich brauche einen Rasenmäher für meinen Garten.
	#4 5 55 Ich brauche einen Maler, der mein Haus streicht.

	# Vectorizer to convert a collection of raw documents to a matrix of TF-IDF features
	vectorizer = TfidfVectorizer()

	# Learn vocabulary and idf, return term-document matrix.
	tfidf = vectorizer.fit_transform(df['text'].values.astype('U'))

	# Check TF-IDF sparce matrix
	tfidf
	# <5x23 sparse matrix of type '<class 'numpy.float64'>'
	# with 37 stored elements in Compressed Sparse Row format>

	# Now we can convert that to array
	tfidf.toarray()
	# array([[0.40409121, 0. , 0. , 0.27062459, 0. ,
	# 0. , 0.27062459, 0. , 0. , 0. ,
	# 0.40409121, 0. , 0.40409121, 0.19255163, 0. ,
	# 0. , 0. , 0. , 0.40409121, 0. ,
	# 0. , 0.40409121, 0. ],
	# [0. , 0.39957751, 0.39957751, 0.33168543, 0. ,
	# 0. , 0.33168543, 0. , 0. , 0. ,
	# 0. , 0. , 0. , 0.23599692, 0. ,
	# 0. , 0.39957751, 0. , 0. , 0. ,
	# 0. , 0. , 0.49526603],
	# [0. , 0.42969627, 0.42969627, 0.35668672, 0.35668672,
	# 0. , 0.35668672, 0. , 0. , 0. ,
	# 0. , 0. , 0. , 0.25378554, 0. ,
	# 0. , 0.42969627, 0. , 0. , 0. ,
	# 0. , 0. , 0. ],
	# [0. , 0. , 0. , 0. , 0.29017996,
	# 0. , 0. , 0.34957636, 0.43329089, 0.43329089,
	# 0. , 0. , 0. , 0.20646543, 0. ,
	# 0. , 0. , 0.43329089, 0. , 0.43329089,
	# 0. , 0. , 0. ],
	# [0. , 0. , 0. , 0. , 0.26626038,
	# 0.39757465, 0. , 0.32076072, 0. , 0. ,
	# 0. , 0.39757465, 0. , 0.18944645, 0.39757465,
	# 0.39757465, 0. , 0. , 0. , 0. ,
	# 0.39757465, 0. , 0. ]])


	# Geenrate a pandas DF and include new column
	df_tf_idf = pd.DataFrame(tfidf.toarray())
	df_tf_idf['new_column'] = 'brasil_patria_educadora'