chelseatroy · March 29, 2025 20:49
diff --git a/tfidf_vectorization_with_pandas.py b/tfidf_vectorization_with_pandas.py
 import pandas as pd
 import numpy as np
 import itertool
 from nltk import word_tokenize
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
 from sklearn.naive_bayes import MultinomialNB

 df = pd.read_csv('my_data_with_text.csv')
 df.columns #id, text, category

 texts = np.array(df['text']) #text contents in dataframe to array for processing
 vocab_length = len(nltk.word_tokenize(list(itertools.chain.from_iterable(texts))) #concatenate all the texts and tokenize the whole corpus

 vectorizer = TfidfVectorizer(ngram_range = (1,3), max_features = vocab_length) #make Tfidf Vectorizer
 tfidf_encodings = vectorizer.fit_transform(texts) #encode the text
                   
 df['tfidf'] = list(tfidf_encodings.toarray()) #vectorized texts to dense list format for storage in dataframe

 vectors_for_training = np.array(df['tfidf'].tolist()) #get the vectors back out of the dataframe for use in something else
 X_train, y_train, X_test, y_test = train_test_split(vectors_for_training, df['category'].tolist())

 nb_classifier = MultinomialNB()
 nb_classifier.fit(X_train, y_train)
 nb_predictions = nb_classifier.predict(df.tfidf.tolist())
                   
 #DO NOT DO:
 df.to_csv('with_encoding.csv') #Stores the first and last 3 items in each vector as a string like "[0.0, 0.0, 0.0...0.0, 0.0, 0.0]"
	import pandas as pd
	import numpy as np
	import itertool
	from nltk import word_tokenize
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import MultinomialNB

	df = pd.read_csv('my_data_with_text.csv')
	df.columns #id, text, category

	texts = np.array(df['text']) #text contents in dataframe to array for processing
	vocab_length = len(nltk.word_tokenize(list(itertools.chain.from_iterable(texts))) #concatenate all the texts and tokenize the whole corpus

	vectorizer = TfidfVectorizer(ngram_range = (1,3), max_features = vocab_length) #make Tfidf Vectorizer
	tfidf_encodings = vectorizer.fit_transform(texts) #encode the text

	df['tfidf'] = list(tfidf_encodings.toarray()) #vectorized texts to dense list format for storage in dataframe

	vectors_for_training = np.array(df['tfidf'].tolist()) #get the vectors back out of the dataframe for use in something else
	X_train, y_train, X_test, y_test = train_test_split(vectors_for_training, df['category'].tolist())

	nb_classifier = MultinomialNB()
	nb_classifier.fit(X_train, y_train)
	nb_predictions = nb_classifier.predict(df.tfidf.tolist())

	#DO NOT DO:
	df.to_csv('with_encoding.csv') #Stores the first and last 3 items in each vector as a string like "[0.0, 0.0, 0.0...0.0, 0.0, 0.0]"