Last active
March 29, 2025 20:49
-
-
Save chelseatroy/e9b37107a65d155a0a1289a91a8b67d2 to your computer and use it in GitHub Desktop.
Tf-Idf Vectorization with Pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import itertool | |
from nltk import word_tokenize | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.model_selection import train_test_split | |
from sklearn.naive_bayes import MultinomialNB | |
df = pd.read_csv('my_data_with_text.csv') | |
df.columns #id, text, category | |
texts = np.array(df['text']) #text contents in dataframe to array for processing | |
vocab_length = len(nltk.word_tokenize(list(itertools.chain.from_iterable(texts))) #concatenate all the texts and tokenize the whole corpus | |
vectorizer = TfidfVectorizer(ngram_range = (1,3), max_features = vocab_length) #make Tfidf Vectorizer | |
tfidf_encodings = vectorizer.fit_transform(texts) #encode the text | |
df['tfidf'] = list(tfidf_encodings.toarray()) #vectorized texts to dense list format for storage in dataframe | |
vectors_for_training = np.array(df['tfidf'].tolist()) #get the vectors back out of the dataframe for use in something else | |
X_train, y_train, X_test, y_test = train_test_split(vectors_for_training, df['category'].tolist()) | |
nb_classifier = MultinomialNB() | |
nb_classifier.fit(X_train, y_train) | |
nb_predictions = nb_classifier.predict(df.tfidf.tolist()) | |
#DO NOT DO: | |
df.to_csv('with_encoding.csv') #Stores the first and last 3 items in each vector as a string like "[0.0, 0.0, 0.0...0.0, 0.0, 0.0]" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment