Last active
June 25, 2021 16:29
-
-
Save kyoto-cheng/e66db047bfa8bacbe7d371a7d6d7bd9a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.metrics import confusion_matrix | |
import matplotlib.pyplot as plt | |
# Split dataset | |
df_train, df_test = model_selection.train_test_split(df, test_size=0.2) | |
# Get target | |
y_train = df_train["Category"].values | |
y_test = df_test["Category"].values | |
# CountVectorizer for modeling | |
vectorizer = feature_extraction.text.CountVectorizer(max_features=300, ngram_range=(1,2)) | |
# Tf-Idf Vectorizer for modeling | |
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features=300, ngram_range=(1,2)) | |
# Fit and transform the Vectorizer based on X_train | |
corpus = df_train['Questions'] | |
vectorizer.fit(corpus) | |
X_train = vectorizer.transform(corpus) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment