Skip to content

Instantly share code, notes, and snippets.

@kyoto-cheng
Last active June 25, 2021 16:29
Show Gist options
  • Save kyoto-cheng/e66db047bfa8bacbe7d371a7d6d7bd9a to your computer and use it in GitHub Desktop.
Save kyoto-cheng/e66db047bfa8bacbe7d371a7d6d7bd9a to your computer and use it in GitHub Desktop.
import re
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
# Split dataset
df_train, df_test = model_selection.train_test_split(df, test_size=0.2)
# Get target
y_train = df_train["Category"].values
y_test = df_test["Category"].values
# CountVectorizer for modeling
vectorizer = feature_extraction.text.CountVectorizer(max_features=300, ngram_range=(1,2))
# Tf-Idf Vectorizer for modeling
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features=300, ngram_range=(1,2))
# Fit and transform the Vectorizer based on X_train
corpus = df_train['Questions']
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment