-
-
Save howard-haowen/fb597a8b6966805cc292c4697bb87747 to your computer and use it in GitHub Desktop.
Text Classification using sklearn
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score, confusion_matrix | |
from sklearn.metrics import classification_report | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
#Are the tags which we are using currently for tagging the sentences | |
my_tags = ['Social','Economic','Political','Health','Environmental','Ministry','Aim/Objective/Goal','Rural Development','Scheme','Proposed','Brief','Organization','Remedy','Defence','Foreign Relations','Science & Technology','Location','Facts','Space Science','Cultural','Future','International'] | |
#reading the file which has our data | |
df = pd.read_csv("out1.csv") | |
#Using only the data for which we have the tags available and splitting the data into training and testing sets | |
X = df['Text'][0:65] | |
y = df['Class'][0:65] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42) | |
#Making a pipeline where we are first creating the vectors then transforming those and finally applying our classifier | |
nb = Pipeline([('vect', CountVectorizer()), | |
('tfidf', TfidfTransformer()), | |
('clf', MultinomialNB()), | |
]) | |
nb.fit(X_train, y_train) | |
y_pred = nb.predict(X_test) | |
#print('accuracy %s' % accuracy_score(y_pred, y_test)) | |
#print(classification_report(y_test, y_pred,target_names=my_tags)) | |
print(X_test,y_pred,y_test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment