Last active
November 18, 2020 05:16
-
-
Save nicksyna01/1312899532bce647daba6d07345f5a49 to your computer and use it in GitHub Desktop.
Text Classification
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import accuracy_score, confusion_matrix | |
from sklearn.metrics import classification_report | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
#Are the tags which we are using currently for tagging the sentences | |
my_tags = ['Social','Economic','Political','Health','Environmental','Ministry','Aim/Objective/Goal','Rural Development','Scheme','Proposed','Brief','Organization','Remedy','Defence','Foreign Relations','Science & Technology','Location','Facts','Space Science','Cultural','Future','International'] | |
#reading the file which has our data | |
df = pd.read_csv("out1.csv") | |
#Using only the data for which we have the tags available and splitting the data into training and testing sets | |
X = df['Text'][0:65] | |
y = df['Class'][0:65] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42) | |
#Making a pipeline where we are first creating the vectors then transforming those and finally applying our classifier | |
nb = Pipeline([('vect', CountVectorizer()), | |
('tfidf', TfidfTransformer()), | |
('clf', MultinomialNB()), | |
]) | |
nb.fit(X_train, y_train) | |
y_pred = nb.predict(X_test) | |
#print('accuracy %s' % accuracy_score(y_pred, y_test)) | |
#print(classification_report(y_test, y_pred,target_names=my_tags)) | |
print(X_test,y_pred,y_test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Dataset looks like: