Last active
November 28, 2020 11:05
-
-
Save debonx/76d9631e3fd28354278cf2e1692ca45d to your computer and use it in GitHub Desktop.
Naive Bayes classifier in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import datasets and libraries | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.feature_extraction.text import CountVectorizer | |
#Category selection to compare different datasets of emails and evaluate how hard is to distinguish those. | |
the_categories = ['comp.sys.ibm.pc.hardware', 'rec.sport.hockey'] | |
train_emails = fetch_20newsgroups(categories = the_categories, subset = 'train', shuffle = True, random_state = 108) | |
test_emails = fetch_20newsgroups(categories = the_categories, subset = 'test', shuffle = True, random_state = 108) | |
#Init and fit counter with a complete list of emails (test + train) | |
counter = CountVectorizer() | |
counter.fit(test_emails.data + train_emails.data) | |
train_counts = counter.transform(train_emails.data) | |
test_counts = counter.transform(test_emails.data) | |
#Init Classifier and fit with train data | |
classifier = MultinomialNB() | |
classifier.fit(train_counts, train_emails.target) | |
#Calculate the score on the test dataset | |
the_score = classifier.score(test_counts, test_emails.target) | |
print(the_score) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment