-
-
Save SouravJohar/bcbbad0d0b7e881cd0dca3481e32381f to your computer and use it in GitHub Desktop.
import cPickle as c | |
import os | |
from sklearn import * | |
from collections import Counter | |
def load(clf_file): | |
with open(clf_file) as fp: | |
clf = c.load(fp) | |
return clf | |
def make_dict(): | |
direc = "emails/" | |
files = os.listdir(direc) | |
emails = [direc + email for email in files] | |
words = [] | |
c = len(emails) | |
for email in emails: | |
f = open(email) | |
blob = f.read() | |
words += blob.split(" ") | |
print c | |
c -= 1 | |
for i in range(len(words)): | |
if not words[i].isalpha(): | |
words[i] = "" | |
dictionary = Counter(words) | |
del dictionary[""] | |
return dictionary.most_common(3000) | |
clf = load("text-classifier.mdl") | |
d = make_dict() | |
while True: | |
features = [] | |
inp = raw_input(">").split() | |
if inp[0] == "exit": | |
break | |
for word in d: | |
features.append(inp.count(word[0])) | |
res = clf.predict([features]) | |
print ["Not Spam", "Spam!"][res[0]] |
import os | |
from collections import Counter | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.model_selection import train_test_split as tts | |
from sklearn.metrics import accuracy_score | |
import cPickle as c | |
def save(clf, name): | |
with open(name, 'wb') as fp: | |
c.dump(clf, fp) | |
print "saved" | |
def make_dict(): | |
direc = "emails/" | |
files = os.listdir(direc) | |
emails = [direc + email for email in files] | |
words = [] | |
c = len(emails) | |
for email in emails: | |
f = open(email) | |
blob = f.read() | |
words += blob.split(" ") | |
print c | |
c -= 1 | |
for i in range(len(words)): | |
if not words[i].isalpha(): | |
words[i] = "" | |
dictionary = Counter(words) | |
del dictionary[""] | |
return dictionary.most_common(3000) | |
def make_dataset(dictionary): | |
direc = "emails/" | |
files = os.listdir(direc) | |
emails = [direc + email for email in files] | |
feature_set = [] | |
labels = [] | |
c = len(emails) | |
for email in emails: | |
data = [] | |
f = open(email) | |
words = f.read().split(' ') | |
for entry in dictionary: | |
data.append(words.count(entry[0])) | |
feature_set.append(data) | |
if "ham" in email: | |
labels.append(0) | |
if "spam" in email: | |
labels.append(1) | |
print c | |
c = c - 1 | |
return feature_set, labels | |
d = make_dict() | |
features, labels = make_dataset(d) | |
x_train, x_test, y_train, y_test = tts(features, labels, test_size=0.2) | |
clf = MultinomialNB() | |
clf.fit(x_train, y_train) | |
preds = clf.predict(x_test) | |
print accuracy_score(y_test, preds) | |
save(clf, "text-classifier.mdl") |
I am getting this error
File "G:\ML proj\Email spam classification\detector.py", line 42, in
clf = load("text_classifier.mdl")File "G:\ML proj\Email spam classification\detector.py", line 19, in load
clf = c.load(fp)File "C:\Users\Kalaivani\anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 52: character maps to
How to clear this error
check out this:---> https://stackoverflow.com/questions/9233027/unicodedecodeerror-charmap-codec-cant-decode-byte-x-in-position-y-character
I am getting this error
File "G:\ML proj\Email spam classification\detector.py", line 42, in
clf = load("text_classifier.mdl")File "G:\ML proj\Email spam classification\detector.py", line 19, in load
clf = c.load(fp)File "C:\Users\Kalaivani\anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 52: character maps to
How to clear this error
check this:--> https://stackoverflow.com/questions/9233027/unicodedecodeerror-charmap-codec-cant-decode-byte-x-in-position-y-character
I am getting this error
File "G:\ML proj\Email spam classification\detector.py", line 42, in
clf = load("text_classifier.mdl")
File "G:\ML proj\Email spam classification\detector.py", line 19, in load
clf = c.load(fp)
File "C:\Users\Kalaivani\anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 52: character maps to
How to clear this error