Skip to content

Instantly share code, notes, and snippets.

@SouravJohar
Last active June 21, 2023 12:12
Show Gist options
  • Save SouravJohar/bcbbad0d0b7e881cd0dca3481e32381f to your computer and use it in GitHub Desktop.
Save SouravJohar/bcbbad0d0b7e881cd0dca3481e32381f to your computer and use it in GitHub Desktop.
Code for building a spam filtering bot.
import cPickle as c
import os
from sklearn import *
from collections import Counter
def load(clf_file):
with open(clf_file) as fp:
clf = c.load(fp)
return clf
def make_dict():
direc = "emails/"
files = os.listdir(direc)
emails = [direc + email for email in files]
words = []
c = len(emails)
for email in emails:
f = open(email)
blob = f.read()
words += blob.split(" ")
print c
c -= 1
for i in range(len(words)):
if not words[i].isalpha():
words[i] = ""
dictionary = Counter(words)
del dictionary[""]
return dictionary.most_common(3000)
clf = load("text-classifier.mdl")
d = make_dict()
while True:
features = []
inp = raw_input(">").split()
if inp[0] == "exit":
break
for word in d:
features.append(inp.count(word[0]))
res = clf.predict([features])
print ["Not Spam", "Spam!"][res[0]]
import os
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score
import cPickle as c
def save(clf, name):
with open(name, 'wb') as fp:
c.dump(clf, fp)
print "saved"
def make_dict():
direc = "emails/"
files = os.listdir(direc)
emails = [direc + email for email in files]
words = []
c = len(emails)
for email in emails:
f = open(email)
blob = f.read()
words += blob.split(" ")
print c
c -= 1
for i in range(len(words)):
if not words[i].isalpha():
words[i] = ""
dictionary = Counter(words)
del dictionary[""]
return dictionary.most_common(3000)
def make_dataset(dictionary):
direc = "emails/"
files = os.listdir(direc)
emails = [direc + email for email in files]
feature_set = []
labels = []
c = len(emails)
for email in emails:
data = []
f = open(email)
words = f.read().split(' ')
for entry in dictionary:
data.append(words.count(entry[0]))
feature_set.append(data)
if "ham" in email:
labels.append(0)
if "spam" in email:
labels.append(1)
print c
c = c - 1
return feature_set, labels
d = make_dict()
features, labels = make_dataset(d)
x_train, x_test, y_train, y_test = tts(features, labels, test_size=0.2)
clf = MultinomialNB()
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
print accuracy_score(y_test, preds)
save(clf, "text-classifier.mdl")
@Utkarsha-Kumbhar
Copy link

I am getting this error

File "G:\ML proj\Email spam classification\detector.py", line 42, in
clf = load("text_classifier.mdl")

File "G:\ML proj\Email spam classification\detector.py", line 19, in load
clf = c.load(fp)

File "C:\Users\Kalaivani\anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 52: character maps to

How to clear this error

check this:--> https://stackoverflow.com/questions/9233027/unicodedecodeerror-charmap-codec-cant-decode-byte-x-in-position-y-character

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment