Last active
June 21, 2023 12:12
-
-
Save SouravJohar/bcbbad0d0b7e881cd0dca3481e32381f to your computer and use it in GitHub Desktop.
Code for building a spam filtering bot.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cPickle as c | |
import os | |
from sklearn import * | |
from collections import Counter | |
def load(clf_file): | |
with open(clf_file) as fp: | |
clf = c.load(fp) | |
return clf | |
def make_dict(): | |
direc = "emails/" | |
files = os.listdir(direc) | |
emails = [direc + email for email in files] | |
words = [] | |
c = len(emails) | |
for email in emails: | |
f = open(email) | |
blob = f.read() | |
words += blob.split(" ") | |
print c | |
c -= 1 | |
for i in range(len(words)): | |
if not words[i].isalpha(): | |
words[i] = "" | |
dictionary = Counter(words) | |
del dictionary[""] | |
return dictionary.most_common(3000) | |
clf = load("text-classifier.mdl") | |
d = make_dict() | |
while True: | |
features = [] | |
inp = raw_input(">").split() | |
if inp[0] == "exit": | |
break | |
for word in d: | |
features.append(inp.count(word[0])) | |
res = clf.predict([features]) | |
print ["Not Spam", "Spam!"][res[0]] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from collections import Counter | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.model_selection import train_test_split as tts | |
from sklearn.metrics import accuracy_score | |
import cPickle as c | |
def save(clf, name): | |
with open(name, 'wb') as fp: | |
c.dump(clf, fp) | |
print "saved" | |
def make_dict(): | |
direc = "emails/" | |
files = os.listdir(direc) | |
emails = [direc + email for email in files] | |
words = [] | |
c = len(emails) | |
for email in emails: | |
f = open(email) | |
blob = f.read() | |
words += blob.split(" ") | |
print c | |
c -= 1 | |
for i in range(len(words)): | |
if not words[i].isalpha(): | |
words[i] = "" | |
dictionary = Counter(words) | |
del dictionary[""] | |
return dictionary.most_common(3000) | |
def make_dataset(dictionary): | |
direc = "emails/" | |
files = os.listdir(direc) | |
emails = [direc + email for email in files] | |
feature_set = [] | |
labels = [] | |
c = len(emails) | |
for email in emails: | |
data = [] | |
f = open(email) | |
words = f.read().split(' ') | |
for entry in dictionary: | |
data.append(words.count(entry[0])) | |
feature_set.append(data) | |
if "ham" in email: | |
labels.append(0) | |
if "spam" in email: | |
labels.append(1) | |
print c | |
c = c - 1 | |
return feature_set, labels | |
d = make_dict() | |
features, labels = make_dataset(d) | |
x_train, x_test, y_train, y_test = tts(features, labels, test_size=0.2) | |
clf = MultinomialNB() | |
clf.fit(x_train, y_train) | |
preds = clf.predict(x_test) | |
print accuracy_score(y_test, preds) | |
save(clf, "text-classifier.mdl") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
check this:--> https://stackoverflow.com/questions/9233027/unicodedecodeerror-charmap-codec-cant-decode-byte-x-in-position-y-character