Skip to content

Instantly share code, notes, and snippets.

@accessnash
Created June 28, 2020 14:49
Show Gist options
  • Select an option

  • Save accessnash/e0ac74aa5dfc31f39cdea09e5b059c07 to your computer and use it in GitHub Desktop.

Select an option

Save accessnash/e0ac74aa5dfc31f39cdea09e5b059c07 to your computer and use it in GitHub Desktop.
Email spam filter using NLP
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 28 11:19:47 2020
@author: User
"""
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
spamfilter_df = pd.read_csv('emails.csv')
spamfilter_df.describe()
ham = spamfilter_df[spamfilter_df['spam']== 0]
spam = spamfilter_df[spamfilter_df['spam']== 1]
print('Spam Percentage =', (len(spam)/len(spamfilter_df))*100, '%')
print('Ham Percentage =', (len(ham)/len(spamfilter_df))*100, '%')
sns.countplot(spamfilter_df['spam'], label = 'Count Spam vs Ham')
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
spamham_countvectorizer = vectorizer.fit_transform(spamfilter_df['text'])
print(vectorizer.get_feature_names())
print(spamham_countvectorizer.toarray())
spamham_countvectorizer.shape # Shows no of columns and all unique words, i.e. (5728, 37303)
label = spamfilter_df['spam'].values
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(spamham_countvectorizer, label)
testing_sample = ['Wizard of Stock trading tips!!!', 'Hi Avi, Please let me know if we could catch up next weekend']
testing_sample_countvectorizer = vectorizer.transform(testing_sample)
test_predict = nb_classifier.predict(testing_sample_countvectorizer)
X = spamham_countvectorizer
y = label
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
from sklearn.metrics import classification_report, confusion_matrix
y_predict_train = nb_classifier.predict(X_train)
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot = True)
y_predict_test = nb_classifier.predict(X_test)
cm2 = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm2, annot = True)
print(classification_report(y_test, y_predict_test))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment