Skip to content

Instantly share code, notes, and snippets.

@shravankumar147
Last active August 4, 2017 06:51
Show Gist options
  • Save shravankumar147/16a325ebb3d590f97ca00ceae5365b11 to your computer and use it in GitHub Desktop.
Save shravankumar147/16a325ebb3d590f97ca00ceae5365b11 to your computer and use it in GitHub Desktop.
Spam Filtering
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Alt](https://1335865630.rsc.cdn77.org/images/Spam-filter.jpg)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd \n",
"import numpy as np \n",
"import matplotlib.pyplot as plt \n",
"\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.linear_model.logistic import LogisticRegression\n",
"from sklearn.model_selection import train_test_split, cross_val_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sample instances: \n",
" 0 1\n",
"0 ham Go until jurong point, crazy.. Available only ...\n",
"1 ham Ok lar... Joking wif u oni...\n",
"2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
"3 ham U dun say so early hor... U c already then say...\n",
"4 ham Nah I don't think he goes to usf, he lives aro...\n"
]
}
],
"source": [
"# read data file using pandas csv reader\n",
"df = pd.read_csv('../dataset/SMSSpamCollection', delimiter='\\t', header=None)\n",
"\n",
"# print out the first 5 SMS\n",
"print(\"sample instances: \")\n",
"print(df.head())\n",
"\n",
"# print 'Number of spam messages:', df[df[0] == 'spam'][0].count()\n",
"# print 'Number of ham messages:', df[df[0] == 'ham'][0].count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Alt](http://www.bloter.net/wp-content/uploads/2016/09/td-idf-graphic-765x255.png)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# pre processing the data\n",
"X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1],df[0])\n",
"vectorizer = TfidfVectorizer()\n",
"X_train = vectorizer.fit_transform(X_train_raw)\n",
"X_test = vectorizer.transform(X_test_raw)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy on test data:\n",
"Accuracy: 0.9698492462311558\n",
"Cross Validation Accuracy:\n",
"Accuracy: 0.91 (+/- 0.02)\n"
]
}
],
"source": [
"# model creation and training\n",
"classifier = LogisticRegression()\n",
"classifier.fit(X_train, y_train)\n",
"predictions = classifier.predict(X_test)\n",
"\n",
"print(\"Accuracy on test data:\")\n",
"\n",
"score =classifier.score(X_test, y_test)\n",
"print(\"Accuracy: {}\".format(score))\n",
"\n",
"print(\"Cross Validation Accuracy:\")\n",
"scores = cross_val_score(classifier, X_test, y_test, cv=5)\n",
"print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Alt](https://s-media-cache-ak0.pinimg.com/originals/d5/78/1e/d5781ee5271df81e4a42807312794e61.jpg)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment