shravankumar147 · August 4, 2017 06:51
diff --git a/spam_filter.ipynb b/spam_filter.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Alt](https://1335865630.rsc.cdn77.org/images/Spam-filter.jpg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np \n",
    "import matplotlib.pyplot as plt \n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model.logistic import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split, cross_val_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sample instances: \n",
      "      0                                                  1\n",
      "0   ham  Go until jurong point, crazy.. Available only ...\n",
      "1   ham                      Ok lar... Joking wif u oni...\n",
      "2  spam  Free entry in 2 a wkly comp to win FA Cup fina...\n",
      "3   ham  U dun say so early hor... U c already then say...\n",
      "4   ham  Nah I don't think he goes to usf, he lives aro...\n"
     ]
    }
   ],
   "source": [
    "# read data file using pandas csv reader\n",
    "df = pd.read_csv('../dataset/SMSSpamCollection', delimiter='\\t', header=None)\n",
    "\n",
    "# print out the first 5 SMS\n",
    "print(\"sample instances: \")\n",
    "print(df.head())\n",
    "\n",
    "# print 'Number of spam messages:', df[df[0] == 'spam'][0].count()\n",
    "# print 'Number of ham messages:', df[df[0] == 'ham'][0].count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Alt](http://www.bloter.net/wp-content/uploads/2016/09/td-idf-graphic-765x255.png)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# pre processing the data\n",
    "X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1],df[0])\n",
    "vectorizer = TfidfVectorizer()\n",
    "X_train = vectorizer.fit_transform(X_train_raw)\n",
    "X_test = vectorizer.transform(X_test_raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy on test data:\n",
      "Accuracy: 0.9698492462311558\n",
      "Cross Validation Accuracy:\n",
      "Accuracy: 0.91 (+/- 0.02)\n"
     ]
    }
   ],
   "source": [
    "# model creation and training\n",
    "classifier = LogisticRegression()\n",
    "classifier.fit(X_train, y_train)\n",
    "predictions = classifier.predict(X_test)\n",
    "\n",
    "print(\"Accuracy on test data:\")\n",
    "\n",
    "score =classifier.score(X_test, y_test)\n",
    "print(\"Accuracy: {}\".format(score))\n",
    "\n",
    "print(\"Cross Validation Accuracy:\")\n",
    "scores = cross_val_score(classifier, X_test, y_test, cv=5)\n",
    "print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![Alt](https://s-media-cache-ak0.pinimg.com/originals/d5/78/1e/d5781ee5271df81e4a42807312794e61.jpg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"![Alt](https://1335865630.rsc.cdn77.org/images/Spam-filter.jpg)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd \n",
	"import numpy as np \n",
	"import matplotlib.pyplot as plt \n",
	"\n",
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"from sklearn.linear_model.logistic import LogisticRegression\n",
	"from sklearn.model_selection import train_test_split, cross_val_score"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"sample instances: \n",
	" 0 1\n",
	"0 ham Go until jurong point, crazy.. Available only ...\n",
	"1 ham Ok lar... Joking wif u oni...\n",
	"2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
	"3 ham U dun say so early hor... U c already then say...\n",
	"4 ham Nah I don't think he goes to usf, he lives aro...\n"
	]
	}
	],
	"source": [
	"# read data file using pandas csv reader\n",
	"df = pd.read_csv('../dataset/SMSSpamCollection', delimiter='\\t', header=None)\n",
	"\n",
	"# print out the first 5 SMS\n",
	"print(\"sample instances: \")\n",
	"print(df.head())\n",
	"\n",
	"# print 'Number of spam messages:', df[df[0] == 'spam'][0].count()\n",
	"# print 'Number of ham messages:', df[df[0] == 'ham'][0].count()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"![Alt](http://www.bloter.net/wp-content/uploads/2016/09/td-idf-graphic-765x255.png)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# pre processing the data\n",
	"X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1],df[0])\n",
	"vectorizer = TfidfVectorizer()\n",
	"X_train = vectorizer.fit_transform(X_train_raw)\n",
	"X_test = vectorizer.transform(X_test_raw)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Accuracy on test data:\n",
	"Accuracy: 0.9698492462311558\n",
	"Cross Validation Accuracy:\n",
	"Accuracy: 0.91 (+/- 0.02)\n"
	]
	}
	],
	"source": [
	"# model creation and training\n",
	"classifier = LogisticRegression()\n",
	"classifier.fit(X_train, y_train)\n",
	"predictions = classifier.predict(X_test)\n",
	"\n",
	"print(\"Accuracy on test data:\")\n",
	"\n",
	"score =classifier.score(X_test, y_test)\n",
	"print(\"Accuracy: {}\".format(score))\n",
	"\n",
	"print(\"Cross Validation Accuracy:\")\n",
	"scores = cross_val_score(classifier, X_test, y_test, cv=5)\n",
	"print(\"Accuracy: %0.2f (+/- %0.2f)\" % (scores.mean(), scores.std() * 2))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"![Alt](https://s-media-cache-ak0.pinimg.com/originals/d5/78/1e/d5781ee5271df81e4a42807312794e61.jpg)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}