Skip to content

Instantly share code, notes, and snippets.

@varunon9
Created January 16, 2018 18:18
Show Gist options
  • Save varunon9/0d48ff2c67043ca2ec7d7c70084940aa to your computer and use it in GitHub Desktop.
Save varunon9/0d48ff2c67043ca2ec7d7c70084940aa to your computer and use it in GitHub Desktop.
Implementing Naive Bayes Classification algorithm into PHP to classify given text as ham or spam. To see complete project visit: https://github.com/varunon9/naive-bayes-classifier
<?php
class Category {
public static $HAM = 'ham';
public static $SPAM = 'spam';
}
?>
<?php
/**
* This file make connection to database using following parameters.
*/
$servername = "localhost";
$username = "root";
$password = "password123";
$dbname = "naiveBayes";
// Create connection
$conn = mysqli_connect($servername, $username, $password, $dbname);
mysqli_set_charset($conn, "utf8");
// Check connection
if (mysqli_connect_errno()) {
echo "Failed to connect to MySQL: " . mysqli_connect_error();
}
?>
<?php
/**
* mysql> create database naiveBayes;
* mysql> use naiveBayes;
* mysql> create table trainingSet (S_NO integer primary key auto_increment, document text, category varchar(255));
* mysql> create table wordFrequency (S_NO integer primary key auto_increment, word varchar(255), count integer, category varchar(255));
*/
require_once('NaiveBayesClassifier.php');
$classifier = new NaiveBayesClassifier();
$spam = Category::$SPAM;
$ham = Category::$HAM;
$classifier -> train('Have a pleasurable stay! Get up to 30% off + Flat 20% Cashback on Oyo Room' .
' bookings done via Paytm', $spam);
$classifier -> train('Lets Talk Fashion! Get flat 40% Cashback on Backpacks, Watches, Perfumes,' .
' Sunglasses & more', $spam);
$classifier -> train('Opportunity with Product firm for Fullstack | Backend | Frontend- Bangalore', $ham);
$classifier -> train('Javascript Developer, Fullstack Developer in Bangalore- Urgent Requirement', $ham);
$category = $classifier -> classify('Scan Paytm QR Code to Pay & Win 100% Cashback');
echo $category;
$category = $classifier -> classify('Re: Applying for Fullstack Developer');
echo $category;
?>
<?php
/**
* @author Varun Kumar <[email protected]>
*/
require_once('Category.php');
class NaiveBayesClassifier {
public function __construct() {
}
/**
* sentence is text(document) which will be classified as ham or spam
* @return category- ham/spam
*/
public function classify($sentence) {
// extracting keywords from input text/sentence
$keywordsArray = $this -> tokenize($sentence);
// classifying the category
$category = $this -> decide($keywordsArray);
return $category;
}
/**
* @sentence- text/document provided by user as training data
* @category- category of sentence
* this function will save sentence aka text/document in trainingSet table with given category
* It will also update count of words (or insert new) in wordFrequency table
*/
public function train($sentence, $category) {
$spam = Category::$SPAM;
$ham = Category::$HAM;
if ($category == $spam || $category == $ham) {
//connecting to database
require 'db_connect.php';
// inserting sentence into trainingSet with given category
$sql = mysqli_query($conn, "INSERT into trainingSet (document, category) values('$sentence', '$category')");
// extracting keywords
$keywordsArray = $this -> tokenize($sentence);
// updating wordFrequency table
foreach ($keywordsArray as $word) {
// if this word is already present with given category then update count else insert
$sql = mysqli_query($conn, "SELECT count(*) as total FROM wordFrequency WHERE word = '$word' and category= '$category' ");
$count = mysqli_fetch_assoc($sql);
if ($count['total'] == 0) {
$sql = mysqli_query($conn, "INSERT into wordFrequency (word, category, count) values('$word', '$category', 1)");
} else {
$sql = mysqli_query($conn, "UPDATE wordFrequency set count = count + 1 where word = '$word'");
}
}
//closing connection
$conn -> close();
} else {
throw new Exception('Unknown category. Valid categories are: $ham, $spam');
}
}
/**
* this function takes a paragraph of text as input and returns an array of keywords.
*/
private function tokenize($sentence) {
$stopWords = array('about','and','are','com','for','from','how',
'that','the','this', 'was','what','when','where','who','will','with','und','the','www');
//removing all the characters which ar not letters, numbers or space
$sentence = preg_replace("/[^a-zA-Z 0-9]+/", "", $sentence);
//converting to lowercase
$sentence = strtolower($sentence);
//an empty array
$keywordsArray = array();
//splitting text into array of keywords
//http://www.w3schools.com/php/func_string_strtok.asp
$token = strtok($sentence, " ");
while ($token !== false) {
//excluding elements of length less than 3
if (!(strlen($token) <= 2)) {
//excluding elements which are present in stopWords array
//http://www.w3schools.com/php/func_array_in_array.asp
if (!(in_array($token, $stopWords))) {
array_push($keywordsArray, $token);
}
}
$token = strtok(" ");
}
return $keywordsArray;
}
/**
* This function takes an array of words as input and return category (spam/ham) after
* applying Naive Bayes Classifier
*
* Naive Bayes Classifier Algorithm -
*
* p(spam/bodyText) = p(spam) * p(bodyText/spam) / p(bodyText);
* p(ham/bodyText) = p(ham) * p(bodyText/ham) / p(bodyText);
* p(bodyText) is constant so it can be ommitted
* p(spam) = no of documents (sentence) belonging to category spam / total no of documents (sentence)
* p(bodyText/spam) = p(word1/spam) * p(word2/spam) * .... p(wordn/spam)
* Laplace smoothing for such cases is usually given by (c+1)/(N+V),
* where V is the vocabulary size (total no of different words)
* p(word/spam) = no of times word occur in spam / no of all words in spam
* Reference:
* http://stackoverflow.com/questions/9996327/using-a-naive-bayes-classifier-to-classify-tweets-some-problems
* https://github.com/ttezel/bayes/blob/master/lib/naive_bayes.js
*/
private function decide ($keywordsArray) {
$spam = Category::$SPAM;
$ham = Category::$HAM;
// by default assuming category to be ham
$category = $ham;
// making connection to database
require 'db_connect.php';
$sql = mysqli_query($conn, "SELECT count(*) as total FROM trainingSet WHERE category = '$spam' ");
$spamCount = mysqli_fetch_assoc($sql);
$spamCount = $spamCount['total'];
$sql = mysqli_query($conn, "SELECT count(*) as total FROM trainingSet WHERE category = '$ham' ");
$hamCount = mysqli_fetch_assoc($sql);
$hamCount = $hamCount['total'];
$sql = mysqli_query($conn, "SELECT count(*) as total FROM trainingSet ");
$totalCount = mysqli_fetch_assoc($sql);
$totalCount = $totalCount['total'];
//p(spam)
$pSpam = $spamCount / $totalCount; // (no of documents classified as spam / total no of documents)
//p(ham)
$pHam = $hamCount / $totalCount; // (no of documents classified as ham / total no of documents)
//echo $pSpam." ".$pHam;
// no of distinct words (used for laplace smoothing)
$sql = mysqli_query($conn, "SELECT count(*) as total FROM wordFrequency ");
$distinctWords = mysqli_fetch_assoc($sql);
$distinctWords = $distinctWords['total'];
$bodyTextIsSpam = log($pSpam);
foreach ($keywordsArray as $word) {
$sql = mysqli_query($conn, "SELECT count as total FROM wordFrequency where word = '$word' and category = '$spam' ");
$wordCount = mysqli_fetch_assoc($sql);
$wordCount = $wordCount['total'];
$bodyTextIsSpam += log(($wordCount + 1) / ($spamCount + $distinctWords));
}
$bodyTextIsHam = log($pHam);
foreach ($keywordsArray as $word) {
$sql = mysqli_query($conn, "SELECT count as total FROM wordFrequency where word = '$word' and category = '$ham' ");
$wordCount = mysqli_fetch_assoc($sql);
$wordCount = $wordCount['total'];
$bodyTextIsHam += log(($wordCount + 1) / ($hamCount + $distinctWords));
}
if ($bodyTextIsHam >= $bodyTextIsSpam) {
$category = $ham;
} else {
$category = $spam;
}
$conn -> close();
return $category;
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment