Last active
August 29, 2015 14:10
-
-
Save emsifa/6f82052c5004a8d34712 to your computer and use it in GitHub Desktop.
Naive Bayes Text Classifier
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require("NaiveBayesText.php"); | |
$opinionClassifier = new NaiveBayesText(); | |
$opinionClassifier->wordResolver(function($word) { | |
// disini gw mainin ignored words doang.. bagusnya ada tokeniing, stemming kata, dsb.. | |
$ignored_words = array("dia", "orang", "itu", "sangat", "kalian", "tidak", "dll", "dsb"); | |
return (in_array($word, $ignored_words))? null : $word; | |
}); | |
// semakin banyak training semakin akurat... | |
$opinionClassifier->addTraining("negative", "orang itu sangat jelek"); | |
$opinionClassifier->addTraining("negative", "mereka semua malas"); | |
$opinionClassifier->addTraining("negative", "asu luh"); | |
$opinionClassifier->addTraining("negative", "kalian bodoh"); | |
$opinionClassifier->addTraining("negative", "asu, merusak pemandangan aja"); | |
$opinionClassifier->addTraining("positive", "orang itu sangat baik"); | |
$opinionClassifier->addTraining("positive", "dia hebat"); | |
$opinionClassifier->addTraining("positive", "mereka orang yang baik"); | |
$opinionClassifier->addTraining("positive", "belajarlah yang benar"); | |
$opinionClassifier->addTraining("positive", "mereka sangat baik"); | |
$test_kalimat = "ah asu lah, nggak guna banget"; | |
print_r($opinionClassifier->classify($test_kalimat)); // hasil: array('negative' => 1, 'positive' => 0) | |
print_r($opinionClassifier->is("negative", $test_kalimat)); // hasil: true | |
print_r($opinionClassifier->isNegative($test_kalimat)); // hasil: true | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class NaiveBayesText { | |
protected $training_sets = array(); | |
protected $word_resolver = null; | |
protected $probability_cache = array(); | |
public function wordResolver($resolver_callable) | |
{ | |
if(!is_callable($resolver_callable)) { | |
throw new InvalidArgumentExeption("Word Resolver must be callable"); | |
} | |
$this->word_resolver = $resolver_callable; | |
} | |
public function addTraining($classify, $document) | |
{ | |
$classify = strtolower($classify); | |
if(!array_key_exists($classify, $this->training_sets)) { | |
$this->training_sets[$classify] = array(); | |
} | |
$this->training_sets[$classify][] = $document; | |
} | |
public function getWordProbability($word) | |
{ | |
$word = $this->resolveWord($word); | |
if(empty($word)) { | |
return null; | |
} | |
if(array_key_exists($word, $this->probability_cache)) { | |
return $this->probability_cache[$word]; | |
} | |
$total = array(); | |
$count_datasets = array(); | |
foreach($this->training_sets as $classify => $datasets) { | |
$total[$classify] = 0; | |
$count_datasets[$classify] = count($datasets); | |
foreach($datasets as $training) { | |
$train_words = $this->parseWords($training); | |
foreach($train_words as $t_word) { | |
$t_word = $this->resolveWord($t_word); | |
if(empty($t_word)) continue; | |
if($t_word == $word) { | |
$total[$classify] += 1; | |
break; | |
} | |
} | |
} | |
} | |
$result = array(); | |
foreach($total as $classify => $count) { | |
$result[$classify] = array( | |
'count_match' => $count, | |
'count_datasets' => $count_datasets[$classify], | |
); | |
} | |
$this->probability_cache[$word] = $result; | |
return $result; | |
} | |
public function getClassifiers() | |
{ | |
return array_keys($this->training_sets); | |
} | |
public function classify($document) | |
{ | |
$words = $this->parseWords($document); | |
$classifiers = $this->getClassifiers(); | |
$scores = array(); | |
$count_datasets = array(); | |
$total_datasets = 0; | |
foreach($classifiers as $classify) { | |
$count_datasets[$classify] = count($this->training_sets[$classify]); | |
$total_datasets += $count_datasets[$classify]; | |
$scores[$classify] = doubleval(0); | |
} | |
foreach($words as $word) { | |
$word_probability = $this->getWordProbability($word); | |
if(!$word_probability) continue; | |
foreach($classifiers as $classify) { | |
$_count_match = $word_probability[$classify]['count_match']; | |
$_count_datasets = $word_probability[$classify]['count_datasets']; | |
$probability = $_count_match/$_count_datasets; | |
if($probability > 0) { | |
if($scores[$classify] == 0) $scores[$classify] = 1; | |
$scores[$classify] *= $probability; | |
} | |
} | |
} | |
foreach($classifiers as $classify) { | |
$scores[$classify] *= $count_datasets[$classify]/$total_datasets; | |
} | |
return $this->normalizeResult($scores); | |
} | |
protected function normalizeResult(array $scores) | |
{ | |
$sum = 0; | |
foreach($scores as $score) $sum += $score; | |
foreach($scores as $i => $score) { | |
if($sum == 0 || $score == 0) { | |
$scores[$i] = 0; | |
} else { | |
$scores[$i] = $score/$sum; | |
} | |
} | |
return $scores; | |
} | |
public function is($classify, $document) | |
{ | |
if(!array_key_exists($classify, $this->training_sets)) { | |
throw new Exception("Undefined classify {$classify}"); | |
} | |
$classify_result = $this->classify($document); | |
$classify_score = $classify_result[$classify]; | |
unset($classify_result[$classify]); | |
foreach($classify_result as $score) { | |
if($score > $classify_score) return FALSE; | |
} | |
return TRUE; | |
} | |
protected function parseWords($text) | |
{ | |
return explode(" ", $text); | |
} | |
protected function basicTokenize($word) | |
{ | |
if(!is_string($word)) return ""; | |
return trim(preg_replace("/^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$/","", $word)); | |
} | |
protected function resolveWord($word) | |
{ | |
if(!$this->word_resolver) return $this->basicTokenize($word); | |
return $this->basicTokenize(call_user_func($this->word_resolver, $word)); | |
} | |
public function __call($method, $args) | |
{ | |
if(!preg_match("/^is[A-Z]/", $method)) { | |
throw new Exception("Call to undefined method {$method}"); | |
} | |
$classify = strtolower(preg_replace("/^is/", "", $method)); | |
return $this->is($classify, $args[0]); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment