Skip to content

Instantly share code, notes, and snippets.

@snellingio
Last active May 23, 2020 16:30
Show Gist options
  • Select an option

  • Save snellingio/167dc0c0f1aae52835c8ac81c139b4f7 to your computer and use it in GitHub Desktop.

Select an option

Save snellingio/167dc0c0f1aae52835c8ac81c139b4f7 to your computer and use it in GitHub Desktop.
Form Validation with Decision Trees
<?php
class DecisionTree
{
protected $root;
public function __construct($options)
{
$optionsArray = [
'trainingSet' => $options['trainingSet'] ?? [],
'ignoredAttributes' => $options['ignoredAttributes'] ?? [],
'categoryAttr' => $options['categoryAttr'] ?? 'category',
'minItemsCount' => $options['minItemsCount'] ?? 1,
'entropyThreshold' => $options['entropyThreshold'] ?? 0.01,
'maxTreeDepth' => $options['maxTreeDepth'] ?? 256,
];
$this->root = $this->buildTree($optionsArray);
}
public function predict($item)
{
return $this->predictTree($this->root, $item);
}
protected function buildTree($options)
{
$predicates = [
'==' => function ($a, $b) {return $a == $b;},
'>=' => function ($a, $b) {return $a >= $b;},
];
$trainingSet = $options['trainingSet'];
$minItemsCount = $options['minItemsCount'];
$categoryAttr = $options['categoryAttr'];
$entropyThreshold = $options['entropyThreshold'];
$maxTreeDepth = $options['maxTreeDepth'];
$ignoredAttributes = $options['ignoredAttributes'];
if (($maxTreeDepth == 0) || (count($trainingSet) <= $minItemsCount)) {
return [
'category' => $this->mostFrequentValue($trainingSet, $categoryAttr),
];
}
$initialEntropy = $this->entropy($trainingSet, $categoryAttr);
if ($initialEntropy <= $entropyThreshold) {
return [
'category' => $this->mostFrequentValue($trainingSet, $categoryAttr),
];
}
$alreadyChecked = [];
$bestSplit = ['gain' => 0];
for ($i = count($trainingSet) - 1; $i >= 0; $i--) {
$item = $trainingSet[$i];
foreach ($item as $attr => $value) {
if (($attr == $categoryAttr) || in_array($attr, $ignoredAttributes)) {
continue;
}
$pivot = $value;
$predicateName;
if (is_numeric($pivot)) {
$predicateName = '>=';
} else {
$predicateName = '==';
}
$attrPredPivot = $attr . $predicateName . $pivot;
if (isset($alreadyChecked[$attrPredPivot])) {
continue;
}
$alreadyChecked[$attrPredPivot] = true;
$predicate = $predicates[$predicateName];
$currSplit = $this->split($trainingSet, $attr, $predicate, $pivot);
$matchEntropy = $this->entropy($currSplit['match'], $categoryAttr);
$notMatchEntropy = $this->entropy($currSplit['notMatch'], $categoryAttr);
$newEntropy = 0;
$newEntropy += $matchEntropy * count($currSplit['match']);
$newEntropy += $notMatchEntropy * count($currSplit['notMatch']);
$newEntropy /= count($trainingSet);
$currGain = $initialEntropy - $newEntropy;
if ($currGain > $bestSplit['gain']) {
$bestSplit = $currSplit;
$bestSplit['predicateName'] = $predicateName;
$bestSplit['predicate'] = $predicate;
$bestSplit['attribute'] = $attr;
$bestSplit['pivot'] = $pivot;
$bestSplit['gain'] = $currGain;
}
}
}
if (!isset($bestSplit['gain'])) {
return [
'category' => $this->mostFrequentValue($trainingSet, $categoryAttr),
];
}
$options['maxTreeDepth'] = $maxTreeDepth - 1;
$options['trainingSet'] = $bestSplit['match'];
$matchSubTree = $this->buildTree($options);
$options['trainingSet'] = $bestSplit['notMatch'];
$notMatchSubTree = $this->buildTree($options);
return [
'attribute' => $bestSplit['attribute'],
'predicate' => $bestSplit['predicate'],
'predicateName' => $bestSplit['predicateName'],
'pivot' => $bestSplit['pivot'],
'match' => $matchSubTree,
'notMatch' => $notMatchSubTree,
'matchedCount' => count($bestSplit['match']),
'notMatchedCount' => count($bestSplit['notMatch']),
];
}
protected function countUniqueValues($items, $attr)
{
$counter = [];
for ($i = count($items) - 1; $i >= 0; $i--) {
$counter[$items[$i][$attr]] = 0;
}
for ($i = count($items) - 1; $i >= 0; $i--) {
$counter[$items[$i][$attr]] += 1;
}
return $counter;
}
protected function entropy($items, $attr)
{
$counter = $this->countUniqueValues($items, $attr);
$entropy = 0;
foreach ($counter as $key => $value) {
$p = $value / count($items);
$entropy += -abs($p) * log($p);
}
return $entropy;
}
protected function mostFrequentValue($items, $attr)
{
$counter = $this->countUniqueValues($items, $attr);
$mostFrequentCount = 0;
$mostFrequentValue;
foreach ($counter as $key => $value) {
if ($value > $mostFrequentCount) {
$mostFrequentCount = $value;
$mostFrequentValue = $key;
}
}
return $mostFrequentValue;
}
protected function predictTree($tree, $item)
{
while (true) {
if (isset($tree['category'])) {
return $tree['category'];
}
$attr = $tree['attribute'];
$value = $item[$attr];
$predicate = $tree['predicate'];
$pivot = $tree['pivot'];
if ($predicate($value, $pivot)) {
$tree = $tree['match'];
} else {
$tree = $tree['notMatch'];
}
}
}
protected function split($items, $attr, $predicate, $pivot)
{
$match = [];
$notMatch = [];
for ($i = count($items) - 1; $i >= 0; $i--) {
$item = $items[$i];
$attrValue = $item[$attr];
if ($predicate($attrValue, $pivot)) {
$match[] = $item;
} else {
$notMatch[] = $item;
}
}
return [
'match' => $match,
'notMatch' => $notMatch,
];
}
}
<?php
require_once 'DecisionTree.php';
$email = $_POST['email'] ?? '';
$password = $_POST['password'] ?? '';
if ($email || $password) {
$trainingSet = [
[
'emailValidation' => true,
'passwordLength' => 6,
'valid' => 'false, Password too short',
],
[
'emailValidation' => false,
'passwordLength' => 7,
'valid' => 'false, Email not valid',
],
[
'emailValidation' => false,
'passwordLength' => 6,
'valid' => 'false, Email not valid & password too short',
],
[
'emailValidation' => true,
'passwordLength' => 7,
'valid' => 'true, Welcome!',
],
];
$tree = new DecisionTree(
[
'trainingSet' => $trainingSet,
'categoryAttr' => 'valid',
]
);
$form = [
'emailValidation' => filter_var($email, FILTER_VALIDATE_EMAIL),
'passwordLength' => strlen($password),
];
$prediction = explode(',', $tree->predict($form));
$valid = (bool) $prediction[0];
$message = $prediction[1];
}
?>
<div class="container">
<?php if (isset($prediction)): ?>
<?=$message?>
<br><br>
<?php endif?>
<form action="index.php" method="post">
<label>Email address</label>
<br>
<input type="text" placeholder="Email" autofocus="" name="email">
<br><br>
<label>Password</label>
<br>
<input type="password" placeholder="Password" name="password">
<br><br>
<button type="submit">Sign in</button>
</form>
</div>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment