Skip to content

Instantly share code, notes, and snippets.

@kaja47
Created September 10, 2012 03:51
Show Gist options
  • Select an option

  • Save kaja47/3688772 to your computer and use it in GitHub Desktop.

Select an option

Save kaja47/3688772 to your computer and use it in GitHub Desktop.
PHP spellcheck
<?php
// PHP implementation of http://norvig.com/spell-correct.html in 49 lines of code
// based on http://soundofemotion.com/spellcorrect.txt which is much longer
function words($text) {
return preg_split("~[^a-z]+~", $text, null, PREG_SPLIT_NO_EMPTY);
}
function train($features) {
return array_count_values($features);
}
function edits1($word) {
$edits = array();
$alphabet = "abcdefghijklmnopqrstuvwxyz";
// deletion
for($x = 0; $x < strlen($word); $x++)
$edits[] = substr($word, 0, $x) . substr($word, $x+1, strlen($word));
// transposition
for($x = 0; $x < strlen($word)-1; $x++)
$edits[] = substr($word, 0, $x) . $word[$x+1] . $word[$x] . substr($word, $x+2, strlen($word));
// alteration
for($c = 0; $c < strlen($alphabet); $c++)
for($x = 0; $x < strlen($word); $x++)
$edits[] = substr($word, 0, $x) . $alphabet[$c] . substr($word, $x+1, strlen($word));
// insertion
for($c = 0; $c < strlen($alphabet); $c++)
for($x = 0; $x < strlen($word) + 1; $x++)
$edits[] = substr($word, 0, $x) . $alphabet[$c] . substr($word, $x, strlen($word));
return array_unique($edits);
}
function known_edits2($word, $nwords) {
$edits = array();
foreach (edits1($word) as $e1)
foreach (edits1($e1) as $e2)
if (isset($nwords[$e2]))
$edits[] = $e2;
return array_unique($edits);
}
function known($words, $nwords) {
return array_flip(array_intersect_key(array_flip($words), $nwords));
}
function candicates($word, $nwords) {
if (known(array($word), $nwords)) return array($word);
if ($known = known(edits1($word), $nwords)) return $known;
if ($known = known_edits2($word, $nwords)) return $known;
return array($word);
}
function correct($word, $nwords) {
$candicates = array_flip(candicates($word, $nwords));
foreach ($candicates as $word => &$weight)
$weight = isset($nwords[$word]) ? $nwords[$word] : 1;
arsort($candicates, SORT_NUMERIC);
reset($candicates);
return key($candicates);
}
$nwords = train(words(file_get_contents("big.txt")));
echo correct("thay", $nwords), "\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment