Created
September 10, 2012 03:51
-
-
Save kaja47/3688772 to your computer and use it in GitHub Desktop.
PHP spellcheck
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| // PHP implementation of http://norvig.com/spell-correct.html in 49 lines of code | |
| // based on http://soundofemotion.com/spellcorrect.txt which is much longer | |
| function words($text) { | |
| return preg_split("~[^a-z]+~", $text, null, PREG_SPLIT_NO_EMPTY); | |
| } | |
| function train($features) { | |
| return array_count_values($features); | |
| } | |
| function edits1($word) { | |
| $edits = array(); | |
| $alphabet = "abcdefghijklmnopqrstuvwxyz"; | |
| // deletion | |
| for($x = 0; $x < strlen($word); $x++) | |
| $edits[] = substr($word, 0, $x) . substr($word, $x+1, strlen($word)); | |
| // transposition | |
| for($x = 0; $x < strlen($word)-1; $x++) | |
| $edits[] = substr($word, 0, $x) . $word[$x+1] . $word[$x] . substr($word, $x+2, strlen($word)); | |
| // alteration | |
| for($c = 0; $c < strlen($alphabet); $c++) | |
| for($x = 0; $x < strlen($word); $x++) | |
| $edits[] = substr($word, 0, $x) . $alphabet[$c] . substr($word, $x+1, strlen($word)); | |
| // insertion | |
| for($c = 0; $c < strlen($alphabet); $c++) | |
| for($x = 0; $x < strlen($word) + 1; $x++) | |
| $edits[] = substr($word, 0, $x) . $alphabet[$c] . substr($word, $x, strlen($word)); | |
| return array_unique($edits); | |
| } | |
| function known_edits2($word, $nwords) { | |
| $edits = array(); | |
| foreach (edits1($word) as $e1) | |
| foreach (edits1($e1) as $e2) | |
| if (isset($nwords[$e2])) | |
| $edits[] = $e2; | |
| return array_unique($edits); | |
| } | |
| function known($words, $nwords) { | |
| return array_flip(array_intersect_key(array_flip($words), $nwords)); | |
| } | |
| function candicates($word, $nwords) { | |
| if (known(array($word), $nwords)) return array($word); | |
| if ($known = known(edits1($word), $nwords)) return $known; | |
| if ($known = known_edits2($word, $nwords)) return $known; | |
| return array($word); | |
| } | |
| function correct($word, $nwords) { | |
| $candicates = array_flip(candicates($word, $nwords)); | |
| foreach ($candicates as $word => &$weight) | |
| $weight = isset($nwords[$word]) ? $nwords[$word] : 1; | |
| arsort($candicates, SORT_NUMERIC); | |
| reset($candicates); | |
| return key($candicates); | |
| } | |
| $nwords = train(words(file_get_contents("big.txt"))); | |
| echo correct("thay", $nwords), "\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment