Created
October 21, 2015 08:40
-
-
Save hyper0x/78b3df6eac18c36871a9 to your computer and use it in GitHub Desktop.
Peter Norvig’s Spelling Corrector in Go by Yi.Wang
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"io/ioutil" | |
"regexp" | |
"strings" | |
"time" | |
) | |
func train(training_data string) map[string]int { | |
NWORDS := make(map[string]int) | |
pattern := regexp.MustCompile("[a-z]+") | |
if content, err := ioutil.ReadFile(training_data); err == nil { | |
for _, w := range pattern.FindAllString(strings.ToLower(string(content)), -1) { | |
NWORDS[w]++; | |
} | |
} else { | |
panic("Failed loading training data. Get it from http://norvig.com/big.txt.") | |
} | |
return NWORDS | |
} | |
func edits1(word string, ch chan string) { | |
const alphabet = "abcdefghijklmnopqrstuvwxyz" | |
type Pair struct{a, b string} | |
var splits []Pair | |
for i := 0; i < len(word) + 1; i++ { | |
splits = append(splits, Pair{word[:i], word[i:]}) } | |
for _, s := range splits { | |
if len(s.b) > 0 { ch <- s.a + s.b[1:] } | |
if len(s.b) > 1 { ch <- s.a + string(s.b[1]) + string(s.b[0]) + s.b[2:] } | |
for _, c := range alphabet { if len(s.b) > 0 { ch <- s.a + string(c) + s.b[1:] }} | |
for _, c := range alphabet { ch <- s.a + string(c) + s.b } | |
} | |
} | |
func edits2(word string, ch chan string) { | |
ch1 := make(chan string, 1024*1024) | |
go func() { edits1(word, ch1); ch1 <- "" }() | |
for e1 := range ch1 { | |
if e1 == "" { break } | |
edits1(e1, ch) | |
} | |
} | |
func best(word string, edits func(string, chan string), model map[string]int) string { | |
ch := make(chan string, 1024*1024) | |
go func() { edits(word, ch); ch <- "" }() | |
maxFreq := 0 | |
correction := "" | |
for word := range ch { | |
if word == "" { break } | |
if freq, present := model[word]; present && freq > maxFreq { | |
maxFreq, correction = freq, word | |
} | |
} | |
return correction | |
} | |
func correct(word string, model map[string]int) string { | |
if _, present := model[word]; present { return word } | |
if correction := best(word, edits1, model); correction != "" { return correction } | |
if correction := best(word, edits2, model); correction != "" { return correction } | |
return word | |
} | |
func main() { | |
model := train("big.txt") | |
startTime := time.Nanoseconds() | |
for i := 0; i < 1; i++ { fmt.Println(correct("korrecter", model)) } | |
fmt.Printf("Time : %v\n", float64(time.Nanoseconds() - startTime) / float64(1e9)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The article illustrate this issue at http://norvig.com/spell-correct.html .