Created
January 13, 2014 01:50
-
-
Save fj/8393399 to your computer and use it in GitHub Desktop.
SymSpell fast-spellcheck algorithm from faroo.com. Original here: http://blog.faroo.com/2012/06/24/1000x-faster-spelling-correction-source-code-released/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// SymSpell: 1000x faster through Symmetric Delete spelling correction algorithm | |
// | |
// The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup | |
// for a given Damerau-Levenshtein distance. It is three orders of magnitude faster and language independent. | |
// Opposite to other algorithms only deletes are required, no transposes + replaces + inserts. | |
// Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term. | |
// Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters! | |
// | |
// Copyright (C) 2012 Wolf Garbe, FAROO Limited | |
// Version: 1.6 | |
// Author: Wolf Garbe <[email protected]> | |
// Maintainer: Wolf Garbe <[email protected]> | |
// URL: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/ | |
// Description: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/ | |
// | |
// License: | |
// This program is free software; you can redistribute it and/or modify | |
// it under the terms of the GNU Lesser General Public License, | |
// version 3.0 (LGPL-3.0) as published by the Free Software Foundation. | |
// http://www.opensource.org/licenses/LGPL-3.0 | |
// | |
// Usage: single word + Enter: Display spelling suggestions | |
// Enter without input: Terminate the program | |
using System; | |
using System.Linq; | |
using System.Text.RegularExpressions; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Diagnostics; | |
static class SymSpell | |
{ | |
private static int editDistanceMax=2; | |
private static int verbose = 0; | |
//0: top suggestion | |
//1: all suggestions of smallest edit distance | |
//2: all suggestions <= editDistanceMax (slower, no early termination) | |
private class dictionaryItem | |
{ | |
public string term = ""; | |
public List<editItem> suggestions = new List<editItem>(); | |
public int count = 0; | |
public override bool Equals(object obj) | |
{ | |
return Equals(term, ((dictionaryItem)obj).term); | |
} | |
public override int GetHashCode() | |
{ | |
return term.GetHashCode(); | |
} | |
} | |
private class editItem | |
{ | |
public string term = ""; | |
public int distance = 0; | |
public override bool Equals(object obj) | |
{ | |
return Equals(term, ((editItem)obj).term); | |
} | |
public override int GetHashCode() | |
{ | |
return term.GetHashCode(); | |
} | |
} | |
private class suggestItem | |
{ | |
public string term = ""; | |
public int distance = 0; | |
public int count = 0; | |
public override bool Equals(object obj) | |
{ | |
return Equals(term, ((suggestItem)obj).term); | |
} | |
public override int GetHashCode() | |
{ | |
return term.GetHashCode(); | |
} | |
} | |
private static Dictionary<string, dictionaryItem> dictionary = new Dictionary<string, dictionaryItem>(); | |
//create a non-unique wordlist from sample text | |
//language independent (e.g. works with Chinese characters) | |
private static IEnumerable<string> parseWords(string text) | |
{ | |
return Regex.Matches(text.ToLower(), @"[\w-[\d_]]+") | |
.Cast<Match>() | |
.Select(m => m.Value); | |
} | |
//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary | |
//every delete entry has a suggestions list, which points to the original term(s) it was created from | |
//The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry | |
private static bool CreateDictionaryEntry(string key, string language) | |
{ | |
bool result = false; | |
dictionaryItem value; | |
if (dictionary.TryGetValue(language+key, out value)) | |
{ | |
//already exists: | |
//1. word appears several times | |
//2. word1==deletes(word2) | |
value.count++; | |
} | |
else | |
{ | |
value = new dictionaryItem(); | |
value.count++; | |
dictionary.Add(language+key, value); | |
} | |
//edits/suggestions are created only once, no matter how often word occurs | |
//edits/suggestions are created only as soon as the word occurs in the corpus, | |
//even if the same term existed before in the dictionary as an edit from another word | |
if (string.IsNullOrEmpty(value.term)) | |
{ | |
result = true; | |
value.term = key; | |
//create deletes | |
foreach (editItem delete in Edits(key, 0, true)) | |
{ | |
editItem suggestion = new editItem(); | |
suggestion.term = key; | |
suggestion.distance = delete.distance; | |
dictionaryItem value2; | |
if (dictionary.TryGetValue(language+delete.term, out value2)) | |
{ | |
//already exists: | |
//1. word1==deletes(word2) | |
//2. deletes(word1)==deletes(word2) | |
if (!value2.suggestions.Contains(suggestion)) AddLowestDistance(value2.suggestions, suggestion); | |
} | |
else | |
{ | |
value2 = new dictionaryItem(); | |
value2.suggestions.Add(suggestion); | |
dictionary.Add(language+delete.term, value2); | |
} | |
} | |
} | |
return result; | |
} | |
//create a frequency disctionary from a corpus | |
private static void CreateDictionary(string corpus, string language) | |
{ | |
if (!File.Exists(corpus)) | |
{ | |
Console.Error.WriteLine("File not found: " + corpus); | |
return; | |
} | |
Console.Write("Creating dictionary ..."); | |
long wordCount = 0; | |
foreach (string key in parseWords(File.ReadAllText(corpus))) | |
{ | |
if (CreateDictionaryEntry(key, language)) wordCount++; | |
} | |
Console.WriteLine("\rDictionary created: " + wordCount.ToString("N0") + " words, " + dictionary.Count.ToString("N0") + " entries, for edit distance=" + editDistanceMax.ToString()); | |
} | |
//save some time and space | |
private static void AddLowestDistance(List<editItem> suggestions, editItem suggestion) | |
{ | |
//remove all existing suggestions of higher distance, if verbose<2 | |
if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > suggestion.distance)) suggestions.Clear(); | |
//do not add suggestion of higher distance than existing, if verbose<2 | |
if ((verbose == 2) || (suggestions.Count == 0) || (suggestions[0].distance >= suggestion.distance)) suggestions.Add(suggestion); | |
} | |
//inexpensive and language independent: only deletes, no transposes + replaces + inserts | |
//replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters) | |
private static List<editItem> Edits(string word, int editDistance, bool recursion) | |
{ | |
editDistance++; | |
List<editItem> deletes = new List<editItem>(); | |
if (word.Length > 1) | |
{ | |
for (int i = 0; i < word.Length; i++) | |
{ | |
editItem delete = new editItem(); | |
delete.term=word.Remove(i, 1); | |
delete.distance=editDistance; | |
if (!deletes.Contains(delete)) | |
{ | |
deletes.Add(delete); | |
//recursion, if maximum edit distance not yet reached | |
if (recursion && (editDistance < editDistanceMax)) | |
{ | |
foreach (editItem edit1 in Edits(delete.term, editDistance,recursion)) | |
{ | |
if (!deletes.Contains(edit1)) deletes.Add(edit1); | |
} | |
} | |
} | |
} | |
} | |
return deletes; | |
} | |
private static int TrueDistance(editItem dictionaryOriginal, editItem inputDelete, string inputOriginal) | |
{ | |
//We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. | |
//For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. | |
//For inserts and deletes the resulting edit distance might exceed editDistanceMax. | |
//To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. | |
//Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) | |
//Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. | |
if (dictionaryOriginal.term == inputOriginal) return 0; else | |
if (dictionaryOriginal.distance == 0) return inputDelete.distance; | |
else if (inputDelete.distance == 0) return dictionaryOriginal.distance; | |
else return DamerauLevenshteinDistance(dictionaryOriginal.term, inputOriginal);//adjust distance, if both distances>0 | |
} | |
private static List<suggestItem> Lookup(string input, string language, int editDistanceMax) | |
{ | |
List<editItem> candidates = new List<editItem>(); | |
//add original term | |
editItem item = new editItem(); | |
item.term = input; | |
item.distance = 0; | |
candidates.Add(item); | |
List<suggestItem> suggestions = new List<suggestItem>(); | |
dictionaryItem value; | |
while (candidates.Count>0) | |
{ | |
editItem candidate = candidates[0]; | |
candidates.RemoveAt(0); | |
//save some time | |
//early termination | |
//suggestion distance=candidate.distance... candidate.distance+editDistanceMax | |
//if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected | |
if ((verbose < 2)&&(suggestions.Count > 0)&&(candidate.distance > suggestions[0].distance)) goto sort; | |
if (candidate.distance > editDistanceMax) goto sort; | |
if (dictionary.TryGetValue(language+candidate.term, out value)) | |
{ | |
if (!string.IsNullOrEmpty(value.term)) | |
{ | |
//correct term | |
suggestItem si = new suggestItem(); | |
si.term = value.term; | |
si.count = value.count; | |
si.distance = candidate.distance; | |
if (!suggestions.Contains(si)) | |
{ | |
suggestions.Add(si); | |
//early termination | |
if ((verbose < 2) && (candidate.distance == 0)) goto sort; | |
} | |
} | |
//edit term (with suggestions to correct term) | |
dictionaryItem value2; | |
foreach (editItem suggestion in value.suggestions) | |
{ | |
//save some time | |
//skipping double items early | |
if (suggestions.Find(x => x.term == suggestion.term) == null) | |
{ | |
int distance = TrueDistance(suggestion, candidate, input); | |
//save some time. | |
//remove all existing suggestions of higher distance, if verbose<2 | |
if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) suggestions.Clear(); | |
//do not process higher distances than those already found, if verbose<2 | |
if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) continue; | |
if (distance <= editDistanceMax) | |
{ | |
if (dictionary.TryGetValue(language+suggestion.term, out value2)) | |
{ | |
suggestItem si = new suggestItem(); | |
si.term = value2.term; | |
si.count = value2.count; | |
si.distance = distance; | |
suggestions.Add(si); | |
} | |
} | |
} | |
} | |
}//end foreach | |
//add edits | |
if (candidate.distance < editDistanceMax) | |
{ | |
foreach (editItem delete in Edits(candidate.term, candidate.distance,false)) | |
{ | |
if (!candidates.Contains(delete)) candidates.Add(delete); | |
} | |
} | |
}//end while | |
sort: suggestions = suggestions.OrderBy(c => c.distance).ThenByDescending(c => c.count).ToList(); | |
if ((verbose == 0)&&(suggestions.Count>1)) return suggestions.GetRange(0, 1); else return suggestions; | |
} | |
private static void Correct(string input, string language) | |
{ | |
List<suggestItem> suggestions = null; | |
/* | |
//Benchmark: 1000 x Lookup | |
Stopwatch stopWatch = new Stopwatch(); | |
stopWatch.Start(); | |
for (int i = 0; i < 1000; i++) | |
{ | |
suggestions = Lookup(input,language,editDistanceMax); | |
} | |
stopWatch.Stop(); | |
Console.WriteLine(stopWatch.ElapsedMilliseconds.ToString()); | |
*/ | |
//check in dictionary for existence and frequency; sort by edit distance, then by word frequency | |
suggestions = Lookup(input, language, editDistanceMax); | |
//display term and frequency | |
foreach (var suggestion in suggestions) | |
{ | |
Console.WriteLine( suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString()); | |
} | |
if (verbose == 2) Console.WriteLine(suggestions.Count.ToString() + " suggestions"); | |
} | |
private static void ReadFromStdIn() | |
{ | |
string word; | |
while (!string.IsNullOrEmpty(word = (Console.ReadLine() ?? "").Trim())) | |
{ | |
Correct(word,"en"); | |
} | |
} | |
public static void Main(string[] args) | |
{ | |
//e.g. http://norvig.com/big.txt , or any other large text corpus | |
CreateDictionary("big.txt","en"); | |
ReadFromStdIn(); | |
} | |
// Damerau–Levenshtein distance algorithm and code | |
// from http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance | |
public static Int32 DamerauLevenshteinDistance(String source, String target) | |
{ | |
Int32 m = source.Length; | |
Int32 n = target.Length; | |
Int32[,] H = new Int32[m + 2, n + 2]; | |
Int32 INF = m + n; | |
H[0, 0] = INF; | |
for (Int32 i = 0; i <= m; i++) { H[i + 1, 1] = i; H[i + 1, 0] = INF; } | |
for (Int32 j = 0; j <= n; j++) { H[1, j + 1] = j; H[0, j + 1] = INF; } | |
SortedDictionary<Char, Int32> sd = new SortedDictionary<Char, Int32>(); | |
foreach (Char Letter in (source + target)) | |
{ | |
if (!sd.ContainsKey(Letter)) | |
sd.Add(Letter, 0); | |
} | |
for (Int32 i = 1; i <= m; i++) | |
{ | |
Int32 DB = 0; | |
for (Int32 j = 1; j <= n; j++) | |
{ | |
Int32 i1 = sd[target[j - 1]]; | |
Int32 j1 = DB; | |
if (source[i - 1] == target[j - 1]) | |
{ | |
H[i + 1, j + 1] = H[i, j]; | |
DB = j; | |
} | |
else | |
{ | |
H[i + 1, j + 1] = Math.Min(H[i, j], Math.Min(H[i + 1, j], H[i, j + 1])) + 1; | |
} | |
H[i + 1, j + 1] = Math.Min(H[i + 1, j + 1], H[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1)); | |
} | |
sd[ source[ i - 1 ]] = i; | |
} | |
return H[m + 1, n + 1]; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment