Skip to content

Instantly share code, notes, and snippets.

@fj
Created January 13, 2014 01:50
Show Gist options
  • Save fj/8393399 to your computer and use it in GitHub Desktop.
Save fj/8393399 to your computer and use it in GitHub Desktop.
SymSpell fast-spellcheck algorithm from faroo.com. Original here: http://blog.faroo.com/2012/06/24/1000x-faster-spelling-correction-source-code-released/
// SymSpell: 1000x faster through Symmetric Delete spelling correction algorithm
//
// The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup
// for a given Damerau-Levenshtein distance. It is three orders of magnitude faster and language independent.
// Opposite to other algorithms only deletes are required, no transposes + replaces + inserts.
// Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term.
// Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters!
//
// Copyright (C) 2012 Wolf Garbe, FAROO Limited
// Version: 1.6
// Author: Wolf Garbe <[email protected]>
// Maintainer: Wolf Garbe <[email protected]>
// URL: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
// Description: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
//
// License:
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License,
// version 3.0 (LGPL-3.0) as published by the Free Software Foundation.
// http://www.opensource.org/licenses/LGPL-3.0
//
// Usage: single word + Enter:  Display spelling suggestions
//        Enter without input:  Terminate the program
 
using System;
using System.Linq;
using System.Text.RegularExpressions;
using System.Collections.Generic;
using System.IO;
using System.Diagnostics;
 
static class SymSpell
{
    private static int editDistanceMax=2;
    private static int verbose = 0;
    //0: top suggestion
    //1: all suggestions of smallest edit distance
    //2: all suggestions <= editDistanceMax (slower, no early termination)
 
    private class dictionaryItem
    {
        public string term = "";
        public List<editItem> suggestions = new List<editItem>();
        public int count = 0;
 
        public override bool Equals(object obj)
        {
            return Equals(term, ((dictionaryItem)obj).term);
        }
      
        public override int GetHashCode()
        {
            return term.GetHashCode();
        }      
    }
 
    private class editItem
    {
        public string term = "";
        public int distance = 0;
 
        public override bool Equals(object obj)
        {
            return Equals(term, ((editItem)obj).term);
        }
      
        public override int GetHashCode()
        {
            return term.GetHashCode();
        }      
    }
 
    private class suggestItem
    {
        public string term = "";
        public int distance = 0;
        public int count = 0;
 
        public override bool Equals(object obj)
        {
            return Equals(term, ((suggestItem)obj).term);
        }
      
        public override int GetHashCode()
        {
            return term.GetHashCode();
        }      
    }
 
    private static Dictionary<string, dictionaryItem> dictionary = new Dictionary<string, dictionaryItem>();
 
    //create a non-unique wordlist from sample text
    //language independent (e.g. works with Chinese characters)
    private static IEnumerable<string> parseWords(string text)
    {
        return Regex.Matches(text.ToLower(), @"[\w-[\d_]]+")
                    .Cast<Match>()
                    .Select(m => m.Value);
    }
 
    //for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary
    //every delete entry has a suggestions list, which points to the original term(s) it was created from
    //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry
    private static bool CreateDictionaryEntry(string key, string language)
    {
        bool result = false;
        dictionaryItem value;
        if (dictionary.TryGetValue(language+key, out value))
        {
            //already exists:
            //1. word appears several times
            //2. word1==deletes(word2)
            value.count++;
        }
        else
        {
            value = new dictionaryItem();
            value.count++;
            dictionary.Add(language+key, value);
        }
 
        //edits/suggestions are created only once, no matter how often word occurs
        //edits/suggestions are created only as soon as the word occurs in the corpus,
        //even if the same term existed before in the dictionary as an edit from another word
        if (string.IsNullOrEmpty(value.term))
        {
            result = true;
            value.term = key;
 
            //create deletes
            foreach (editItem delete in Edits(key, 0, true))
            {
                editItem suggestion = new editItem();
                suggestion.term = key;
                suggestion.distance = delete.distance;
 
                dictionaryItem value2;
                if (dictionary.TryGetValue(language+delete.term, out value2))
                {
                    //already exists:
                    //1. word1==deletes(word2)
                    //2. deletes(word1)==deletes(word2)
                    if (!value2.suggestions.Contains(suggestion)) AddLowestDistance(value2.suggestions, suggestion);
                }
                else
                {
                    value2 = new dictionaryItem();
                    value2.suggestions.Add(suggestion);
                    dictionary.Add(language+delete.term, value2);
                }
            }
        }
        return result;
    }
 
    //create a frequency disctionary from a corpus
    private static void CreateDictionary(string corpus, string language)
    {
        if (!File.Exists(corpus))
        {
            Console.Error.WriteLine("File not found: " + corpus);
            return;
        }
 
        Console.Write("Creating dictionary ...");
        long wordCount = 0;
        foreach (string key in parseWords(File.ReadAllText(corpus)))
        {
            if (CreateDictionaryEntry(key, language)) wordCount++;
        }
        Console.WriteLine("\rDictionary created: " + wordCount.ToString("N0") + " words, " + dictionary.Count.ToString("N0") + " entries, for edit distance=" + editDistanceMax.ToString());
    }
 
    //save some time and space
    private static void AddLowestDistance(List<editItem> suggestions, editItem suggestion)
    {
        //remove all existing suggestions of higher distance, if verbose<2
        if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > suggestion.distance)) suggestions.Clear();
        //do not add suggestion of higher distance than existing, if verbose<2
        if ((verbose == 2) || (suggestions.Count == 0) || (suggestions[0].distance >= suggestion.distance)) suggestions.Add(suggestion);
    }
 
    //inexpensive and language independent: only deletes, no transposes + replaces + inserts
    //replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters)
    private static List<editItem> Edits(string word, int editDistance, bool recursion)
    {
        editDistance++;
        List<editItem> deletes = new List<editItem>();
        if (word.Length > 1)
        {
            for (int i = 0; i < word.Length; i++)
            {
                editItem delete = new editItem();
                delete.term=word.Remove(i, 1);
                delete.distance=editDistance;
                if (!deletes.Contains(delete))
                {
                    deletes.Add(delete);
                    //recursion, if maximum edit distance not yet reached
                    if (recursion && (editDistance < editDistanceMax))
                    {
                        foreach (editItem edit1 in Edits(delete.term, editDistance,recursion))
                        {
                            if (!deletes.Contains(edit1)) deletes.Add(edit1);
                        }
                    }                  
                }
            }
        }
 
        return deletes;
    }
 
    private static int TrueDistance(editItem dictionaryOriginal, editItem inputDelete, string inputOriginal)
    {
        //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term.
        //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
        //For inserts and deletes the resulting edit distance might exceed editDistanceMax.
        //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
        //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
        //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
 
        if (dictionaryOriginal.term == inputOriginal) return 0; else
        if (dictionaryOriginal.distance == 0) return inputDelete.distance;
        else if (inputDelete.distance == 0) return dictionaryOriginal.distance;
        else return DamerauLevenshteinDistance(dictionaryOriginal.term, inputOriginal);//adjust distance, if both distances>0
    }
 
    private static List<suggestItem> Lookup(string input, string language, int editDistanceMax)
    {
        List<editItem> candidates = new List<editItem>();
 
        //add original term
        editItem item = new editItem();
        item.term = input;
        item.distance = 0;
        candidates.Add(item);
  
        List<suggestItem> suggestions = new List<suggestItem>();
        dictionaryItem value;
 
        while (candidates.Count>0)
        {
            editItem candidate = candidates[0];
            candidates.RemoveAt(0);
 
            //save some time
            //early termination
            //suggestion distance=candidate.distance... candidate.distance+editDistanceMax               
            //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
            if ((verbose < 2)&&(suggestions.Count > 0)&&(candidate.distance > suggestions[0].distance)) goto sort;
            if (candidate.distance > editDistanceMax) goto sort; 
 
            if (dictionary.TryGetValue(language+candidate.term, out value))
            {
                if (!string.IsNullOrEmpty(value.term))
                {
                    //correct term
                    suggestItem si = new suggestItem();
                    si.term = value.term;
                    si.count = value.count;
                    si.distance = candidate.distance;
 
                    if (!suggestions.Contains(si))
                    {
                        suggestions.Add(si);
                        //early termination
                        if ((verbose < 2) && (candidate.distance == 0)) goto sort;    
                    }
                }
 
                //edit term (with suggestions to correct term)
                dictionaryItem value2;
                foreach (editItem suggestion in value.suggestions)
                {
                    //save some time
                    //skipping double items early
                    if (suggestions.Find(x => x.term == suggestion.term) == null)
                    {
                        int distance = TrueDistance(suggestion, candidate, input);
                      
                        //save some time.
                        //remove all existing suggestions of higher distance, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (suggestions[0].distance > distance)) suggestions.Clear();
                        //do not process higher distances than those already found, if verbose<2
                        if ((verbose < 2) && (suggestions.Count > 0) && (distance > suggestions[0].distance)) continue;
 
                        if (distance <= editDistanceMax)
                        {
                            if (dictionary.TryGetValue(language+suggestion.term, out value2))
                            {
                                suggestItem si = new suggestItem();
                                si.term = value2.term;
                                si.count = value2.count;
                                si.distance = distance;
 
                                suggestions.Add(si);
                            }
                        }
                    }
                }
            }//end foreach
 
            //add edits
            if (candidate.distance < editDistanceMax)
            {
                foreach (editItem delete in Edits(candidate.term, candidate.distance,false))
                {
                    if (!candidates.Contains(delete)) candidates.Add(delete);
                }
            }
        }//end while
 
        sort: suggestions = suggestions.OrderBy(c => c.distance).ThenByDescending(c => c.count).ToList();
        if ((verbose == 0)&&(suggestions.Count>1))  return suggestions.GetRange(0, 1); else return suggestions;
    }
 
    private static void Correct(string input, string language)
    {
        List<suggestItem> suggestions = null;
     
        /*
        //Benchmark: 1000 x Lookup
        Stopwatch stopWatch = new Stopwatch();
        stopWatch.Start();
        for (int i = 0; i < 1000; i++)
        {
            suggestions = Lookup(input,language,editDistanceMax);
        }
        stopWatch.Stop();
        Console.WriteLine(stopWatch.ElapsedMilliseconds.ToString());
        */
         
        //check in dictionary for existence and frequency; sort by edit distance, then by word frequency
        suggestions = Lookup(input, language, editDistanceMax);
 
        //display term and frequency
        foreach (var suggestion in suggestions)
        {
            Console.WriteLine( suggestion.term + " " + suggestion.distance.ToString() + " " + suggestion.count.ToString());
        }
        if (verbose == 2) Console.WriteLine(suggestions.Count.ToString() + " suggestions");
    }
 
    private static void ReadFromStdIn()
    {
        string word;
        while (!string.IsNullOrEmpty(word = (Console.ReadLine() ?? "").Trim()))
        {
            Correct(word,"en");
        }
    }
 
    public static void Main(string[] args)
    {
        //e.g. http://norvig.com/big.txt , or any other large text corpus
        CreateDictionary("big.txt","en");
        ReadFromStdIn();
    }
 
    // Damerau–Levenshtein distance algorithm and code
    // from http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
    public static Int32 DamerauLevenshteinDistance(String source, String target)
    {
        Int32 m = source.Length;
        Int32 n = target.Length;
        Int32[,] H = new Int32[m + 2, n + 2];
 
        Int32 INF = m + n;
        H[0, 0] = INF;
        for (Int32 i = 0; i <= m; i++) { H[i + 1, 1] = i; H[i + 1, 0] = INF; }
        for (Int32 j = 0; j <= n; j++) { H[1, j + 1] = j; H[0, j + 1] = INF; }
 
        SortedDictionary<Char, Int32> sd = new SortedDictionary<Char, Int32>();
        foreach (Char Letter in (source + target))
        {
            if (!sd.ContainsKey(Letter))
                sd.Add(Letter, 0);
        }
 
        for (Int32 i = 1; i <= m; i++)
        {
            Int32 DB = 0;
            for (Int32 j = 1; j <= n; j++)
            {
                Int32 i1 = sd[target[j - 1]];
                Int32 j1 = DB;
 
                if (source[i - 1] == target[j - 1])
                {
                    H[i + 1, j + 1] = H[i, j];
                    DB = j;
                }
                else
                {
                    H[i + 1, j + 1] = Math.Min(H[i, j], Math.Min(H[i + 1, j], H[i, j + 1])) + 1;
                }
 
                H[i + 1, j + 1] = Math.Min(H[i + 1, j + 1], H[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1));
            }
 
            sd[ source[ i - 1 ]] = i;
        }
        return H[m + 1, n + 1];
    }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment