Skip to content

Instantly share code, notes, and snippets.

@alexandrnikitin
Created January 16, 2015 16:05
Show Gist options
  • Save alexandrnikitin/d1ea82fb193f5c931981 to your computer and use it in GitHub Desktop.
Save alexandrnikitin/d1ea82fb193f5c931981 to your computer and use it in GitHub Desktop.
Aho-Corasick for language detection
using System;
using System.Collections.Generic;
namespace LanguageDetection
{
public class AhoCorasickTree
{
private readonly AhoCorasickTreeNode _rootNode;
public AhoCorasickTree(IEnumerable<Tuple<string, int>> mapings)
{
if (mapings == null) throw new ArgumentNullException("keywords");
_rootNode = new AhoCorasickTreeNode();
foreach (var maping in mapings)
{
AddPatternToTree(maping.Item1, maping.Item2);
}
SetFailureNodes();
}
public int Contains(string text)
{
var foundId = 0;
var maxHeight = 0;
var currentNode = _rootNode;
var length = text.Length;
for (var i = 0; i < length; i++)
{
while (true)
{
var transition = currentNode.GetTransition(text[i]);
if (transition == null)
{
currentNode = currentNode.Failure;
if (currentNode == _rootNode)
{
break;
}
}
else
{
if (transition.Id > 0 && transition.Height > maxHeight)
{
foundId = transition.Id;
}
currentNode = transition;
break;
}
}
}
return foundId;
}
private void SetFailureNodes()
{
var nodes = FailToRootNode();
FailUsingBFS(nodes);
_rootNode.Failure = _rootNode;
}
private void AddPatternToTree(string pattern, int id)
{
var node = _rootNode;
foreach (var c in pattern)
{
node = node.GetTransition(c)
?? node.AddTransition(c);
}
node.Id = id;
node.Height = pattern.Length;
}
private List<AhoCorasickTreeNode> FailToRootNode()
{
var nodes = new List<AhoCorasickTreeNode>();
foreach (var node in _rootNode.Transitions)
{
node.Failure = _rootNode;
nodes.AddRange(node.Transitions);
}
return nodes;
}
private void FailUsingBFS(List<AhoCorasickTreeNode> nodes)
{
while (nodes.Count != 0)
{
var newNodes = new List<AhoCorasickTreeNode>();
foreach (var node in nodes)
{
var failure = node.ParentFailure;
var value = node.Value;
while (failure != null && !failure.ContainsTransition(value))
{
failure = failure.Failure;
}
if (failure == null)
{
node.Failure = _rootNode;
}
else
{
node.Failure = failure.GetTransition(value);
if (!node.IsFinished)
{
node.IsFinished = node.Failure.IsFinished;
}
}
newNodes.AddRange(node.Transitions);
}
nodes = newNodes;
}
}
}
}
using System;
using System.Linq;
namespace LanguageDetection
{
public class AhoCorasickTreeNode
{
public AhoCorasickTreeNode Failure;
public bool IsFinished;
public int Id;
public int Height;
public char Value;
private readonly AhoCorasickTreeNode _parent;
private int[] _buckets;
private int _count;
private Transition[] _transitions;
public AhoCorasickTreeNode()
: this(null, ' ')
{
}
private AhoCorasickTreeNode(AhoCorasickTreeNode parent, char value)
{
Value = value;
_parent = parent;
_buckets = new int[0];
_transitions = new Transition[0];
}
public AhoCorasickTreeNode ParentFailure
{
get { return _parent == null ? null : _parent.Failure; }
}
public AhoCorasickTreeNode[] Transitions
{
get { return _transitions.Select(x => x.Value).ToArray(); }
}
public AhoCorasickTreeNode AddTransition(char key)
{
Resize(_count + 1);
var value = new AhoCorasickTreeNode(this, key);
var targetBucket = key % _count;
for (var i = _buckets[targetBucket]; i >= 0; i = _transitions[i].Next)
{
if (_transitions[i].Key == key)
{
_transitions[i].Value = value;
return value;
}
}
var index = _count - 1;
_transitions[index].Next = _buckets[targetBucket];
_transitions[index].Key = key;
_transitions[index].Value = value;
_buckets[targetBucket] = index;
return value;
}
public bool ContainsTransition(char c)
{
return GetTransition(c) != null;
}
public AhoCorasickTreeNode GetTransition(char key)
{
if (_count == 0)
{
return null;
}
var bucketIndex = key % _count;
for (var i = _buckets[bucketIndex]; i >= 0; i = _transitions[i].Next)
{
if (_transitions[i].Key == key)
{
return _transitions[i].Value;
}
}
return null;
}
private void Resize(int newSize)
{
var newBuckets = new int[newSize];
for (var i = 0; i < newSize; i++)
{
newBuckets[i] = -1;
}
var newTransitions = new Transition[newSize];
Array.Copy(_transitions, 0, newTransitions, 0, _count);
// rebalancing buckets
for (var i = 0; i < _count; i++)
{
var bucket = newTransitions[i].Key % newSize;
newTransitions[i].Next = newBuckets[bucket];
newBuckets[bucket] = i;
}
_buckets = newBuckets;
_transitions = newTransitions;
_count = _buckets.Length;
}
}
}
namespace LanguageDetection
{
internal struct Transition
{
public char Key;
public int Next;
public AhoCorasickTreeNode Value;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment