Created
January 16, 2015 16:05
-
-
Save alexandrnikitin/d1ea82fb193f5c931981 to your computer and use it in GitHub Desktop.
Aho-Corasick for language detection
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
namespace LanguageDetection | |
{ | |
public class AhoCorasickTree | |
{ | |
private readonly AhoCorasickTreeNode _rootNode; | |
public AhoCorasickTree(IEnumerable<Tuple<string, int>> mapings) | |
{ | |
if (mapings == null) throw new ArgumentNullException("keywords"); | |
_rootNode = new AhoCorasickTreeNode(); | |
foreach (var maping in mapings) | |
{ | |
AddPatternToTree(maping.Item1, maping.Item2); | |
} | |
SetFailureNodes(); | |
} | |
public int Contains(string text) | |
{ | |
var foundId = 0; | |
var maxHeight = 0; | |
var currentNode = _rootNode; | |
var length = text.Length; | |
for (var i = 0; i < length; i++) | |
{ | |
while (true) | |
{ | |
var transition = currentNode.GetTransition(text[i]); | |
if (transition == null) | |
{ | |
currentNode = currentNode.Failure; | |
if (currentNode == _rootNode) | |
{ | |
break; | |
} | |
} | |
else | |
{ | |
if (transition.Id > 0 && transition.Height > maxHeight) | |
{ | |
foundId = transition.Id; | |
} | |
currentNode = transition; | |
break; | |
} | |
} | |
} | |
return foundId; | |
} | |
private void SetFailureNodes() | |
{ | |
var nodes = FailToRootNode(); | |
FailUsingBFS(nodes); | |
_rootNode.Failure = _rootNode; | |
} | |
private void AddPatternToTree(string pattern, int id) | |
{ | |
var node = _rootNode; | |
foreach (var c in pattern) | |
{ | |
node = node.GetTransition(c) | |
?? node.AddTransition(c); | |
} | |
node.Id = id; | |
node.Height = pattern.Length; | |
} | |
private List<AhoCorasickTreeNode> FailToRootNode() | |
{ | |
var nodes = new List<AhoCorasickTreeNode>(); | |
foreach (var node in _rootNode.Transitions) | |
{ | |
node.Failure = _rootNode; | |
nodes.AddRange(node.Transitions); | |
} | |
return nodes; | |
} | |
private void FailUsingBFS(List<AhoCorasickTreeNode> nodes) | |
{ | |
while (nodes.Count != 0) | |
{ | |
var newNodes = new List<AhoCorasickTreeNode>(); | |
foreach (var node in nodes) | |
{ | |
var failure = node.ParentFailure; | |
var value = node.Value; | |
while (failure != null && !failure.ContainsTransition(value)) | |
{ | |
failure = failure.Failure; | |
} | |
if (failure == null) | |
{ | |
node.Failure = _rootNode; | |
} | |
else | |
{ | |
node.Failure = failure.GetTransition(value); | |
if (!node.IsFinished) | |
{ | |
node.IsFinished = node.Failure.IsFinished; | |
} | |
} | |
newNodes.AddRange(node.Transitions); | |
} | |
nodes = newNodes; | |
} | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Linq; | |
namespace LanguageDetection | |
{ | |
public class AhoCorasickTreeNode | |
{ | |
public AhoCorasickTreeNode Failure; | |
public bool IsFinished; | |
public int Id; | |
public int Height; | |
public char Value; | |
private readonly AhoCorasickTreeNode _parent; | |
private int[] _buckets; | |
private int _count; | |
private Transition[] _transitions; | |
public AhoCorasickTreeNode() | |
: this(null, ' ') | |
{ | |
} | |
private AhoCorasickTreeNode(AhoCorasickTreeNode parent, char value) | |
{ | |
Value = value; | |
_parent = parent; | |
_buckets = new int[0]; | |
_transitions = new Transition[0]; | |
} | |
public AhoCorasickTreeNode ParentFailure | |
{ | |
get { return _parent == null ? null : _parent.Failure; } | |
} | |
public AhoCorasickTreeNode[] Transitions | |
{ | |
get { return _transitions.Select(x => x.Value).ToArray(); } | |
} | |
public AhoCorasickTreeNode AddTransition(char key) | |
{ | |
Resize(_count + 1); | |
var value = new AhoCorasickTreeNode(this, key); | |
var targetBucket = key % _count; | |
for (var i = _buckets[targetBucket]; i >= 0; i = _transitions[i].Next) | |
{ | |
if (_transitions[i].Key == key) | |
{ | |
_transitions[i].Value = value; | |
return value; | |
} | |
} | |
var index = _count - 1; | |
_transitions[index].Next = _buckets[targetBucket]; | |
_transitions[index].Key = key; | |
_transitions[index].Value = value; | |
_buckets[targetBucket] = index; | |
return value; | |
} | |
public bool ContainsTransition(char c) | |
{ | |
return GetTransition(c) != null; | |
} | |
public AhoCorasickTreeNode GetTransition(char key) | |
{ | |
if (_count == 0) | |
{ | |
return null; | |
} | |
var bucketIndex = key % _count; | |
for (var i = _buckets[bucketIndex]; i >= 0; i = _transitions[i].Next) | |
{ | |
if (_transitions[i].Key == key) | |
{ | |
return _transitions[i].Value; | |
} | |
} | |
return null; | |
} | |
private void Resize(int newSize) | |
{ | |
var newBuckets = new int[newSize]; | |
for (var i = 0; i < newSize; i++) | |
{ | |
newBuckets[i] = -1; | |
} | |
var newTransitions = new Transition[newSize]; | |
Array.Copy(_transitions, 0, newTransitions, 0, _count); | |
// rebalancing buckets | |
for (var i = 0; i < _count; i++) | |
{ | |
var bucket = newTransitions[i].Key % newSize; | |
newTransitions[i].Next = newBuckets[bucket]; | |
newBuckets[bucket] = i; | |
} | |
_buckets = newBuckets; | |
_transitions = newTransitions; | |
_count = _buckets.Length; | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace LanguageDetection | |
{ | |
internal struct Transition | |
{ | |
public char Key; | |
public int Next; | |
public AhoCorasickTreeNode Value; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment