Created
April 14, 2017 07:44
-
-
Save alexandrnikitin/e4176d6b472b39155a7e0e5d68264e65 to your computer and use it in GitHub Desktop.
Aho-Corasick C# implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Collections.Generic; | |
using System.Linq; | |
namespace AhoCorasickTree | |
{ | |
public class AhoCorasickTree | |
{ | |
internal AhoCorasickTreeNode Root { get; set; } | |
public AhoCorasickTree(IEnumerable<string> keywords) | |
{ | |
Root = new AhoCorasickTreeNode(); | |
if (keywords != null) | |
{ | |
foreach (var p in keywords) | |
{ | |
AddPatternToTree(p); | |
} | |
SetFailureNodes(); | |
} | |
} | |
public bool Contains(string text) | |
{ | |
return Contains(text, false); | |
} | |
public bool ContainsThatStart(string text) | |
{ | |
return Contains(text, true); | |
} | |
private bool Contains(string text, bool onlyStarts) | |
{ | |
var pointer = Root; | |
foreach (var c in text) | |
{ | |
var transition = GetTransition(c, ref pointer); | |
if (transition != null) | |
pointer = transition; | |
else if (onlyStarts) | |
return false; | |
if (pointer.Results.Any()) | |
return true; | |
} | |
return false; | |
} | |
public IEnumerable<string> FindAll(string text) | |
{ | |
var pointer = Root; | |
foreach (var c in text) | |
{ | |
var transition = GetTransition(c, ref pointer); | |
if (transition != null) | |
pointer = transition; | |
foreach (var result in pointer.Results) | |
yield return result; | |
} | |
} | |
private AhoCorasickTreeNode GetTransition(char c, ref AhoCorasickTreeNode pointer) | |
{ | |
AhoCorasickTreeNode transition = null; | |
while (transition == null) | |
{ | |
transition = pointer.GetTransition(c); | |
if (pointer == Root) | |
break; | |
if (transition == null) | |
pointer = pointer.Failure; | |
} | |
return transition; | |
} | |
private void SetFailureNodes() | |
{ | |
var nodes = FailToRootNode(); | |
FailUsingBFS(nodes); | |
Root.Failure = Root; | |
} | |
private void AddPatternToTree(string pattern) | |
{ | |
var node = Root; | |
foreach (var c in pattern) | |
{ | |
node = node.GetTransition(c) | |
?? node.AddTransition(c); | |
} | |
node.AddResult(pattern); | |
} | |
private List<AhoCorasickTreeNode> FailToRootNode() | |
{ | |
var nodes = new List<AhoCorasickTreeNode>(); | |
foreach (var node in Root.Transitions) | |
{ | |
node.Failure = Root; | |
nodes.AddRange(node.Transitions); | |
} | |
return nodes; | |
} | |
private void FailUsingBFS(List<AhoCorasickTreeNode> nodes) | |
{ | |
while (nodes.Count != 0) | |
{ | |
var newNodes = new List<AhoCorasickTreeNode>(); | |
foreach (var node in nodes) | |
{ | |
var failure = node.ParentFailure; | |
var value = node.Value; | |
while (failure != null && !failure.ContainsTransition(value)) | |
{ | |
failure = failure.Failure; | |
} | |
if (failure == null) | |
{ | |
node.Failure = Root; | |
} | |
else | |
{ | |
node.Failure = failure.GetTransition(value); | |
node.AddResults(node.Failure.Results); | |
} | |
newNodes.AddRange(node.Transitions); | |
} | |
nodes = newNodes; | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Collections.Generic; | |
using System.Diagnostics; | |
namespace AhoCorasickTree | |
{ | |
[DebuggerDisplay("Value = {Value}, TransitionCount = {_transitionsDictionary.Count}")] | |
internal class AhoCorasickTreeNode | |
{ | |
public char Value { get; private set; } | |
public AhoCorasickTreeNode Failure { get; set; } | |
private readonly List<string> _results; | |
private readonly Dictionary<char, AhoCorasickTreeNode> _transitionsDictionary; | |
private readonly AhoCorasickTreeNode _parent; | |
public IEnumerable<string> Results { get { return _results; } } | |
public AhoCorasickTreeNode ParentFailure { get { return _parent == null ? null : _parent.Failure; } } | |
public IEnumerable<AhoCorasickTreeNode> Transitions { get { return _transitionsDictionary.Values; } } | |
public AhoCorasickTreeNode() : this(null, ' ') | |
{ | |
} | |
private AhoCorasickTreeNode(AhoCorasickTreeNode parent, char value) | |
{ | |
Value = value; | |
_parent = parent; | |
_results = new List<string>(); | |
_transitionsDictionary = new Dictionary<char, AhoCorasickTreeNode>(); | |
} | |
public void AddResult(string result) | |
{ | |
if (!_results.Contains(result)) | |
{ | |
_results.Add(result); | |
} | |
} | |
public void AddResults(IEnumerable<string> results) | |
{ | |
foreach (var result in results) | |
{ | |
AddResult(result); | |
} | |
} | |
public AhoCorasickTreeNode AddTransition(char c) | |
{ | |
var node = new AhoCorasickTreeNode(this, c); | |
_transitionsDictionary.Add(node.Value, node); | |
return node; | |
} | |
public AhoCorasickTreeNode GetTransition(char c) | |
{ | |
return _transitionsDictionary.ContainsKey(c) | |
? _transitionsDictionary[c] | |
: null; | |
} | |
public bool ContainsTransition(char c) | |
{ | |
return _transitionsDictionary.ContainsKey(c); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace AhoCorasickTree.SandboxApp | |
{ | |
class Program | |
{ | |
static void Main() | |
{ | |
var tree = new AhoCorasickTree(new[] {"googlebot", "bingbot", "duckduckbot"}); | |
tree.Contains("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Damm💓