Created
July 18, 2021 12:31
-
-
Save Validark/d493cfd1b3425c2e3073f5ccd08fbeb9 to your computer and use it in GitHub Desktop.
A clean implementation of the aho-corasick algorithm taking full advantage of Lua's __index metamethod.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- We can re-use metatables where possible | |
local lookupCache = { | |
__index = function(self, i) | |
local v = { __index = i } | |
self[i] = v | |
return v | |
end | |
} | |
local function use_aho_corasick(automaton, str) | |
local found = {} | |
local state = automaton | |
for i = 1, string.len(str) do | |
state = state[string.lower(string.sub(str, i, i))] or automaton | |
for _, capture in ipairs(state.captures) do | |
table.insert(found, { i - capture + 1, i }) | |
end | |
end | |
return found | |
end | |
local automaton_mt = { __call = use_aho_corasick } | |
-- given a dictionary Array<string> | |
-- output a DFA with metatable magic | |
local function aho_corasick(dictionary) | |
-- captures: Array<numbers> representing the length of matched strings | |
local automaton = { captures = {} } | |
-- construct a basic Trie for the dictionary | |
for _, word in ipairs(dictionary) do | |
local cur = automaton | |
for char in string.gmatch(word, ".") do | |
local next = cur[char] | |
if next == nil then | |
next = {} | |
cur[char] = next | |
end | |
cur = next | |
end | |
cur.captures = cur.captures or {} | |
table.insert(cur.captures, string.len(word)) | |
end | |
-- Link each node to the longest suffix in its path | |
-- Do a BFS, linking each level with its runs | |
-- (BFS allows us to use our links as we go!) | |
local pointers = setmetatable({}, lookupCache) | |
local queue = { { {}, automaton } } | |
for _, data in ipairs(queue) do | |
for char, node in pairs(data[2]) do | |
if char ~= "captures" then | |
local state = data[1][char] or automaton | |
if #state.captures > 0 then | |
if node.captures == nil then node.captures = {} end | |
table.move(state.captures, 1, #state.captures, #node.captures + 1, node.captures) | |
end | |
table.insert(queue, { state, setmetatable(node, pointers[state]) }) | |
end | |
end | |
end | |
return setmetatable(automaton, automaton_mt) | |
end | |
do -- a little demo | |
local automata = aho_corasick { "try", "to", "find", "all", "these", "words" } | |
local input = "Let's see if you can find any matching words in this sentence" | |
local captures = automata(input) | |
for _, capture in ipairs(captures) do | |
print(capture[1], string.sub(input, capture[1], capture[2])) | |
end | |
end | |
return aho_corasick |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment