Last active
April 18, 2023 04:32
-
-
Save randrews/26d4a5cd2a59b95bb376 to your computer and use it in GitHub Desktop.
Split a telegram string into component words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function load_dictionary(filename) | |
local dictionary = {} | |
for line in io.lines(filename) do | |
if line:match("^%l%l+$") then | |
insert(dictionary, line .. "$") | |
end | |
end | |
-- We're cutting all single-letter words, but | |
-- two of them are real words | |
insert(dictionary, "a$") | |
insert(dictionary, "i$") | |
return dictionary | |
end | |
function insert(dictionary, word) | |
local first = word:sub(1,1) | |
dictionary[first] = dictionary[first] or {} | |
if first ~= "$" then | |
insert(dictionary[first], word:sub(2)) | |
end | |
end | |
local dictionary = load_dictionary("/usr/share/dict/words") | |
-------------------------------------------------- | |
function find_words(telegram, dictionary) | |
local words = {} -- list of tuples, {start, word} | |
for start = 1, #telegram do | |
local current_index = start | |
local current_letter = telegram:sub(start, start) | |
local current_node = dictionary | |
while current_node[ current_letter ] do | |
current_node = current_node[ current_letter ] | |
if current_node['$'] then | |
table.insert(words, {start, telegram:sub(start, current_index)}) | |
end | |
current_index = current_index + 1 | |
current_letter = telegram:sub(current_index, current_index) | |
end | |
end | |
return words | |
end | |
function print_words(words) | |
for _,w in ipairs(words) do | |
for n=1, w[1]-1 do io.write('-') end | |
print(w[2]) | |
end | |
end | |
function cull_words(words, telegram_length) | |
-- Map from start / end position, to word count | |
local by_start = setmetatable( {}, {__index=function() return 0 end} ) | |
local by_end = setmetatable( {}, {__index=function() return 0 end} ) | |
-- Initialize these counts | |
for _, w in ipairs(words) do | |
local l = w[1] | |
local r = w[1] + #(w[2]) - 1 | |
by_start[l] = by_start[l] + 1 | |
by_end[r] = by_end[r] + 1 | |
end | |
-- Loop until we run out of things to remove | |
while true do | |
local new_words = {} | |
for i, w in ipairs(words) do | |
local left = w[1] | |
local right = w[1] + #(w[2]) - 1 | |
local left_neighbor = (left == 1) or (by_end[left-1] > 0) | |
local right_neighbor = (right == telegram_length) or (by_start[right+1] > 0) | |
if left_neighbor and right_neighbor then | |
table.insert(new_words, w) | |
else | |
by_start[left] = by_start[left] - 1 | |
by_end[right] = by_end[right] - 1 | |
end | |
end | |
if #new_words == #words then break end | |
words = new_words | |
end | |
return words | |
end | |
function build_word_tree(words) | |
local function find_by_start(start) | |
local arr = {} | |
for _,w in ipairs(words) do | |
if w[1] == start then table.insert(arr, w[2]) end | |
end | |
return arr | |
end | |
local function tree_for_start(start) | |
local tree = {} | |
for _, w in ipairs(find_by_start(start)) do | |
tree[w] = tree_for_start( start + #w ) | |
end | |
return tree | |
end | |
return tree_for_start(1) | |
end | |
function print_tree(word_tree) | |
local strings = {} -- The paths through the tree | |
local function dft(path, current_node) | |
for word, children in pairs(current_node) do | |
local new_path = path and path .. " " .. word or word | |
if not next(children) then | |
table.insert(strings, new_path) | |
else | |
dft(new_path, children) | |
end | |
end | |
end | |
dft(nil, word_tree) | |
for _, s in ipairs(strings) do | |
print(s) | |
end | |
end | |
-------------------------------------------------- | |
function parse_telegram(telegram) | |
local words = find_words(telegram, dictionary) | |
local culled = cull_words(words, #telegram) | |
local tree = build_word_tree(culled) | |
print_tree(tree) | |
end | |
-------------------------------------------------- | |
parse_telegram("turkeytrotstowaterwhereistaskforcethirtyfourtheworldwonders") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment