Skip to content

Instantly share code, notes, and snippets.

@cpu
Last active May 19, 2022 10:32
Show Gist options
  • Save cpu/af15929758bebc81fb2282a44dfe7daf to your computer and use it in GitHub Desktop.
Save cpu/af15929758bebc81fb2282a44dfe7daf to your computer and use it in GitHub Desktop.
LPC BK-Tree data structure and True Damerau-Levenshtein Distance implementation for command suggestions/spellchecking.
/*
* LWO LPC BK-Tree.
* Paradox@DUNE
* MIT Licensed.
*
* A general BK Tree data-structure[0] implemented in LPC as a light-weight
* object[1]. Modelled after agatan/bktree[2], but in LPC.
*
* Assumes the presence of a `damerau_levenshtein` simul efun for providing the
* default distance metric and LDMud 3.6.5+ features (LWOs, union types,
* structs).
*
* Example BK-Tree Usage:
* #define BKTREE "path/to/this/file/bktree.c"
* lwobject tree = new_lwobject(BKTREE, ({ "apple", "orange", "peach" }));
* <int|string>** results = tree.Search("peacn", 2);
* // results = ({ ({ "peach", 1 }) });
*
* See `command_suggest_test.c and 'bktree_test.c' for complete usage examples.
*
* Changelog:
* - 2022-05-15 Initial code.
* - 2022-05-17 Cleanup, BatchAdd func.
*
* [0]: https://en.wikipedia.org/wiki/BK-tree
* [1]: https://github.com/ldmud/ldmud/blob/3.6.5/doc/LPC/lwobjects
* [2]: https://github.com/agatan/bktree
*/
#pragma strict_types, rtt_checks, pedantic, range_check, lightweight
/*
* Public API
*/
public void create(mixed* initial, closure distance_func);
public void Add(mixed val);
public void BatchAdd(mixed* words, int chunkSize, closure done);
public <int|string>** Search(mixed needle, int tolerance);
public string String();
/*
* Internal Types
*/
private struct node {
mixed element;
mapping children;
};
/*
* Internal API
*/
private int distance(mixed a, mixed b);
private string printNode(struct node n, int distance, int level);
/*
* Internal state
*/
private closure distance_metric;
private struct node root;
/*
* Public API Impl.
*/
public void create(mixed* initial, closure distance_func) {
initial ||= ({ });
distance_func ||= #'damerau_levenshtein;
if(structp(root) || closurep(distance_metric)) {
raise_error("create called twice");
}
distance_metric = distance_func;
foreach(mixed m : initial) {
Add(m);
}
}
public void Add(mixed val) {
if(!structp(root)) {
root = (<node> element: val, children: ([ ]));
return;
}
struct node cur = root;
int d = distance(cur.element, val);
while(d in cur.children) {
if(d == 0) return;
cur = cur.children[d];
d = distance(cur.element, val);
}
cur.children[d] = (<node> element: val, children: ([ ]));
}
private void addBatch(mixed *values, int chunkSize, int started, int added, closure done) {
if(sizeof(values) == 0) {
int elapsed = time() - started;
closurep(done) && funcall(done, added, elapsed);
return;
}
string* chunk = sizeof(values) >= chunkSize ? values[0 .. chunkSize - 1] : values;
string* rest = sizeof(values) > chunkSize ? values[chunkSize + 1 ..] : ({ });
foreach(mixed value : chunk) {
Add(value);
added++;
}
call_out(#'addBatch, 1, rest, chunkSize, started, added, done);
}
public void BatchAdd(mixed* values, int chunkSize, closure done)
{
if(sizeof(values) == 0) { return; }
if(find_call_out(#'addBatch) != -1) {
raise_error("a batch add is already running.\n");
}
call_out(#'addBatch, 1, values, chunkSize, time(), 0, done);
}
public <int|string>** Search(mixed needle, int tolerance) {
<int|string>** results = ({ });
if(!structp(root)) { return results; }
struct node* candidates = ({ root });
while(sizeof(candidates) != 0) {
struct node c = candidates[<1];
candidates = sizeof(candidates) > 1 ? candidates[..<2] : ({ });
int d = distance(c.element, needle);
if(d <= tolerance) {
results += ({ ({ c.element, d }) });
}
int low = d - tolerance, high = d + tolerance;
foreach(int cd, struct node child : c.children) {
if(low <= cd && cd <= high) {
candidates += ({ child });
}
}
}
return sort_array(results, (: $1[1] > $2[1] :));
}
public string String() {
if(!structp(root)) { return ""; }
return printNode(root, 0, 0);
}
/*
* Internal API Impl.
*/
private int distance(mixed a, mixed b) {
return funcall(distance_metric, a, b);
}
private string printNode(struct node n, int distance, int level) {
string result = sprintf("%s %Q [%d] \n",
" " * level, n.element, distance);
int* distances = sort_array(m_indices(n.children), #'>);
foreach(int d : distances) {
result += printNode(n.children[d], d, level+1);
}
return result;
}
/*
* LWO LPC BK-Tree Test.
* Paradox@DUNE
* MIT Licensed.
*
* Tests of a LPC BK Tree data-structure[0] using hamming-distance[1] and
* integers.
*
* See 'bktree.c' for more information. Adapted from agatan/bktree[2].
*
* Changelog:
* - 2022-05-15 Initial code.
*
* [0]: https://en.wikipedia.org/wiki/BK-tree
* [1]: https://en.wikipedia.org/wiki/Hamming_distance
* [2]: https://github.com/agatan/bktree/blob/master/bktree_test.go
*/
#pragma strong_types, rtt_checks, pedantic, range_check, no_clone, no_inherit
#include <functionlist.h>
#if !defined(BKTREE)
#define BKTREE __PATH__(0)"bktree"
#endif
private int hamming_distance(int a, int b) {
int dist = 0;
for(int val = a ^ b; val > 0; ++dist)
{
val = val & (val - 1);
}
return dist;
}
public int main() {
string* test_funcs = filter(
functionlist(this_object(), RETURN_FUNCTION_NAME|TYPE_MOD_PRIVATE),
(: sizeof($1) >= 5 && $1[0..4] == "test_" :));
debug_message(sprintf("%s:%s - tests: %Q\n",
__FILE__, __FUNCTION__, implode(test_funcs, ", ")));
string* failed = ({ });
string* passed = ({ });
foreach(string test_name : test_funcs) {
closure func = symbol_function(test_name, this_object());
if(!funcall(func)) {
failed += ({ test_name });
} else {
passed += ({ test_name });
}
}
debug_message(sprintf("%s:%s - %d tests passed. %d failed\n",
__FILE__, __FUNCTION__, sizeof(passed), sizeof(failed)));
return sizeof(failed);
}
protected int test_empty_search() {
lwobject tree = new_lwobject(BKTREE, ({ }), #'hamming_distance);
int** results = tree.Search(0, 0);
if(sizeof(results) != 0) {
debug_message(sprintf("expected empty search to return no results, got %d\n",
sizeof(results)));
return 0;
}
return 1;
}
private lwobject int_tree(int size) {
lwobject tree = new_lwobject(BKTREE, ({ }), #'hamming_distance);
for(int i = 0; i < size; i++) {
tree->Add(i);
}
return tree;
}
protected int test_exact_match() {
int numElements = 20;
lwobject tree = int_tree(numElements);
for(int i = 0; i < numElements; i++) {
int** results = tree.Search(i, 0);
if(sizeof(results) != 1) {
debug_message(sprintf("expected exact search to return 1 match, got %d: %Q\n",
sizeof(results), results));
return 0;
}
int *match = results[0];
if(sizeof(match) != 2) {
debug_message(sprintf("expected exact match to be 2 elements, got %d\n",
sizeof(match)));
return 0;
}
int result = match[0];
if(result != i) {
debug_message(sprintf("expected exact match to be value %d, got %d\n",
i, result));
return 0;
}
int distance = match[1];
if(distance != 0) {
debug_message(sprintf("expected exact match to be distance %d, got %d\n",
i, distance));
return 0;
}
}
return 1;
}
protected int test_fuzzy_match() {
int numElements = 20;
lwobject tree = int_tree(numElements);
for(int i = 0; i < numElements; i++) {
int** results = tree.Search(i, 2);
foreach(int* match: results) {
if(sizeof(match) != 2) {
debug_message(sprintf("expected fuzzy match to be 2 elements, got %d\n",
sizeof(match)));
return 0;
}
int distance = match[1];
if(distance > 2) {
debug_message(sprintf("expected fuzzy match distance to <= 2, got %d\n",
distance));
return 0;
}
int result = match[0];
int calc_distance = hamming_distance(result, i);
if(calc_distance > 2) {
debug_message(sprintf(
"expected fuzzy match element distance between %d and %d to be <= 2, got %d\n",
result, i, distance));
return 0;
}
if(calc_distance != distance) {
debug_message(sprintf(
"calculated fuzzy match element distance between %d and %d to be %d, found %d",
result, i, calc_distance, distance));
return 0;
}
}
}
return 1;
}
/*
* True Damerau-Levenshtein Distance.
* Paradox@Dune
* MIT Licensed.
*
* Implementation of a simulated efun for efficient calculation of the
* Damerau-Levenshtein[0] edit distance metric between two strings. Includes
* support for adjacent transpositions[1]. Adapted from Go
* lmas/Damerau-Levenshtein package[2] to LPC.
*
* Pre-allocates data for efficient usage, automatically doubling internal
* buffer when capacity is reached.
*
* Example usage:
* * add this code to your simul efun object.
* * `eval damerau_levenshtein("hello", "bonjour")`
* * // result: 6
* * `eval return damerau_levenshtein("potato", "patate");
* * // result: 2
*
* Changelog:
* - 2022-05-14 Initial code.
*
* [0]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
* [1]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions
* [2]: https://github.com/lmas/Damerau-Levenshtein
*/
#pragma strong_types, save_types
/*
* External API
*/
// Return the true Damerau–Levenshtein distance between a and b.
// Worst case complexity O(sizeof(a) * sizeof(b)).
public int damerau_levenshtein(string a, string b);
/*
* Internal Types
*/
// TDL encapsulates state required to compute a Damerau–Levenshtein distance.
// We allocate this up-front to make computations faster.
private struct TDL {
int maxSize;
int** matrix;
mapping da;
};
/*
* Internal API
*/
private struct TDL NewTDL(int maxSize);
private void Grow(struct TDL t, int newMax);
private int DLDistance(struct TDL t, string a, string b);
/*
* Internal State
*/
// defaultTDL is a TDL instance that is used for simul efun calls so callers
// don't need to worry about the TDL struct. The size provided to NewTDL
// determines how large the strings we can compare are before we need to double
// the initial buffer.
private struct TDL defaultTDL = NewTDL(100);
/*
* Public API Impl
*/
// Return the true Damerau–Levenshtein distance between a and b.
// Worst case complexity O(sizeof(a) * sizeof(b)).
//
// See:
// https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions
public int damerau_levenshtein(string a, string b) {
return DLDistance(defaultTDL, a, b);
}
/*
* Internal API Impl
*/
// NewTDL is an internal constructor for the TDL struct. It uses Grow to ensure
// the TDL instance can accommodate maxSize comparisons without further
// allocation.
private struct TDL NewTDL(int maxSize) {
struct TDL res = (<TDL> maxSize: maxSize, da: ([ :1 ]));
Grow(res, maxSize);
return res;
}
// Grow increases the size of a TDL struct so it can accommodate comparisons of
// size newMax without further allocation.
private void Grow(struct TDL t, int newMax) {
// double each time.
int s = 2 * sizeof(t.matrix) + newMax;
t.matrix = allocate(({ s, s }));
t.maxSize = s;
}
// DLDistance computes a true Damerau–Levenshtein distance between a and b using
// the pre-allocated state in the TDL struct.
private int DLDistance(struct TDL t, string a, string b) {
int lenA = sizeof(a), lenB = sizeof(b);
if(lenA < 1) {
return lenB;
} else if(lenB < 1) {
return lenA;
} else if(lenA >= t.maxSize-1) {
Grow(t, lenA);
} else if(lenB >= t.maxSize-1) {
Grow(t, lenB);
}
t.matrix[0][0] = lenA + lenB + 1;
for(int i = 0; i <= lenA; i++) {
t.matrix[i+1][1] = i;
t.matrix[i+1][0] = t.matrix[0][0];
}
for(int j = 0; j <= lenB; j++) {
t.matrix[1][j+1] = j;
t.matrix[0][j+1] = t.matrix[0][0];
}
string comb = a + b;
for(int r = 0; r < sizeof(comb); r++) {
t.da[comb[r]] = 0;
}
for(int i = 1; i <= lenA; i++) {
int db = 0;
for(int j = 1; j <= lenB; j++) {
int i1 = t.da[b[j-1]];
int j1 = db;
int cost = 1;
if(a[i-1] == b[j-1]) {
cost = 0;
db = j;
}
// By "conventional wisdom", the costs for the ins/del/trans operations
// are always +1
t.matrix[i+1][j+1] = min(
t.matrix[i][j]+cost, // substitution
t.matrix[i+1][j]+1, // insertion
t.matrix[i][j+1]+1, // deletion
t.matrix[i1][j1]+(i-i1-1)+1+(j-j1-1) // transposition
);
}
t.da[a[i-1]] = i;
}
return t.matrix[lenA+1][lenB+1];
}
/*
* True Damerau-Levenshtein Distance Test.
* Paradox@Dune
* MIT Licensed.
*
* Tests of a simulated efun for efficient calculation of the
* Damerau-Levenshtein[0] edit distance metric between two strings.
*
* Changelog:
* - 2022-05-14 Initial code.
*
* [0]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
*/
#pragma strong_types, rtt_checks, pedantic, range_check, no_clone, no_inherit
// TestCase holds two strings (a, b) to compare, and an expected distance (d).
private struct TestCase {
string a, b;
int d;
};
// See also:
// https://github.com/lmas/Damerau-Levenshtein/blob/ad5c73249ec5bf822627e3f060e8839d78e786a6/damerau-levenshtein_test.go#L56-L87
private struct TestCase* testcases = ({
(<TestCase> "", "", 0), // empty string
(<TestCase> "azerty", "", 6), // non empty against empty string
(<TestCase> "", "qwerty", 6), // empty against non empty string
(<TestCase> "azer", "azerty", 2), // adding to end
(<TestCase> "erty", "azerty", 2), // adding to start
(<TestCase> "zert", "azerty", 2), // adding to both ends
(<TestCase> "azty", "azerty", 2), // adding to middle
(<TestCase> "zt", "azerty", 4), // adding to middle and both ends
(<TestCase> "azer", "azerty", 2), // removing from end
(<TestCase> "erty", "azerty", 2), // removing from start
(<TestCase> "zert", "azerty", 2), // removing from both ends
(<TestCase> "azty", "azerty", 2), // removing from middle
(<TestCase> "zt", "azerty", 4), // removing from middle and both ends
(<TestCase> "azErtY", "azerty", 2), // substitution
(<TestCase> "azrety", "azerty", 1), // permutation
// Common examples.
(<TestCase> "moral", "carry", 4),
(<TestCase> "across", "is", 5),
(<TestCase> "beak", "water", 4),
(<TestCase> "teh", "the", 1),
(<TestCase> "tets", "test", 1),
(<TestCase> "fuor", "four", 1),
(<TestCase> "kitten", "sitting", 3),
(<TestCase> "Saturday", "Sunday", 3),
(<TestCase> "rossettacode", "raisethysword", 8),
});
public int main() {
foreach(struct TestCase tc : testcases) {
int actual;
if((actual = damerau_levenshtein(tc.a, tc.b)) != tc.d) {
raise_error(sprintf(
"expected distance %d between %s and %s, got %d",
tc.d, tc.a, tc.b, actual));
}
}
return 1;
}
The MIT License (MIT)
Copyright (c) 2022 Daniel McCarney
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
/*
* LWO LPC BK-Tree Command Suggestion Test.
* Paradox@DUNE
* MIT Licensed.
*
* Using a LPC BK Tree data-structure[0] and an LPC Damerau-Levenshtein[1]
* distance sefun to make a misspelled word suggestion demo.
*
* See 'bktree.c' for more information.
*
* Changelog:
* - 2022-05-16 Initial code.
* - 2022-05-17 Adapted using BatchAdd.
*
* [0]: https://en.wikipedia.org/wiki/BK-tree
* [1]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
*/
#pragma strong_types, rtt_checks, pedantic, range_check, no_clone, no_inherit
#if !defined(BKTREE)
#define BKTREE __PATH__(0)"bktree"
#endif
#if !defined(WORDLIST_JSON)
#define WORDLIST_JSON __PATH__(0)"wordlist.json"
#endif
#define CHUNK_SIZE 35
int ready = 0;
lwobject tree = 0;
public void create() {
tree = new_lwobject(BKTREE, ({ }));
string wordlist_json = read_file(WORDLIST_JSON) || "";
string *dictionary = json_parse(wordlist_json) || ({ });
tree->BatchAdd(dictionary, CHUNK_SIZE, (:
debug_message(sprintf("%s:%s:%d built BK tree of %d words in %d seconds\n",
__FILE__, __FUNCTION__, __LINE__, $1, $2));
ready = 1;
return;
:));
}
public void debug_tree() {
debug_message(sprintf("tree:\n%s\n", tree->String()));
}
public int main(string word, int tolerance) {
word ||= "blork";
tolerance ||= 2;
if(!ready) {
return notify_fail("Still building BK-Tree. Try again soon...\n"), 0;
}
// Search the tree for the word the user entered, with some tolerance for
// edits.
<string|int>** results = tree.Search(word, tolerance);
// Print the suggestions.
string* suggestions = map(results,
(: sprintf("%s [distance %d]", $1[0], $1[1] ) :));
debug_message(sprintf("You entered %s. Have you considered: %s?\n",
word, implode(suggestions, ", ")));
return 1;
}
[
"aba",
"ach",
"adab",
"akarso",
"al-lat",
"alam",
"ampoliros",
"amtal",
"aql",
"arrakeen",
"arrakis",
"assassins",
"auliya",
"aumas",
"ayat",
"bakka",
"baklawa",
"baliset",
"bandalong",
"baradye",
"baraka",
"bashar",
"battle",
"bedwine",
"bela",
"bene",
"bg",
"bhotani",
"bi-la",
"bindu",
"bled",
"bodal",
"bourka",
"burhan",
"burseg",
"butlerian",
"caid",
"caladan",
"canto",
"carryall",
"catchpocket",
"chakobsa",
"chaumas",
"chaumurky",
"cheops",
"cherem",
"choam",
"chusuk",
"cielago",
"cone",
"coriolis",
"corrin",
"cousines",
"crushers",
"crysknife",
"cutteray",
"dar",
"dark",
"death",
"demibrothers",
"derch",
"dew",
"dew2",
"dictum",
"distrans",
"domel",
"doorseal",
"drum",
"dump",
"dune",
"dust",
"ecaz",
"ego",
"el-sayal",
"elacca",
"erg",
"fai",
"fanmetal",
"faufreluches",
"fedaykin",
"filmbook",
"filt-plug",
"fiqh",
"fire",
"first",
"free",
"fremen",
"fremkit",
"frigate",
"galach",
"gammu",
"gamont",
"gare",
"gathering",
"geyrat",
"ghafla",
"ghanima",
"giedi",
"ginaz",
"giudichar",
"glowglobe",
"gom",
"graben",
"great",
"great2",
"great3",
"gridex",
"grumman",
"guild",
"hagal",
"haiiiii",
"hajj",
"hajra",
"hal",
"harj",
"harmonthep",
"harvester",
"heighliner",
"hiereg",
"high",
"holtzman",
"hookman",
"house",
"houses",
"houses2",
"hunter",
"ibad",
"ibn",
"ichwan",
"ijaz",
"ikhut-eigh",
"ilm",
"imperial",
"inkvine",
"istislah",
"ix",
"jihad",
"jihad2",
"jubba",
"judge",
"kanly",
"karama",
"khala",
"kindjal",
"kiswa",
"kitab",
"krimskell",
"kull",
"kulon",
"kwisatz",
"la",
"lasgun",
"lashkar",
"legion",
"liban",
"lighter",
"lisan",
"literjon",
"little",
"mahai",
"mahdi",
"maker",
"maker2",
"mantene",
"masheikh",
"mating",
"matres",
"maula",
"maula2",
"melange",
"mentat",
"metaglass",
"mihna",
"minimic",
"mish",
"misr",
"missionaria",
"monitor",
"mu",
"muad'dib",
"mudir",
"mushtamal",
"musky",
"na",
"naib",
"nezhoni",
"no-ships",
"noukkers",
"oil",
"opafire",
"orange",
"ornithopter",
"out-freyn",
"palm",
"pan",
"panoplia",
"paracompass",
"pentashield",
"pic",
"plasteel",
"pleniscenta",
"poling",
"poritrin",
"portyguls",
"powindah",
"prana",
"pre-spice",
"prick",
"proces",
"proctor",
"prudence",
"pundi",
"pyons",
"pyretic",
"qanat",
"qirtaiba",
"quis",
"quizara",
"rachag",
"ramadhan",
"razzia",
"recaths",
"repkit",
"residual",
"reverend",
"richese",
"rimwall",
"ruh",
"sadus",
"salusa",
"sandcrawler",
"sandmaster",
"sandrider",
"sandsnork",
"sandtide",
"sandwalker",
"sandworm",
"sapho",
"sardaukar",
"sarfa",
"sayyadina",
"schlag",
"second",
"selamlik",
"semuta",
"servok",
"shadout",
"shah",
"shai-hulud",
"shaitan",
"shari-a",
"shield",
"shield2",
"shigawire",
"sietch",
"sihaya",
"sink",
"sinkchart",
"sirat",
"slip-tip",
"snooper",
"solari",
"solido",
"sondagi",
"soo",
"soostones",
"spacing",
"spice",
"spice2",
"spice3",
"spotter",
"stillsuit",
"stilltent",
"stunner",
"subakh",
"subakh2",
"suspension",
"suspensor",
"t-p",
"tahaddi",
"tahaddi2",
"taqwa",
"tau",
"test",
"thumper",
"tidal",
"tleilax",
"training",
"troop",
"truthsayer",
"truthtrance",
"tupile",
"ulema",
"umma",
"uroshnor",
"usul",
"varota",
"verite",
"voice",
"wali",
"wallach",
"war",
"water",
"water2",
"water3",
"watercounters",
"waterman",
"watertube",
"way",
"weather",
"weirding",
"windtrap",
"ya",
"ya2",
"yali",
"zensunni"
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment