Forked from markpapadakis/FastEditDistanceCandidates.cpp
Created
June 21, 2020 09:41
-
-
Save yhafri/362d9394d75533d4f6743dd5f278c506 to your computer and use it in GitHub Desktop.
From input string, identify all strings within a specific edit distance from it - based on deletions only.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Compile list of tokens based on deletions, and respecting max edit distance | |
// Original implementation would concatenate left/right string segments using memcpy() etc, but that didn't seem right | |
// The idea is, to instead partition S into 1+ segments and build hash from them, no need for memory copies etc. | |
// This is almost x8 times faster compared to that - but it's still in the 10s of microseconds range. | |
// | |
// ( for a 31 characters long string, with max edit distance = 2, it takes 24 microseconds, including the cost for computing the FNV rolling hash ) | |
#include <switch.h> | |
#include <switch_print.h> | |
#include <ansifmt.h> | |
static void Div(strwlen8_t S, uint8_t midsCnt, const uint8_t *const mids, const uint8_t ed, const uint8_t maxEditDistance) | |
{ | |
uint8_t idx{0}, localMids[4]; | |
char repr[64], *r{repr}; | |
const auto SL = S.len; | |
uint64_t h; | |
// copy and sort, fast | |
switch (midsCnt) | |
{ | |
case 0: | |
break; | |
case 1: | |
localMids[0] = mids[0]; | |
break; | |
case 2: | |
if (mids[0] <= mids[1]) | |
{ | |
localMids[0] = mids[0]; | |
localMids[1] = mids[1]; | |
} | |
else | |
{ | |
localMids[0] = mids[1]; | |
localMids[1] = mids[0]; | |
} | |
break; | |
case 3: | |
if (mids[0] <= mids[1]) | |
{ | |
if (mids[0] <= mids[2]) | |
{ | |
localMids[0] = mids[0]; | |
localMids[1] = Min(mids[1], mids[2]); | |
localMids[2] = Max(mids[1], mids[2]); | |
} | |
else | |
{ | |
localMids[0] = mids[2]; | |
localMids[1] = Min(mids[1], mids[0]); | |
localMids[2] = Max(mids[1], mids[0]); | |
} | |
} | |
else | |
{ | |
localMids[0] = mids[1]; | |
localMids[1] = Min(mids[2], mids[0]); | |
localMids[2] = Max(mids[2], mids[0]); | |
} | |
break; | |
default: | |
Abort(); | |
} | |
h = BeginFNVHash64(); | |
for (uint8_t i = 0; i != midsCnt; ++i) | |
{ | |
const auto sep = localMids[i]; | |
const auto s = S.Substr(idx, sep - idx); | |
h = FNVHash64(h, (uint8_t *)s.p, s.len); | |
idx = sep + 1; | |
} | |
if (idx < SL) | |
{ | |
const auto s = S.SuffixFrom(idx); | |
h = FNVHash64(h, (uint8_t *)s.p, s.len); | |
} | |
Consider(h); | |
if (ed == maxEditDistance) | |
return; | |
if (!midsCnt) | |
{ | |
if (SL > 1) | |
{ | |
const auto upto = SL - 1; | |
Div(S.SuffixFrom(1), 0, localMids, ed + 1, maxEditDistance); | |
for (uint8_t i = 1; i != upto; ++i) | |
{ | |
localMids[0] = i; | |
Div(S, 1, localMids, ed + 1, maxEditDistance); | |
} | |
Div(S.Prefix(upto), 0, localMids, ed + 1, maxEditDistance); | |
} | |
} | |
else | |
{ | |
uint8_t i{0}, idx{0}; | |
// Optimization: | |
// Drop first if they are in sequence | |
for (i = 0; i != midsCnt && localMids[i] == i; ++i) | |
continue; | |
if (i) | |
{ | |
midsCnt-=i; | |
switch (i) | |
{ | |
case 3: | |
localMids[2] = --localMids[3]; | |
case 2: | |
localMids[1] = --localMids[2]; | |
case 1: | |
localMids[0] = --localMids[1]; | |
break; | |
default: | |
Abort(); | |
} | |
S.StripPrefix(i); | |
i = 0; | |
} | |
for (; i < midsCnt; ++i) | |
{ | |
const auto sep = localMids[i]; | |
while (idx != sep) | |
{ | |
localMids[midsCnt] = idx; | |
Div(S, midsCnt + 1, localMids, ed + 1, maxEditDistance); | |
++idx; | |
} | |
idx = sep + 1; | |
} | |
} | |
} | |
static void Div(strwlen8_t S, const uint8_t maxEditDistance) | |
{ | |
// We want to exclude the input query from consideration, so we 'll just do that here | |
uint8_t localMids[4]; | |
const auto before = Timings::Microseconds::Tick(); | |
const auto SL = S.len; | |
if (SL > 1) | |
{ | |
const auto upto = SL - 1; | |
Div(S.SuffixFrom(1), 0, localMids, 1, maxEditDistance); | |
for (uint8_t i = 1; i != upto; ++i) | |
{ | |
localMids[0] = i; | |
Div(S, 1, localMids, 1, maxEditDistance); | |
} | |
Div(S.Prefix(upto), 0, localMids, 1, maxEditDistance); | |
} | |
SLog(duration_repr(Timings::Microseconds::Since(before)), "\n"); | |
} | |
int main(int argc, char *argv[]) | |
{ | |
const strwlen8_t in(argv[1]); | |
Div(in, Min<uint8_t>(2, in.len / 5 + 1)); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment