Skip to content

Instantly share code, notes, and snippets.

@jrade
Created May 25, 2024 04:19
Show Gist options
  • Save jrade/8fd24da00d665934f07134fe83aa4337 to your computer and use it in GitHub Desktop.
Save jrade/8fd24da00d665934f07134fe83aa4337 to your computer and use it in GitHub Desktop.
Fast and memory efficient C++ implementations of the following string distances: LCS (Longest common subsequence), Levenshtein, Damerau-Levenshtein
// Copyright 2024 Johan Rade ([email protected])
// Distributed under the MIT license (https://opensource.org/licenses/MIT)
inline size_t lcsDistance(const std::string& s, const std::string& t)
{
size_t m = s.size() + 1;
size_t n = t.size() + 1;
const char* p = s.data();
const char* q = t.data();
if (m > n) {
std::swap(m, n);
std::swap(p, q);
}
static thread_local std::vector<size_t> buf;
buf.resize(std::max(2 * m, buf.size()));
size_t* prevDist = buf.data();
size_t* dist = buf.data() + m;
for (size_t i = 0; i != m; ++i)
dist[i] = i;
for (size_t j = 1; j != n; ++j) {
std::swap(prevDist, dist);
dist[0] = j;
for (size_t i = 1; i != m; ++i) {
const size_t cost = (p[i - 1] == q[j - 1]) ? 0 : 2;
dist[i] = std::min(prevDist[i - 1] + cost, std::min(prevDist[i], dist[i - 1]) + 1);
}
}
return dist[m - 1];
}
inline size_t levenshteinDistance(const std::string& s, const std::string& t)
{
size_t m = s.size() + 1;
size_t n = t.size() + 1;
const char* p = s.data();
const char* q = t.data();
if (m > n) {
std::swap(m, n);
std::swap(p, q);
}
static thread_local std::vector<size_t> buf;
buf.resize(std::max(2 * m, buf.size()));
size_t* prevDist = buf.data();
size_t* dist = buf.data() + m;
for (size_t i = 0; i != m; ++i)
dist[i] = i;
for (size_t j = 1; j != n; ++j) {
std::swap(prevDist, dist);
dist[0] = j;
for (size_t i = 1; i != m; ++i) {
const size_t cost = (p[i - 1] == q[j - 1]) ? 0 : 1;
dist[i] = std::min(prevDist[i - 1] + cost, std::min(prevDist[i], dist[i - 1]) + 1);
}
}
return dist[m - 1];
}
inline size_t damerauLevenshteinDistance(const std::string& s, const std::string& t)
{
size_t m = s.size() + 1;
size_t n = t.size() + 1;
const char* p = s.data();
const char* q = t.data();
if (m > n) {
std::swap(m, n);
std::swap(p, q);
}
if (m == 1)
return n - 1;
static thread_local std::vector<size_t> buf;
buf.resize(std::max(3 * m, buf.size()));
size_t* prevPrevDist = buf.data();
size_t* prevDist = buf.data() + m;
size_t* dist = buf.data() + 2 * m;
// j = 0
for (size_t i = 0; i != m; ++i)
dist[i] = i;
// j = 1
std::swap(dist, prevDist);
dist[0] = 1;
for (size_t i = 1; i != m; ++i) {
const size_t substCost = (p[i - 1] == q[0]) ? 0 : 1;
dist[i] = std::min(prevDist[i - 1] + substCost, std::min(prevDist[i], dist[i - 1]) + 1);
}
for (size_t j = 2; j != n; ++j) {
std::swap(prevDist, prevPrevDist);
std::swap(dist, prevDist);
dist[0] = j;
{
const size_t substCost = (p[0] == q[j - 1]) ? 0 : 1;
dist[1] = std::min(prevDist[0] + substCost, std::min(prevDist[1], dist[0]) + 1);
}
for (size_t i = 2; i != m; ++i) {
const size_t substCost = (p[i - 1] == q[j - 1]) ? 0 : 1;
const size_t transCost = (p[i - 1] == q[j - 2] && p[i - 2] == q[j - 1]) ? 1 : 2;
dist[i] = std::min(
std::min(prevDist[i - 1] + substCost, prevPrevDist[i - 2] + transCost),
std::min(prevDist[i], dist[i - 1]) + 1);
}
}
return dist[m - 1];
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment