Created
October 26, 2024 07:54
-
-
Save cacharle/4b0b4244eff7361ab56147668cbb9100 to your computer and use it in GitHub Desktop.
Levenshtein distance benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _POSIX_C_SOURCE 200809L | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <assert.h> | |
#include <string.h> | |
#include <time.h> | |
int min(int a, int b, int c) | |
{ | |
if (a < b && a < c) | |
return a; | |
else if (b < a && b < c) | |
return b; | |
else | |
return c; | |
} | |
int lev_naive(char *s1, char *s2) | |
{ | |
if (*s1 == '\0') | |
return strlen(s2); | |
if (*s2 == '\0') | |
return strlen(s1); | |
if (*s1 == *s2) | |
return lev_naive(s1 + 1, s2 + 1); | |
return 1 + min( | |
lev_naive(s1, s2 + 1), | |
lev_naive(s1 + 1, s2), | |
lev_naive(s1 + 1, s2 + 1) | |
); | |
} | |
int lev(char *s1, char *s2) | |
{ | |
static int diffs[128][128] = {{0}}; | |
memset(diffs, 0, sizeof diffs); | |
int s1_len, i; | |
for (i = 1; s1[i-1] != '\0'; i++) | |
diffs[i][0] = i; | |
s1_len = i; | |
int s2_len, j; | |
for (j = 1; s2[j-1] != '\0'; j++) | |
diffs[0][j] = j; | |
s2_len = j; | |
for (int j = 1; s2[j-1] != '\0'; j++) | |
{ | |
for (int i = 1; s1[i-1] != '\0'; i++) | |
{ | |
int substitution_cost = s1[i] == s2[j] ? 0 : 1; | |
diffs[i][j] = min( | |
diffs[i-1][j] + 1, | |
diffs[i][j-1] + 1, | |
diffs[i-1][j-1] + substitution_cost | |
); | |
} | |
} | |
return diffs[s1_len - 1][s2_len - 1]; | |
} | |
int main(void) | |
{ | |
struct timespec start, end; | |
FILE *f = fopen("/usr/share/dict/words", "r"); | |
assert(f != NULL); | |
char *target = "house"; | |
// int dist = lev(target, "housf"); | |
// printf("%s: %d\n", target, dist); | |
// return 0; | |
char *line = NULL; | |
size_t line_size = 0; | |
ssize_t read_size; | |
clock_gettime(CLOCK_MONOTONIC, &start); | |
while ((read_size = getline(&line, &line_size, f)) != -1) | |
{ | |
line[read_size - 1] = '\0'; | |
// int dist = lev_naive(target, line); | |
int dist = lev(target, line); | |
// printf("%s: %d\n", line, dist); | |
} | |
clock_gettime(CLOCK_MONOTONIC, &end); | |
free(line); | |
fclose(f); | |
time_t delta_seconds = end.tv_sec - start.tv_sec; | |
long long int delta_ns = end.tv_nsec - start.tv_nsec; | |
if (delta_ns < 0) | |
{ | |
delta_seconds -= 1; | |
delta_ns += 1000000000; | |
} | |
printf("took: %zus %lldms\n", delta_seconds, delta_ns / 1'000'000); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
~/git/tried master* | |
❯ gcc -std=c23 -O3 t.c && ./a.out # <-- cache distance matrix | |
took: 0s 107ms | |
~/git/tried master* | |
❯ gcc -std=c23 -O3 t.c && ./a.out # <-- naive | |
took: 8s 8ms | |
~/git/tried master* 8s | |
❯ wc -l /usr/share/dict/words | |
123985 /usr/share/dict/words |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment