Created
May 6, 2012 12:24
-
-
Save sahib/2622023 to your computer and use it in GitHub Desktop.
Better implementation to get the lv-distance of two utf8-strings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <glib.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <locale.h> | |
/////////////////////////////// | |
// Old version | |
gsize levenshtein_plain_strcmp(const gchar * s, const gchar * t) | |
{ | |
int n = (s) ? strlen(s)+1 : 0; | |
int m = (t) ? strlen(t)+1 : 0; | |
// Nothing to compute really.. | |
if (n==0) return m; | |
if (m==0) return n; | |
// String matrix | |
int d[n][m]; | |
int i,j; | |
// Init first row|column to 0...n|m | |
for (i=0; i<n; i++) d[i][0] = i; | |
for (j=0; j<m; j++) d[0][j] = j; | |
for (i=1; i<n; i++) | |
{ | |
// Current char in string s | |
char cats = s[i-1]; | |
for (j=1; j<m; j++) | |
{ | |
// Do -1 only once | |
int jm1 = j-1, | |
im1 = i-1; | |
// a = above cell, b = left cell, c = left above celli | |
int a = d[im1][j] + 1, | |
b = d[i][jm1] + 1, | |
c = d[im1][jm1] + (t[jm1] != cats); | |
// Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j] | |
d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c); | |
} | |
} | |
// The result is stored in the very right down cell | |
return d[n-1][m-1]; | |
} | |
/////////////////////////////// | |
// Utf8 aware levenshtein | |
gsize levenshtein_strcmp(const gchar * s, const gchar * t) | |
{ | |
int n = (s) ? g_utf8_strlen(s,-1)+1 : 0; | |
int m = (t) ? g_utf8_strlen(t,-1)+1 : 0; | |
// NOTE: Be sure to call g_utf8_validate(), might fail otherwise | |
// It's advisable to call g_utf8_normalize() too. | |
// Nothing to compute really.. | |
if (n < 2) return m; | |
if (m < 2) return n; | |
// String matrix | |
int d[n][m]; | |
int i,j; | |
// Init first row|column to 0...n|m | |
for (i=0; i<n; i++) d[i][0] = i; | |
for (j=0; j<m; j++) d[0][j] = j; | |
for (i=1; i<n; i++) | |
{ | |
// Current char in string s | |
gunichar cats = g_utf8_get_char(g_utf8_offset_to_pointer(s,i-1)); | |
for (j=1; j<m; j++) | |
{ | |
// Do -1 only once | |
int jm1 = j-1, | |
im1 = i-1; | |
gunichar tats = g_utf8_get_char(g_utf8_offset_to_pointer(t,jm1)); | |
// a = above cell, b = left cell, c = left above celli | |
int a = d[im1][j] + 1, | |
b = d[i][jm1] + 1, | |
c = d[im1][jm1] + (tats != cats); | |
// Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j] | |
d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c); | |
} | |
} | |
// The result is stored in the very right down cell | |
return d[n-1][m-1]; | |
} | |
/////////////////////////////// | |
// A utf8 aware levenshtein that normalizes ambigious codepoints | |
// and validates input-data | |
gsize levenshtein_safe_strcmp(const gchar * s, const gchar * t) | |
{ | |
gsize rc = 0; | |
if(g_utf8_validate(s,-1,NULL) == FALSE || | |
g_utf8_validate(t,-1,NULL) == FALSE) | |
return rc; | |
gchar * s_norm = g_utf8_normalize(s,-1,G_NORMALIZE_ALL_COMPOSE); | |
gchar * t_norm = g_utf8_normalize(t,-1,G_NORMALIZE_ALL_COMPOSE); | |
rc = levenshtein_strcmp(s_norm,t_norm); | |
g_free(s_norm); | |
g_free(t_norm); | |
return rc; | |
} | |
/////////////////////////////// | |
int main(int argc, char const *argv[]) | |
{ | |
if(argc < 3) | |
return EXIT_FAILURE; | |
setlocale(LC_ALL,""); | |
printf("Unicode lv_dist: %lu\n",levenshtein_strcmp(argv[1],argv[2])); | |
printf("Safeutf lv_dist: %lu\n",levenshtein_safe_strcmp(argv[1],argv[2])); | |
printf("'Plain' lv_dist: %lu\n",levenshtein_plain_strcmp(argv[1],argv[2])); | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, can I use this under the terms of LGPL-2.1-or-later?