Created
May 6, 2012 10:46
-
-
Save sahib/2621593 to your computer and use it in GitHub Desktop.
Silly implementation for utf8 levenshtein (works by converting utf-8 to ucs4)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <glib.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <stdio.h> | |
/////////////////////////////// | |
gsize gunicode_strlen(const gunichar * s) | |
{ | |
gsize cnt = 0; | |
while(*s++) | |
++cnt; | |
return cnt; | |
} | |
/////////////////////////////// | |
// 'New' version, takes gunichar strings, but completely the same otherwise | |
gsize levenshtein_unistrcmp(const gunichar * s, const gunichar * t) | |
{ | |
int n = (s) ? gunicode_strlen(s)+1 : 0; | |
int m = (t) ? gunicode_strlen(t)+1 : 0; | |
// Nothing to compute really.. | |
if (n==0) return m; | |
if (m==0) return n; | |
// String matrix | |
int d[n][m]; | |
int i,j; | |
// Init first row|column to 0...n|m | |
for (i=0; i<n; i++) d[i][0] = i; | |
for (j=0; j<m; j++) d[0][j] = j; | |
for (i=1; i<n; i++) | |
{ | |
// Current char in string s | |
gunichar cats = s[i-1]; | |
for (j=1; j<m; j++) | |
{ | |
// Do -1 only once | |
int jm1 = j-1, | |
im1 = i-1; | |
// a = above cell, b = left cell, c = left above celli | |
int a = d[im1][j] + 1, | |
b = d[i][jm1] + 1, | |
c = d[im1][jm1] + (t[jm1] != cats); | |
// Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j] | |
d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c); | |
} | |
} | |
// The result is stored in the very right down cell | |
return d[n-1][m-1]; | |
} | |
/////////////////////////////// | |
// Old "plain text" levenshtein | |
gsize levenshtein_strcmp(const gchar * s, const gchar * t) | |
{ | |
int n = (s) ? strlen(s)+1 : 0; | |
int m = (t) ? strlen(t)+1 : 0; | |
// Nothing to compute really.. | |
if (n==0) return m; | |
if (m==0) return n; | |
// String matrix | |
int d[n][m]; | |
int i,j; | |
// Init first row|column to 0...n|m | |
for (i=0; i<n; i++) d[i][0] = i; | |
for (j=0; j<m; j++) d[0][j] = j; | |
for (i=1; i<n; i++) | |
{ | |
// Current char in string s | |
char cats = s[i-1]; | |
for (j=1; j<m; j++) | |
{ | |
// Do -1 only once | |
int jm1 = j-1, | |
im1 = i-1; | |
// a = above cell, b = left cell, c = left above celli | |
int a = d[im1][j] + 1, | |
b = d[i][jm1] + 1, | |
c = d[im1][jm1] + (t[jm1] != cats); | |
// Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j] | |
d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c); | |
} | |
} | |
// The result is stored in the very right down cell | |
return d[n-1][m-1]; | |
} | |
/////////////////////////////// | |
static gunichar * utf8_to_ucs4(const gchar * utf8) | |
{ | |
GError * conv_error = NULL; | |
glong in = 0, out = 0; | |
gunichar * rc = g_utf8_to_ucs4(utf8,-1,&in,&out,&conv_error); \ | |
if(conv_error != NULL) | |
{ | |
g_print("Error while converting '%s': %s",utf8,conv_error->message); | |
g_error_free(conv_error); | |
return NULL; | |
} | |
return rc; | |
} | |
/////////////////////////////// | |
// A wrapper, so we do not need to convert gchar * to gunichar * ourself. | |
gsize levenshtein_uf8strcmp(const gchar * s, const gchar * t) | |
{ | |
gsize rc = 0; | |
if(s != NULL && t != NULL) | |
{ | |
gunichar * s_ucs4 = utf8_to_ucs4(s); | |
gunichar * t_ucs4 = utf8_to_ucs4(t); | |
if(s_ucs4 != NULL && t_ucs4 != NULL) | |
{ | |
rc = levenshtein_unistrcmp(s_ucs4,t_ucs4); | |
} | |
g_free(s_ucs4); | |
g_free(t_ucs4); | |
} | |
return rc; | |
} | |
/////////////////////////////// | |
int main(int argc, char const *argv[]) | |
{ | |
if(argc < 3) | |
return EXIT_FAILURE; | |
printf("Unicode lv_dist: %lu\n",levenshtein_uf8strcmp(argv[1],argv[2])); | |
printf("'Plain' lv_dist: %lu\n",levenshtein_strcmp(argv[1],argv[2])); | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment