Skip to content

Instantly share code, notes, and snippets.

@sahib
Created May 6, 2012 10:46
Show Gist options
  • Save sahib/2621593 to your computer and use it in GitHub Desktop.
Save sahib/2621593 to your computer and use it in GitHub Desktop.
Silly implementation for utf8 levenshtein (works by converting utf-8 to ucs4)
#include <glib.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
///////////////////////////////
gsize gunicode_strlen(const gunichar * s)
{
gsize cnt = 0;
while(*s++)
++cnt;
return cnt;
}
///////////////////////////////
// 'New' version, takes gunichar strings, but completely the same otherwise
gsize levenshtein_unistrcmp(const gunichar * s, const gunichar * t)
{
int n = (s) ? gunicode_strlen(s)+1 : 0;
int m = (t) ? gunicode_strlen(t)+1 : 0;
// Nothing to compute really..
if (n==0) return m;
if (m==0) return n;
// String matrix
int d[n][m];
int i,j;
// Init first row|column to 0...n|m
for (i=0; i<n; i++) d[i][0] = i;
for (j=0; j<m; j++) d[0][j] = j;
for (i=1; i<n; i++)
{
// Current char in string s
gunichar cats = s[i-1];
for (j=1; j<m; j++)
{
// Do -1 only once
int jm1 = j-1,
im1 = i-1;
// a = above cell, b = left cell, c = left above celli
int a = d[im1][j] + 1,
b = d[i][jm1] + 1,
c = d[im1][jm1] + (t[jm1] != cats);
// Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j]
d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c);
}
}
// The result is stored in the very right down cell
return d[n-1][m-1];
}
///////////////////////////////
// Old "plain text" levenshtein
gsize levenshtein_strcmp(const gchar * s, const gchar * t)
{
int n = (s) ? strlen(s)+1 : 0;
int m = (t) ? strlen(t)+1 : 0;
// Nothing to compute really..
if (n==0) return m;
if (m==0) return n;
// String matrix
int d[n][m];
int i,j;
// Init first row|column to 0...n|m
for (i=0; i<n; i++) d[i][0] = i;
for (j=0; j<m; j++) d[0][j] = j;
for (i=1; i<n; i++)
{
// Current char in string s
char cats = s[i-1];
for (j=1; j<m; j++)
{
// Do -1 only once
int jm1 = j-1,
im1 = i-1;
// a = above cell, b = left cell, c = left above celli
int a = d[im1][j] + 1,
b = d[i][jm1] + 1,
c = d[im1][jm1] + (t[jm1] != cats);
// Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j]
d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c);
}
}
// The result is stored in the very right down cell
return d[n-1][m-1];
}
///////////////////////////////
static gunichar * utf8_to_ucs4(const gchar * utf8)
{
GError * conv_error = NULL;
glong in = 0, out = 0;
gunichar * rc = g_utf8_to_ucs4(utf8,-1,&in,&out,&conv_error); \
if(conv_error != NULL)
{
g_print("Error while converting '%s': %s",utf8,conv_error->message);
g_error_free(conv_error);
return NULL;
}
return rc;
}
///////////////////////////////
// A wrapper, so we do not need to convert gchar * to gunichar * ourself.
gsize levenshtein_uf8strcmp(const gchar * s, const gchar * t)
{
gsize rc = 0;
if(s != NULL && t != NULL)
{
gunichar * s_ucs4 = utf8_to_ucs4(s);
gunichar * t_ucs4 = utf8_to_ucs4(t);
if(s_ucs4 != NULL && t_ucs4 != NULL)
{
rc = levenshtein_unistrcmp(s_ucs4,t_ucs4);
}
g_free(s_ucs4);
g_free(t_ucs4);
}
return rc;
}
///////////////////////////////
int main(int argc, char const *argv[])
{
if(argc < 3)
return EXIT_FAILURE;
printf("Unicode lv_dist: %lu\n",levenshtein_uf8strcmp(argv[1],argv[2]));
printf("'Plain' lv_dist: %lu\n",levenshtein_strcmp(argv[1],argv[2]));
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment