sahib · May 6, 2012 12:24 · andyholmes · Sep 29, 2021
diff --git a/levenshtein_utf8.c b/levenshtein_utf8.c
 #include <glib.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <locale.h>

 ///////////////////////////////

 // Old version
 gsize levenshtein_plain_strcmp(const gchar * s, const gchar * t)
 {
    int n = (s) ? strlen(s)+1 : 0;
    int m = (t) ? strlen(t)+1 : 0;

    // Nothing to compute really..
    if (n==0) return m;
    if (m==0) return n;

    // String matrix
    int d[n][m];
    int i,j;

    // Init first row|column to 0...n|m
    for (i=0; i<n; i++) d[i][0] = i;
    for (j=0; j<m; j++) d[0][j] = j;

    for (i=1; i<n; i++)
    {
        // Current char in string s
        char cats = s[i-1];

        for (j=1; j<m; j++)
        {
            // Do -1 only once
            int jm1 = j-1,
                im1 = i-1;

            // a = above cell, b = left cell, c = left above celli
            int a = d[im1][j] + 1,
                b = d[i][jm1] + 1,
                c = d[im1][jm1] + (t[jm1] != cats);

            // Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j]
     	    d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c);
        }
    }

    // The result is stored in the very right down cell
    return d[n-1][m-1];
 }

 ///////////////////////////////

 // Utf8 aware levenshtein
 gsize levenshtein_strcmp(const gchar * s, const gchar * t)
 {
    int n = (s) ? g_utf8_strlen(s,-1)+1 : 0;
    int m = (t) ? g_utf8_strlen(t,-1)+1 : 0;

    // NOTE: Be sure to call g_utf8_validate(), might fail otherwise
    //       It's advisable to call g_utf8_normalize() too.

    // Nothing to compute really..
    if (n < 2) return m;
    if (m < 2) return n;

    // String matrix
    int d[n][m];
    int i,j;

    // Init first row|column to 0...n|m
    for (i=0; i<n; i++) d[i][0] = i;
    for (j=0; j<m; j++) d[0][j] = j;

    for (i=1; i<n; i++)
    {
        // Current char in string s
        gunichar cats = g_utf8_get_char(g_utf8_offset_to_pointer(s,i-1));

        for (j=1; j<m; j++)
        {
            // Do -1 only once
            int jm1 = j-1,
                im1 = i-1;

            gunichar tats = g_utf8_get_char(g_utf8_offset_to_pointer(t,jm1));

            // a = above cell, b = left cell, c = left above celli
            int a = d[im1][j] + 1,
                b = d[i][jm1] + 1,
                c = d[im1][jm1] + (tats != cats);

            // Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j]
     	    d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c);
        }
    }

    // The result is stored in the very right down cell
    return d[n-1][m-1];
 }

 ///////////////////////////////

 // A utf8 aware levenshtein that normalizes ambigious codepoints
 // and validates input-data
 gsize levenshtein_safe_strcmp(const gchar * s, const gchar * t)
 {
    gsize rc = 0;
    if(g_utf8_validate(s,-1,NULL) == FALSE ||
       g_utf8_validate(t,-1,NULL) == FALSE)
     return rc;

    gchar * s_norm = g_utf8_normalize(s,-1,G_NORMALIZE_ALL_COMPOSE);
    gchar * t_norm = g_utf8_normalize(t,-1,G_NORMALIZE_ALL_COMPOSE);

    rc = levenshtein_strcmp(s_norm,t_norm);

    g_free(s_norm);
    g_free(t_norm);

    return rc;
 }

 ///////////////////////////////

 int main(int argc, char const *argv[])
 {
    if(argc < 3)
        return EXIT_FAILURE;

    setlocale(LC_ALL,"");

    printf("Unicode lv_dist: %lu\n",levenshtein_strcmp(argv[1],argv[2]));
    printf("Safeutf lv_dist: %lu\n",levenshtein_safe_strcmp(argv[1],argv[2]));
    printf("'Plain' lv_dist: %lu\n",levenshtein_plain_strcmp(argv[1],argv[2]));
    return EXIT_SUCCESS;
 }
	#include <glib.h>
	#include <stdlib.h>
	#include <string.h>
	#include <stdio.h>
	#include <locale.h>

	///////////////////////////////

	// Old version
	gsize levenshtein_plain_strcmp(const gchar * s, const gchar * t)
	{
	int n = (s) ? strlen(s)+1 : 0;
	int m = (t) ? strlen(t)+1 : 0;

	// Nothing to compute really..
	if (n==0) return m;
	if (m==0) return n;

	// String matrix
	int d[n][m];
	int i,j;

	// Init first row\|column to 0...n\|m
	for (i=0; i<n; i++) d[i][0] = i;
	for (j=0; j<m; j++) d[0][j] = j;

	for (i=1; i<n; i++)
	{
	// Current char in string s
	char cats = s[i-1];

	for (j=1; j<m; j++)
	{
	// Do -1 only once
	int jm1 = j-1,
	im1 = i-1;

	// a = above cell, b = left cell, c = left above celli
	int a = d[im1][j] + 1,
	b = d[i][jm1] + 1,
	c = d[im1][jm1] + (t[jm1] != cats);

	// Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j]
	d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c);
	}
	}

	// The result is stored in the very right down cell
	return d[n-1][m-1];
	}

	///////////////////////////////

	// Utf8 aware levenshtein
	gsize levenshtein_strcmp(const gchar * s, const gchar * t)
	{
	int n = (s) ? g_utf8_strlen(s,-1)+1 : 0;
	int m = (t) ? g_utf8_strlen(t,-1)+1 : 0;

	// NOTE: Be sure to call g_utf8_validate(), might fail otherwise
	// It's advisable to call g_utf8_normalize() too.

	// Nothing to compute really..
	if (n < 2) return m;
	if (m < 2) return n;

	// String matrix
	int d[n][m];
	int i,j;

	// Init first row\|column to 0...n\|m
	for (i=0; i<n; i++) d[i][0] = i;
	for (j=0; j<m; j++) d[0][j] = j;

	for (i=1; i<n; i++)
	{
	// Current char in string s
	gunichar cats = g_utf8_get_char(g_utf8_offset_to_pointer(s,i-1));

	for (j=1; j<m; j++)
	{
	// Do -1 only once
	int jm1 = j-1,
	im1 = i-1;

	gunichar tats = g_utf8_get_char(g_utf8_offset_to_pointer(t,jm1));

	// a = above cell, b = left cell, c = left above celli
	int a = d[im1][j] + 1,
	b = d[i][jm1] + 1,
	c = d[im1][jm1] + (tats != cats);

	// Now compute the minimum of a,b,c and set MIN(a,b,c) to cell d[i][j]
	d[i][j] = (a < b) ? MIN(a,c) : MIN(b,c);
	}
	}

	// The result is stored in the very right down cell
	return d[n-1][m-1];
	}

	///////////////////////////////

	// A utf8 aware levenshtein that normalizes ambigious codepoints
	// and validates input-data
	gsize levenshtein_safe_strcmp(const gchar * s, const gchar * t)
	{
	gsize rc = 0;
	if(g_utf8_validate(s,-1,NULL) == FALSE \|\|
	g_utf8_validate(t,-1,NULL) == FALSE)
	return rc;

	gchar * s_norm = g_utf8_normalize(s,-1,G_NORMALIZE_ALL_COMPOSE);
	gchar * t_norm = g_utf8_normalize(t,-1,G_NORMALIZE_ALL_COMPOSE);

	rc = levenshtein_strcmp(s_norm,t_norm);

	g_free(s_norm);
	g_free(t_norm);

	return rc;
	}

	///////////////////////////////

	int main(int argc, char const *argv[])
	{
	if(argc < 3)
	return EXIT_FAILURE;

	setlocale(LC_ALL,"");

	printf("Unicode lv_dist: %lu\n",levenshtein_strcmp(argv[1],argv[2]));
	printf("Safeutf lv_dist: %lu\n",levenshtein_safe_strcmp(argv[1],argv[2]));
	printf("'Plain' lv_dist: %lu\n",levenshtein_plain_strcmp(argv[1],argv[2]));
	return EXIT_SUCCESS;
	}