Last active
October 14, 2015 09:21
-
-
Save djvanderlaan/9bd1ff9830c3a2326b8f to your computer and use it in GitHub Desktop.
Jaro similarity score C++ implementation with R-interface
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstring> | |
#include <cstdlib> | |
#include <R.h> | |
#include <Rdefines.h> | |
double jaro(const char* a, const char* b, double range = 0.5, double match_weight = 1.0, | |
double trans_weight = 1.0) { | |
unsigned int la = strlen(a); | |
unsigned int lb = strlen(b); | |
// create a buffers; these store for each character in a and b which have | |
// been matched to characters in the other string | |
char* match_a = (char*)calloc(la, sizeof(char)); | |
char* match_b = (char*)calloc(lb, sizeof(char)); | |
// determine search range (only characters within search range can be | |
// matched) | |
unsigned int search_range = la > lb ? range*la - 1.0 : range*lb - 1.0; | |
if (search_range < 0) search_range = 0; | |
// determine the number of matching characters; a character can only be | |
// matched once, so aa and a has one match; the second a in the first string | |
// can not be match to the a in the second string as this a is already | |
// matched to the first a in the first string | |
unsigned int nmatch = 0; | |
for (unsigned int i = 0; i < la; ++i) { | |
unsigned int jmin = (i > search_range) ? i - search_range : 0; | |
unsigned int jmax = ((i+search_range+1) < lb) ? i + search_range + 1: lb; | |
for (unsigned int j = jmin; j < jmax; ++j) { | |
if ((match_b[j] != 1) && (a[i] == b[j])) { | |
nmatch++; | |
match_a[i] = 1; | |
match_b[j] = 1; | |
break; | |
} | |
} | |
} | |
if (nmatch == 0) { | |
free(match_a); | |
free(match_b); | |
return 0.0; | |
} | |
// determine the number of transpositions: count the number of matched | |
// characters (previous step; there are marked in match_a) that do not align; | |
// these are the number of half transpositions: AB BA has two half | |
// transpositions (A does not align with the A in the second string and the B | |
// does not align with the B in the second string) and one transposition. | |
unsigned int ntransp = 0; | |
unsigned int k = 0; | |
for (unsigned int i = 0; i < la; ++i) { | |
if (match_a[i] == 1) { | |
for (unsigned int j = k; j < lb; ++j) { | |
if (match_b[j] == 1) { | |
k = j + 1; | |
if (a[i] != b[j]) ++ntransp; | |
break; | |
} | |
} | |
} | |
} | |
ntransp /= 2; | |
// cleanup and calculate final score | |
free(match_a); | |
free(match_b); | |
double tot_weight = 2.0*match_weight + trans_weight; | |
return (match_weight*(double)nmatch/la + match_weight*(double)nmatch/lb + | |
trans_weight*(double)(nmatch - ntransp)/nmatch)/tot_weight; | |
} | |
extern "C" { | |
SEXP r_jaro(SEXP a, SEXP b) { | |
if (LENGTH(a) != LENGTH(b)) return R_NilValue; | |
SEXP res = PROTECT(allocVector(REALSXP, LENGTH(a))); | |
double* p = REAL(res); | |
for (int i = 0; i < LENGTH(a); ++i) { | |
SEXP str_a = STRING_ELT(a, i); | |
SEXP str_b = STRING_ELT(b, i); | |
(*p) = jaro(CHAR(str_a), CHAR(str_b)); | |
} | |
UNPROTECT(1); | |
return res; | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Build jaro.so by running | |
# R CMD SHLIB jaro.cpp | |
dyn.load("jaro.so") | |
.Call("jw", 'shackleford', 'shackelford') | |
.Call("jw", 'jones', 'johnson') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment