Last active
November 16, 2017 23:35
-
-
Save djaquels/7dcf9844c04004ed9642d9f1215a51e7 to your computer and use it in GitHub Desktop.
Ejemplo De SImilitud De Cadenas De Texto. Aproximación con el coeficiente de Sørensen-Dice
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Implementación del algoritmo Sørense-Dice para evaluar similitud de cadenas*/ | |
// Función que obtiene bi-grams, conjuto de cadenas de dos letras, de cada | |
// palabra ingresada | |
// Recibe : Una cadena de texto | |
// Obtiene: Un arreglo de cadenas de texto de dos letras. | |
function bi_grams(cadena){ | |
var grams = []; | |
for(var i = 0; i <= cadena.length-2;i++){ | |
var pareja = [cadena[i].toString() + cadena[i+1]].toString(); | |
grams.push(pareja.toString()); | |
} | |
return grams; | |
} | |
// Regresa el número de coincidencias (intersección) de bigrams de dos cadenas, | |
function cardCadenas(bgram1,bgram2){ | |
var l = 0; | |
var max = bgram1.length > bgram2.length ? [bgram2,bgram1] : [bgram1,bgram2]; | |
for(var i in max[0]){ | |
if(max[1].indexOf(max[0][i]) != -1){ | |
l++; | |
} | |
} | |
return l; | |
} | |
// Algortimo que regresa un indice de 1-0, donde 1 es cadenas iguales y 0 es | |
// cadenas totalmente diferentes. | |
function indiceDeSimilitud(s1,s2){ | |
var aux1,aux2; | |
aux1 = s1.toUpperCase(); | |
aux2 = s2.toUpperCase(); | |
var bgram1 = bi_grams(aux1); | |
var bgram2 = bi_grams(aux2); | |
return (2*cardCadenas(bgram1,bgram2))/(bgram1.length + bgram2.length); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment