Last active
November 8, 2023 19:28
-
-
Save the-mikedavis/db39019721fab873980df3560345a93b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(jaro). | |
-export([distance/2]). | |
%% This is an Erlang translation of Elixir's `String.jaro_distance/2'. | |
%% Upstream: https://github.com/elixir-lang/elixir/blob/v1.15.7/lib/elixir/lib/string.ex#L2784-L2879. | |
%% Jaro Distance is relatively simple - only transpositions of grapheme | |
%% clusters are allowed which is much simpler than other edit distance | |
%% measurements like Damerau–Levenshtein (insertion, deletion, substitution | |
%% _and_ transposition). | |
%% Transpositions are pairs of characters that exist in both input strings but | |
%% have different positions in the strings. | |
-spec distance(BinA, BinB) -> Distance when | |
BinA :: binary(), | |
BinB :: binary(), | |
%% [+0.0, 1.0] | |
Distance :: float(). | |
distance(Bin, Bin) -> | |
1.0; | |
distance(_Bin, <<"">>) -> | |
+0.0; | |
distance(<<"">>, _Bin) -> | |
+0.0; | |
distance(Bin1, Bin2) when is_binary(Bin1) andalso is_binary(Bin2) -> | |
{Graphemes1, Len1} = graphemes_and_length(Bin1), | |
{Graphemes2, Len2} = graphemes_and_length(Bin2), | |
case match(Graphemes1, Len1, Graphemes2, Len2) of | |
{0, _Trans} -> | |
+0.0; | |
{Common, Transpositions} -> | |
(Common / Len1 + | |
Common / Len2 + | |
(Common - Transpositions) / Common) / 3 | |
end. | |
match(Graphemes1, Len1, Graphemes2, Len2) -> | |
%% Iterate through the shorter grapheme cluster list. | |
case Len1 < Len2 of | |
true -> | |
match(Graphemes1, Graphemes2, Len1 div 2 - 1); | |
false -> | |
match(Graphemes2, Graphemes1, Len2 div 2 - 1) | |
end. | |
match(Graphemes1, Graphemes2, Limit) -> | |
match(Graphemes1, Graphemes2, {0, Limit}, {0, 0, -1}, 0). | |
match([Grapheme | Rest], Graphemes, Range, State, Idx) -> | |
{Graphemes1, State1} = submatch(Grapheme, Graphemes, Range, State, Idx), | |
case Range of | |
{Limit, Limit} -> | |
match(Rest, tl(Graphemes1), Range, State1, Idx + 1); | |
{Pre, Limit} -> | |
match(Rest, Graphemes1, {Pre + 1, Limit}, State1, Idx + 1) | |
end; | |
match([], _, _, {Common, Transpositions, _}, _) -> | |
{Common, Transpositions}. | |
submatch(Grapheme, Graphemes, {Pre, _} = Range, State, Idx) -> | |
case detect(Grapheme, Graphemes, Range) of | |
undefined -> | |
{Graphemes, State}; | |
{SubIdx, Graphemes1} -> | |
{Graphemes1, proceed(State, Idx - Pre + SubIdx)} | |
end. | |
detect(Grapheme, Graphemes, {Pre, Limit}) -> | |
detect(Grapheme, Graphemes, Pre + 1 + Limit, 0, []). | |
detect(_Grapheme, _Graphemes, 0, _Idx, _Acc) -> | |
undefined; | |
detect(_Grapheme, [], _Limit, _Idx, _Acc) -> | |
undefined; | |
detect(Grapheme, [Grapheme | Rest], _Limit, Idx, Acc) -> | |
{Idx, lists:reverse(Acc, ['=:=' | Rest])}; | |
detect(Grapheme, [Other | Rest], Limit, Idx, Acc) -> | |
detect(Grapheme, Rest, Limit - 1, Idx + 1, [Other | Acc]). | |
proceed({Common, Transpositions, Former}, Current) -> | |
case Current < Former of | |
true -> | |
{Common + 1, Transpositions + 1, Current}; | |
false -> | |
{Common + 1, Transpositions, Current} | |
end. | |
-spec graphemes_and_length(Bin) -> {Graphemes, Length} when | |
Bin :: binary(), | |
Graphemes :: [binary()], | |
Length :: non_neg_integer(). | |
%% @doc Separates the grapheme clusters of the binary and counts grapheme | |
%% clusters. | |
%% | |
%% @private | |
%% Upstream: https://github.com/elixir-lang/elixir/blob/v1.15.7/lib/elixir/lib/string.ex#L2950-L2964 | |
graphemes_and_length(Bin) -> | |
graphemes_and_length(Bin, [], 0). | |
graphemes_and_length(Bin, Acc, Length) -> | |
case unicode_util:gc(Bin) of | |
[Gc | Rest] -> | |
graphemes_and_length(Rest, [Gc | Acc], Length + 1); | |
[] -> | |
{lists:reverse(Acc), Length}; | |
{error, <<Byte, Rest/bitstring>>} -> | |
graphemes_and_length(Rest, [<<Byte>> | Acc], Length + 1) | |
end. | |
-ifdef(TEST). | |
-include_lib("eunit/include/eunit.hrl"). | |
edge_cases_test() -> | |
?assertEqual(1.0, jaro:distance(<<"foo">>, <<"foo">>)), | |
?assertEqual(+0.0, jaro:distance(<<"foo">>, <<"">>)), | |
?assertEqual(+0.0, jaro:distance(<<"">>, <<"foo">>)). | |
nothing_in_common_test() -> | |
?assertEqual(+0.0, jaro:distance(<<"foo">>, <<"bar">>)). | |
wikipedia_example_test() -> | |
Trunc = fun(Float, Precision) -> | |
Scalar = math:pow(10, Precision), | |
trunc(Float * Scalar) / Scalar | |
end, | |
%% See <https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_similarity>. | |
%% FAREMVIEL and FARMVILLE have a distance of 0.88 (truncated). | |
?assertEqual( | |
0.88, | |
Trunc(jaro:distance(<<"faremveil">>, <<"farmville">>), 2)). | |
unicode_test() -> | |
%% The pirate flag emoji is a combination of the black flag and skull emoji | |
%% graphemes joined by the zero-width joiner code-point. It's a single | |
%% grapheme <em>cluster</em> though, and we should be segmenting the input | |
%% binaries on grapheme clusters, so the code-points in common should not | |
%% be considered a transposition: they shoud not increase the distance | |
%% score. | |
%% So it should be the same score as if we replaced the pirate flag with | |
%% some other single grapheme cluster and the black flag and skull with | |
%% two different grapheme clusters. | |
?assertEqual( | |
jaro:distance(<<"🏴☠️"/utf8>>, <<"🏴☠️"/utf8>>), | |
jaro:distance(<<"a">>, <<"bc">>)). | |
invalid_unicode_test() -> | |
%% Without the `/utf8' bit string modifier, the following is not valid | |
%% unicode and `unicode_util:gc/1' will correctly recognize the invalid | |
%% byte sequences. | |
%% The invalid byte will be the same between both code-point lists. The | |
%% skull emoji will be mistaken as two grapheme clusters while the black | |
%% flag and zero-width joiner will be considered one cluster each, so I've | |
%% represented this below as the flag flag as "a", zero-width joiner as | |
%% space, and skull as "bc". | |
?assertEqual( | |
jaro:distance(<<"🏴☠️">>, <<"🏴☠️">>), | |
jaro:distance(<<"a bc">>, <<"abc">>)). | |
-endif. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment