Created
June 22, 2017 17:11
-
-
Save WolfDan/7cef2973d1fcae8f8e84c00b0a959fc8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule Search.Searcher do | |
# Score consts | |
@sequential_bonus 15 # bonus for adjacent matches | |
@separator_bonus 30 # bonus if match occurs after a separator | |
@camel_bonus 30 # bonus if match is uppercase and prev is lower | |
@first_letter_bonus 15 # bonus if the first letter is matched | |
@leading_letter_penalty -5 # penalty applied for every letter in str before the first match | |
@max_leading_letter_penalty -15 # maximum penalty for leading letters | |
@unmatched_letter_penalty -1 # penalty for every letter that doesn't matter | |
def search(pat, str) do | |
_search(pat |> String.codepoints , | |
str |> String.codepoints , 0, str |> String.codepoints, 1, nil) | |
end | |
defp _search(_, _, score, _, rec_count, _) when rec_count >= 10 do | |
{:false, score, []} | |
end | |
defp _search([], _, score, _, _, _) do # pattern empty | |
{:false, score, []} | |
end | |
defp _search(_, [], score, _, _, _) do # or str empty | |
{:false, score, []} | |
end | |
defp _search(pat, str, score, str_begin, rec_count, src_matches) do | |
state = %{ | |
rec_match: false, #rec_match | |
best_rec_matches: [], # best_rec_matches | |
best_rec_score: 0, # best_rec_score | |
first_match: true, # first_match | |
matches: [], # matches | |
src_matches: src_matches # src_matches | |
} | |
case _while(state, pat, str, str_begin, rec_count) do | |
{:ok, state, pat} -> | |
_calculate_score(state, str_begin, pat, score) | |
false -> | |
{:false, score, []} | |
end | |
end | |
defp _calculate_score(state, str_begin, pat, score) do | |
if pat == [] do | |
100 | |
|> _calculate_penalty(Enum.at(state[:matches], 0)) | |
|> _cal_unmatched(str_begin, state) | |
|> _iter_matches(state[:matches], str_begin) | |
|> _calculate_return(state, pat) | |
else | |
_calculate_return({:ok, score}, state, pat) | |
end | |
end | |
defp _calculate_return({_, score}, state, pat) do | |
cond do | |
state[:rec_match] && (pat != [] || state[:best_rec_score] > score) | |
-> | |
{true, state[:best_rec_score], state[:best_rec_matches]} | |
pat === [] -> | |
{true, score, state[:matches]} | |
true -> | |
{false, score, state[:matches]} | |
end | |
end | |
defp _calculate_penalty(out_score, match_score) do | |
penalty = | |
if match_score == nil do | |
0 | |
else | |
@leading_letter_penalty * match_score | |
end | |
if penalty < @max_leading_letter_penalty do | |
out_score + @max_leading_letter_penalty | |
else | |
out_score + penalty | |
end | |
end | |
defp _cal_unmatched(out_score, str_begin, state) do | |
out_score + @unmatched_letter_penalty * | |
(String.replace_suffix(Enum.join(str_begin, ""), "", "") | |
|> String.length |> Kernel.-(length(state[:matches]))) | |
end | |
defp _iter_matches(out_score, matches, str_begin) do | |
matches | |
|> Enum.with_index | |
|> Enum.map_reduce(out_score, fn({item, count}, score) -> | |
{item, score | |
|> _iter_sequential(matches, count, item) | |
|> _iter_bonuses(str_begin, item)} | |
end) | |
end | |
defp _iter_sequential(score, matches, count, item) do | |
if count > 0 && item == (Enum.at(matches, count - 1) + 1) do | |
score + @sequential_bonus | |
else | |
score | |
end | |
end | |
defp _iter_bonuses(score, str_begin, item) do | |
if item > 0 do | |
neighbor = Enum.at(str_begin, item - 1) | |
score | |
|> _camel_case(neighbor, Enum.at(str_begin, item)) | |
|> _separator(neighbor) | |
else | |
score + @first_letter_bonus | |
end | |
end | |
defp _camel_case(score, neighbor, curr) do | |
if neighbor != " " && neighbor == String.downcase(neighbor) && | |
curr != " " && curr === String.upcase(curr) | |
do | |
score + @camel_bonus | |
else | |
score | |
end | |
end | |
defp _separator(score, neighbor) do | |
if neighbor === "_" || neighbor === " " do | |
score + @separator_bonus | |
else | |
score | |
end | |
end | |
defp _while(state, [], _, _, _) do | |
{:ok, | |
state |> Map.put(:matches, Enum.reverse(state[:matches])), | |
[]} | |
end | |
defp _while(state, pat, [], _, _) do | |
{:ok, | |
state |> Map.put(:matches, Enum.reverse(state[:matches])), | |
pat} | |
end | |
defp _while(state, pat,[_ | str_t] = str, | |
str_begin, rec_count) do | |
#can't call String.downcase on a def when so here use cond | |
case _check_while(state, pat, str, str_begin, rec_count) do | |
{pat, state, rec_count} -> | |
_while(state, pat, str_t, str_begin, rec_count) | |
false -> | |
false | |
end | |
end | |
defp _check_while(state,[pat_h | pat_t] = pat,[str_h | str_t] = str, | |
str_begin, rec_count) do | |
if String.downcase(pat_h) === String.downcase(str_h) do | |
if length(state[:matches]) >= 255 do | |
false | |
else | |
rec_count = rec_count + 1 | |
state = | |
state | |
|> check_first_match | |
|> check_recursive(pat, str_t, str_begin, rec_count) | |
|> update_matches(str, str_begin) | |
{pat_t, state, rec_count} | |
end | |
else | |
{pat, state, rec_count} | |
end | |
end | |
defp check_first_match(state) do | |
if state[:first_match] && state[:src_matches] != nil do | |
state | |
|> Map.put(:matches, state[:src_matches]) | |
|> Map.put(:first_match, false) | |
else | |
state | |
end | |
end | |
defp check_recursive(state, pat, str, str_begin, rec_count) do | |
case _search(pat, str, 0, str_begin, rec_count, state[:matches]) do | |
{true, score, matches} -> | |
recursive_matches state, score, matches | |
_ -> | |
state | |
end | |
end | |
defp recursive_matches(state, score, matches) do | |
state = | |
if state[:rec_match] === false || | |
score > state[:best_rec_score] do | |
state | |
|> Map.put(:best_rec_matches, matches) | |
|> Map.put(:best_rec_score, score) | |
else | |
state | |
end | |
state | |
|> Map.put(:rec_match, true) | |
end | |
defp update_matches(state, str, str_begin) do | |
state | |
|> Map.put(:matches, | |
[String.replace_suffix(Enum.join(str_begin, ""), Enum.join(str, ""), "") | |
|> String.length | state[:matches]]) | |
end | |
end |
Hey @tajmone !
Sure! feel free to include it on your repo, regarding the license I don't know too much about them so I'd guess just keep it MIT
Just a warning, this code is really old and I did it when I just get started with the language, I'm sure it can be improved, so once you add it in your repo I'll probably add a pr later on improving the code :D
Thanks a lot @WolfDan! I've added it to the proejct:
https://github.com/tajmone/fuzzy-search/tree/master/fts_fuzzy_match/0.2.0/elixir
Following your directives, I've taken liberty to add a LICENSE
file to its folder:
MIT License
Copyright (c) 2017 @WolfDan -- https://github.com/WolfDan
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
I hope it's all OK.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @WolfDan,
I wanted to ask you permission to include your Elixir port of fts_fuzzy_match in a new repository I've created around the algorithm:
The project aims to collect all the existing implementations and ports of Forrest Smith's original algorithms (v0.1.0 and v0.2.0) and to add useful educational assets to study and improve them.
Since you didn't specify a license in the source file of this Gist, I'm asking you permission to include it in my project, and whether I should consider it under the same public domain terms of the original algorithm by Forrest, or if you rather prefer to share it under a different license.
My project uses the CC0 Universal license, but third party assets can be included with any license.
Thanks.