Last active
July 25, 2023 09:17
-
-
Save CDillinger/2aa02128f840bdca90340ce08ee71bc2 to your computer and use it in GitHub Desktop.
C# Implementation of Fuzzy Match
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// LICENSE | |
// | |
// This software is dual-licensed to the public domain and under the following | |
// license: you are granted a perpetual, irrevocable license to copy, modify, | |
// publish, and distribute this file as you see fit. | |
using System; | |
using System.Collections.Generic; | |
public static class FuzzyMatcher | |
{ | |
/// <summary> | |
/// Does a fuzzy search for a pattern within a string. | |
/// </summary> | |
/// <param name="stringToSearch">The string to search for the pattern in.</param> | |
/// <param name="pattern">The pattern to search for in the string.</param> | |
/// <returns>true if each character in pattern is found sequentially within stringToSearch; otherwise, false.</returns> | |
public static bool FuzzyMatch(string stringToSearch, string pattern) | |
{ | |
var patternIdx = 0; | |
var strIdx = 0; | |
var patternLength = pattern.Length; | |
var strLength = stringToSearch.Length; | |
while (patternIdx != patternLength && strIdx != strLength) | |
{ | |
if (char.ToLower(pattern[patternIdx]) == char.ToLower(stringToSearch[strIdx])) | |
++patternIdx; | |
++strIdx; | |
} | |
return patternLength != 0 && strLength != 0 && patternIdx == patternLength; | |
} | |
/// <summary> | |
/// Does a fuzzy search for a pattern within a string, and gives the search a score on how well it matched. | |
/// </summary> | |
/// <param name="stringToSearch">The string to search for the pattern in.</param> | |
/// <param name="pattern">The pattern to search for in the string.</param> | |
/// <param name="outScore">The score which this search received, if a match was found.</param> | |
/// <returns>true if each character in pattern is found sequentially within stringToSearch; otherwise, false.</returns> | |
public static bool FuzzyMatch(string stringToSearch, string pattern, out int outScore) | |
{ | |
// Score consts | |
const int adjacencyBonus = 5; // bonus for adjacent matches | |
const int separatorBonus = 10; // bonus if match occurs after a separator | |
const int camelBonus = 10; // bonus if match is uppercase and prev is lower | |
const int leadingLetterPenalty = -3; // penalty applied for every letter in stringToSearch before the first match | |
const int maxLeadingLetterPenalty = -9; // maximum penalty for leading letters | |
const int unmatchedLetterPenalty = -1; // penalty for every letter that doesn't matter | |
// Loop variables | |
var score = 0; | |
var patternIdx = 0; | |
var patternLength = pattern.Length; | |
var strIdx = 0; | |
var strLength = stringToSearch.Length; | |
var prevMatched = false; | |
var prevLower = false; | |
var prevSeparator = true; // true if first letter match gets separator bonus | |
// Use "best" matched letter if multiple string letters match the pattern | |
char? bestLetter = null; | |
char? bestLower = null; | |
int? bestLetterIdx = null; | |
var bestLetterScore = 0; | |
var matchedIndices = new List<int>(); | |
// Loop over strings | |
while (strIdx != strLength) | |
{ | |
var patternChar = patternIdx != patternLength ? pattern[patternIdx] as char? : null; | |
var strChar = stringToSearch[strIdx]; | |
var patternLower = patternChar != null ? char.ToLower((char)patternChar) as char? : null; | |
var strLower = char.ToLower(strChar); | |
var strUpper = char.ToUpper(strChar); | |
var nextMatch = patternChar != null && patternLower == strLower; | |
var rematch = bestLetter != null && bestLower == strLower; | |
var advanced = nextMatch && bestLetter != null; | |
var patternRepeat = bestLetter != null && patternChar != null && bestLower == patternLower; | |
if (advanced || patternRepeat) | |
{ | |
score += bestLetterScore; | |
matchedIndices.Add((int)bestLetterIdx); | |
bestLetter = null; | |
bestLower = null; | |
bestLetterIdx = null; | |
bestLetterScore = 0; | |
} | |
if (nextMatch || rematch) | |
{ | |
var newScore = 0; | |
// Apply penalty for each letter before the first pattern match | |
// Note: Math.Max because penalties are negative values. So max is smallest penalty. | |
if (patternIdx == 0) | |
{ | |
var penalty = Math.Max(strIdx * leadingLetterPenalty, maxLeadingLetterPenalty); | |
score += penalty; | |
} | |
// Apply bonus for consecutive bonuses | |
if (prevMatched) | |
newScore += adjacencyBonus; | |
// Apply bonus for matches after a separator | |
if (prevSeparator) | |
newScore += separatorBonus; | |
// Apply bonus across camel case boundaries. Includes "clever" isLetter check. | |
if (prevLower && strChar == strUpper && strLower != strUpper) | |
newScore += camelBonus; | |
// Update pattern index IF the next pattern letter was matched | |
if (nextMatch) | |
++patternIdx; | |
// Update best letter in stringToSearch which may be for a "next" letter or a "rematch" | |
if (newScore >= bestLetterScore) | |
{ | |
// Apply penalty for now skipped letter | |
if (bestLetter != null) | |
score += unmatchedLetterPenalty; | |
bestLetter = strChar; | |
bestLower = char.ToLower((char)bestLetter); | |
bestLetterIdx = strIdx; | |
bestLetterScore = newScore; | |
} | |
prevMatched = true; | |
} | |
else | |
{ | |
score += unmatchedLetterPenalty; | |
prevMatched = false; | |
} | |
// Includes "clever" isLetter check. | |
prevLower = strChar == strLower && strLower != strUpper; | |
prevSeparator = strChar == '_' || strChar == ' '; | |
++strIdx; | |
} | |
// Apply score for last match | |
if (bestLetter != null) | |
{ | |
score += bestLetterScore; | |
matchedIndices.Add((int)bestLetterIdx); | |
} | |
outScore = score; | |
return patternIdx == patternLength; | |
} | |
} |
Hey CDillinger,
I found your code really helpful. Btw, there was an update on this article (https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/)
Update — 2/18/2017
It would be great also if you could apply the recursive match.
Thanks!
It would be great also if you could apply the recursive match.
I agree, it would be great. And I'll be happy to include the updated algorithm in in the fuzzy-search repository.
I made the recursive version but it doesn't work.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Tests
{
public static class Fuzzy
{
const int SEQUENTIAL_BONUS = 15; // bonus for adjacent matches
const int SEPARATOR_BONUS = 30; // bonus if match occurs after a separator
const int CAMEL_BONUS = 30; // bonus if match is uppercase and prev is lower
const int FIRST_LETTER_BONUS = 15; // bonus if the first letter is matched
const int LEADING_LETTER_PENALTY = -5; // penalty applied for every letter in str before the first match
const int MAX_LEADING_LETTER_PENALTY = -15; // maximum penalty for leading letters
const int UNMATCHED_LETTER_PENALTY = -1; // penalty for every letter that doesn't matter
public static bool FuzzyMatchRecursive(string pattern, string target, out int outScore)
{
int recursionCount = 0;
int recursionLimit = 10;
List<int> matches = new List<int>();
int maxMatches = 256;
return FuzzyMatchRecursive(
pattern, // string to Search
target, // target string
0, // patternCurIndex
0, // strCurrIndex
null, // srcMatces
matches, // matches
maxMatches, // maxMatches
0, // nextMatch
recursionCount, // recursionCount
recursionLimit, // recursionLimit
out outScore);
}
public static bool FuzzyMatchRecursive(
string pattern,
string str,
int patternCurIndex,
int strCurrIndex,
List<int> srcMatces,
List<int> matches,
int maxMatches,
int nextMatch,
int recursionCount,
int recursionLimit,
out int outScore)
{
outScore = 0;
// Return if recursion limit is reached.
if (++recursionCount >= recursionLimit)
return false;
// Return if we reached ends of strings.
if (patternCurIndex == pattern.Length || strCurrIndex == str.Length)
return false;
// Recursion params
bool recursiveMatch = false;
List<int> bestRecursiveMatches = new List<int>();
int bestRecursiveScore = 0;
// Loop through pattern and str looking for a match.
bool firstMatch = true;
while (patternCurIndex < pattern.Length && strCurrIndex < str.Length)
{
// Match found.
var patternChar = char.ToLower(pattern[patternCurIndex]);
var strChar = char.ToLower(str[strCurrIndex]);
if (patternChar == strChar)
{
if (nextMatch > maxMatches)
return false;
if (firstMatch && srcMatces != null)
{
matches = srcMatces;
firstMatch = false;
}
List<int> recursiveMatches = new List<int>();
var ismatched = FuzzyMatchRecursive(
pattern,
str,
patternCurIndex,
strCurrIndex + 1,
matches,
recursiveMatches,
maxMatches,
nextMatch,
recursionCount,
recursionLimit,
out int recursiveScore);
if (ismatched)
{
// Pick best recursive score.
if (!recursiveMatch || recursiveScore > bestRecursiveScore)
{
bestRecursiveMatches = recursiveMatches;
bestRecursiveScore = recursiveScore;
}
recursiveMatch = true;
}
nextMatch++;
matches.Add(strCurrIndex);
++patternCurIndex;
}
++strCurrIndex;
}
var matched = patternCurIndex == pattern.Length;
if (matched)
{
outScore = 100;
// Apply leading letter penalty
int penalty = LEADING_LETTER_PENALTY * matches[0];
penalty = penalty < MAX_LEADING_LETTER_PENALTY ? MAX_LEADING_LETTER_PENALTY : penalty;
outScore += penalty;
//Apply unmatched penalty
int unmatched = str.Length - nextMatch;
outScore += UNMATCHED_LETTER_PENALTY * unmatched;
for (int i = 0; i < nextMatch; i++)
{
int currIdx = matches[i];
if (i > 0)
{
int prevIdx = matches[i - 1];
if (currIdx == prevIdx + 1)
{
outScore += SEQUENTIAL_BONUS;
}
}
// Check for bonuses based on neighbor character value.
if (currIdx > 0)
{
// Camel case
char neighbor = str[currIdx - 1];
char curr = str[currIdx];
if (neighbor == char.ToLower(neighbor) && curr == char.ToUpper(curr))
{
outScore += CAMEL_BONUS;
}
bool isNeighbourSeparator = neighbor == '_' || neighbor == ' ';
if (isNeighbourSeparator)
{
outScore += SEPARATOR_BONUS;
}
}
else
{
// First letter
outScore += FIRST_LETTER_BONUS;
}
}
// Return best result
if (recursiveMatch && (!matched || bestRecursiveScore > outScore))
{
// Recursive score is better than "this"
matches = bestRecursiveMatches;
outScore = bestRecursiveScore;
return true;
}
else if (matched)
{
// "this" score is better than recursive
return true;
}
else
{
return false;
}
}
return false;
}
public static string FuzzyMatchRecursive(string pattern, string[] targets)
{
if (!string.IsNullOrEmpty(pattern) && targets?.Length > 0)
{
Dictionary<string, int> distances = new Dictionary<string, int>();
foreach (var item in targets)
{
var result = FuzzyMatchRecursive(pattern, item, out int score);
distances.Add(item, score);
}
return distances.Aggregate((l, r) => l.Value > r.Value ? l : r).Key;
}
return pattern;
}
public static void Test()
{
string[] targets = new[] {
"Internal Hire or Backfill",
"OTE Neutral",
"Funded by Business Leader Budget",
"Funded by Merit Budget"
};
string pattern = "Intern Conversion";
string pattern2 = "Internal Hire";
var result = FuzzyMatchRecursive(pattern, targets);
var result2 = FuzzyMatchRecursive(pattern2, targets);
if (result != "Internal Hire or Backfill")
throw new Exception("No match found!");
if (result2 != "Internal Hire or Backfill")
throw new Exception("No match found!");
}
}
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @CDillinger,
just wanted to let you know that I've added your C# port to a new project built around Forrest Smith's FTS Fuzzy Match: