Last active
July 28, 2024 11:18
-
-
Save amsul/1154ffa36d8c90cca321948b3c7583f5 to your computer and use it in GitHub Desktop.
A stupidly simple fuzzy search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// @flow | |
import * as _ from 'lodash' | |
/////////////////// | |
// FILTER & SORT // | |
/////////////////// | |
type FilterAndSortType = ({ | |
data: Array<*>, | |
getTargetText?: ({ item: * }) => string, | |
limit?: number, | |
searchText: string, | |
sortIteratees?: Array<(match: Object) => number>, | |
}) => Array<*> | |
export const filterAndSort: FilterAndSortType = ({ | |
data, | |
getTargetText = ({ item }) => item, | |
limit = data.length, | |
searchText, | |
sortIteratees = [() => 0], | |
}) => { | |
if (!searchText) { | |
if (limit !== data.length) { | |
return data.slice(0, limit) | |
} | |
return data | |
} | |
return ( | |
_.chain(data) | |
// Get matches for each item | |
.map(item => | |
getMatch({ | |
item, | |
searchText, | |
targetText: getTargetText({ item }), | |
}) | |
) | |
// Filter out non-matches | |
.filter(match => { | |
console.debug( | |
'Match of %o in %o: isExact=%o isFuzzy=%o coefficient=%o', | |
match.searchText, | |
match.targetText, | |
match.isExact, | |
match.isFuzzy, | |
match.coefficient | |
) | |
return match.isExact || match.isFuzzy | |
// TODO: If it's a fuzzy match and only one character matches | |
// when there are multiple words in the search text, ignore it | |
}) | |
// Sort by the coefficient, then by fuzzy matches, and then exact matches | |
.sortBy(match => match.coefficient * -1) | |
.sortBy(match => (match.isFuzzy ? -1 : 1)) | |
.sortBy(match => (match.isExact ? -1 : 1)) | |
// Finally, sort by the number of exact matches in the pairs of matches | |
.sortBy(match => | |
match.pairs.reduce((accumulator, pair) => { | |
if (typeof pair !== 'string') { | |
return accumulator - (pair.isExact ? 2 : 1) | |
} | |
return accumulator | |
}, 0) | |
) | |
// Sort by all custom sorters | |
.sortBy(...sortIteratees) | |
// Slice down to the limit | |
.slice(0, limit) | |
// Pull out the original item and then return the final value | |
.map(match => match.item) | |
.value() | |
) | |
} | |
/////////// | |
// SPLIT // | |
/////////// | |
type SplitByMatchesType = ({ | |
searchText: string, | |
targetText: string, | |
}) => Array<string | [string, string, string]> | |
export const splitByMatches: SplitByMatchesType = ({ | |
searchText, | |
targetText, | |
}) => { | |
if (!searchText) { | |
return [targetText] | |
} | |
const match = getMatch({ item: targetText, searchText, targetText }) | |
const matchedWords = match.pairs.map(pair => { | |
if (typeof pair === 'string') { | |
return pair | |
} | |
const indexOfMatchText = pair.targetWord.indexOf(pair.matchText) | |
const preMatchSliceEnd = indexOfMatchText > 0 ? indexOfMatchText : 0 | |
const postMatchSliceStart = preMatchSliceEnd + pair.matchText.length | |
return [ | |
pair.targetWord.slice(0, preMatchSliceEnd), | |
pair.matchText, | |
pair.targetWord.slice(postMatchSliceStart), | |
] | |
}) | |
return reduceConsecutiveStrings({ values: matchedWords }) | |
} | |
type ReduceConsecutiveStringsType = ({ values: Array<*> }) => Array<*> | |
const reduceConsecutiveStrings: ReduceConsecutiveStringsType = ({ | |
values, | |
}) => | |
values.reduce((accumulator, value, index) => { | |
if (typeof value === 'string' && typeof _.last(accumulator) === 'string') { | |
accumulator[accumulator.length - 1] = _.last(accumulator) + value | |
} else { | |
accumulator.push(value) | |
} | |
return accumulator | |
}, []) | |
//////////////////////// | |
// WORDS & SEPARATORS // | |
//////////////////////// | |
const SEPARATOR_CHARACTERS = [',', '.', '-', ' ', '|', '/'] | |
const REGEXP_SEPARATORS = new RegExp(`[\\${SEPARATOR_CHARACTERS.join('\\')}]+`) | |
const REGEXP_SEPARATORS_CAPTURING = new RegExp( | |
`([\\${SEPARATOR_CHARACTERS.join('\\')}])` | |
) | |
const splitWords = ({ text, isCapturingSeparators }) => { | |
const regExp = isCapturingSeparators | |
? REGEXP_SEPARATORS_CAPTURING | |
: REGEXP_SEPARATORS | |
return text.split(regExp).filter(value => value) | |
} | |
const hasAnySeparator = ({ word }) => | |
SEPARATOR_CHARACTERS.some(character => word.includes(character)) | |
/////////// | |
// MATCH // | |
/////////// | |
type SearchWordMatchType = { | |
index: number, | |
coefficient: number, | |
matchText: string, | |
searchWord: string, | |
targetWord: string, | |
} | |
type SearchMatchType = { | |
item: any, | |
pairs: Array<string | SearchWordMatchType>, | |
searchText: string, | |
searchWords: string[], | |
targetText: string, | |
targetWords: string[], | |
coefficient: number, | |
isExact: boolean, | |
isFuzzy: boolean, | |
} | |
const getMatch = ({ item, searchText, targetText }): SearchMatchType => { | |
const searchWords = splitWords({ | |
text: searchText, | |
isCapturingSeparators: false, | |
}) | |
const targetWords = splitWords({ | |
text: targetText, | |
isCapturingSeparators: true, | |
}) | |
const targetWordsWithoutSeparators = splitWords({ | |
text: targetText, | |
isCapturingSeparators: false, | |
}) | |
const pairs = (targetWords.slice(0): any) | |
searchWords.forEach(searchWord => { | |
const match = getBestWordMatch({ pairs, searchWord, targetWords }) | |
if (match) { | |
pairs[match.index] = match | |
} | |
}) | |
return { | |
item, | |
pairs, | |
searchText, | |
searchWords, | |
targetText, | |
targetWords, | |
coefficient: getJaccardCoefficient({ | |
matchText: pairs | |
.map(pair => (typeof pair === 'string' ? '' : pair.matchText)) | |
.join(''), | |
searchWord: searchWords.join(''), | |
targetWord: targetWordsWithoutSeparators.join(''), | |
}), | |
isExact: normalize({ string: targetText }).startsWith( | |
normalize({ string: searchText }) | |
), | |
isFuzzy: targetWords.some(targetWord => | |
searchWords.some(searchWord => | |
normalize({ string: targetWord }).includes( | |
normalize({ string: searchWord }) | |
) | |
) | |
), | |
} | |
} | |
const getBestWordMatch = ({ | |
pairs, | |
searchWord, | |
targetWords, | |
}): ?SearchWordMatchType => | |
_.chain(targetWords) | |
.map((targetWord, index) => | |
getWordMatch({ pairs, index, searchWord, targetWord }) | |
) | |
.filter(match => match.coefficient) | |
.sortBy(match => match.coefficient * -1) | |
.sortBy(match => (match.isExact ? -1 : 1)) | |
.first() | |
.value() | |
const getWordMatch = ({ | |
pairs, | |
index, | |
searchWord, | |
targetWord, | |
}): SearchWordMatchType => { | |
const matchText = | |
typeof pairs[index] === 'string' | |
? getWordMatchText({ searchWord, targetWord }) | |
: '' | |
return { | |
index, | |
coefficient: getJaccardCoefficient({ matchText, searchWord, targetWord }), | |
isExact: normalize({ string: targetWord }).startsWith( | |
normalize({ string: searchWord }) | |
), | |
matchText, | |
searchWord, | |
targetWord, | |
} | |
} | |
const getWordMatchText = ({ searchWord, targetWord }) => { | |
if (SEPARATOR_CHARACTERS.includes(targetWord)) { | |
return '' | |
} | |
if ( | |
hasAnySeparator({ word: targetWord }) || | |
hasAnySeparator({ word: searchWord }) | |
) { | |
throw new Error( | |
'Neither the `targetWord` nor the `searchWord` cannot contain separator characters' | |
) | |
} | |
const searchWordNormalized = normalize({ string: searchWord }) | |
const targetWordNormalized = normalize({ string: targetWord }) | |
let matchText = '' | |
let searchIndex = 0 | |
let targetIndex = 0 | |
let searchCharacter = null | |
let targetCharacter = null | |
while ( | |
(searchCharacter = searchWordNormalized[searchIndex]) && | |
(targetCharacter = targetWordNormalized[targetIndex]) | |
) { | |
if (searchCharacter === targetCharacter) { | |
matchText += targetWord[targetIndex] | |
searchIndex += 1 | |
} else if (matchText.length) { | |
return matchText | |
} | |
targetIndex += 1 | |
} | |
return matchText | |
} | |
const getJaccardCoefficient = ({ matchText, searchWord, targetWord }) => { | |
if (!matchText) { | |
return 0 | |
} | |
const intersection = matchText.length | |
const union = targetWord.length + searchWord.length - intersection | |
return intersection / union | |
} | |
///////////// | |
// HELPERS // | |
///////////// | |
// https://stackoverflow.com/questions/990904/remove-accents-diacritics-in-a-string-in-javascript/37511463#37511463 | |
export const normalize = ({ string }: { string: string }): string => | |
string | |
.normalize('NFD') | |
.replace(/[\u0300-\u036f]/g, '') | |
.replace(/['"“”’‘.,]/g, '') | |
.replace(/-/g, ' ') | |
.toLowerCase() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment