amsul · July 28, 2024 11:18
diff --git a/searchUtil.js b/searchUtil.js
 // @flow

 import * as _ from 'lodash'

 ///////////////////
 // FILTER & SORT //
 ///////////////////

 type FilterAndSortType = ({
  data: Array<*>,
  getTargetText?: ({ item: * }) => string,
  limit?: number,
  searchText: string,
  sortIteratees?: Array<(match: Object) => number>,
 }) => Array<*>

 export const filterAndSort: FilterAndSortType = ({
  data,
  getTargetText = ({ item }) => item,
  limit = data.length,
  searchText,
  sortIteratees = [() => 0],
 }) => {
  if (!searchText) {
    if (limit !== data.length) {
      return data.slice(0, limit)
    }
    return data
  }

  return (
    _.chain(data)

      // Get matches for each item
      .map(item =>
        getMatch({
          item,
          searchText,
          targetText: getTargetText({ item }),
        })
      )

      // Filter out non-matches
      .filter(match => {
        console.debug(
          'Match of %o in %o: isExact=%o isFuzzy=%o coefficient=%o',
          match.searchText,
          match.targetText,
          match.isExact,
          match.isFuzzy,
          match.coefficient
        )
        return match.isExact || match.isFuzzy
        // TODO: If it's a fuzzy match and only one character matches
        //       when there are multiple words in the search text, ignore it
      })

      // Sort by the coefficient, then by fuzzy matches, and then exact matches
      .sortBy(match => match.coefficient * -1)
      .sortBy(match => (match.isFuzzy ? -1 : 1))
      .sortBy(match => (match.isExact ? -1 : 1))

      // Finally, sort by the number of exact matches in the pairs of matches
      .sortBy(match =>
        match.pairs.reduce((accumulator, pair) => {
          if (typeof pair !== 'string') {
            return accumulator - (pair.isExact ? 2 : 1)
          }
          return accumulator
        }, 0)
      )

      // Sort by all custom sorters
      .sortBy(...sortIteratees)

      // Slice down to the limit
      .slice(0, limit)

      // Pull out the original item and then return the final value
      .map(match => match.item)
      .value()
  )
 }

 ///////////
 // SPLIT //
 ///////////

 type SplitByMatchesType = ({
  searchText: string,
  targetText: string,
 }) => Array<string | [string, string, string]>

 export const splitByMatches: SplitByMatchesType = ({
  searchText,
  targetText,
 }) => {
  if (!searchText) {
    return [targetText]
  }

  const match = getMatch({ item: targetText, searchText, targetText })
  const matchedWords = match.pairs.map(pair => {
    if (typeof pair === 'string') {
      return pair
    }
    const indexOfMatchText = pair.targetWord.indexOf(pair.matchText)
    const preMatchSliceEnd = indexOfMatchText > 0 ? indexOfMatchText : 0
    const postMatchSliceStart = preMatchSliceEnd + pair.matchText.length
    return [
      pair.targetWord.slice(0, preMatchSliceEnd),
      pair.matchText,
      pair.targetWord.slice(postMatchSliceStart),
    ]
  })

  return reduceConsecutiveStrings({ values: matchedWords })
 }

 type ReduceConsecutiveStringsType = ({ values: Array<*> }) => Array<*>

 const reduceConsecutiveStrings: ReduceConsecutiveStringsType = ({
  values,
 }) =>
  values.reduce((accumulator, value, index) => {
    if (typeof value === 'string' && typeof _.last(accumulator) === 'string') {
      accumulator[accumulator.length - 1] = _.last(accumulator) + value
    } else {
      accumulator.push(value)
    }
    return accumulator
  }, [])

 ////////////////////////
 // WORDS & SEPARATORS //
 ////////////////////////

 const SEPARATOR_CHARACTERS = [',', '.', '-', ' ', '|', '/']
 const REGEXP_SEPARATORS = new RegExp(`[\\${SEPARATOR_CHARACTERS.join('\\')}]+`)
 const REGEXP_SEPARATORS_CAPTURING = new RegExp(
  `([\\${SEPARATOR_CHARACTERS.join('\\')}])`
 )

 const splitWords = ({ text, isCapturingSeparators }) => {
  const regExp = isCapturingSeparators
    ? REGEXP_SEPARATORS_CAPTURING
    : REGEXP_SEPARATORS
  return text.split(regExp).filter(value => value)
 }

 const hasAnySeparator = ({ word }) =>
  SEPARATOR_CHARACTERS.some(character => word.includes(character))

 ///////////
 // MATCH //
 ///////////

 type SearchWordMatchType = {
  index: number,
  coefficient: number,
  matchText: string,
  searchWord: string,
  targetWord: string,
 }

 type SearchMatchType = {
  item: any,
  pairs: Array<string | SearchWordMatchType>,
  searchText: string,
  searchWords: string[],
  targetText: string,
  targetWords: string[],
  coefficient: number,
  isExact: boolean,
  isFuzzy: boolean,
 }

 const getMatch = ({ item, searchText, targetText }): SearchMatchType => {
  const searchWords = splitWords({
    text: searchText,
    isCapturingSeparators: false,
  })
  const targetWords = splitWords({
    text: targetText,
    isCapturingSeparators: true,
  })
  const targetWordsWithoutSeparators = splitWords({
    text: targetText,
    isCapturingSeparators: false,
  })

  const pairs = (targetWords.slice(0): any)
  searchWords.forEach(searchWord => {
    const match = getBestWordMatch({ pairs, searchWord, targetWords })
    if (match) {
      pairs[match.index] = match
    }
  })

  return {
    item,
    pairs,
    searchText,
    searchWords,
    targetText,
    targetWords,
    coefficient: getJaccardCoefficient({
      matchText: pairs
        .map(pair => (typeof pair === 'string' ? '' : pair.matchText))
        .join(''),
      searchWord: searchWords.join(''),
      targetWord: targetWordsWithoutSeparators.join(''),
    }),
    isExact: normalize({ string: targetText }).startsWith(
      normalize({ string: searchText })
    ),
    isFuzzy: targetWords.some(targetWord =>
      searchWords.some(searchWord =>
        normalize({ string: targetWord }).includes(
          normalize({ string: searchWord })
        )
      )
    ),
  }
 }

 const getBestWordMatch = ({
  pairs,
  searchWord,
  targetWords,
 }): ?SearchWordMatchType =>
  _.chain(targetWords)
    .map((targetWord, index) =>
      getWordMatch({ pairs, index, searchWord, targetWord })
    )
    .filter(match => match.coefficient)
    .sortBy(match => match.coefficient * -1)
    .sortBy(match => (match.isExact ? -1 : 1))
    .first()
    .value()

 const getWordMatch = ({
  pairs,
  index,
  searchWord,
  targetWord,
 }): SearchWordMatchType => {
  const matchText =
    typeof pairs[index] === 'string'
      ? getWordMatchText({ searchWord, targetWord })
      : ''
  return {
    index,
    coefficient: getJaccardCoefficient({ matchText, searchWord, targetWord }),
    isExact: normalize({ string: targetWord }).startsWith(
      normalize({ string: searchWord })
    ),
    matchText,
    searchWord,
    targetWord,
  }
 }

 const getWordMatchText = ({ searchWord, targetWord }) => {
  if (SEPARATOR_CHARACTERS.includes(targetWord)) {
    return ''
  }

  if (
    hasAnySeparator({ word: targetWord }) ||
    hasAnySeparator({ word: searchWord })
  ) {
    throw new Error(
      'Neither the `targetWord` nor the `searchWord` cannot contain separator characters'
    )
  }

  const searchWordNormalized = normalize({ string: searchWord })
  const targetWordNormalized = normalize({ string: targetWord })

  let matchText = ''
  let searchIndex = 0
  let targetIndex = 0
  let searchCharacter = null
  let targetCharacter = null

  while (
    (searchCharacter = searchWordNormalized[searchIndex]) &&
    (targetCharacter = targetWordNormalized[targetIndex])
  ) {
    if (searchCharacter === targetCharacter) {
      matchText += targetWord[targetIndex]
      searchIndex += 1
    } else if (matchText.length) {
      return matchText
    }
    targetIndex += 1
  }

  return matchText
 }

 const getJaccardCoefficient = ({ matchText, searchWord, targetWord }) => {
  if (!matchText) {
    return 0
  }
  const intersection = matchText.length
  const union = targetWord.length + searchWord.length - intersection
  return intersection / union
 }

 /////////////
 // HELPERS //
 /////////////

 // https://stackoverflow.com/questions/990904/remove-accents-diacritics-in-a-string-in-javascript/37511463#37511463
 export const normalize = ({ string }: { string: string }): string =>
  string
    .normalize('NFD')
    .replace(/[\u0300-\u036f]/g, '')
    .replace(/['"“”’‘.,]/g, '')
    .replace(/-/g, ' ')
    .toLowerCase()
	// @flow

	import * as _ from 'lodash'

	///////////////////
	// FILTER & SORT //
	///////////////////

	type FilterAndSortType = ({
	data: Array<*>,
	getTargetText?: ({ item: * }) => string,
	limit?: number,
	searchText: string,
	sortIteratees?: Array<(match: Object) => number>,
	}) => Array<*>

	export const filterAndSort: FilterAndSortType = ({
	data,
	getTargetText = ({ item }) => item,
	limit = data.length,
	searchText,
	sortIteratees = [() => 0],
	}) => {
	if (!searchText) {
	if (limit !== data.length) {
	return data.slice(0, limit)
	}
	return data
	}

	return (
	_.chain(data)

	// Get matches for each item
	.map(item =>
	getMatch({
	item,
	searchText,
	targetText: getTargetText({ item }),
	})
	)

	// Filter out non-matches
	.filter(match => {
	console.debug(
	'Match of %o in %o: isExact=%o isFuzzy=%o coefficient=%o',
	match.searchText,
	match.targetText,
	match.isExact,
	match.isFuzzy,
	match.coefficient
	)
	return match.isExact \|\| match.isFuzzy
	// TODO: If it's a fuzzy match and only one character matches
	// when there are multiple words in the search text, ignore it
	})

	// Sort by the coefficient, then by fuzzy matches, and then exact matches
	.sortBy(match => match.coefficient * -1)
	.sortBy(match => (match.isFuzzy ? -1 : 1))
	.sortBy(match => (match.isExact ? -1 : 1))

	// Finally, sort by the number of exact matches in the pairs of matches
	.sortBy(match =>
	match.pairs.reduce((accumulator, pair) => {
	if (typeof pair !== 'string') {
	return accumulator - (pair.isExact ? 2 : 1)
	}
	return accumulator
	}, 0)
	)

	// Sort by all custom sorters
	.sortBy(...sortIteratees)

	// Slice down to the limit
	.slice(0, limit)

	// Pull out the original item and then return the final value
	.map(match => match.item)
	.value()
	)
	}

	///////////
	// SPLIT //
	///////////

	type SplitByMatchesType = ({
	searchText: string,
	targetText: string,
	}) => Array<string \| [string, string, string]>

	export const splitByMatches: SplitByMatchesType = ({
	searchText,
	targetText,
	}) => {
	if (!searchText) {
	return [targetText]
	}

	const match = getMatch({ item: targetText, searchText, targetText })
	const matchedWords = match.pairs.map(pair => {
	if (typeof pair === 'string') {
	return pair
	}
	const indexOfMatchText = pair.targetWord.indexOf(pair.matchText)
	const preMatchSliceEnd = indexOfMatchText > 0 ? indexOfMatchText : 0
	const postMatchSliceStart = preMatchSliceEnd + pair.matchText.length
	return [
	pair.targetWord.slice(0, preMatchSliceEnd),
	pair.matchText,
	pair.targetWord.slice(postMatchSliceStart),
	]
	})

	return reduceConsecutiveStrings({ values: matchedWords })
	}

	type ReduceConsecutiveStringsType = ({ values: Array<> }) => Array<>

	const reduceConsecutiveStrings: ReduceConsecutiveStringsType = ({
	values,
	}) =>
	values.reduce((accumulator, value, index) => {
	if (typeof value === 'string' && typeof _.last(accumulator) === 'string') {
	accumulator[accumulator.length - 1] = _.last(accumulator) + value
	} else {
	accumulator.push(value)
	}
	return accumulator
	}, [])

	////////////////////////
	// WORDS & SEPARATORS //
	////////////////////////

	const SEPARATOR_CHARACTERS = [',', '.', '-', ' ', '\|', '/']
	const REGEXP_SEPARATORS = new RegExp(`[\\${SEPARATOR_CHARACTERS.join('\\')}]+`)
	const REGEXP_SEPARATORS_CAPTURING = new RegExp(
	`([\\${SEPARATOR_CHARACTERS.join('\\')}])`
	)

	const splitWords = ({ text, isCapturingSeparators }) => {
	const regExp = isCapturingSeparators
	? REGEXP_SEPARATORS_CAPTURING
	: REGEXP_SEPARATORS
	return text.split(regExp).filter(value => value)
	}

	const hasAnySeparator = ({ word }) =>
	SEPARATOR_CHARACTERS.some(character => word.includes(character))

	///////////
	// MATCH //
	///////////

	type SearchWordMatchType = {
	index: number,
	coefficient: number,
	matchText: string,
	searchWord: string,
	targetWord: string,
	}

	type SearchMatchType = {
	item: any,
	pairs: Array<string \| SearchWordMatchType>,
	searchText: string,
	searchWords: string[],
	targetText: string,
	targetWords: string[],
	coefficient: number,
	isExact: boolean,
	isFuzzy: boolean,
	}

	const getMatch = ({ item, searchText, targetText }): SearchMatchType => {
	const searchWords = splitWords({
	text: searchText,
	isCapturingSeparators: false,
	})
	const targetWords = splitWords({
	text: targetText,
	isCapturingSeparators: true,
	})
	const targetWordsWithoutSeparators = splitWords({
	text: targetText,
	isCapturingSeparators: false,
	})

	const pairs = (targetWords.slice(0): any)
	searchWords.forEach(searchWord => {
	const match = getBestWordMatch({ pairs, searchWord, targetWords })
	if (match) {
	pairs[match.index] = match
	}
	})

	return {
	item,
	pairs,
	searchText,
	searchWords,
	targetText,
	targetWords,
	coefficient: getJaccardCoefficient({
	matchText: pairs
	.map(pair => (typeof pair === 'string' ? '' : pair.matchText))
	.join(''),
	searchWord: searchWords.join(''),
	targetWord: targetWordsWithoutSeparators.join(''),
	}),
	isExact: normalize({ string: targetText }).startsWith(
	normalize({ string: searchText })
	),
	isFuzzy: targetWords.some(targetWord =>
	searchWords.some(searchWord =>
	normalize({ string: targetWord }).includes(
	normalize({ string: searchWord })
	)
	)
	),
	}
	}

	const getBestWordMatch = ({
	pairs,
	searchWord,
	targetWords,
	}): ?SearchWordMatchType =>
	_.chain(targetWords)
	.map((targetWord, index) =>
	getWordMatch({ pairs, index, searchWord, targetWord })
	)
	.filter(match => match.coefficient)
	.sortBy(match => match.coefficient * -1)
	.sortBy(match => (match.isExact ? -1 : 1))
	.first()
	.value()

	const getWordMatch = ({
	pairs,
	index,
	searchWord,
	targetWord,
	}): SearchWordMatchType => {
	const matchText =
	typeof pairs[index] === 'string'
	? getWordMatchText({ searchWord, targetWord })
	: ''
	return {
	index,
	coefficient: getJaccardCoefficient({ matchText, searchWord, targetWord }),
	isExact: normalize({ string: targetWord }).startsWith(
	normalize({ string: searchWord })
	),
	matchText,
	searchWord,
	targetWord,
	}
	}

	const getWordMatchText = ({ searchWord, targetWord }) => {
	if (SEPARATOR_CHARACTERS.includes(targetWord)) {
	return ''
	}

	if (
	hasAnySeparator({ word: targetWord }) \|\|
	hasAnySeparator({ word: searchWord })
	) {
	throw new Error(
	'Neither the `targetWord` nor the `searchWord` cannot contain separator characters'
	)
	}

	const searchWordNormalized = normalize({ string: searchWord })
	const targetWordNormalized = normalize({ string: targetWord })

	let matchText = ''
	let searchIndex = 0
	let targetIndex = 0
	let searchCharacter = null
	let targetCharacter = null

	while (
	(searchCharacter = searchWordNormalized[searchIndex]) &&
	(targetCharacter = targetWordNormalized[targetIndex])
	) {
	if (searchCharacter === targetCharacter) {
	matchText += targetWord[targetIndex]
	searchIndex += 1
	} else if (matchText.length) {
	return matchText
	}
	targetIndex += 1
	}

	return matchText
	}

	const getJaccardCoefficient = ({ matchText, searchWord, targetWord }) => {
	if (!matchText) {
	return 0
	}
	const intersection = matchText.length
	const union = targetWord.length + searchWord.length - intersection
	return intersection / union
	}

	/////////////
	// HELPERS //
	/////////////

	// https://stackoverflow.com/questions/990904/remove-accents-diacritics-in-a-string-in-javascript/37511463#37511463
	export const normalize = ({ string }: { string: string }): string =>
	string
	.normalize('NFD')
	.replace(/[\u0300-\u036f]/g, '')
	.replace(/['"“”’‘.,]/g, '')
	.replace(/-/g, ' ')
	.toLowerCase()