Skip to content

Instantly share code, notes, and snippets.

@ArtemAvramenko
Last active October 23, 2024 10:06
Show Gist options
  • Save ArtemAvramenko/4cb410ea021f8f16febad2d041c4f935 to your computer and use it in GitHub Desktop.
Save ArtemAvramenko/4cb410ea021f8f16febad2d041c4f935 to your computer and use it in GitHub Desktop.
A script for a heuristic search for a similar name
const expect = actual => {
const toBe = expected => {
let [a, b] = [actual, expected].map(JSON.stringify);
if (a !== b) {
console.error(a + ' should be ' + b);
}
};
return { toBe, toEqual: toBe };
};
class FullTextSearch {
static getWords(s) {
return s
.normalize('NFC')
.toUpperCase()
.replace(/(?:\s|[&().,:;/\\<=>[\]{|}~\-\u2010\u2012\u00AD\u2013\u2014\u2043\u2212\uFE58\uFE63\uFF0D])+/g, ' ')
.split(/ /g)
.filter(x => !!x);
}
// artcles from Germanic/Romance languages, whose match is less important than the rest of the words
static isArticle = (() => {
const res = { };
'a|as|das|de|der|des|die|du|ein|eine|el|la|las|le|les|lo|los|o|os|the|um|uma|umas|un|una|unas|une|uno|unos|uns'
.match(/[^|]+/g)
.forEach(s => res[s.toUpperCase()] = true);
return res
})();
static getScore(oldWords, newWordCount, newWordPositions) {
const oldWordCount = oldWords.length;
let oldLeftPos = 0;
let score = 0;
// look at every word in an existing string
for (let oldWord of oldWords) {
const newLeftPos = newWordPositions[oldWord];
// found a matching word
if (newLeftPos != null) {
// find the distance between the word positions
const newRightPos = newWordCount - newLeftPos - 1;
const oldRightPos = oldWordCount - oldLeftPos - 1;
const posDiff = Math.min(
Math.abs(newLeftPos - oldLeftPos),
Math.abs(newRightPos - oldRightPos));
// the closer the words are spaced, the higher the score
score += 1000 - posDiff;
// rank stoplist word much lower
if (this.isArticle[oldWord]) {
score /= 5;
}
}
oldLeftPos++;
}
return score;
}
static searchNearestString(oldStrings, newString) {
// check arguments
let res = null;
if (!newString || !oldStrings) {
return res;
}
// search the exact match
for (let oldString of oldStrings) {
if (oldString === newString) {
return oldString;
}
}
// split new string to words
const newWordPositions = {};
let newWordCount = 0;
for (const word of this.getWords(newString)) {
newWordPositions[word] = newWordCount++;
}
// find the most relevant string
let resScore = 0;
let resWordDiff = 0;
for (let oldString of oldStrings) {
const oldWords = this.getWords(oldString);
const score = this.getScore(oldWords, newWordCount, newWordPositions);
if (score >= resScore) {
const wordDiff = Math.abs(oldWords.length - newWordCount);
if (score > resScore || wordDiff < resWordDiff) {
res = oldString;
resScore = score;
resWordDiff = wordDiff;
}
}
}
return res;
}
}
console.clear();
expect(FullTextSearch.searchNearestString(
['tmetric - desktop - design', 'tmetric - web'],
'tmetric - extensions'))
.toBe('tmetric - web');
expect(FullTextSearch.searchNearestString(
['web - tmetric', 'tmetric - web'],
'tmetric - extensions'))
.toBe('tmetric - web');
expect(FullTextSearch.searchNearestString(
['tmetric - web', 'web - tmetric'],
'tmetric - extensions'))
.toBe('tmetric - web');
expect(FullTextSearch.searchNearestString(
['Spring Projects - CSV Import - Accounts Payable', 'Acme - Clients - John Doe'],
'Spring Projects - CSV Import - Accounts Receivable, John Doe'))
.toBe('Spring Projects - CSV Import - Accounts Payable');
expect(FullTextSearch.searchNearestString(
['Design - Ideas', 'Design - Form Responses'],
'Design - Mark Moe'))
.toBe('Design - Form Responses');
expect(FullTextSearch.searchNearestString(
['The Marketplace', 'TMetric'],
'The TMetric'))
.toBe('TMetric');
expect(FullTextSearch.searchNearestString(
['TMetric', 'The TMetric'],
'The TMetric'))
.toBe('The TMetric');
expect(FullTextSearch.searchNearestString(
['TMetric', 'The TMetric'],
'TMetric'))
.toBe('TMetric');
expect(FullTextSearch.searchNearestString(
['Design: TMetric', 'TMetric Extensions'],
'TMetric Design'))
.toBe('Design: TMetric');
expect(FullTextSearch.searchNearestString([], undefined)).toBe(null);
expect(FullTextSearch.searchNearestString(undefined, 'x')).toBe(null);
expect(FullTextSearch.searchNearestString(['a'], 'x')).toBe(null);
expect(FullTextSearch.getWords('TMetric - Design')).toEqual(['TMETRIC', 'DESIGN']);
expect(FullTextSearch.getWords('TMetric: Version 1')).toEqual(['TMETRIC', 'VERSION', '1']);
expect(FullTextSearch.getWords(' TMetric(v1) ')).toEqual(['TMETRIC', 'V1']);
expect(FullTextSearch.getWords('Проєкт Straße', '')).toEqual(['ПРОЄКТ', 'STRASSE']);
expect(FullTextSearch.getWords('TMetric - Design')).toEqual(['TMETRIC', 'DESIGN']);
expect(FullTextSearch.getWords('TMetric - Design')).toEqual(['TMETRIC', 'DESIGN']);
console.log('tests completed');
// Design - Ideas
// Design - Form Responses
// Design - Ashley Webster
// Spring Projects - CSV Import - Accounts Payable
// Spring Projects - Orders
// Spring Projects - CSV Import - Accounts Receivable, Elizabeth Shore
// Circa - Clients - Elizabeth Shore
// Circa - Clients - Stan Edwards
// Circa - Task Management - Backlog
// Circa - Task Management - Stage 2
// Circa - Task Management - Action Items
// Spring Projects - CSV Import - Accounts Payable
// Circa - Clients - Elizabeth Shore
// Spring Projects - CSV Import - Accounts Receivable, Elizabeth Shore
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment