Last active
October 23, 2024 10:06
-
-
Save ArtemAvramenko/4cb410ea021f8f16febad2d041c4f935 to your computer and use it in GitHub Desktop.
A script for a heuristic search for a similar name
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const expect = actual => { | |
const toBe = expected => { | |
let [a, b] = [actual, expected].map(JSON.stringify); | |
if (a !== b) { | |
console.error(a + ' should be ' + b); | |
} | |
}; | |
return { toBe, toEqual: toBe }; | |
}; | |
class FullTextSearch { | |
static getWords(s) { | |
return s | |
.normalize('NFC') | |
.toUpperCase() | |
.replace(/(?:\s|[&().,:;/\\<=>[\]{|}~\-\u2010\u2012\u00AD\u2013\u2014\u2043\u2212\uFE58\uFE63\uFF0D])+/g, ' ') | |
.split(/ /g) | |
.filter(x => !!x); | |
} | |
// artcles from Germanic/Romance languages, whose match is less important than the rest of the words | |
static isArticle = (() => { | |
const res = { }; | |
'a|as|das|de|der|des|die|du|ein|eine|el|la|las|le|les|lo|los|o|os|the|um|uma|umas|un|una|unas|une|uno|unos|uns' | |
.match(/[^|]+/g) | |
.forEach(s => res[s.toUpperCase()] = true); | |
return res | |
})(); | |
static getScore(oldWords, newWordCount, newWordPositions) { | |
const oldWordCount = oldWords.length; | |
let oldLeftPos = 0; | |
let score = 0; | |
// look at every word in an existing string | |
for (let oldWord of oldWords) { | |
const newLeftPos = newWordPositions[oldWord]; | |
// found a matching word | |
if (newLeftPos != null) { | |
// find the distance between the word positions | |
const newRightPos = newWordCount - newLeftPos - 1; | |
const oldRightPos = oldWordCount - oldLeftPos - 1; | |
const posDiff = Math.min( | |
Math.abs(newLeftPos - oldLeftPos), | |
Math.abs(newRightPos - oldRightPos)); | |
// the closer the words are spaced, the higher the score | |
score += 1000 - posDiff; | |
// rank stoplist word much lower | |
if (this.isArticle[oldWord]) { | |
score /= 5; | |
} | |
} | |
oldLeftPos++; | |
} | |
return score; | |
} | |
static searchNearestString(oldStrings, newString) { | |
// check arguments | |
let res = null; | |
if (!newString || !oldStrings) { | |
return res; | |
} | |
// search the exact match | |
for (let oldString of oldStrings) { | |
if (oldString === newString) { | |
return oldString; | |
} | |
} | |
// split new string to words | |
const newWordPositions = {}; | |
let newWordCount = 0; | |
for (const word of this.getWords(newString)) { | |
newWordPositions[word] = newWordCount++; | |
} | |
// find the most relevant string | |
let resScore = 0; | |
let resWordDiff = 0; | |
for (let oldString of oldStrings) { | |
const oldWords = this.getWords(oldString); | |
const score = this.getScore(oldWords, newWordCount, newWordPositions); | |
if (score >= resScore) { | |
const wordDiff = Math.abs(oldWords.length - newWordCount); | |
if (score > resScore || wordDiff < resWordDiff) { | |
res = oldString; | |
resScore = score; | |
resWordDiff = wordDiff; | |
} | |
} | |
} | |
return res; | |
} | |
} | |
console.clear(); | |
expect(FullTextSearch.searchNearestString( | |
['tmetric - desktop - design', 'tmetric - web'], | |
'tmetric - extensions')) | |
.toBe('tmetric - web'); | |
expect(FullTextSearch.searchNearestString( | |
['web - tmetric', 'tmetric - web'], | |
'tmetric - extensions')) | |
.toBe('tmetric - web'); | |
expect(FullTextSearch.searchNearestString( | |
['tmetric - web', 'web - tmetric'], | |
'tmetric - extensions')) | |
.toBe('tmetric - web'); | |
expect(FullTextSearch.searchNearestString( | |
['Spring Projects - CSV Import - Accounts Payable', 'Acme - Clients - John Doe'], | |
'Spring Projects - CSV Import - Accounts Receivable, John Doe')) | |
.toBe('Spring Projects - CSV Import - Accounts Payable'); | |
expect(FullTextSearch.searchNearestString( | |
['Design - Ideas', 'Design - Form Responses'], | |
'Design - Mark Moe')) | |
.toBe('Design - Form Responses'); | |
expect(FullTextSearch.searchNearestString( | |
['The Marketplace', 'TMetric'], | |
'The TMetric')) | |
.toBe('TMetric'); | |
expect(FullTextSearch.searchNearestString( | |
['TMetric', 'The TMetric'], | |
'The TMetric')) | |
.toBe('The TMetric'); | |
expect(FullTextSearch.searchNearestString( | |
['TMetric', 'The TMetric'], | |
'TMetric')) | |
.toBe('TMetric'); | |
expect(FullTextSearch.searchNearestString( | |
['Design: TMetric', 'TMetric Extensions'], | |
'TMetric Design')) | |
.toBe('Design: TMetric'); | |
expect(FullTextSearch.searchNearestString([], undefined)).toBe(null); | |
expect(FullTextSearch.searchNearestString(undefined, 'x')).toBe(null); | |
expect(FullTextSearch.searchNearestString(['a'], 'x')).toBe(null); | |
expect(FullTextSearch.getWords('TMetric - Design')).toEqual(['TMETRIC', 'DESIGN']); | |
expect(FullTextSearch.getWords('TMetric: Version 1')).toEqual(['TMETRIC', 'VERSION', '1']); | |
expect(FullTextSearch.getWords(' TMetric(v1) ')).toEqual(['TMETRIC', 'V1']); | |
expect(FullTextSearch.getWords('Проєкт Straße', '')).toEqual(['ПРОЄКТ', 'STRASSE']); | |
expect(FullTextSearch.getWords('TMetric - Design')).toEqual(['TMETRIC', 'DESIGN']); | |
expect(FullTextSearch.getWords('TMetric - Design')).toEqual(['TMETRIC', 'DESIGN']); | |
console.log('tests completed'); | |
// Design - Ideas | |
// Design - Form Responses | |
// Design - Ashley Webster | |
// Spring Projects - CSV Import - Accounts Payable | |
// Spring Projects - Orders | |
// Spring Projects - CSV Import - Accounts Receivable, Elizabeth Shore | |
// Circa - Clients - Elizabeth Shore | |
// Circa - Clients - Stan Edwards | |
// Circa - Task Management - Backlog | |
// Circa - Task Management - Stage 2 | |
// Circa - Task Management - Action Items | |
// Spring Projects - CSV Import - Accounts Payable | |
// Circa - Clients - Elizabeth Shore | |
// Spring Projects - CSV Import - Accounts Receivable, Elizabeth Shore |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment