Created
February 28, 2019 21:15
-
-
Save SergProduction/96134c0770880380d65a147b0225416f to your computer and use it in GitHub Desktop.
tf-idf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
https://ru.stackoverflow.com/questions/664746/%D0%9A%D0%BE%D1%81%D0%B8%D0%BD%D1%83%D1%81%D0%BD%D0%BE%D0%B5-%D1%81%D1%85%D0%BE%D0%B4%D1%81%D1%82%D0%B2%D0%BE-%D0%BA%D0%BE%D1%81%D0%B8%D0%BD%D1%83%D1%81%D0%BD%D0%B0%D1%8F-%D0%BC%D0%B5%D1%80%D0%B0-%D0%B4%D0%BB%D1%8F-%D1%81%D1%82%D1%80%D0%BE%D0%BA | |
type Docs = Array<string> | |
type Words = Array<string> | |
type MapWordCount = {[string]: number} | |
type MapWordTF = {[string]: number} | |
type DocsMapWordTF = Array<MapWordTF> | |
type MapWordIDF = {[string]: number} | |
type DocsMapWordIDF = Array<MapWordIDF> | |
type DocsMapWordTFIDF = Array<MapWordIDF> | |
*/ | |
splitDocToWords = (doc/*:: string*/)/*:: Words*/ => str.replace(/\.|,/g, '').split(' ').filter(Boolean) | |
conuntWords = (words/*:: Array<string>*/)/*:: MapWordCount*/ => words.reduce((acc, w) => ({ | |
...acc, | |
[w]: acc[w] === undefined | |
? 1 | |
: acc[w] + 1 | |
}), {}) | |
tf = (words/*:: MapWordCount*/) /*:: MapWordTF*/ => { | |
const lenDict = Object.keys(words).length | |
return Object.entries(words).reduce((acc, [w, c]) => ({ | |
...acc, | |
[w]: c / lenDict | |
}), {}) | |
} | |
// example - tf(conuntWords(splitDocToWords('а и б сидели на и'))) | |
docs_tf = (docs /*:: Docs*/)/*:: DocsMapWordTF*/ => docs.map(doc => tf(conuntWords(splitDocToWords(doc)))) | |
idf = (word/*:: string*/, docs/*:: DocsMapWordTF*/)/*:: number */ => Math.log( | |
docs.length / docs.reduce((sum, doc) => doc[word] !== undefined ? sum+1 : sum, 1) | |
) | |
docs_idf = (docsTF/*:: DocsMapWordTF*/)/*:: DocsMapWordIDF*/ => docsTF.map( | |
docTF => Object.keys(docTF).reduce( | |
(acc, word) => ({ ...acc, [word]: idf(word, docs) }) | |
, {}) | |
) | |
merge_tf_idf = (tf/*:: MapWordTF*/, idf/*:: {[string]: number}*/) => Object.keys(tf).reduce((acc, word) => ({ | |
...acc, | |
[word]: tf[word] * idf[word] | |
}), {}) | |
build_tf_idf = (docs /*:: Docs*/)/*:: Array<{[string]: number}> */ => { | |
const doc_tf = docs_tf(docs) | |
const doc_idf = docs_idf(doc_tf) | |
return doc_tf.map((tf, i) => merge_tf_idf(tf, doc_idf[i])) | |
} | |
// ---------- tf-idf ---------- ^^ | |
// ---------- | |
str1 = 'Я люблю тортики больше, чем яблоки' | |
str2 = 'Я уважаю апельсины больше, чем торты' | |
str3 = 'Яблочные сады раскинулись над дорогой' | |
str4 = 'Ехал Грека через реку' | |
check_str1 = 'Тортики делают из муки, апельсины и воды' | |
check_str2 = 'Торты исчезли там, где появился я' | |
check_str3 = 'Ехал тортик через реку' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment