Created
November 12, 2023 23:33
-
-
Save siliconjungle/f1b55d77b128fb48d1472814ca3f3caa to your computer and use it in GitHub Desktop.
Simple bag of words implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const tokenize = (text) => | |
text.toLowerCase().split(/\W+/).filter(token => token.length > 0) | |
const buildVocabulary = (sentences) => { | |
const vocabulary = new Set() | |
sentences.forEach(sentence => { | |
tokenize(sentence).forEach(token => { | |
vocabulary.add(token) | |
}) | |
}) | |
return Array.from(vocabulary) | |
} | |
const vectorize = (sentence, vocabulary) => { | |
const vector = new Array(vocabulary.length).fill(0) | |
const tokens = tokenize(sentence) | |
tokens.forEach(token => { | |
const index = vocabulary.indexOf(token) | |
if (index !== -1) { | |
vector[index] += 1 | |
} else { | |
throw new Error(`Token '${token}' not found in vocabulary`) | |
} | |
}) | |
return vector | |
} | |
const sentences = [ | |
'Sunny, warm and beautiful day', | |
'Snowy and cold', | |
'Cold and windy day' | |
] | |
const vocabulary = buildVocabulary(sentences) | |
console.log('Vocabulary:', vocabulary) | |
sentences.forEach(sentence => { | |
const vector = vectorize(sentence, vocabulary) | |
console.log(`Vector for '${sentence}':`, vector) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment