siliconjungle · November 12, 2023 23:33
diff --git a/bag-of-words.js b/bag-of-words.js
 const tokenize = (text) =>
  text.toLowerCase().split(/\W+/).filter(token => token.length > 0)

 const buildVocabulary = (sentences) => {
  const vocabulary = new Set()

  sentences.forEach(sentence => {
    tokenize(sentence).forEach(token => {
      vocabulary.add(token)
    })
  })

  return Array.from(vocabulary)
 }

 const vectorize = (sentence, vocabulary) => {
  const vector = new Array(vocabulary.length).fill(0)
  const tokens = tokenize(sentence)

  tokens.forEach(token => {
    const index = vocabulary.indexOf(token)

    if (index !== -1) {
      vector[index] += 1
    } else {
      throw new Error(`Token '${token}' not found in vocabulary`)
    }
  })

  return vector
 }

 const sentences = [
  'Sunny, warm and beautiful day',
  'Snowy and cold',
  'Cold and windy day'
 ]

 const vocabulary = buildVocabulary(sentences)
 console.log('Vocabulary:', vocabulary)

 sentences.forEach(sentence => {
  const vector = vectorize(sentence, vocabulary)
  console.log(`Vector for '${sentence}':`, vector)
 })
	const tokenize = (text) =>
	text.toLowerCase().split(/\W+/).filter(token => token.length > 0)

	const buildVocabulary = (sentences) => {
	const vocabulary = new Set()

	sentences.forEach(sentence => {
	tokenize(sentence).forEach(token => {
	vocabulary.add(token)
	})
	})

	return Array.from(vocabulary)
	}

	const vectorize = (sentence, vocabulary) => {
	const vector = new Array(vocabulary.length).fill(0)
	const tokens = tokenize(sentence)

	tokens.forEach(token => {
	const index = vocabulary.indexOf(token)

	if (index !== -1) {
	vector[index] += 1
	} else {
	throw new Error(`Token '${token}' not found in vocabulary`)
	}
	})

	return vector
	}

	const sentences = [
	'Sunny, warm and beautiful day',
	'Snowy and cold',
	'Cold and windy day'
	]

	const vocabulary = buildVocabulary(sentences)
	console.log('Vocabulary:', vocabulary)

	sentences.forEach(sentence => {
	const vector = vectorize(sentence, vocabulary)
	console.log(`Vector for '${sentence}':`, vector)
	})