VishDev12 · November 8, 2022 06:43
diff --git a/top2vec_breakdown.py b/top2vec_breakdown.py
 # Setup and train the Doc2Vec model.
 self.model = Doc2Vec()

 # Get the vectors and indexes from the doc2vec model.
 self.word_vectors = self.model.wv.get_normed_vectors()
 self.word_indexes = self.model.wv.key_to_index
 self.vocab = list(self.model.wv.key_to_index.keys())
 self.document_vectors = self.model.dv.get_normed_vectors()

 # Use UMAP to create 5D embeddings of the document vectors.
 umap_model = umap.UMAP().fit(self.document_vectors)

 # Get dense areas of the documents with HDBSCAN.
 cluster = hdbscan.HDBSCAN().fit(umap_model.embedding_)

 # We use the lower-dimension embeddings to find dense areas,
 # then create topic vectors by getting the centroid of the
 # document vectors in each dense cluster in the original dimension.
 self.topic_vectors = self._create_topic_vectors(cluster.labels_)

 """
 Create a normalized array of the mean of the array of document vectors.
 	1. Each inside array of document vectors corresponds to a dense cluster
 	   as determined by HDBSCAN.
 	2. The mean of each nested array is a single vector which is appended
 	   to the output array.
 	3. This array is vstacked which stacks the vectors vertically.
 	4. The L2 norm on this stacked array of `n` vectors will give us a new
 	   array of the same shape, but each nested array is now a unit vector,
 	   meaning the values for each vecto
 """

 # Deduplicate Topics

 # Find topic words and scores
 self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)

 """
 An explanation of `_find_topic_words_and_scores`:

 1. `self.word_vectors` contains a numpy array of all the word vectors.
  Shape: n x 300
 2. `topic_vectors` contains a numpy array of all topic vectors. Shape: m x 300.
 3. An inner product of topic vectors and word vectors gives us a new array
   of shape m x n. In this, the `i`th row has n elements with the `j`th row
   element indicating how similar the `i`th topic is to the `j`th word.
 4. top_words is created by argsorting the above inner product along axis 1 to get
   the sort order indices with the lowest values starting from the first column.
   A .flip on this gives us descending order. So each row `i` contains, in desc order,
   the indices to the most similar words for the `i`th topic vector.
 5. top_scores is created similarly except instead of argsort to return indices,
   sort is used to return the values themselves. So each row `i` contains, in
   desc order, the scores of the most similar words for the `i`th topic vector.
 6. topic_words is created by taking the first 50 (axis 1) elements from top_words
   and finding the corresponding string representation from .vocab. topic_scores is
   just the first 50 elements from top_scores.
 """


 self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors, self.document_vectors)

 """
 This method does the same thing as `_find_topic_words_and_scores` above, but instead
 of relating words to topics, it relates topics to documents. So that each document
 gets `num_topics` topics that are closest to it. The function is longer than the
 previous one because the number of documents is unbounded, where as
 vocabulary is usually bounded to a certain maximum number since there can only be a
 finite number of meaningful words in any language.

 If there are `n` documents, `doc_top` is an array with `n` elements, with each
 element indicating the topic index.

 doc_dist is the distance from the topic for each document.
 """

 # Determine the number of documents that are clustered around each topic.
 # This is a pandas Series with the document counts for each topic.
 self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)

 # Reorder the topic_words, topic_vectors, and topic_word_scores based on topic_sizes.
 self._reorder_topics(hierarchy=False)

 # Done!
	# Setup and train the Doc2Vec model.
	self.model = Doc2Vec()

	# Get the vectors and indexes from the doc2vec model.
	self.word_vectors = self.model.wv.get_normed_vectors()
	self.word_indexes = self.model.wv.key_to_index
	self.vocab = list(self.model.wv.key_to_index.keys())
	self.document_vectors = self.model.dv.get_normed_vectors()

	# Use UMAP to create 5D embeddings of the document vectors.
	umap_model = umap.UMAP().fit(self.document_vectors)

	# Get dense areas of the documents with HDBSCAN.
	cluster = hdbscan.HDBSCAN().fit(umap_model.embedding_)

	# We use the lower-dimension embeddings to find dense areas,
	# then create topic vectors by getting the centroid of the
	# document vectors in each dense cluster in the original dimension.
	self.topic_vectors = self._create_topic_vectors(cluster.labels_)

	"""
	Create a normalized array of the mean of the array of document vectors.
	1. Each inside array of document vectors corresponds to a dense cluster
	as determined by HDBSCAN.
	2. The mean of each nested array is a single vector which is appended
	to the output array.
	3. This array is vstacked which stacks the vectors vertically.
	4. The L2 norm on this stacked array of `n` vectors will give us a new
	array of the same shape, but each nested array is now a unit vector,
	meaning the values for each vecto
	"""

	# Deduplicate Topics

	# Find topic words and scores
	self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)

	"""
	An explanation of `_find_topic_words_and_scores`:

	1. `self.word_vectors` contains a numpy array of all the word vectors.
	Shape: n x 300
	2. `topic_vectors` contains a numpy array of all topic vectors. Shape: m x 300.
	3. An inner product of topic vectors and word vectors gives us a new array
	of shape m x n. In this, the `i`th row has n elements with the `j`th row
	element indicating how similar the `i`th topic is to the `j`th word.
	4. top_words is created by argsorting the above inner product along axis 1 to get
	the sort order indices with the lowest values starting from the first column.
	A .flip on this gives us descending order. So each row `i` contains, in desc order,
	the indices to the most similar words for the `i`th topic vector.
	5. top_scores is created similarly except instead of argsort to return indices,
	sort is used to return the values themselves. So each row `i` contains, in
	desc order, the scores of the most similar words for the `i`th topic vector.
	6. topic_words is created by taking the first 50 (axis 1) elements from top_words
	and finding the corresponding string representation from .vocab. topic_scores is
	just the first 50 elements from top_scores.
	"""


	self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors, self.document_vectors)

	"""
	This method does the same thing as `_find_topic_words_and_scores` above, but instead
	of relating words to topics, it relates topics to documents. So that each document
	gets `num_topics` topics that are closest to it. The function is longer than the
	previous one because the number of documents is unbounded, where as
	vocabulary is usually bounded to a certain maximum number since there can only be a
	finite number of meaningful words in any language.

	If there are `n` documents, `doc_top` is an array with `n` elements, with each
	element indicating the topic index.

	doc_dist is the distance from the topic for each document.
	"""

	# Determine the number of documents that are clustered around each topic.
	# This is a pandas Series with the document counts for each topic.
	self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)

	# Reorder the topic_words, topic_vectors, and topic_word_scores based on topic_sizes.
	self._reorder_topics(hierarchy=False)

	# Done!
No results found