Skip to content

Instantly share code, notes, and snippets.

@abhijeet-talaulikar
Last active July 28, 2023 23:22
Show Gist options
  • Save abhijeet-talaulikar/47565f818b021920fc02b752d173de52 to your computer and use it in GitHub Desktop.
Save abhijeet-talaulikar/47565f818b021920fc02b752d173de52 to your computer and use it in GitHub Desktop.
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
# Use Hierarchical DBSCAN as clustering model
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# Use UMAP for dimensionality reduction
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Build the topic model
topic_model = BERTopic(
verbose=True,
embedding_model=finbert,
hdbscan_model=hdbscan_model,
nr_topics=10,
umap_model=umap_model
)
# Fit
headline_topics, _ = topic_model.fit_transform(headline_texts)
# View topics
freq = topic_model.get_topic_info()
print("Number of topics: {}".format(len(freq)))
freq
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment