Skip to content

Instantly share code, notes, and snippets.

@Jong-Sig
Created June 16, 2024 21:28
Show Gist options
  • Save Jong-Sig/29d8ffeb1541ddaa6b44a4400db9e420 to your computer and use it in GitHub Desktop.
Save Jong-Sig/29d8ffeb1541ddaa6b44a4400db9e420 to your computer and use it in GitHub Desktop.
Fine-Tuning Parameters of BERTopic Using Pseudo Grid-Search and Mini-Batch
# conda activate bert-github
#%%
import pandas as pd
import numpy as np
import os
import glob
import pathlib
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import string
import random
from sentence_transformers import SentenceTransformer, losses
from transformers import AutoModel, AutoTokenizer, pipelines
import torch
from topictuner import TopicModelTuner as TMT
from bertopic import BERTopic, representation, vectorizers
from umap import UMAP
import umap.plot
# from cuml.manifold import UMAP # use cuML to speed up UMAP through GPU acceleration (https://docs.rapids.ai/install#rapids-release-selector)
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from scipy.special import softmax
# Following [NBSL: A Supervised Classification Model of Pull Request in Github], I will use Topic Model Approach to Classify Knowledge Domain of PR
# For approach, I will use [BERTopic: Neural topic modeling with a class-based TF-IDF procedure]
"""
This document is for fine-tuning parameters:
1) Over-arching Pipeline:
# The pipeline is as follows:
# Embeddings (e.g., SBERT) -> Dimensionality Reduction (e.g., UMAP) -> Clustering (e.g., HDBSCAN) -> Tokenization -> Weighting (e.g., cTF-IDF) & Topic Representation (e.g., KeyBERTInspired)
(see: https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html)
1-1) Embeddings:
# For a transformer model, I will use [Collab-uniba/github-issues-preprocessed-mpnet-st-e10]
# This model fine-tuned MPNET sentence transformer using GitHub issue data from NLBSE22 dataset
# This model well recognize meanings (e.g., operating systems) in GitHub domain.
# In fact, SentenceBert without fine-tuning out-performed most non-transformer models.
(see, DupHunter: detecting duplicate pull requests in fork-based development)
1-2) UMAP:
# Following the developer's recommendation I set n_neighbors to 200, as I have a large dataset
(see: https://maartengr.github.io/BERTopic/faq.html#i-have-too-many-topics-how-do-i-decrease-them)
# For n_components, I use a default (5) to overcome the curse of dimensionality reduction (too high n) and loss of into (too small n)
(see: https://maartengr.github.io/BERTopic/getting_started/dim_reduction/dim_reduction.html)
# Also use unique to avoid duplicates after cleaning texts
(see: https://github.com/lmcinnes/umap/issues/771)
# randome seed for repoducibility
# the rests are default setting
1-3) HDBSCAN:
# use TopicTuner to fine-tune parameters: # I use random-search + grid-search combo
(see: https://github.com/drob-xx/TopicTuner)
# Why? although large samples helps extract right domains
# by discarding "bad" documents and extracting "good" documents,
# it also discard a large percentage of "good" documents
# so we need to optimize params to so as not to discard "good" documents
# Fine-tuning with randomly selected 231836 samples (2%)
# In my sample, >= 100K documents are sufficient to extract appropriate topics
# other users also observe >=100K documents are considered sufficient to extract topics
(see: https://github.com/MaartenGr/BERTopic/issues/836)
(see: https://github.com/MaartenGr/BERTopic/issues/1613)
# the other params are default setting
1-4) Tokenizer:
# use sklearn countvectorizer
# lowering the value of min_df increases processing time, but DO NOT reduce the quality of clustering
# so I would recommend setting min_df <= 10 with >= 100K documents if you have enough computing power&time,
(see: https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer)
# use stop words to reduce duplicates
# I do not use n_gram of 2 as 2-grams do not have different meaning than 1-gram + 1-gram in GitHub context (therby leading to duplicates)
1-5) c-TF-IDF & Representation:
# use default setting
# additionally, use reduce_frequent_words=True for ctfidf and MMR to reduce duplicates
(see: https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#diversify-topic-representation)
"""
# %% [Create function]
# Preprocess Text
def merge_texts(df, columns: list):
df = df.dropna(subset = columns).reset_index(drop = True)
df['text'] = df[columns[0]].astype(str) + ' ' + df[columns[1]].astype(str)
df = df.drop(columns = columns)
return df
def remove_emoji(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
def replace_markdown_with_text(text):
# Identify markdown links and replace them with the anchor text
text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\1', text)
# Identify markdown bold text and replace it with plain text
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
# Identify markdown italic text and replace it with plain text
text = re.sub(r'\*(.*?)\*', r'\1', text)
return text
def remove_hashes(text):
# Regular expression to detect hashes (alphanumeric words with a length of 7 characters or more)
hash_pattern = r'\b[a-fA-F0-9]{7,}\b'
# Replace hashes with [HASH] tokens
text = re.sub(hash_pattern, '[HASH]', text)
return text
def preprocess_text(text):
# Apply BeautifulSoup to clean HTML tags
if text is not None:
text = BeautifulSoup(text, 'lxml').get_text()
else:
pass
# remove markdown
text = replace_markdown_with_text(text)
# remove urls
text = re.sub(r"http\S+", "", text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
# remove emojis & emoticions
text = remove_emoji(text)
# replace hashes with [HASH]
text = remove_hashes(text)
# replace @users with [USER]
text = re.sub(r'(?<!\S)@[\w]+(?!@)', '[USER]', text)
# replace punctuations with ' '
PUNCT_TO_REMOVE = string.punctuation
text = text.translate(str.maketrans(PUNCT_TO_REMOVE, ' '*len(PUNCT_TO_REMOVE)))
# remove number if it in not a part of string
text = re.sub(r'\s\d+', '', text)
# Remove extra whitespace2
text = re.sub(r'\s+', ' ', text).strip()
# Lowercase the text
text = text.lower()
return text.strip()
# %%
if __name__ == '__main__':
################
# Import Dataset
################
print('Import Dataset')
# Basic configs
path = r'G:/Data/GitHub_Collab/github-trending/'
# Get repos with at least one PR request between 2015-2023
data_dir = pathlib.Path(path + 'PR_event')
# test sample
counter = 1
df = pd.DataFrame()
for file in data_dir.glob('*.parquet.gzip'):
if counter < 60:
df = pd.concat([df, pd.read_parquet(file, columns = ['PRID', 'RepoID', 'PRTitle', 'PRBody'])],
ignore_index = True)
counter += 1
else:
break
# drop duplicates based on PR IDs
df = df.drop_duplicates(subset =['PRID']).reset_index(drop = True)
# Get Repos in cleaned sample
base_path = r'F:\Projects\Automation\data\trending_sample\sample_panel'
file = 'github_trending_sample_panel.parquet.gzip'
df_base = pd.read_parquet(f'{base_path}\\{file}',
columns = ['RepoID', 'RepoLanguage'])
df_base = df_base.drop_duplicates().reset_index(drop = True)
# Keep repos only in the cleaned sample
df = pd.merge(df, df_base, on = ['RepoID'])
del df_base
# %%
###################
# Pre-Process Texts
###################
# Great source: https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing
# https://ieeexplore.ieee.org/abstract/document/9785808
print('Pre-Processing Texts')
# Prepare input sentences for fine-tuning
df = merge_texts(df, columns = ['PRTitle', 'PRBody'])
# convert to dictionary
df = df.to_dict('list')
# apply function
progress_bar = tqdm(total=len(df['text']))
for i, text in enumerate(df['text']):
df['text'][i] = preprocess_text(text)
progress_bar.update(1) # Update progress bar
# df['text'] = df['text'].progress_apply(preprocess_text)
# Close the progress bar
progress_bar.close()
# %% Model Config (https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html)
########################
# Fine-Tuning Parameters
########################
print('Setting a Model')
# Step 0: cuda config
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Step 1: Load the SentenceTransformer model with the specified architecture
embedding_model = SentenceTransformer('Collab-uniba/github-issues-preprocessed-mpnet-st-e10',
device=device)
# Step 2: Dimensionality reduction: this is default and works well with embeddings high in dimensionality
dim_model = UMAP(n_neighbors=200, n_components=5, min_dist=0.0,
metric='cosine', unique = True, random_state=1) # specify this for reproducibility
# Stpe 3: Clustering
cluster_model = HDBSCAN(min_cluster_size=365, min_samples=73,
metric='euclidean', cluster_selection_method='eom',
prediction_data=True, gen_min_span_tree=True)
# Step 4: Tokenization of topics (https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer)
vectorizer_model = CountVectorizer(stop_words="english", min_df=5) #optional: ngram_range=(1, 2) min_df=10
# Step 5-1:TF-IDF Weighting
ctfidf_model = vectorizers.ClassTfidfTransformer(reduce_frequent_words=True) # optional: reduce_frequent_words=True to further reduce frequent words in every topic
# Step 5-2: Representations (optional)
# rep_model1 = representation.KeyBERTInspired()
rep_chain = [representation.KeyBERTInspired(nr_samples = 1000),
representation.MaximalMarginalRelevance(diversity=.5)]
# %% fine-tuning parameters (reference: https://github.com/drob-xx/TopicTuner)
## Basig configs (N = 231836)
batch_size = int(np.ceil(11591800)/50)
random.seed(1)
batch_texts = random.sample(df['text'], batch_size)
batch_embeddings = embedding_model.encode(batch_texts, show_progress_bar = True)
tmt_path = r'F:\Projects\Automation\codes\Python\Similarity_Detection\tm_model_params'
tmt = TMT(docs = batch_texts,
embedding_model = embedding_model,
reducer_model = dim_model,
verbose=2) # verbose turns tqdm on
tmt.embeddings = batch_embeddings # setting embeddings after creating an instance
tmt.reduce()
## store and evaluate random search results
searches_result = tmt.randomSearch([*range(301,500)], # values between 50 and 300
[.1, .25, .5, .75, 1], # multiply float
40) # increase number of searches to 30
tmt.summarizeResults(searches_result).sort_values(by = ['number_uncategorized'])
## Further explore range between 350 and 401
searches_result2 = tmt.randomSearch([*range(350,401)], # values between 350 and 401
[.1, .25, .5, .75, 1], 30)
tmt.summarizeResults().sort_values(by = ['number_uncategorized'])
## Lastly, pseudo grid search of the limited range of cluster size (361-370, which works best above)
searches_result3 = tmt.pseudoGridSearch([*range(361,371)], [x/100 for x in range(10,101,10)])
## Evaluate results : 270 / 27 shows best perf - 113373 "good" and 118463 "bad"
tmt.summarizeResults(searches_result3).sort_values(by = ['number_uncategorized'])
## Save tmt model and results
tmt.save(f'{tmt_path}\\tmt_temp_300-500')
searches_result.to_csv(f'{tmt_path}\\random_search_300-500.csv', index = False)
searches_result2.to_csv(f'{tmt_path}\\random_search_round2_300-500.csv', index = False)
searches_result3.to_csv(f'{tmt_path}\\pseudo_grid_search_300-500.csv', index = False)
## visualization
tmt.visualizeSearch(searches_result3).write_html(f"{tmt_path}\\2d_search_plot_300-500.html")
tmt.createVizReduction('TSNE')
tmt.visualizeEmbeddings(365,73).write_html(f"{tmt_path}\\2d_topic_representation_300-500.html")
# %% Test Model
#########################
# Test Model (Mini Batch)
#########################
# Before run this, re-set the parameters above based on the fine-tuning results
print('Test a model')
## Basig configs (N = 231836)
batch_size = int(np.ceil(11591800)/50)
random.seed(42) # test the model with different sample
batch_texts = random.sample(df['text'], batch_size)
batch_embeddings = embedding_model.encode(batch_texts, show_progress_bar = True)
# Train a base BERTopic model using a batch
topic_model = BERTopic(
# models
embedding_model = embedding_model,
umap_model = dim_model,
hdbscan_model = cluster_model,
vectorizer_model=vectorizer_model,
ctfidf_model = ctfidf_model,
representation_model = rep_chain, # optional
#parameters
nr_topics="auto", # automatic topic reduction
calculate_probabilities=True, # base model needs this to calculate topic dist. per document
verbose = True
).fit(batch_texts, embeddings = batch_embeddings) #optional: calculate_probabilities=True
# get resulting topic info
topic_model.get_topic_info()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment