Jong-Sig · June 16, 2024 21:28
diff --git a/TuningBERTopic.py b/TuningBERTopic.py
 # conda activate bert-github

 #%%
 import pandas as pd
 import numpy as np
 import os
 import glob
 import pathlib
 from tqdm import tqdm
 from bs4 import BeautifulSoup
 import re
 import string
 import random
 from sentence_transformers import SentenceTransformer, losses
 from transformers import AutoModel, AutoTokenizer, pipelines
 import torch
 from topictuner import TopicModelTuner as TMT
 from bertopic import BERTopic, representation, vectorizers 
 from umap import UMAP
 import umap.plot
 # from cuml.manifold import UMAP # use cuML to speed up UMAP through GPU acceleration (https://docs.rapids.ai/install#rapids-release-selector)
 from hdbscan import HDBSCAN
 from sklearn.feature_extraction.text import CountVectorizer
 from scipy.special import softmax

 # Following [NBSL: A Supervised Classification Model of Pull Request in Github], I will use Topic Model Approach to Classify Knowledge Domain of PR
 # For approach, I will use [BERTopic: Neural topic modeling with a class-based TF-IDF procedure]



 """
 This document is for fine-tuning parameters:

 1) Over-arching Pipeline:
    # The pipeline is as follows:
        # Embeddings (e.g., SBERT) -> Dimensionality Reduction (e.g., UMAP) -> Clustering (e.g., HDBSCAN) -> Tokenization -> Weighting (e.g., cTF-IDF) & Topic Representation (e.g., KeyBERTInspired)
    (see: https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html)

    1-1) Embeddings:
        # For a transformer model, I will use [Collab-uniba/github-issues-preprocessed-mpnet-st-e10]
        # This model fine-tuned MPNET sentence transformer using GitHub issue data from NLBSE22 dataset
        # This model well recognize meanings (e.g., operating systems) in GitHub domain.
            # In fact, SentenceBert without fine-tuning out-performed most non-transformer models.
            (see, DupHunter: detecting duplicate pull requests in fork-based development)

    1-2) UMAP:
        # Following the developer's recommendation I set n_neighbors to 200, as I have a large dataset
        (see: https://maartengr.github.io/BERTopic/faq.html#i-have-too-many-topics-how-do-i-decrease-them)
        # For n_components, I use a default (5) to overcome the curse of dimensionality reduction (too high n) and loss of into (too small n)
        (see: https://maartengr.github.io/BERTopic/getting_started/dim_reduction/dim_reduction.html)
        # Also use unique to avoid duplicates after cleaning texts
        (see: https://github.com/lmcinnes/umap/issues/771)
        # randome seed for repoducibility
        # the rests are default setting

    1-3) HDBSCAN:
        # use TopicTuner to fine-tune parameters: # I use random-search + grid-search combo
        (see: https://github.com/drob-xx/TopicTuner)
            # Why? although large samples helps extract right domains
            # by discarding "bad" documents and extracting "good" documents,
            # it also discard a large percentage of "good" documents
            # so we need to optimize params to so as not to discard "good" documents
        # Fine-tuning with randomly selected 231836 samples (2%)
            # In my sample, >= 100K documents are sufficient to extract appropriate topics
            # other users also observe >=100K documents are considered sufficient to extract topics
        (see: https://github.com/MaartenGr/BERTopic/issues/836)
        (see: https://github.com/MaartenGr/BERTopic/issues/1613)
        # the other params are default setting

    1-4) Tokenizer:
        # use sklearn countvectorizer
        # lowering the value of min_df increases processing time, but DO NOT reduce the quality of clustering
            # so I would recommend setting min_df <= 10 with >= 100K documents if you have enough computing power&time,
        (see: https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer)
        # use stop words to reduce duplicates
        # I do not use n_gram of 2 as 2-grams do not have different meaning than 1-gram + 1-gram in GitHub context (therby leading to duplicates)

    1-5) c-TF-IDF & Representation:
        # use default setting
        # additionally, use reduce_frequent_words=True for ctfidf and MMR to reduce duplicates
        (see: https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#diversify-topic-representation)

 """


 # %% [Create function]

 # Preprocess Text
 def merge_texts(df, columns: list):
    df = df.dropna(subset = columns).reset_index(drop = True)
    df['text'] = df[columns[0]].astype(str) + ' ' + df[columns[1]].astype(str)
    df = df.drop(columns = columns)
    
    return df

 def remove_emoji(text):
    emoji_pattern = re.compile("["
                    u"\U0001F600-\U0001F64F"  # emoticons
                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                    u"\U00002702-\U000027B0"
                    u"\U000024C2-\U0001F251"
                    "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

 def replace_markdown_with_text(text):
    # Identify markdown links and replace them with the anchor text
    text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\1', text)
    # Identify markdown bold text and replace it with plain text
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    # Identify markdown italic text and replace it with plain text
    text = re.sub(r'\*(.*?)\*', r'\1', text)

    return text

 def remove_hashes(text):
    # Regular expression to detect hashes (alphanumeric words with a length of 7 characters or more)
    hash_pattern = r'\b[a-fA-F0-9]{7,}\b'
    # Replace hashes with [HASH] tokens
    text = re.sub(hash_pattern, '[HASH]', text)
    
    return text

 def preprocess_text(text):

    # Apply BeautifulSoup to clean HTML tags
    if text is not None:
        text = BeautifulSoup(text, 'lxml').get_text()
    else:
        pass
    # remove markdown
    text = replace_markdown_with_text(text)
    # remove urls
    text = re.sub(r"http\S+", "", text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # remove emojis & emoticions
    text = remove_emoji(text)
    # replace hashes with [HASH]
    text = remove_hashes(text)
    # replace @users with [USER]
    text = re.sub(r'(?<!\S)@[\w]+(?!@)', '[USER]', text)
    # replace punctuations with ' ' 
    PUNCT_TO_REMOVE = string.punctuation
    text = text.translate(str.maketrans(PUNCT_TO_REMOVE, ' '*len(PUNCT_TO_REMOVE)))
    # remove number if it in not a part of string
    text = re.sub(r'\s\d+', '', text)
    # Remove extra whitespace2
    text = re.sub(r'\s+', ' ', text).strip()
    # Lowercase the text
    text = text.lower()
    
    return text.strip()


 # %%

 if __name__ == '__main__':

    ################
    # Import Dataset
    ################

    print('Import Dataset')

    # Basic configs
    path = r'G:/Data/GitHub_Collab/github-trending/'

    # Get repos with at least one PR request between 2015-2023
    data_dir = pathlib.Path(path + 'PR_event')

    # test sample
    counter = 1
    df = pd.DataFrame()
    for file in data_dir.glob('*.parquet.gzip'):
        if counter < 60:
            df = pd.concat([df, pd.read_parquet(file, columns = ['PRID', 'RepoID', 'PRTitle', 'PRBody'])],
            ignore_index = True)

            counter += 1

        else:
            break

    # drop duplicates based on PR IDs
    df = df.drop_duplicates(subset =['PRID']).reset_index(drop = True)

    # Get Repos in cleaned sample
    base_path = r'F:\Projects\Automation\data\trending_sample\sample_panel'
    file = 'github_trending_sample_panel.parquet.gzip'

    df_base = pd.read_parquet(f'{base_path}\\{file}',
                              columns = ['RepoID', 'RepoLanguage'])
    
    df_base = df_base.drop_duplicates().reset_index(drop = True)

    # Keep repos only in the cleaned sample
    df = pd.merge(df, df_base, on = ['RepoID'])

    del df_base


 # %%

    ###################
    # Pre-Process Texts
    ###################
    # Great source: https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing
    # https://ieeexplore.ieee.org/abstract/document/9785808

    print('Pre-Processing Texts')

    # Prepare input sentences for fine-tuning
    df = merge_texts(df, columns = ['PRTitle', 'PRBody'])

    # convert to dictionary
    df = df.to_dict('list')

    # apply function
    progress_bar = tqdm(total=len(df['text']))

    for i, text in enumerate(df['text']):
        df['text'][i] = preprocess_text(text)
        progress_bar.update(1)  # Update progress bar

    # df['text'] = df['text'].progress_apply(preprocess_text)

    # Close the progress bar
    progress_bar.close()


 # %% Model Config (https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html)

    ########################
    # Fine-Tuning Parameters
    ########################

    print('Setting a Model')

    # Step 0: cuda config
    torch.cuda.empty_cache()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Step 1: Load the SentenceTransformer model with the specified architecture
    embedding_model = SentenceTransformer('Collab-uniba/github-issues-preprocessed-mpnet-st-e10', 
                                          device=device)


    # Step 2: Dimensionality reduction: this is default and works well with embeddings high in dimensionality
    dim_model = UMAP(n_neighbors=200, n_components=5, min_dist=0.0, 
                     metric='cosine', unique = True, random_state=1) # specify this for reproducibility

    # Stpe 3: Clustering
    cluster_model = HDBSCAN(min_cluster_size=365, min_samples=73,
                            metric='euclidean', cluster_selection_method='eom', 
                            prediction_data=True, gen_min_span_tree=True) 

    # Step 4: Tokenization of topics (https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer)
    vectorizer_model = CountVectorizer(stop_words="english", min_df=5) #optional: ngram_range=(1, 2) min_df=10

    # Step 5-1:TF-IDF Weighting
    ctfidf_model = vectorizers.ClassTfidfTransformer(reduce_frequent_words=True) # optional: reduce_frequent_words=True to further reduce frequent words in every topic

    # Step 5-2: Representations (optional)
    # rep_model1 = representation.KeyBERTInspired()
    rep_chain = [representation.KeyBERTInspired(nr_samples = 1000), 
                 representation.MaximalMarginalRelevance(diversity=.5)]


 # %% fine-tuning parameters (reference: https://github.com/drob-xx/TopicTuner)

    ## Basig configs (N = 231836)
    batch_size = int(np.ceil(11591800)/50)
    random.seed(1)
    batch_texts = random.sample(df['text'], batch_size) 
    batch_embeddings = embedding_model.encode(batch_texts, show_progress_bar = True)

    tmt_path = r'F:\Projects\Automation\codes\Python\Similarity_Detection\tm_model_params'

    tmt = TMT(docs = batch_texts,
              embedding_model = embedding_model,
              reducer_model = dim_model,
              verbose=2) # verbose turns tqdm on
    tmt.embeddings = batch_embeddings # setting embeddings after creating an instance
     
    tmt.reduce()

    ## store and evaluate random search results
    searches_result = tmt.randomSearch([*range(301,500)], # values between 50 and 300
                                        [.1, .25, .5, .75, 1], # multiply float
                                        40) # increase number of searches to 30
    
    tmt.summarizeResults(searches_result).sort_values(by = ['number_uncategorized'])

    ## Further explore range between 350 and 401
    searches_result2 = tmt.randomSearch([*range(350,401)], # values between 350 and 401
                                        [.1, .25, .5, .75, 1], 30) 
    
    tmt.summarizeResults().sort_values(by = ['number_uncategorized'])

    ## Lastly, pseudo grid search of the limited range of cluster size (361-370, which works best above)
    searches_result3 = tmt.pseudoGridSearch([*range(361,371)], [x/100 for x in range(10,101,10)])

    ## Evaluate results : 270 / 27 shows best perf - 113373 "good" and 118463 "bad"
    tmt.summarizeResults(searches_result3).sort_values(by = ['number_uncategorized'])

    ## Save tmt model and results
    tmt.save(f'{tmt_path}\\tmt_temp_300-500')

    searches_result.to_csv(f'{tmt_path}\\random_search_300-500.csv', index = False)
    searches_result2.to_csv(f'{tmt_path}\\random_search_round2_300-500.csv', index = False)
    searches_result3.to_csv(f'{tmt_path}\\pseudo_grid_search_300-500.csv', index = False)

    ## visualization
    tmt.visualizeSearch(searches_result3).write_html(f"{tmt_path}\\2d_search_plot_300-500.html")

    tmt.createVizReduction('TSNE')
    tmt.visualizeEmbeddings(365,73).write_html(f"{tmt_path}\\2d_topic_representation_300-500.html")


 # %% Test Model

    #########################
    # Test Model (Mini Batch)
    #########################

    # Before run this, re-set the parameters above based on the fine-tuning results

    print('Test a model') 

    ## Basig configs (N = 231836)
    batch_size = int(np.ceil(11591800)/50)
    random.seed(42) # test the model with different sample
    batch_texts = random.sample(df['text'], batch_size) 
    batch_embeddings = embedding_model.encode(batch_texts, show_progress_bar = True)

    # Train a base BERTopic model using a batch
    topic_model = BERTopic(
        # models
        embedding_model = embedding_model,
        umap_model = dim_model,
        hdbscan_model = cluster_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model = ctfidf_model,
        representation_model = rep_chain, # optional
        
        #parameters
        nr_topics="auto", # automatic topic reduction
        calculate_probabilities=True, # base model needs this to calculate topic dist. per document
        verbose = True
        ).fit(batch_texts, embeddings = batch_embeddings) #optional: calculate_probabilities=True

    # get resulting topic info
    topic_model.get_topic_info()
	# conda activate bert-github

	#%%
	import pandas as pd
	import numpy as np
	import os
	import glob
	import pathlib
	from tqdm import tqdm
	from bs4 import BeautifulSoup
	import re
	import string
	import random
	from sentence_transformers import SentenceTransformer, losses
	from transformers import AutoModel, AutoTokenizer, pipelines
	import torch
	from topictuner import TopicModelTuner as TMT
	from bertopic import BERTopic, representation, vectorizers
	from umap import UMAP
	import umap.plot
	# from cuml.manifold import UMAP # use cuML to speed up UMAP through GPU acceleration (https://docs.rapids.ai/install#rapids-release-selector)
	from hdbscan import HDBSCAN
	from sklearn.feature_extraction.text import CountVectorizer
	from scipy.special import softmax

	# Following [NBSL: A Supervised Classification Model of Pull Request in Github], I will use Topic Model Approach to Classify Knowledge Domain of PR
	# For approach, I will use [BERTopic: Neural topic modeling with a class-based TF-IDF procedure]



	"""
	This document is for fine-tuning parameters:

	1) Over-arching Pipeline:
	# The pipeline is as follows:
	# Embeddings (e.g., SBERT) -> Dimensionality Reduction (e.g., UMAP) -> Clustering (e.g., HDBSCAN) -> Tokenization -> Weighting (e.g., cTF-IDF) & Topic Representation (e.g., KeyBERTInspired)
	(see: https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html)

	1-1) Embeddings:
	# For a transformer model, I will use [Collab-uniba/github-issues-preprocessed-mpnet-st-e10]
	# This model fine-tuned MPNET sentence transformer using GitHub issue data from NLBSE22 dataset
	# This model well recognize meanings (e.g., operating systems) in GitHub domain.
	# In fact, SentenceBert without fine-tuning out-performed most non-transformer models.
	(see, DupHunter: detecting duplicate pull requests in fork-based development)

	1-2) UMAP:
	# Following the developer's recommendation I set n_neighbors to 200, as I have a large dataset
	(see: https://maartengr.github.io/BERTopic/faq.html#i-have-too-many-topics-how-do-i-decrease-them)
	# For n_components, I use a default (5) to overcome the curse of dimensionality reduction (too high n) and loss of into (too small n)
	(see: https://maartengr.github.io/BERTopic/getting_started/dim_reduction/dim_reduction.html)
	# Also use unique to avoid duplicates after cleaning texts
	(see: https://github.com/lmcinnes/umap/issues/771)
	# randome seed for repoducibility
	# the rests are default setting

	1-3) HDBSCAN:
	# use TopicTuner to fine-tune parameters: # I use random-search + grid-search combo
	(see: https://github.com/drob-xx/TopicTuner)
	# Why? although large samples helps extract right domains
	# by discarding "bad" documents and extracting "good" documents,
	# it also discard a large percentage of "good" documents
	# so we need to optimize params to so as not to discard "good" documents
	# Fine-tuning with randomly selected 231836 samples (2%)
	# In my sample, >= 100K documents are sufficient to extract appropriate topics
	# other users also observe >=100K documents are considered sufficient to extract topics
	(see: https://github.com/MaartenGr/BERTopic/issues/836)
	(see: https://github.com/MaartenGr/BERTopic/issues/1613)
	# the other params are default setting

	1-4) Tokenizer:
	# use sklearn countvectorizer
	# lowering the value of min_df increases processing time, but DO NOT reduce the quality of clustering
	# so I would recommend setting min_df <= 10 with >= 100K documents if you have enough computing power&time,
	(see: https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer)
	# use stop words to reduce duplicates
	# I do not use n_gram of 2 as 2-grams do not have different meaning than 1-gram + 1-gram in GitHub context (therby leading to duplicates)

	1-5) c-TF-IDF & Representation:
	# use default setting
	# additionally, use reduce_frequent_words=True for ctfidf and MMR to reduce duplicates
	(see: https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#diversify-topic-representation)

	"""


	# %% [Create function]

	# Preprocess Text
	def merge_texts(df, columns: list):
	df = df.dropna(subset = columns).reset_index(drop = True)
	df['text'] = df[columns[0]].astype(str) + ' ' + df[columns[1]].astype(str)
	df = df.drop(columns = columns)

	return df

	def remove_emoji(text):
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+", flags=re.UNICODE)

	return emoji_pattern.sub(r'', text)

	def replace_markdown_with_text(text):
	# Identify markdown links and replace them with the anchor text
	text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\1', text)
	# Identify markdown bold text and replace it with plain text
	text = re.sub(r'\\(.?)\\*', r'\1', text)
	# Identify markdown italic text and replace it with plain text
	text = re.sub(r'\(.?)\*', r'\1', text)

	return text

	def remove_hashes(text):
	# Regular expression to detect hashes (alphanumeric words with a length of 7 characters or more)
	hash_pattern = r'\b[a-fA-F0-9]{7,}\b'
	# Replace hashes with [HASH] tokens
	text = re.sub(hash_pattern, '[HASH]', text)

	return text

	def preprocess_text(text):

	# Apply BeautifulSoup to clean HTML tags
	if text is not None:
	text = BeautifulSoup(text, 'lxml').get_text()
	else:
	pass
	# remove markdown
	text = replace_markdown_with_text(text)
	# remove urls
	text = re.sub(r"http\S+", "", text)
	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text).strip()
	# remove emojis & emoticions
	text = remove_emoji(text)
	# replace hashes with [HASH]
	text = remove_hashes(text)
	# replace @users with [USER]
	text = re.sub(r'(?<!\S)@[\w]+(?!@)', '[USER]', text)
	# replace punctuations with ' '
	PUNCT_TO_REMOVE = string.punctuation
	text = text.translate(str.maketrans(PUNCT_TO_REMOVE, ' '*len(PUNCT_TO_REMOVE)))
	# remove number if it in not a part of string
	text = re.sub(r'\s\d+', '', text)
	# Remove extra whitespace2
	text = re.sub(r'\s+', ' ', text).strip()
	# Lowercase the text
	text = text.lower()

	return text.strip()


	# %%

	if __name__ == '__main__':

	################
	# Import Dataset
	################

	print('Import Dataset')

	# Basic configs
	path = r'G:/Data/GitHub_Collab/github-trending/'

	# Get repos with at least one PR request between 2015-2023
	data_dir = pathlib.Path(path + 'PR_event')

	# test sample
	counter = 1
	df = pd.DataFrame()
	for file in data_dir.glob('*.parquet.gzip'):
	if counter < 60:
	df = pd.concat([df, pd.read_parquet(file, columns = ['PRID', 'RepoID', 'PRTitle', 'PRBody'])],
	ignore_index = True)

	counter += 1

	else:
	break

	# drop duplicates based on PR IDs
	df = df.drop_duplicates(subset =['PRID']).reset_index(drop = True)

	# Get Repos in cleaned sample
	base_path = r'F:\Projects\Automation\data\trending_sample\sample_panel'
	file = 'github_trending_sample_panel.parquet.gzip'

	df_base = pd.read_parquet(f'{base_path}\\{file}',
	columns = ['RepoID', 'RepoLanguage'])

	df_base = df_base.drop_duplicates().reset_index(drop = True)

	# Keep repos only in the cleaned sample
	df = pd.merge(df, df_base, on = ['RepoID'])

	del df_base


	# %%

	###################
	# Pre-Process Texts
	###################
	# Great source: https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing
	# https://ieeexplore.ieee.org/abstract/document/9785808

	print('Pre-Processing Texts')

	# Prepare input sentences for fine-tuning
	df = merge_texts(df, columns = ['PRTitle', 'PRBody'])

	# convert to dictionary
	df = df.to_dict('list')

	# apply function
	progress_bar = tqdm(total=len(df['text']))

	for i, text in enumerate(df['text']):
	df['text'][i] = preprocess_text(text)
	progress_bar.update(1) # Update progress bar

	# df['text'] = df['text'].progress_apply(preprocess_text)

	# Close the progress bar
	progress_bar.close()


	# %% Model Config (https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html)

	########################
	# Fine-Tuning Parameters
	########################

	print('Setting a Model')

	# Step 0: cuda config
	torch.cuda.empty_cache()
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Step 1: Load the SentenceTransformer model with the specified architecture
	embedding_model = SentenceTransformer('Collab-uniba/github-issues-preprocessed-mpnet-st-e10',
	device=device)


	# Step 2: Dimensionality reduction: this is default and works well with embeddings high in dimensionality
	dim_model = UMAP(n_neighbors=200, n_components=5, min_dist=0.0,
	metric='cosine', unique = True, random_state=1) # specify this for reproducibility

	# Stpe 3: Clustering
	cluster_model = HDBSCAN(min_cluster_size=365, min_samples=73,
	metric='euclidean', cluster_selection_method='eom',
	prediction_data=True, gen_min_span_tree=True)

	# Step 4: Tokenization of topics (https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer)
	vectorizer_model = CountVectorizer(stop_words="english", min_df=5) #optional: ngram_range=(1, 2) min_df=10

	# Step 5-1:TF-IDF Weighting
	ctfidf_model = vectorizers.ClassTfidfTransformer(reduce_frequent_words=True) # optional: reduce_frequent_words=True to further reduce frequent words in every topic

	# Step 5-2: Representations (optional)
	# rep_model1 = representation.KeyBERTInspired()
	rep_chain = [representation.KeyBERTInspired(nr_samples = 1000),
	representation.MaximalMarginalRelevance(diversity=.5)]


	# %% fine-tuning parameters (reference: https://github.com/drob-xx/TopicTuner)

	## Basig configs (N = 231836)
	batch_size = int(np.ceil(11591800)/50)
	random.seed(1)
	batch_texts = random.sample(df['text'], batch_size)
	batch_embeddings = embedding_model.encode(batch_texts, show_progress_bar = True)

	tmt_path = r'F:\Projects\Automation\codes\Python\Similarity_Detection\tm_model_params'

	tmt = TMT(docs = batch_texts,
	embedding_model = embedding_model,
	reducer_model = dim_model,
	verbose=2) # verbose turns tqdm on
	tmt.embeddings = batch_embeddings # setting embeddings after creating an instance

	tmt.reduce()

	## store and evaluate random search results
	searches_result = tmt.randomSearch([*range(301,500)], # values between 50 and 300
	[.1, .25, .5, .75, 1], # multiply float
	40) # increase number of searches to 30

	tmt.summarizeResults(searches_result).sort_values(by = ['number_uncategorized'])

	## Further explore range between 350 and 401
	searches_result2 = tmt.randomSearch([*range(350,401)], # values between 350 and 401
	[.1, .25, .5, .75, 1], 30)

	tmt.summarizeResults().sort_values(by = ['number_uncategorized'])

	## Lastly, pseudo grid search of the limited range of cluster size (361-370, which works best above)
	searches_result3 = tmt.pseudoGridSearch([*range(361,371)], [x/100 for x in range(10,101,10)])

	## Evaluate results : 270 / 27 shows best perf - 113373 "good" and 118463 "bad"
	tmt.summarizeResults(searches_result3).sort_values(by = ['number_uncategorized'])

	## Save tmt model and results
	tmt.save(f'{tmt_path}\\tmt_temp_300-500')

	searches_result.to_csv(f'{tmt_path}\\random_search_300-500.csv', index = False)
	searches_result2.to_csv(f'{tmt_path}\\random_search_round2_300-500.csv', index = False)
	searches_result3.to_csv(f'{tmt_path}\\pseudo_grid_search_300-500.csv', index = False)

	## visualization
	tmt.visualizeSearch(searches_result3).write_html(f"{tmt_path}\\2d_search_plot_300-500.html")

	tmt.createVizReduction('TSNE')
	tmt.visualizeEmbeddings(365,73).write_html(f"{tmt_path}\\2d_topic_representation_300-500.html")


	# %% Test Model

	#########################
	# Test Model (Mini Batch)
	#########################

	# Before run this, re-set the parameters above based on the fine-tuning results

	print('Test a model')

	## Basig configs (N = 231836)
	batch_size = int(np.ceil(11591800)/50)
	random.seed(42) # test the model with different sample
	batch_texts = random.sample(df['text'], batch_size)
	batch_embeddings = embedding_model.encode(batch_texts, show_progress_bar = True)

	# Train a base BERTopic model using a batch
	topic_model = BERTopic(
	# models
	embedding_model = embedding_model,
	umap_model = dim_model,
	hdbscan_model = cluster_model,
	vectorizer_model=vectorizer_model,
	ctfidf_model = ctfidf_model,
	representation_model = rep_chain, # optional

	#parameters
	nr_topics="auto", # automatic topic reduction
	calculate_probabilities=True, # base model needs this to calculate topic dist. per document
	verbose = True
	).fit(batch_texts, embeddings = batch_embeddings) #optional: calculate_probabilities=True

	# get resulting topic info
	topic_model.get_topic_info()