Created
          June 16, 2024 21:28 
        
      - 
      
 - 
        
Save Jong-Sig/29d8ffeb1541ddaa6b44a4400db9e420 to your computer and use it in GitHub Desktop.  
    Fine-Tuning Parameters of BERTopic Using Pseudo Grid-Search and Mini-Batch
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # conda activate bert-github | |
| #%% | |
| import pandas as pd | |
| import numpy as np | |
| import os | |
| import glob | |
| import pathlib | |
| from tqdm import tqdm | |
| from bs4 import BeautifulSoup | |
| import re | |
| import string | |
| import random | |
| from sentence_transformers import SentenceTransformer, losses | |
| from transformers import AutoModel, AutoTokenizer, pipelines | |
| import torch | |
| from topictuner import TopicModelTuner as TMT | |
| from bertopic import BERTopic, representation, vectorizers | |
| from umap import UMAP | |
| import umap.plot | |
| # from cuml.manifold import UMAP # use cuML to speed up UMAP through GPU acceleration (https://docs.rapids.ai/install#rapids-release-selector) | |
| from hdbscan import HDBSCAN | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from scipy.special import softmax | |
| # Following [NBSL: A Supervised Classification Model of Pull Request in Github], I will use Topic Model Approach to Classify Knowledge Domain of PR | |
| # For approach, I will use [BERTopic: Neural topic modeling with a class-based TF-IDF procedure] | |
| """ | |
| This document is for fine-tuning parameters: | |
| 1) Over-arching Pipeline: | |
| # The pipeline is as follows: | |
| # Embeddings (e.g., SBERT) -> Dimensionality Reduction (e.g., UMAP) -> Clustering (e.g., HDBSCAN) -> Tokenization -> Weighting (e.g., cTF-IDF) & Topic Representation (e.g., KeyBERTInspired) | |
| (see: https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html) | |
| 1-1) Embeddings: | |
| # For a transformer model, I will use [Collab-uniba/github-issues-preprocessed-mpnet-st-e10] | |
| # This model fine-tuned MPNET sentence transformer using GitHub issue data from NLBSE22 dataset | |
| # This model well recognize meanings (e.g., operating systems) in GitHub domain. | |
| # In fact, SentenceBert without fine-tuning out-performed most non-transformer models. | |
| (see, DupHunter: detecting duplicate pull requests in fork-based development) | |
| 1-2) UMAP: | |
| # Following the developer's recommendation I set n_neighbors to 200, as I have a large dataset | |
| (see: https://maartengr.github.io/BERTopic/faq.html#i-have-too-many-topics-how-do-i-decrease-them) | |
| # For n_components, I use a default (5) to overcome the curse of dimensionality reduction (too high n) and loss of into (too small n) | |
| (see: https://maartengr.github.io/BERTopic/getting_started/dim_reduction/dim_reduction.html) | |
| # Also use unique to avoid duplicates after cleaning texts | |
| (see: https://github.com/lmcinnes/umap/issues/771) | |
| # randome seed for repoducibility | |
| # the rests are default setting | |
| 1-3) HDBSCAN: | |
| # use TopicTuner to fine-tune parameters: # I use random-search + grid-search combo | |
| (see: https://github.com/drob-xx/TopicTuner) | |
| # Why? although large samples helps extract right domains | |
| # by discarding "bad" documents and extracting "good" documents, | |
| # it also discard a large percentage of "good" documents | |
| # so we need to optimize params to so as not to discard "good" documents | |
| # Fine-tuning with randomly selected 231836 samples (2%) | |
| # In my sample, >= 100K documents are sufficient to extract appropriate topics | |
| # other users also observe >=100K documents are considered sufficient to extract topics | |
| (see: https://github.com/MaartenGr/BERTopic/issues/836) | |
| (see: https://github.com/MaartenGr/BERTopic/issues/1613) | |
| # the other params are default setting | |
| 1-4) Tokenizer: | |
| # use sklearn countvectorizer | |
| # lowering the value of min_df increases processing time, but DO NOT reduce the quality of clustering | |
| # so I would recommend setting min_df <= 10 with >= 100K documents if you have enough computing power&time, | |
| (see: https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer) | |
| # use stop words to reduce duplicates | |
| # I do not use n_gram of 2 as 2-grams do not have different meaning than 1-gram + 1-gram in GitHub context (therby leading to duplicates) | |
| 1-5) c-TF-IDF & Representation: | |
| # use default setting | |
| # additionally, use reduce_frequent_words=True for ctfidf and MMR to reduce duplicates | |
| (see: https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#diversify-topic-representation) | |
| """ | |
| # %% [Create function] | |
| # Preprocess Text | |
| def merge_texts(df, columns: list): | |
| df = df.dropna(subset = columns).reset_index(drop = True) | |
| df['text'] = df[columns[0]].astype(str) + ' ' + df[columns[1]].astype(str) | |
| df = df.drop(columns = columns) | |
| return df | |
| def remove_emoji(text): | |
| emoji_pattern = re.compile("[" | |
| u"\U0001F600-\U0001F64F" # emoticons | |
| u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
| u"\U0001F680-\U0001F6FF" # transport & map symbols | |
| u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
| u"\U00002702-\U000027B0" | |
| u"\U000024C2-\U0001F251" | |
| "]+", flags=re.UNICODE) | |
| return emoji_pattern.sub(r'', text) | |
| def replace_markdown_with_text(text): | |
| # Identify markdown links and replace them with the anchor text | |
| text = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', r'\1', text) | |
| # Identify markdown bold text and replace it with plain text | |
| text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) | |
| # Identify markdown italic text and replace it with plain text | |
| text = re.sub(r'\*(.*?)\*', r'\1', text) | |
| return text | |
| def remove_hashes(text): | |
| # Regular expression to detect hashes (alphanumeric words with a length of 7 characters or more) | |
| hash_pattern = r'\b[a-fA-F0-9]{7,}\b' | |
| # Replace hashes with [HASH] tokens | |
| text = re.sub(hash_pattern, '[HASH]', text) | |
| return text | |
| def preprocess_text(text): | |
| # Apply BeautifulSoup to clean HTML tags | |
| if text is not None: | |
| text = BeautifulSoup(text, 'lxml').get_text() | |
| else: | |
| pass | |
| # remove markdown | |
| text = replace_markdown_with_text(text) | |
| # remove urls | |
| text = re.sub(r"http\S+", "", text) | |
| # Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| # remove emojis & emoticions | |
| text = remove_emoji(text) | |
| # replace hashes with [HASH] | |
| text = remove_hashes(text) | |
| # replace @users with [USER] | |
| text = re.sub(r'(?<!\S)@[\w]+(?!@)', '[USER]', text) | |
| # replace punctuations with ' ' | |
| PUNCT_TO_REMOVE = string.punctuation | |
| text = text.translate(str.maketrans(PUNCT_TO_REMOVE, ' '*len(PUNCT_TO_REMOVE))) | |
| # remove number if it in not a part of string | |
| text = re.sub(r'\s\d+', '', text) | |
| # Remove extra whitespace2 | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| # Lowercase the text | |
| text = text.lower() | |
| return text.strip() | |
| # %% | |
| if __name__ == '__main__': | |
| ################ | |
| # Import Dataset | |
| ################ | |
| print('Import Dataset') | |
| # Basic configs | |
| path = r'G:/Data/GitHub_Collab/github-trending/' | |
| # Get repos with at least one PR request between 2015-2023 | |
| data_dir = pathlib.Path(path + 'PR_event') | |
| # test sample | |
| counter = 1 | |
| df = pd.DataFrame() | |
| for file in data_dir.glob('*.parquet.gzip'): | |
| if counter < 60: | |
| df = pd.concat([df, pd.read_parquet(file, columns = ['PRID', 'RepoID', 'PRTitle', 'PRBody'])], | |
| ignore_index = True) | |
| counter += 1 | |
| else: | |
| break | |
| # drop duplicates based on PR IDs | |
| df = df.drop_duplicates(subset =['PRID']).reset_index(drop = True) | |
| # Get Repos in cleaned sample | |
| base_path = r'F:\Projects\Automation\data\trending_sample\sample_panel' | |
| file = 'github_trending_sample_panel.parquet.gzip' | |
| df_base = pd.read_parquet(f'{base_path}\\{file}', | |
| columns = ['RepoID', 'RepoLanguage']) | |
| df_base = df_base.drop_duplicates().reset_index(drop = True) | |
| # Keep repos only in the cleaned sample | |
| df = pd.merge(df, df_base, on = ['RepoID']) | |
| del df_base | |
| # %% | |
| ################### | |
| # Pre-Process Texts | |
| ################### | |
| # Great source: https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing | |
| # https://ieeexplore.ieee.org/abstract/document/9785808 | |
| print('Pre-Processing Texts') | |
| # Prepare input sentences for fine-tuning | |
| df = merge_texts(df, columns = ['PRTitle', 'PRBody']) | |
| # convert to dictionary | |
| df = df.to_dict('list') | |
| # apply function | |
| progress_bar = tqdm(total=len(df['text'])) | |
| for i, text in enumerate(df['text']): | |
| df['text'][i] = preprocess_text(text) | |
| progress_bar.update(1) # Update progress bar | |
| # df['text'] = df['text'].progress_apply(preprocess_text) | |
| # Close the progress bar | |
| progress_bar.close() | |
| # %% Model Config (https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html) | |
| ######################## | |
| # Fine-Tuning Parameters | |
| ######################## | |
| print('Setting a Model') | |
| # Step 0: cuda config | |
| torch.cuda.empty_cache() | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # Step 1: Load the SentenceTransformer model with the specified architecture | |
| embedding_model = SentenceTransformer('Collab-uniba/github-issues-preprocessed-mpnet-st-e10', | |
| device=device) | |
| # Step 2: Dimensionality reduction: this is default and works well with embeddings high in dimensionality | |
| dim_model = UMAP(n_neighbors=200, n_components=5, min_dist=0.0, | |
| metric='cosine', unique = True, random_state=1) # specify this for reproducibility | |
| # Stpe 3: Clustering | |
| cluster_model = HDBSCAN(min_cluster_size=365, min_samples=73, | |
| metric='euclidean', cluster_selection_method='eom', | |
| prediction_data=True, gen_min_span_tree=True) | |
| # Step 4: Tokenization of topics (https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer) | |
| vectorizer_model = CountVectorizer(stop_words="english", min_df=5) #optional: ngram_range=(1, 2) min_df=10 | |
| # Step 5-1:TF-IDF Weighting | |
| ctfidf_model = vectorizers.ClassTfidfTransformer(reduce_frequent_words=True) # optional: reduce_frequent_words=True to further reduce frequent words in every topic | |
| # Step 5-2: Representations (optional) | |
| # rep_model1 = representation.KeyBERTInspired() | |
| rep_chain = [representation.KeyBERTInspired(nr_samples = 1000), | |
| representation.MaximalMarginalRelevance(diversity=.5)] | |
| # %% fine-tuning parameters (reference: https://github.com/drob-xx/TopicTuner) | |
| ## Basig configs (N = 231836) | |
| batch_size = int(np.ceil(11591800)/50) | |
| random.seed(1) | |
| batch_texts = random.sample(df['text'], batch_size) | |
| batch_embeddings = embedding_model.encode(batch_texts, show_progress_bar = True) | |
| tmt_path = r'F:\Projects\Automation\codes\Python\Similarity_Detection\tm_model_params' | |
| tmt = TMT(docs = batch_texts, | |
| embedding_model = embedding_model, | |
| reducer_model = dim_model, | |
| verbose=2) # verbose turns tqdm on | |
| tmt.embeddings = batch_embeddings # setting embeddings after creating an instance | |
| tmt.reduce() | |
| ## store and evaluate random search results | |
| searches_result = tmt.randomSearch([*range(301,500)], # values between 50 and 300 | |
| [.1, .25, .5, .75, 1], # multiply float | |
| 40) # increase number of searches to 30 | |
| tmt.summarizeResults(searches_result).sort_values(by = ['number_uncategorized']) | |
| ## Further explore range between 350 and 401 | |
| searches_result2 = tmt.randomSearch([*range(350,401)], # values between 350 and 401 | |
| [.1, .25, .5, .75, 1], 30) | |
| tmt.summarizeResults().sort_values(by = ['number_uncategorized']) | |
| ## Lastly, pseudo grid search of the limited range of cluster size (361-370, which works best above) | |
| searches_result3 = tmt.pseudoGridSearch([*range(361,371)], [x/100 for x in range(10,101,10)]) | |
| ## Evaluate results : 270 / 27 shows best perf - 113373 "good" and 118463 "bad" | |
| tmt.summarizeResults(searches_result3).sort_values(by = ['number_uncategorized']) | |
| ## Save tmt model and results | |
| tmt.save(f'{tmt_path}\\tmt_temp_300-500') | |
| searches_result.to_csv(f'{tmt_path}\\random_search_300-500.csv', index = False) | |
| searches_result2.to_csv(f'{tmt_path}\\random_search_round2_300-500.csv', index = False) | |
| searches_result3.to_csv(f'{tmt_path}\\pseudo_grid_search_300-500.csv', index = False) | |
| ## visualization | |
| tmt.visualizeSearch(searches_result3).write_html(f"{tmt_path}\\2d_search_plot_300-500.html") | |
| tmt.createVizReduction('TSNE') | |
| tmt.visualizeEmbeddings(365,73).write_html(f"{tmt_path}\\2d_topic_representation_300-500.html") | |
| # %% Test Model | |
| ######################### | |
| # Test Model (Mini Batch) | |
| ######################### | |
| # Before run this, re-set the parameters above based on the fine-tuning results | |
| print('Test a model') | |
| ## Basig configs (N = 231836) | |
| batch_size = int(np.ceil(11591800)/50) | |
| random.seed(42) # test the model with different sample | |
| batch_texts = random.sample(df['text'], batch_size) | |
| batch_embeddings = embedding_model.encode(batch_texts, show_progress_bar = True) | |
| # Train a base BERTopic model using a batch | |
| topic_model = BERTopic( | |
| # models | |
| embedding_model = embedding_model, | |
| umap_model = dim_model, | |
| hdbscan_model = cluster_model, | |
| vectorizer_model=vectorizer_model, | |
| ctfidf_model = ctfidf_model, | |
| representation_model = rep_chain, # optional | |
| #parameters | |
| nr_topics="auto", # automatic topic reduction | |
| calculate_probabilities=True, # base model needs this to calculate topic dist. per document | |
| verbose = True | |
| ).fit(batch_texts, embeddings = batch_embeddings) #optional: calculate_probabilities=True | |
| # get resulting topic info | |
| topic_model.get_topic_info() | |
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment