Skip to content

Instantly share code, notes, and snippets.

@ChakshuGautam
ChakshuGautam / chunking_by_embedding.py
Last active July 3, 2024 08:07
BERT Embedding based chunking of texts
# --------------------------------------------------------------------- #
# Chunking Mechanism #
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
def calculate_embedding_difference(embeddings):
return [1 - cosine_similarity(embeddings[i].reshape(1, -1), embeddings[i + 1].reshape(1, -1))[0][0] for i in range(len(embeddings) - 1)]