do-me · January 29, 2025 12:56
diff --git a/unique_embeddings.py b/unique_embeddings.py
 from FlagEmbedding import BGEM3FlagModel
 model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

 # assuming gdf is a (geo)pandas dataframe with texts to inference

 # Step 1: Get the list of texts to encode
 gdf_list = gdf["texts"].to_list()

 # Step 2: Deduplicate the list of texts and keep track of the original indices
 unique_texts = list(set(gdf_list))

 # Step 3: Perform inference only on the unique texts
 unique_embeddings = model.encode(unique_texts, batch_size=12, max_length=2048)['dense_vecs'] # e.g. with bge-m3

 # Step 4: Create a mapping from unique texts to embeddings
 text_to_embedding = dict(zip(unique_texts, unique_embeddings))

 # Step 5: Rebuild the embeddings in the original order using the mapping
 embeddings = [text_to_embedding[text] for text in gdf_list]

 # Step 6: Now 'embeddings' contains the embeddings in the correct order, reassign to a pandas column
 gdf["embeddings"] = embeddings
	from FlagEmbedding import BGEM3FlagModel
	model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

	# assuming gdf is a (geo)pandas dataframe with texts to inference

	# Step 1: Get the list of texts to encode
	gdf_list = gdf["texts"].to_list()

	# Step 2: Deduplicate the list of texts and keep track of the original indices
	unique_texts = list(set(gdf_list))

	# Step 3: Perform inference only on the unique texts
	unique_embeddings = model.encode(unique_texts, batch_size=12, max_length=2048)['dense_vecs'] # e.g. with bge-m3

	# Step 4: Create a mapping from unique texts to embeddings
	text_to_embedding = dict(zip(unique_texts, unique_embeddings))

	# Step 5: Rebuild the embeddings in the original order using the mapping
	embeddings = [text_to_embedding[text] for text in gdf_list]

	# Step 6: Now 'embeddings' contains the embeddings in the correct order, reassign to a pandas column
	gdf["embeddings"] = embeddings