Created
June 29, 2025 14:57
-
-
Save AnteaterKit/15570b9c0494917625491b31416ac783 to your computer and use it in GitHub Desktop.
Fine-tune embeded for Ollama
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pip install torch torchvision torchaudio scikit-learn | |
pip install nomic torch transformers sentence-transformers transformers torch nomic einops | |
import torch | |
from torch import nn | |
from torch.utils.data import Dataset, DataLoader | |
from torch.optim import AdamW | |
from sentence_transformers import SentenceTransformer | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
import random | |
# Установка seed | |
def set_seed(seed=42): | |
random.seed(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
if torch.cuda.is_available(): | |
torch.cuda.manual_seed_all(seed) | |
set_seed() | |
# Загрузка модели | |
model = SentenceTransformer( | |
"nomic-ai/nomic-embed-text-v1.5", | |
trust_remote_code=True | |
) | |
# Пример данных | |
triplets = [ | |
("Audi Q7 2023", "Audi A6 2022", "BMW X5 2023"), | |
("Audi Q7 2023", "Audi A6 2022", "Skoda Octavia"), | |
("Audi Q7 2023", "Audi A5 2022", "Renault Logan"), | |
("Mercedes S-Class", "Mercedes E-Class", "Audi A8"), | |
("Mercedes S-Class", "Mercedes E-Class", "Audi A5"), | |
("BMW 5 Series", "BMW 3 Series", "Lada Vesta"), | |
("Toyota Camry", "Toyota Corola", "Lada Vesta"), | |
("Lada Vesta", "Lada Granta", "Renault Logan"), | |
("Skoda Octavia", "Volkswagen Passat", "Toyota Camry") | |
] | |
# Датасет | |
class TripletDataset(Dataset): | |
def __init__(self, triplets): | |
self.triplets = triplets | |
def __len__(self): | |
return len(self.triplets) | |
def __getitem__(self, idx): | |
return self.triplets[idx] | |
dataset = TripletDataset(triplets) | |
train_dataloader = DataLoader(dataset, shuffle=True, batch_size=6) | |
# Устройство | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
model.train() | |
# Функция потерь и оптимизатор | |
triplet_loss = nn.TripletMarginLoss(margin=1.5, p=2) | |
optimizer = AdamW(model.parameters(), lr=2e-5) | |
# Тестовая функция | |
def test_model(model, test_texts): | |
model.eval() | |
with torch.no_grad(): | |
embeddings = model.encode(test_texts) | |
sim_matrix = cosine_similarity(embeddings) | |
print("\n🔍 Матрица сходства:") | |
for i, row in enumerate(sim_matrix): | |
print(f"{test_texts[i][:15]:<15} " + " ".join([f"{x:.2f}" for x in row])) | |
model.train() | |
return sim_matrix | |
# Тестовые данные | |
test_texts = ["Audi Q7", "Audi A6", "BMW X5", "Mercedes S-Class", "Toyota Camry", "Toyota Corola", "Lada Granta", "Skoda Octavia"] | |
print("🧪 Тест ДО обучения:") | |
test_model(model, test_texts) | |
# Обучение | |
print("\n🔁 Начало обучения...") | |
for epoch in range(10): | |
total_loss = 0 | |
for batch in train_dataloader: | |
# Каждый элемент батча - это кортеж из трех строк | |
anchors = [item[0] for item in batch] | |
positives = [item[1] for item in batch] | |
negatives = [item[2] for item in batch] | |
# Объединяем все тексты для пакетной обработки | |
all_texts = anchors + positives + negatives | |
# Правильный способ получения эмбеддингов с сохранением графа вычислений | |
features = model.tokenize(all_texts) | |
features = {k: v.to(device) for k, v in features.items()} | |
output = model(features) | |
all_embeddings = output['sentence_embedding'] | |
all_embeddings = torch.nn.functional.normalize(all_embeddings, p=2, dim=1) | |
# Разделяем результаты | |
batch_size = len(anchors) | |
anchor_emb = all_embeddings[:batch_size] | |
positive_emb = all_embeddings[batch_size:2*batch_size] | |
negative_emb = all_embeddings[2*batch_size:] | |
# Расчет потерь | |
loss = triplet_loss(anchor_emb, positive_emb, negative_emb) | |
total_loss += loss.item() | |
# Обновление весов | |
optimizer.zero_grad() | |
loss.backward() | |
optimizer.step() | |
print(f"Эпоха {epoch+1} | Средние потери: {total_loss/len(train_dataloader):.4f}") | |
print("\n🧪 Тест ПОСЛЕ обучения:") | |
test_model(model, test_texts) | |
# Сохранение модели | |
output_path = "./fine_tuned_nomic_embed1" | |
model.save(output_path) | |
print(f"\n✅ Модель сохранена в {output_path}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment