Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save AnteaterKit/15570b9c0494917625491b31416ac783 to your computer and use it in GitHub Desktop.
Save AnteaterKit/15570b9c0494917625491b31416ac783 to your computer and use it in GitHub Desktop.
Fine-tune embeded for Ollama
pip install torch torchvision torchaudio scikit-learn
pip install nomic torch transformers sentence-transformers transformers torch nomic einops
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import random
# Установка seed
def set_seed(seed=42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
set_seed()
# Загрузка модели
model = SentenceTransformer(
"nomic-ai/nomic-embed-text-v1.5",
trust_remote_code=True
)
# Пример данных
triplets = [
("Audi Q7 2023", "Audi A6 2022", "BMW X5 2023"),
("Audi Q7 2023", "Audi A6 2022", "Skoda Octavia"),
("Audi Q7 2023", "Audi A5 2022", "Renault Logan"),
("Mercedes S-Class", "Mercedes E-Class", "Audi A8"),
("Mercedes S-Class", "Mercedes E-Class", "Audi A5"),
("BMW 5 Series", "BMW 3 Series", "Lada Vesta"),
("Toyota Camry", "Toyota Corola", "Lada Vesta"),
("Lada Vesta", "Lada Granta", "Renault Logan"),
("Skoda Octavia", "Volkswagen Passat", "Toyota Camry")
]
# Датасет
class TripletDataset(Dataset):
def __init__(self, triplets):
self.triplets = triplets
def __len__(self):
return len(self.triplets)
def __getitem__(self, idx):
return self.triplets[idx]
dataset = TripletDataset(triplets)
train_dataloader = DataLoader(dataset, shuffle=True, batch_size=6)
# Устройство
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.train()
# Функция потерь и оптимизатор
triplet_loss = nn.TripletMarginLoss(margin=1.5, p=2)
optimizer = AdamW(model.parameters(), lr=2e-5)
# Тестовая функция
def test_model(model, test_texts):
model.eval()
with torch.no_grad():
embeddings = model.encode(test_texts)
sim_matrix = cosine_similarity(embeddings)
print("\n🔍 Матрица сходства:")
for i, row in enumerate(sim_matrix):
print(f"{test_texts[i][:15]:<15} " + " ".join([f"{x:.2f}" for x in row]))
model.train()
return sim_matrix
# Тестовые данные
test_texts = ["Audi Q7", "Audi A6", "BMW X5", "Mercedes S-Class", "Toyota Camry", "Toyota Corola", "Lada Granta", "Skoda Octavia"]
print("🧪 Тест ДО обучения:")
test_model(model, test_texts)
# Обучение
print("\n🔁 Начало обучения...")
for epoch in range(10):
total_loss = 0
for batch in train_dataloader:
# Каждый элемент батча - это кортеж из трех строк
anchors = [item[0] for item in batch]
positives = [item[1] for item in batch]
negatives = [item[2] for item in batch]
# Объединяем все тексты для пакетной обработки
all_texts = anchors + positives + negatives
# Правильный способ получения эмбеддингов с сохранением графа вычислений
features = model.tokenize(all_texts)
features = {k: v.to(device) for k, v in features.items()}
output = model(features)
all_embeddings = output['sentence_embedding']
all_embeddings = torch.nn.functional.normalize(all_embeddings, p=2, dim=1)
# Разделяем результаты
batch_size = len(anchors)
anchor_emb = all_embeddings[:batch_size]
positive_emb = all_embeddings[batch_size:2*batch_size]
negative_emb = all_embeddings[2*batch_size:]
# Расчет потерь
loss = triplet_loss(anchor_emb, positive_emb, negative_emb)
total_loss += loss.item()
# Обновление весов
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Эпоха {epoch+1} | Средние потери: {total_loss/len(train_dataloader):.4f}")
print("\n🧪 Тест ПОСЛЕ обучения:")
test_model(model, test_texts)
# Сохранение модели
output_path = "./fine_tuned_nomic_embed1"
model.save(output_path)
print(f"\n✅ Модель сохранена в {output_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment