Skip to content

Instantly share code, notes, and snippets.

View tomaarsen's full-sized avatar

Tom Aarsen tomaarsen

View GitHub Profile
@tomaarsen
tomaarsen / train_script.py
Created July 9, 2025 13:00
Boilerplate to train a reranker model using Sentence Transformers
import logging
import traceback
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import (
CrossEncoder,
CrossEncoderModelCardData,
@tomaarsen
tomaarsen / train_script.py
Created August 28, 2025 14:30
MS MARCO Contrastive and/or Distillation sample training script
import argparse
import logging
import traceback
from collections import defaultdict
from collections.abc import Iterable
from enum import Enum, auto
import torch
from datasets import load_dataset
from torch import Tensor
@tomaarsen
tomaarsen / update_e5_nl.py
Created September 24, 2025 16:08
Script to update all E5-NL models to be nicely integrated with Sentence Transformers
import re
from huggingface_hub import get_collection, ModelCard
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Normalize
collection = get_collection(collection_slug="clips/e5-nl-68be9d3760240ce5c7d9f831")
ST_SNIPPET_PATTERN = r"""\
from sentence_transformers import SentenceTransformer
model = SentenceTransformer\((?:'|")([a-zA-Z0-9_\/\.-]+?)(?:'|")\)