Skip to content

Instantly share code, notes, and snippets.

@tomaarsen
Created March 25, 2024 09:34
Show Gist options
  • Save tomaarsen/2f6d9691dc4d24687998b7462ece7646 to your computer and use it in GitHub Desktop.
Save tomaarsen/2f6d9691dc4d24687998b7462ece7646 to your computer and use it in GitHub Desktop.
from functools import partial
import datasets
from sentence_transformers import (
SentenceTransformer,
evaluation,
)
from torch.nn import functional as F
stsb = datasets.load_dataset("mteb/stsbenchmark-sts", split="test")
def decorator(fn, dim):
def wrapper(self, *args, **kwargs):
kwargs["convert_to_tensor"] = True
output = fn(self, *args, **kwargs).cpu()
# Nomic v1.5 needs layer normalization:
# output = F.layer_norm(output, (output.shape[1],))
output = output[..., :dim]
output = F.normalize(output, p=2, dim=1)
return output
return wrapper
model = SentenceTransformer("tomaarsen/mpnet-base-nli")
old_encode = model.encode
for dim in [768, 512, 256, 128, 64, 32, 16, 8, 4]:
model.encode = decorator(old_encode, dim=dim)
evaluator = evaluation.EmbeddingSimilarityEvaluator(
stsb["sentence1"],
stsb["sentence2"],
[score / 5 for score in stsb["score"]],
main_similarity=evaluation.SimilarityFunction.COSINE,
name="sts-test",
)
results = evaluator(model)
print(dim, results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment