Skip to content

Instantly share code, notes, and snippets.

View tomaarsen's full-sized avatar

Tom Aarsen tomaarsen

View GitHub Profile
@tomaarsen
tomaarsen / export_locally.py
Created October 15, 2024 12:30
Export Sentence Transformer models to ONNX (+ optimization, quantization) & OpenVINO
# requires sentence_transformers>=3.2.0
from sentence_transformers import SentenceTransformer, export_optimized_onnx_model, export_dynamic_quantized_onnx_model
# The model to export to ONNX (+ optimize, quantize), OpenVINO
model_id = "mixedbread-ai/mxbai-embed-large-v1"
# Where to save the exported models locally
output_dir = model_id.replace("/", "-")
onnx_model = SentenceTransformer(model_id, backend="onnx", model_kwargs={"export": True})
onnx_model.save_pretrained(output_dir)
@tomaarsen
tomaarsen / snowflake_arctic_trust_remote_code.ipynb
Created April 24, 2024 15:16
Snowflake_Arctic_trust_remote_code.ipynb
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from functools import partial
import datasets
from sentence_transformers import (
SentenceTransformer,
evaluation,
)
from torch.nn import functional as F
stsb = datasets.load_dataset("mteb/stsbenchmark-sts", split="test")
@tomaarsen
tomaarsen / demo.py
Created December 7, 2023 20:54
Attention Sinks in `transformers` showcase
from transformers import AutoTokenizer, SinkCache, LlamaForCausalLM, TextStreamer
import torch
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = LlamaForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
)
inputs = tokenizer(["Vaswani et al. (2017) introduced the Transformers"], return_tensors="pt").to(model.device)
cache = SinkCache(window_length=256, num_sink_tokens=4)
@tomaarsen
tomaarsen / handler.py
Created September 20, 2023 08:56
SpanMarker handler.py for Inference Endpoints
from typing import Any, Dict, List
from span_marker import SpanMarkerModel
class EndpointHandler:
def __init__(self, model_id: str) -> None:
self.model = SpanMarkerModel.from_pretrained(model_id)
# Try to place it on CUDA, do nothing if it fails
self.model.try_cuda()
@tomaarsen
tomaarsen / train_span_marker_keyphrase.py
Created August 9, 2023 16:46
Keyphrase extraction model with SpanMarker
from datasets import load_dataset, concatenate_datasets
from transformers import TrainingArguments
from span_marker import SpanMarkerModel, Trainer
def main() -> None:
# Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
dataset = load_dataset("midas/inspec", "extraction")
dataset = dataset.rename_columns({"document": "tokens", "doc_bio_tags": "ner_tags"})
# Map string labels to integer labels instead
@tomaarsen
tomaarsen / cookiecutter.sh
Created April 21, 2023 12:00
Checkout a working, updated RAFT Submission repo
# Clone the full original repo into a local directory
git clone https://huggingface.co/datasets/ought/raft-submission local-raft-submission
cd local-raft-submission
# Fetch the PR and place it in a branch called pr/3
git fetch origin pr/3:pr/3
# Checkout the new branch
git checkout pr/3
@tomaarsen
tomaarsen / demo.py
Created April 20, 2023 12:33
Logging losses for SetFit
import functools
from dataclasses import dataclass
from typing import Callable
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from torch import nn
import wandb
from setfit import SetFitModel, SetFitTrainer, sample_dataset
@tomaarsen
tomaarsen / tnp_example.py
Last active September 8, 2019 09:40
Python The Noun Project API Wrapper example
# See the Documentation for more information: https://cubiedev.github.io/TheNounProjectAPI
# Install Module using "pip install TheNounProjectAPI".
# This sample works for version >= 1.0.5
from TheNounProjectAPI import API
if __name__ == "__main__":
# API Key and Secret from https://api.thenounproject.com/getting_started.html#creating-an-api-key
key = "<my api key>"
secret = "<my api secret>"