Skip to content

Instantly share code, notes, and snippets.

View tomaarsen's full-sized avatar

Tom Aarsen tomaarsen

View GitHub Profile
@tomaarsen
tomaarsen / tnp_example.py
Last active September 8, 2019 09:40
Python The Noun Project API Wrapper example
# See the Documentation for more information: https://cubiedev.github.io/TheNounProjectAPI
# Install Module using "pip install TheNounProjectAPI".
# This sample works for version >= 1.0.5
from TheNounProjectAPI import API
if __name__ == "__main__":
# API Key and Secret from https://api.thenounproject.com/getting_started.html#creating-an-api-key
key = "<my api key>"
secret = "<my api secret>"
@tomaarsen
tomaarsen / demo.py
Created April 20, 2023 12:33
Logging losses for SetFit
import functools
from dataclasses import dataclass
from typing import Callable
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from torch import nn
import wandb
from setfit import SetFitModel, SetFitTrainer, sample_dataset
@tomaarsen
tomaarsen / cookiecutter.sh
Created April 21, 2023 12:00
Checkout a working, updated RAFT Submission repo
# Clone the full original repo into a local directory
git clone https://huggingface.co/datasets/ought/raft-submission local-raft-submission
cd local-raft-submission
# Fetch the PR and place it in a branch called pr/3
git fetch origin pr/3:pr/3
# Checkout the new branch
git checkout pr/3
@tomaarsen
tomaarsen / train_span_marker_keyphrase.py
Created August 9, 2023 16:46
Keyphrase extraction model with SpanMarker
from datasets import load_dataset, concatenate_datasets
from transformers import TrainingArguments
from span_marker import SpanMarkerModel, Trainer
def main() -> None:
# Load the dataset, ensure "tokens" and "ner_tags" columns, and get a list of labels
dataset = load_dataset("midas/inspec", "extraction")
dataset = dataset.rename_columns({"document": "tokens", "doc_bio_tags": "ner_tags"})
# Map string labels to integer labels instead
@tomaarsen
tomaarsen / handler.py
Created September 20, 2023 08:56
SpanMarker handler.py for Inference Endpoints
from typing import Any, Dict, List
from span_marker import SpanMarkerModel
class EndpointHandler:
def __init__(self, model_id: str) -> None:
self.model = SpanMarkerModel.from_pretrained(model_id)
# Try to place it on CUDA, do nothing if it fails
self.model.try_cuda()
@tomaarsen
tomaarsen / demo.py
Created December 7, 2023 20:54
Attention Sinks in `transformers` showcase
from transformers import AutoTokenizer, SinkCache, LlamaForCausalLM, TextStreamer
import torch
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = LlamaForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
)
inputs = tokenizer(["Vaswani et al. (2017) introduced the Transformers"], return_tensors="pt").to(model.device)
cache = SinkCache(window_length=256, num_sink_tokens=4)
from functools import partial
import datasets
from sentence_transformers import (
SentenceTransformer,
evaluation,
)
from torch.nn import functional as F
stsb = datasets.load_dataset("mteb/stsbenchmark-sts", split="test")
@tomaarsen
tomaarsen / snowflake_arctic_trust_remote_code.ipynb
Created April 24, 2024 15:16
Snowflake_Arctic_trust_remote_code.ipynb
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@tomaarsen
tomaarsen / export_locally.py
Created October 15, 2024 12:30
Export Sentence Transformer models to ONNX (+ optimization, quantization) & OpenVINO
# requires sentence_transformers>=3.2.0
from sentence_transformers import SentenceTransformer, export_optimized_onnx_model, export_dynamic_quantized_onnx_model
# The model to export to ONNX (+ optimize, quantize), OpenVINO
model_id = "mixedbread-ai/mxbai-embed-large-v1"
# Where to save the exported models locally
output_dir = model_id.replace("/", "-")
onnx_model = SentenceTransformer(model_id, backend="onnx", model_kwargs={"export": True})
onnx_model.save_pretrained(output_dir)