Skip to content

Instantly share code, notes, and snippets.

View alvarobartt's full-sized avatar
🚢

Alvaro Bartolome alvarobartt

🚢
View GitHub Profile
# pip install "distilabel[vllm]>=1.1.1"
# pip install flash-attn --no-build-isolation
# huggingface-cli login
import time
from distilabel.llms import vLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import KeepColumns, LoadHubDataset
from distilabel.steps.tasks import PrometheusEval
import time
from typing import Any, Dict, Literal
from distilabel.llms import vLLM
from distilabel.llms.typing import ChatType
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromDicts
from distilabel.steps.tasks.prometheus_eval import PrometheusEval
_CUSTOM_RUBRICS = {
@alvarobartt
alvarobartt / kv_cache_computation.py
Last active March 27, 2025 17:04
KV Cache Size Computation
from transformers import AutoConfig
if __name__ == "__main__":
config = AutoConfig.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", token="hf_...")
tokens_in_cache = 1024 # this is the only arg that will change over time (as more requests are sent)
precision_in_bytes = 2 # float16 or bfloat16
cache_size_bytes = (
2 *
@alvarobartt
alvarobartt / torch_text_generation_mps.py
Last active August 20, 2024 07:09
Simple script on using `torch` for text-generation with a `transformers` model one token at a time on MPS.
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# Define the model name
model_name = "HuggingFaceTB/SmolLM-1.7B-Instruct"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
@alvarobartt
alvarobartt / diffusers_flux_lora_inference.py
Last active August 31, 2024 08:42
Run FLUX LoRA with `diffusers` with `alvarobartt/ghibli-characters-flux-lora` adapter weights
import torch
from diffusers import DiffusionPipeline
model_id = "black-forest-labs/FLUX.1-dev"
adapter_id = "alvarobartt/ghibli-characters-flux-lora"
pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
pipeline.load_lora_weights(adapter_id)
pipeline.to("cuda")
@alvarobartt
alvarobartt / duckdb_query_to_datasets.py
Created September 12, 2024 15:49
DuckDB SQL query to datasets.Dataset
import duckdb
from datasets import Dataset
# Create DuckDB connection
con = duckdb.connect()
con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")
# Query the dataset
query = """
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@alvarobartt
alvarobartt / required_vram.py
Last active January 31, 2025 15:50
Calculates the required VRAM for different precisions based on the number of parameters of a model (pulled from the Hugging Face Hub Safetensors metadata). This Gist is inspired on https://gist.github.com/philschmid/d188034c759811a7183e7949e1fa0aa4.
from huggingface_hub import get_safetensors_metadata
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
precision = "F8"
dtype_bytes = {"F32": 4, "F16": 2, "BF16": 2, "F8": 1, "INT8": 1, "INT4": 0.5}
metadata = get_safetensors_metadata(model_id)
memory = ((sum(metadata.parameter_count.values()) * dtype_bytes[precision]) / (1024**3)) * 1.18
print(f"{model_id=} requires {memory=}GB")
@alvarobartt
alvarobartt / estimate-deepseek-vram.py
Created January 31, 2025 15:52
Calculates the required VRAM for DeepSeek R1 (pulled from the Hugging Face Hub Safetensors metadata)
from huggingface_hub import get_safetensors_metadata
model_id = "deepseek-ai/DeepSeek-R1"
dtype_bytes = {"F32": 4, "F16": 2, "F8": 1}
metadata = get_safetensors_metadata(model_id)
memory = (
sum(count * dtype_bytes[key.split("_")[0]] for key, count in metadata.parameter_count.items())
/ (1024**3)
* 1.18