Skip to content

Instantly share code, notes, and snippets.

View fxmarty's full-sized avatar

fxmarty

View GitHub Profile
@fxmarty
fxmarty / benchmark_quanto.py
Created July 17, 2024 14:53
benchmark quanto
import torch
import torch.nn as nn
import time
import numpy as np
from optimum.quanto import Calibration, freeze, qint4, qint8, quantize, qfloat8, qfloat8_e4m3fn
from torch.profiler import ProfilerActivity, profile
M_SHAPES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
N_SHAPE = 4096
@fxmarty
fxmarty / transformers_compile.py
Created July 25, 2024 14:47
transformers_compile.py
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch
from transformers.cache_utils import StaticCache
import logging
import time
#model_id = "fxmarty/tiny-llama-fast-tokenizer"
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(