This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torch.nn as nn | |
| import time | |
| import numpy as np | |
| from optimum.quanto import Calibration, freeze, qint4, qint8, quantize, qfloat8, qfloat8_e4m3fn | |
| from torch.profiler import ProfilerActivity, profile | |
| M_SHAPES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] | |
| N_SHAPE = 4096 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig | |
| import torch | |
| from transformers.cache_utils import StaticCache | |
| import logging | |
| import time | |
| #model_id = "fxmarty/tiny-llama-fast-tokenizer" | |
| model_id = "meta-llama/Meta-Llama-3-8B-Instruct" | |
| tokenizer = AutoTokenizer.from_pretrained( |
OlderNewer