TensorRT supports two approaches to prepare model for Quantization - Calibration or Training
First we need to add/replace regular model nn.Layers with TRT pytorch_quantization.nn layers. Quantization layers will gather statistics required for quantization.
Once the model is modified we can use the following approaches to gather statistics before quantization:
- Calibrate pre-trainer model
- Train (1 epoch) pre-trainer model
Resulting model should be exported to ONNX
ONNX model can be converted to int8 TRT engine
docker run -ti -v ~/workspace:/root/workspace \
--gpus all --name py2301 \
--ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
nvcr.io/nvidia/pytorch:23.01-py3
Install additional packages
pip3 install -U pip
pip3 install pycuda torchinfo
Get test image
curl -L -o cat.jpg https://i.ibb.co/tXK0D91/Screen-Shot-2023-02-07-at-12-11-08-PM.jpg
cd ~/workspace
git clone https://github.com/pytorch/vision.git torchvision
cd torchvision/references/classification
export PYTHONPATH=$PWD
cd ~/workspace
cd ~/workspace && mkdir datasets && cd datasets
# Train dataset - 138 GB
wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_train.tar --no-check-certificate
# Validation dataset - 6.3 GB
wget https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar --no-check-certificate
# Alternatievly we can use [academictorrents.com](academictorrents.com) to get datasets using torrents
# Train
wget https://academictorrents.com/download/a306397ccf9c2ead27155983c254227c0fd938e2.torrent
transmission-cli a306397ccf9c2ead27155983c254227c0fd938e2.torrent
mv Downloads/ILSVRC2012_img_train.tar .
# Validation
wget https://academictorrents.com/download/5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5.torrent
transmission-cli 5d6d0df7ed81efd49ca99ea4737e0ae5e3a5f2e5.torrent
mv Downloads/ILSVRC2012_img_val.tar .
# To extract and prepare Train Dataset
mkdir -p imagenet/train
cd imagenet/train
tar -xvf ../../ILSVRC2012_img_train.tar
# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category
# Lets extract them to corresponding folders
find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
cd ~/workspace/datasets
# To extract and prepare Validation Dataset
mkdir -p imagenet/val
cd imagenet/val
tar -xvf ../../ILSVRC2012_img_val.tar
# get script from soumith and run
# this script creates all class directories and moves images into corresponding directories
wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
cd ~/workspace
from pathlib import Path
import os
import torch
from torchvision import datasets
from torchvision import transforms
dataset_dir = os.path.join(Path.home(), "workspace/datasets/imagenet")
batch_size = 64
kwargs = {"num_workers": 4, "pin_memory": True}
norm = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# Training
train_dir = os.path.join(dataset_dir, "train")
train_trans = [
transforms.RandomHorizontalFlip(),
transforms.RandomResizedCrop(224),
transforms.ToTensor(),
norm,
]
train_data = datasets.ImageFolder(train_dir, transform=transforms.Compose(train_trans))
# Prepare smaller train dataset (10% of the original size) to speedup the process
train_data_small, _ = torch.utils.data.random_split(
train_data, [0.1, 0.9], generator=torch.Generator().manual_seed(42)
)
train_data_loader = torch.utils.data.DataLoader(
train_data_small, batch_size=batch_size, shuffle=True, **kwargs
)
# Validation
val_dir = os.path.join(dataset_dir, "val")
val_trans = [
transforms.Resize(232),
transforms.CenterCrop(224),
transforms.ToTensor(),
norm,
]
val_data = datasets.ImageFolder(val_dir, transform=transforms.Compose(val_trans))
val_data_loader = torch.utils.data.DataLoader(
val_data, batch_size=batch_size, shuffle=True, **kwargs
)
Prepare special Resnet50 model where normall nn Layers are automatically replaced with TRT pytorch_quantization.nn layers (QuantConv2d).
import torch
from torchvision.io import read_image
from torchvision.models import resnet50, ResNet50_Weights
from torchinfo import summary
from tqdm import tqdm
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import calib
from pytorch_quantization.tensor_quant import QuantDescriptor
# Set default QuantDescriptor to use histogram based calibration for activation
quant_desc_input = QuantDescriptor(calib_method='histogram')
quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
# Now new models will automatically have QuantConv2d layers instead of regular Conv2d
from pytorch_quantization import quant_modules
quant_modules.initialize()
img = read_image("cat.jpg")
weights = ResNet50_Weights.DEFAULT
preprocess = weights.transforms()
batch = preprocess(img).unsqueeze(0)
model = resnet50(weights=weights)
model=model.eval()
summary(model, batch.shape) # make sure the model consist of QuantConv2d layers
model=model.cuda()
batch=batch.cuda()
prediction = model(batch).squeeze(0).softmax(0)
class_id = prediction.argmax().item()
score = prediction[class_id].item()
category_name = weights.meta["categories"][class_id]
print(f"{category_name}: {100 * score:.1f}%")
from train import evaluate
criterion = torch.nn.CrossEntropyLoss()
with torch.no_grad():
evaluate(model, criterion, val_data_loader, device="cuda", print_freq=20)
Switch the model to calibration mode and feed data into it
def collect_stats(model, data_loader, num_batches):
"""Feed data to the network and collect statistic"""
# Enable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
progress_bar=tqdm(data_loader, total=num_batches, desc='Calibrate')
for i, (data, target) in enumerate(progress_bar):
prediction=model(data.cuda())
if i >= num_batches:
break
# Disable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.enable_quant()
module.disable_calib()
else:
module.enable()
def compute_amax(model, **kwargs):
# Load calib result
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, calib.MaxCalibrator):
module.load_calib_amax()
else:
module.load_calib_amax(**kwargs)
#print(F"{name:40}: {module}")
model.cuda()
with torch.no_grad():
collect_stats(model, train_data_loader, num_batches=20)
We can try different calibrations and see which one works the best
with torch.no_grad():
print("percentile 99.99 calibration")
compute_amax(model, method="percentile", percentile=99.99)
evaluate(model, criterion, val_data_loader, device="cuda", print_freq=20)
with torch.no_grad():
print("percentile 99.9 calibration")
compute_amax(model, method="percentile", percentile=99.9)
evaluate(model, criterion, val_data_loader, device="cuda", print_freq=20)
with torch.no_grad():
method="entropy"
print(F"{method} calibration")
compute_amax(model, method=method)
evaluate(model, criterion, val_data_loader, device="cuda", print_freq=20)
with torch.no_grad():
method="mse"
print(F"{method} calibration")
compute_amax(model, method=method)
evaluate(model, criterion, val_data_loader, device="cuda", print_freq=20)
Save weights
# Save calibrated model state dictionary
torch.save(model.state_dict(), "quant_resnet50-calibrated.pth")
We can fine-tune the calibrated model to improve accuracy further.
import torch
from torchvision.models import resnet50
model = resnet50()
model.load_state_dict(torch.load("quant_resnet50-calibrated.pth"))
model=model.cuda()
from train import train_one_epoch
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
class args:
print_freq=10
clip_grad_norm=None
model_ema_steps=32
lr_warmup_epochs=0
train_one_epoch(model, criterion, optimizer, train_data_loader, "cuda", 0, args)
# Save the model
torch.save(model.state_dict(), "quant_resnet50-finetuned.pth")
from pytorch_quantization import nn as quant_nn
quant_nn.TensorQuantizer.use_fb_fake_quant = True
dummy_input = torch.randn(1, 3, 224, 224, device="cuda")
input_names = ["input0"]
output_names = ["output0"]
dynamic_axes = {"input0": {0: "batch"}, "output0": {0: "batch"}}
# Sets the model to inference mode - train(False)
model = model.eval()
y = model(dummy_input)
torch.onnx.export(
model,
dummy_input,
"quant_resnet50.onnx",
verbose=True,
input_names=input_names,
output_names=output_names,
dynamic_axes=dynamic_axes,
)
trtexec \
--int8 \
--verbose \
--onnx=quant_resnet50.onnx \
--saveEngine=quant_resnet50.trt \
--minShapes=input0:1x3x224x224 \
--optShapes=input0:8x3x224x224 \
--maxShapes=input0:16x3x224x224
==== Run TRT Engine using python API ====
import numpy as np
from torchvision.io import read_image
from torchvision.models import ResNet50_Weights
img = read_image("cat.jpg")
preprocess = ResNet50_Weights.DEFAULT.transforms()
batch = preprocess(img).unsqueeze(0)
batch = batch.numpy()
batch = np.concatenate([batch]*8)
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import sys
import numpy as np
trt_logger = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(trt_logger)
fpath="quant_resnet50.trt"
with open(fpath, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
BATCH_SIZE = 8
context.set_input_shape("input0", (BATCH_SIZE, 3, 224, 224))
print("Engine Info:")
for i, binding in enumerate(engine):
shape = [engine.max_batch_size, *engine.get_binding_shape(binding)]
dtype = trt.nptype(engine.get_binding_dtype(binding))
volume = abs(trt.volume(engine.get_binding_shape(binding)))
if engine.binding_is_input(binding):
desc = "input"
else:
desc = "output"
print(f"{i} type: {desc}\n binding: {binding} \n data: {np.dtype(dtype).name}\n shape: {shape} => {volume} \n")
USE_FP16 = False
target_dtype = np.float16 if USE_FP16 else np.float32
output = np.empty([BATCH_SIZE, 1000], dtype = target_dtype)
# allocate device memory
d_input = cuda.mem_alloc(1 * batch.nbytes)
d_output = cuda.mem_alloc(1 * output.nbytes)
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()
def predict(batch): # result gets copied into output
# transfer input data to device
cuda.memcpy_htod_async(d_input, batch, stream)
# execute model
context.execute_async_v2(bindings, stream.handle, None)
# transfer predictions back
cuda.memcpy_dtoh_async(output, d_output, stream)
# syncronize threads
stream.synchronize()
predict(batch)
best_ids=np.argmax(output,axis=-1)
print("Best class ids:", best_ids)
# Warmup
for i in range(100):
predict(batch)
# Measure Latency
import time
TT=[]
for i in range(100):
t0=time.time()
predict(batch)
t1=time.time()
TT.append((t1-t0)*1000/BATCH_SIZE)
print("AVG time (ms):",np.mean(TT))
print("P50 time (ms):",np.percentile(TT, 50))
print("P95 time (ms):",np.percentile(TT, 95))