fxmarty · July 17, 2024 14:53
diff --git a/benchmark_quanto.py b/benchmark_quanto.py
 import torch
 import torch.nn as nn
 import time
 import numpy as np

 from optimum.quanto import Calibration, freeze, qint4, qint8, quantize, qfloat8, qfloat8_e4m3fn
 from torch.profiler import ProfilerActivity, profile

 M_SHAPES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
 N_SHAPE = 4096
 K_SHAPE = 4096

 class MyModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.lin1 = nn.Linear(K_SHAPE, N_SHAPE, bias=False)

    def forward(self, inp):
        return self.lin1(inp)

 def keyword_to_itype(k):
    return {"none": None, "int4": qint4, "int8": qint8, "float8": qfloat8, "float8_e4m3fn": qfloat8_e4m3fn}[k]

 model = MyModel().to(torch.float16)
 model = model.eval()


 device = "cuda"
 seed = 42
 weights = "float8_e4m3fn"
 activations = "none"
 batch_size = 10
 samples = 10

 torch.manual_seed(seed)
 device = torch.device("cuda")

 model = model.to(device)

 original_weight = model.lin1.weight.data.clone()


 print("Float model")
 weights = keyword_to_itype(weights)
 activations = keyword_to_itype(activations)

 print("------ QUANTIZING")
 quantize(model, weights=weights, activations=activations)

 print("------ FREEZING")
 freeze(model)
 print(f"Quantized model (w: {weights}, a: {activations})")

 print("--------- INFERENCE")
 def run_linear_marlin(inp, weight):
    workspace = weight._workspace
    scale = weight._scale

    # assert inp.ndim == 2

    inp = inp.view(-1, inp.shape[-1])
    out = torch.ops.quanto_ext.fp8_marlin(
        inp,
        b_q_weight=weight._data,
        b_scales=scale,
        workspace=weight._workspace,
        num_bits=8,
        size_m=inp.shape[0],
        size_n=scale.shape[1],
        size_k=inp.shape[1],
    )
    return out.reshape(inp.shape[:-1] + (scale.shape[1],))

 def run_native_linear(inp, weight):
    return torch.nn.functional.linear(inp, weight)

 n_runs = 50
 tps_quanto_model = []
 tps_ops_call = []
 tps_native = []

 def benchmark_func(func, kwargs):
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)
    torch.cuda.synchronize()
    start_event.record()
    res = func(**kwargs)

    end_event.record()
    torch.cuda.synchronize(device)

    return start_event.elapsed_time(end_event)

 n = N_SHAPE
 k = K_SHAPE

 result = "m,n_out,k_in,mean_quanto_model_ms,mean_ops_call_ms,mean_native_ms\n"
 with torch.no_grad():
    for m in M_SHAPES:
        inp = torch.rand(m, K_SHAPE, dtype=torch.float16).to(device)

        res = model(inp)
        res = run_linear_marlin(inp, model.lin1.weight)
        res = run_native_linear(inp, original_weight)

        for _ in range(n_runs):
            latency_ms = benchmark_func(model, kwargs={"inp": inp})
            tps_quanto_model.append(latency_ms)

            latency_ms = benchmark_func(run_linear_marlin, kwargs={"inp": inp, "weight": model.lin1.weight})
            tps_ops_call.append(latency_ms)

            latency_ms = benchmark_func(run_native_linear, kwargs={"inp": inp, "weight": original_weight})
            tps_native.append(latency_ms)

            mean_quanto_model = np.mean(tps_quanto_model)
            mean_ops_call = np.mean(tps_ops_call)
            mean_native = np.mean(tps_native)

        result += ",".join([
                str(m),
                str(n),
                str(k),
                f"{mean_quanto_model:.4f}",
                f"{mean_ops_call:.4f}",
                f"{mean_native:.4f}",
            ]) + "\n"

 print(result)
	import torch
	import torch.nn as nn
	import time
	import numpy as np

	from optimum.quanto import Calibration, freeze, qint4, qint8, quantize, qfloat8, qfloat8_e4m3fn
	from torch.profiler import ProfilerActivity, profile

	M_SHAPES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
	N_SHAPE = 4096
	K_SHAPE = 4096

	class MyModel(nn.Module):
	def __init__(self):
	super().__init__()

	self.lin1 = nn.Linear(K_SHAPE, N_SHAPE, bias=False)

	def forward(self, inp):
	return self.lin1(inp)

	def keyword_to_itype(k):
	return {"none": None, "int4": qint4, "int8": qint8, "float8": qfloat8, "float8_e4m3fn": qfloat8_e4m3fn}[k]

	model = MyModel().to(torch.float16)
	model = model.eval()


	device = "cuda"
	seed = 42
	weights = "float8_e4m3fn"
	activations = "none"
	batch_size = 10
	samples = 10

	torch.manual_seed(seed)
	device = torch.device("cuda")

	model = model.to(device)

	original_weight = model.lin1.weight.data.clone()


	print("Float model")
	weights = keyword_to_itype(weights)
	activations = keyword_to_itype(activations)

	print("------ QUANTIZING")
	quantize(model, weights=weights, activations=activations)

	print("------ FREEZING")
	freeze(model)
	print(f"Quantized model (w: {weights}, a: {activations})")

	print("--------- INFERENCE")
	def run_linear_marlin(inp, weight):
	workspace = weight._workspace
	scale = weight._scale

	# assert inp.ndim == 2

	inp = inp.view(-1, inp.shape[-1])
	out = torch.ops.quanto_ext.fp8_marlin(
	inp,
	b_q_weight=weight._data,
	b_scales=scale,
	workspace=weight._workspace,
	num_bits=8,
	size_m=inp.shape[0],
	size_n=scale.shape[1],
	size_k=inp.shape[1],
	)
	return out.reshape(inp.shape[:-1] + (scale.shape[1],))

	def run_native_linear(inp, weight):
	return torch.nn.functional.linear(inp, weight)

	n_runs = 50
	tps_quanto_model = []
	tps_ops_call = []
	tps_native = []

	def benchmark_func(func, kwargs):
	start_event = torch.cuda.Event(enable_timing=True)
	end_event = torch.cuda.Event(enable_timing=True)
	torch.cuda.synchronize()
	start_event.record()
	res = func(**kwargs)

	end_event.record()
	torch.cuda.synchronize(device)

	return start_event.elapsed_time(end_event)

	n = N_SHAPE
	k = K_SHAPE

	result = "m,n_out,k_in,mean_quanto_model_ms,mean_ops_call_ms,mean_native_ms\n"
	with torch.no_grad():
	for m in M_SHAPES:
	inp = torch.rand(m, K_SHAPE, dtype=torch.float16).to(device)

	res = model(inp)
	res = run_linear_marlin(inp, model.lin1.weight)
	res = run_native_linear(inp, original_weight)

	for _ in range(n_runs):
	latency_ms = benchmark_func(model, kwargs={"inp": inp})
	tps_quanto_model.append(latency_ms)

	latency_ms = benchmark_func(run_linear_marlin, kwargs={"inp": inp, "weight": model.lin1.weight})
	tps_ops_call.append(latency_ms)

	latency_ms = benchmark_func(run_native_linear, kwargs={"inp": inp, "weight": original_weight})
	tps_native.append(latency_ms)

	mean_quanto_model = np.mean(tps_quanto_model)
	mean_ops_call = np.mean(tps_ops_call)
	mean_native = np.mean(tps_native)

	result += ",".join([
	str(m),
	str(n),
	str(k),
	f"{mean_quanto_model:.4f}",
	f"{mean_ops_call:.4f}",
	f"{mean_native:.4f}",
	]) + "\n"

	print(result)