Skip to content

Instantly share code, notes, and snippets.

@iqiancheng
Forked from ptrblck/layernorm_vs_fused
Last active January 30, 2024 09:24
Show Gist options
  • Save iqiancheng/7c35457b2ab41316f4bd697506970e74 to your computer and use it in GitHub Desktop.
Save iqiancheng/7c35457b2ab41316f4bd697506970e74 to your computer and use it in GitHub Desktop.
layernorm_vs_fused
import torch
import torch.nn as nn
torch.backends.cudnn.benchmark = True
from apex.normalization import FusedLayerNorm
import time
# Create data
x = torch.randn(64, 16, 224, 224, device='cuda')
# upstream layernorm
norm = nn.LayerNorm(x.size()[1:]).cuda()
# cudnn warmup
for _ in range(50):
_ = norm(x)
nb_iters = 1000
torch.cuda.synchronize()
t0 = time.time()
for _ in range(nb_iters):
_ = norm(x)
torch.cuda.synchronize()
t1 = time.time()
print('upstream layernorm {:.3f}'.format(t1 -t0))
# apex fusedlayernorm
fused_norm = FusedLayerNorm(x.size()[1:]).cuda()
# cudnn warmup
for _ in range(50):
_ = fused_norm(x)
nb_iters = 1000
torch.cuda.synchronize()
t0 = time.time()
for _ in range(nb_iters):
_ = fused_norm(x)
torch.cuda.synchronize()
t1 = time.time()
print('apex layernorm {:.3f}'.format(t1 -t0))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment