jojonki · January 10, 2018 15:26
diff --git a/benchmark.py b/benchmark.py
 # see original code
 # https://discuss.pytorch.org/t/solved-titan-v-on-pytorch-0-3-0-cuda-9-0-cudnn-7-0-is-much-slower-than-1080-ti/11320/10?u=jef

 import torch
 from torchvision.models import vgg16,densenet121,resnet152
 from time import time
 import torch.nn as nn
 import torch.backends.cudnn as cudnn
 import torch.optim
 from torch.autograd import Variable
 import torchvision.models as models
 torch.backends.cudnn.benchmark=True
 model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

 print('cuda version=', torch.version.cuda)
 print('cudnn version=', torch.backends.cudnn.version())

 for arch in ['densenet121', 'vgg16', 'resnet152']:
    model   = models.__dict__[arch]().cuda()
    loss   = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 0.001,
                                        momentum=0.9,
                                        weight_decay=1e-5)
    durations = []
    num_runs = 100

    for i in range(num_runs + 1):
        x = torch.rand(16, 3, 224, 224)
        x_var = torch.autograd.Variable(x).cuda()
        target = Variable(torch.LongTensor(16).fill_(1).cuda())
        torch.cuda.synchronize()
        t1 = time()
        out = model(x_var)
        err = loss(out, target)
        err.backward()
        optimizer.step()
        torch.cuda.synchronize()
        t2 = time()

        # treat the initial run as warm up and don't count
        if i > 0:
            durations.append(t2 - t1)

    print('{} FP 32 avg over {} runs: {} ms'.format(arch, len(durations), sum(durations) / len(durations) * 1000)) 

    model   = models.__dict__[arch]().cuda().half()
    loss   = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 0.001,
                                        momentum=0.9,
                                        weight_decay=1e-5)
    durations = []
    num_runs = 100

    for i in range(num_runs + 1):
        x = torch.rand(16, 3, 224, 224)
        x_var = torch.autograd.Variable(x).cuda().half()
        target = Variable(torch.LongTensor(16).fill_(1).cuda())
        torch.cuda.synchronize()
        t1 = time()
        out = model(x_var)
        err = loss(out, target)
        err.backward()
        optimizer.step()
        torch.cuda.synchronize()
        t2 = time()

        # treat the initial run as warm up and don't count
        if i > 0:
            durations.append(t2 - t1)

    print('{} FP 16 avg over {} runs: {} ms'.format(arch, len(durations), sum(durations) / len(durations) * 1000)) 
diff --git a/result.txt b/result.txt
 cuda version= 9.1.85
 cudnn version= 7005
 densenet121 FP 32 avg over 100 runs: 85.31552791595459 ms
 densenet121 FP 16 avg over 100 runs: 66.39776229858398 ms
 vgg16 FP 32 avg over 100 runs: 108.35402250289917 ms
 vgg16 FP 16 avg over 100 runs: 67.54538059234619 ms
 resnet152 FP 32 avg over 100 runs: 168.9338254928589 ms
 resnet152 FP 16 avg over 100 runs: 107.77381420135498 ms
	# see original code
	# https://discuss.pytorch.org/t/solved-titan-v-on-pytorch-0-3-0-cuda-9-0-cudnn-7-0-is-much-slower-than-1080-ti/11320/10?u=jef

	import torch
	from torchvision.models import vgg16,densenet121,resnet152
	from time import time
	import torch.nn as nn
	import torch.backends.cudnn as cudnn
	import torch.optim
	from torch.autograd import Variable
	import torchvision.models as models
	torch.backends.cudnn.benchmark=True
	model_names = sorted(name for name in models.__dict__
	if name.islower() and not name.startswith("__")
	and callable(models.__dict__[name]))

	print('cuda version=', torch.version.cuda)
	print('cudnn version=', torch.backends.cudnn.version())

	for arch in ['densenet121', 'vgg16', 'resnet152']:
	model = models.__dict__[arch]().cuda()
	loss = nn.CrossEntropyLoss().cuda()
	optimizer = torch.optim.SGD(model.parameters(), 0.001,
	momentum=0.9,
	weight_decay=1e-5)
	durations = []
	num_runs = 100

	for i in range(num_runs + 1):
	x = torch.rand(16, 3, 224, 224)
	x_var = torch.autograd.Variable(x).cuda()
	target = Variable(torch.LongTensor(16).fill_(1).cuda())
	torch.cuda.synchronize()
	t1 = time()
	out = model(x_var)
	err = loss(out, target)
	err.backward()
	optimizer.step()
	torch.cuda.synchronize()
	t2 = time()

	# treat the initial run as warm up and don't count
	if i > 0:
	durations.append(t2 - t1)

	print('{} FP 32 avg over {} runs: {} ms'.format(arch, len(durations), sum(durations) / len(durations) * 1000))

	model = models.__dict__[arch]().cuda().half()
	loss = nn.CrossEntropyLoss().cuda()
	optimizer = torch.optim.SGD(model.parameters(), 0.001,
	momentum=0.9,
	weight_decay=1e-5)
	durations = []
	num_runs = 100

	for i in range(num_runs + 1):
	x = torch.rand(16, 3, 224, 224)
	x_var = torch.autograd.Variable(x).cuda().half()
	target = Variable(torch.LongTensor(16).fill_(1).cuda())
	torch.cuda.synchronize()
	t1 = time()
	out = model(x_var)
	err = loss(out, target)
	err.backward()
	optimizer.step()
	torch.cuda.synchronize()
	t2 = time()

	# treat the initial run as warm up and don't count
	if i > 0:
	durations.append(t2 - t1)

	print('{} FP 16 avg over {} runs: {} ms'.format(arch, len(durations), sum(durations) / len(durations) * 1000))
	cuda version= 9.1.85
	cudnn version= 7005
	densenet121 FP 32 avg over 100 runs: 85.31552791595459 ms
	densenet121 FP 16 avg over 100 runs: 66.39776229858398 ms
	vgg16 FP 32 avg over 100 runs: 108.35402250289917 ms
	vgg16 FP 16 avg over 100 runs: 67.54538059234619 ms
	resnet152 FP 32 avg over 100 runs: 168.9338254928589 ms
	resnet152 FP 16 avg over 100 runs: 107.77381420135498 ms