cako · November 27, 2023 03:09
diff --git a/inplace_pin.py b/inplace_pin.py
 # In-Place Memory Pinning in PyTorch
 # See # https://github.com/pytorch/pytorch/issues/32167#issuecomment-753551842
 #
 # Copyright 2023 Carlos Alberto da Costa Filho <[email protected]>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the “Software”), to
 # deal in the Software without restriction, including without limitation the
 # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 # sell copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.

 import gc
 from time import perf_counter_ns

 import psutil
 import torch
 from torch.cuda import cudart, empty_cache, nvtx, synchronize

 ARRAY_MEMORY_GiB = 3
 GiB_to_B = 1024 * 1024 * 1024
 BIT_TO_BYTE = 8
 DTYPE = torch.float32
 n = (ARRAY_MEMORY_GiB * GiB_to_B * BIT_TO_BYTE) // torch.finfo(DTYPE).bits


 class PerfBlock:
    def __init__(self, name: str) -> None:
        self.name = name

    def __enter__(self):
        gc.collect()
        ram = psutil.virtual_memory()
        free_ram, total_ram = ram.available, ram.total
        free_gpu, total_gpu = cudart().cudaMemGetInfo(0)
        print(f"Free RAM: {free_ram /GiB_to_B:.1f} GiB / {total_ram /GiB_to_B:.1f} GiB")
        print(f"Free GPU: {free_gpu /GiB_to_B:.1f} GiB / {total_gpu /GiB_to_B:.1f} GiB")
        print(self.name)
        nvtx.range_push(self.name)
        self.tic = perf_counter_ns()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.toc = perf_counter_ns()
        nvtx.range_pop()
        print(f"Runtime: {1e-6 * (self.toc-self.tic):.1f} ms")
        print()


 free, total = cudart().cudaMemGetInfo(0)
 assert free >= ARRAY_MEMORY_GiB * GiB_to_B, ValueError("Array memory is too large")
 cudart().cudaProfilerStart()

 # Paged
 print("PAGED")
 with PerfBlock("Paged CPU create"):
    x = torch.ones(n, device="cpu", dtype=DTYPE)
 assert not x.is_pinned()
 with PerfBlock("Paged CPU to GPU call"):
    x = x.to("cuda", non_blocking=True)
 with PerfBlock("Paged CPU to GPU sync"):
    synchronize()
 with PerfBlock("Paged CPU to GPU math"):
    x.mul_(2)
    y = x.mean()
    synchronize()
 with PerfBlock("Paged CPU to GPU del"):
    del x
    del y
    empty_cache()
    synchronize()

 print("PINNED Out-of-place")
 with PerfBlock("Pinned oop CPU create"):
    x = torch.ones(n, device="cpu", dtype=DTYPE)
 assert not x.is_pinned()
 with PerfBlock("Pinned oop CPU pin"):
    x = x.pin_memory()
 assert x.is_pinned()
 with PerfBlock("Pinned oop CPU to GPU call"):
    x = x.to("cuda", non_blocking=True)
 with PerfBlock("Pinned oop CPU to GPU sync"):
    synchronize()
 with PerfBlock("Pinned oop CPU to GPU math"):
    x.mul_(2)
    y = x.mean()
    synchronize()
 with PerfBlock("Pinned oop CPU to GPU del"):
    del x
    del y
    empty_cache()
    synchronize()

 # Pinned
 print("PINNED")
 with PerfBlock("Pinned CPU create"):
    x = torch.ones(n, device="cpu", dtype=DTYPE)
 with PerfBlock("Pinned CPU pin"):
    # See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
    ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0)
 assert ret
 assert x.is_pinned()
 xcpu = x
 with PerfBlock("Pinned CPU to GPU call"):
    x = x.to("cuda", non_blocking=True)
 with PerfBlock("Pinned CPU to GPU sync"):
    synchronize()
 with PerfBlock("Pinned CPU to GPU unpin"):
    ret = cudart().cudaHostUnregister(xcpu.data_ptr())
    del xcpu
 assert ret
 with PerfBlock("Pinned CPU to GPU math"):
    x.mul_(2)
    y = x.mean()
    synchronize()
 with PerfBlock("Pinned CPU to GPU del"):
    del x
    del y
    empty_cache()
    synchronize()

 # Shared
 print("SHARED")
 with PerfBlock("Shared CPU create"):
    x = torch.ones(n, device="cpu", dtype=DTYPE)
 with PerfBlock("Shared CPU share"):
    x.share_memory_()
 assert x.is_shared()
 with PerfBlock("Shared CPU pin"):
    # See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
    ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0)
 assert ret
 assert x.is_pinned()
 xcpu = x
 with PerfBlock("Shared CPU to GPU call"):
    x = x.to("cuda", non_blocking=True)
 with PerfBlock("Shared CPU to GPU sync"):
    synchronize()
 with PerfBlock("Sahred CPU to GPU unpin"):
    ret = cudart().cudaHostUnregister(xcpu.data_ptr())
    del xcpu
 assert ret
 with PerfBlock("Shared CPU to GPU math"):
    x.mul_(2)
    y = x.mean()
    synchronize()
 with PerfBlock("Shared CPU to GPU del"):
    del x
    del y
    empty_cache()
    synchronize()

 cudart().cudaProfilerStop()

 # nsys profile --trace cuda,nvtx --capture-range cudaProfilerApi --output inplace_pin --force-overwrite true python inplace_pin.py
	# In-Place Memory Pinning in PyTorch
	# See # https://github.com/pytorch/pytorch/issues/32167#issuecomment-753551842
	#
	# Copyright 2023 Carlos Alberto da Costa Filho <[email protected]>
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the “Software”), to
	# deal in the Software without restriction, including without limitation the
	# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
	# sell copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.

	import gc
	from time import perf_counter_ns

	import psutil
	import torch
	from torch.cuda import cudart, empty_cache, nvtx, synchronize

	ARRAY_MEMORY_GiB = 3
	GiB_to_B = 1024 * 1024 * 1024
	BIT_TO_BYTE = 8
	DTYPE = torch.float32
	n = (ARRAY_MEMORY_GiB * GiB_to_B * BIT_TO_BYTE) // torch.finfo(DTYPE).bits


	class PerfBlock:
	def __init__(self, name: str) -> None:
	self.name = name

	def __enter__(self):
	gc.collect()
	ram = psutil.virtual_memory()
	free_ram, total_ram = ram.available, ram.total
	free_gpu, total_gpu = cudart().cudaMemGetInfo(0)
	print(f"Free RAM: {free_ram /GiB_to_B:.1f} GiB / {total_ram /GiB_to_B:.1f} GiB")
	print(f"Free GPU: {free_gpu /GiB_to_B:.1f} GiB / {total_gpu /GiB_to_B:.1f} GiB")
	print(self.name)
	nvtx.range_push(self.name)
	self.tic = perf_counter_ns()

	def __exit__(self, exc_type, exc_val, exc_tb):
	self.toc = perf_counter_ns()
	nvtx.range_pop()
	print(f"Runtime: {1e-6 * (self.toc-self.tic):.1f} ms")
	print()


	free, total = cudart().cudaMemGetInfo(0)
	assert free >= ARRAY_MEMORY_GiB * GiB_to_B, ValueError("Array memory is too large")
	cudart().cudaProfilerStart()

	# Paged
	print("PAGED")
	with PerfBlock("Paged CPU create"):
	x = torch.ones(n, device="cpu", dtype=DTYPE)
	assert not x.is_pinned()
	with PerfBlock("Paged CPU to GPU call"):
	x = x.to("cuda", non_blocking=True)
	with PerfBlock("Paged CPU to GPU sync"):
	synchronize()
	with PerfBlock("Paged CPU to GPU math"):
	x.mul_(2)
	y = x.mean()
	synchronize()
	with PerfBlock("Paged CPU to GPU del"):
	del x
	del y
	empty_cache()
	synchronize()

	print("PINNED Out-of-place")
	with PerfBlock("Pinned oop CPU create"):
	x = torch.ones(n, device="cpu", dtype=DTYPE)
	assert not x.is_pinned()
	with PerfBlock("Pinned oop CPU pin"):
	x = x.pin_memory()
	assert x.is_pinned()
	with PerfBlock("Pinned oop CPU to GPU call"):
	x = x.to("cuda", non_blocking=True)
	with PerfBlock("Pinned oop CPU to GPU sync"):
	synchronize()
	with PerfBlock("Pinned oop CPU to GPU math"):
	x.mul_(2)
	y = x.mean()
	synchronize()
	with PerfBlock("Pinned oop CPU to GPU del"):
	del x
	del y
	empty_cache()
	synchronize()

	# Pinned
	print("PINNED")
	with PerfBlock("Pinned CPU create"):
	x = torch.ones(n, device="cpu", dtype=DTYPE)
	with PerfBlock("Pinned CPU pin"):
	# See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
	ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0)
	assert ret
	assert x.is_pinned()
	xcpu = x
	with PerfBlock("Pinned CPU to GPU call"):
	x = x.to("cuda", non_blocking=True)
	with PerfBlock("Pinned CPU to GPU sync"):
	synchronize()
	with PerfBlock("Pinned CPU to GPU unpin"):
	ret = cudart().cudaHostUnregister(xcpu.data_ptr())
	del xcpu
	assert ret
	with PerfBlock("Pinned CPU to GPU math"):
	x.mul_(2)
	y = x.mean()
	synchronize()
	with PerfBlock("Pinned CPU to GPU del"):
	del x
	del y
	empty_cache()
	synchronize()

	# Shared
	print("SHARED")
	with PerfBlock("Shared CPU create"):
	x = torch.ones(n, device="cpu", dtype=DTYPE)
	with PerfBlock("Shared CPU share"):
	x.share_memory_()
	assert x.is_shared()
	with PerfBlock("Shared CPU pin"):
	# See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
	ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0)
	assert ret
	assert x.is_pinned()
	xcpu = x
	with PerfBlock("Shared CPU to GPU call"):
	x = x.to("cuda", non_blocking=True)
	with PerfBlock("Shared CPU to GPU sync"):
	synchronize()
	with PerfBlock("Sahred CPU to GPU unpin"):
	ret = cudart().cudaHostUnregister(xcpu.data_ptr())
	del xcpu
	assert ret
	with PerfBlock("Shared CPU to GPU math"):
	x.mul_(2)
	y = x.mean()
	synchronize()
	with PerfBlock("Shared CPU to GPU del"):
	del x
	del y
	empty_cache()
	synchronize()

	cudart().cudaProfilerStop()

	# nsys profile --trace cuda,nvtx --capture-range cudaProfilerApi --output inplace_pin --force-overwrite true python inplace_pin.py