Last active
November 27, 2023 03:09
-
-
Save cako/e1894a18c84b76ed50d8155b7d3630ca to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# In-Place Memory Pinning in PyTorch | |
# See # https://github.com/pytorch/pytorch/issues/32167#issuecomment-753551842 | |
# | |
# Copyright 2023 Carlos Alberto da Costa Filho <[email protected]> | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the “Software”), to | |
# deal in the Software without restriction, including without limitation the | |
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | |
# sell copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in | |
# all copies or substantial portions of the Software. | |
import gc | |
from time import perf_counter_ns | |
import psutil | |
import torch | |
from torch.cuda import cudart, empty_cache, nvtx, synchronize | |
ARRAY_MEMORY_GiB = 3 | |
GiB_to_B = 1024 * 1024 * 1024 | |
BIT_TO_BYTE = 8 | |
DTYPE = torch.float32 | |
n = (ARRAY_MEMORY_GiB * GiB_to_B * BIT_TO_BYTE) // torch.finfo(DTYPE).bits | |
class PerfBlock: | |
def __init__(self, name: str) -> None: | |
self.name = name | |
def __enter__(self): | |
gc.collect() | |
ram = psutil.virtual_memory() | |
free_ram, total_ram = ram.available, ram.total | |
free_gpu, total_gpu = cudart().cudaMemGetInfo(0) | |
print(f"Free RAM: {free_ram /GiB_to_B:.1f} GiB / {total_ram /GiB_to_B:.1f} GiB") | |
print(f"Free GPU: {free_gpu /GiB_to_B:.1f} GiB / {total_gpu /GiB_to_B:.1f} GiB") | |
print(self.name) | |
nvtx.range_push(self.name) | |
self.tic = perf_counter_ns() | |
def __exit__(self, exc_type, exc_val, exc_tb): | |
self.toc = perf_counter_ns() | |
nvtx.range_pop() | |
print(f"Runtime: {1e-6 * (self.toc-self.tic):.1f} ms") | |
print() | |
free, total = cudart().cudaMemGetInfo(0) | |
assert free >= ARRAY_MEMORY_GiB * GiB_to_B, ValueError("Array memory is too large") | |
cudart().cudaProfilerStart() | |
# Paged | |
print("PAGED") | |
with PerfBlock("Paged CPU create"): | |
x = torch.ones(n, device="cpu", dtype=DTYPE) | |
assert not x.is_pinned() | |
with PerfBlock("Paged CPU to GPU call"): | |
x = x.to("cuda", non_blocking=True) | |
with PerfBlock("Paged CPU to GPU sync"): | |
synchronize() | |
with PerfBlock("Paged CPU to GPU math"): | |
x.mul_(2) | |
y = x.mean() | |
synchronize() | |
with PerfBlock("Paged CPU to GPU del"): | |
del x | |
del y | |
empty_cache() | |
synchronize() | |
print("PINNED Out-of-place") | |
with PerfBlock("Pinned oop CPU create"): | |
x = torch.ones(n, device="cpu", dtype=DTYPE) | |
assert not x.is_pinned() | |
with PerfBlock("Pinned oop CPU pin"): | |
x = x.pin_memory() | |
assert x.is_pinned() | |
with PerfBlock("Pinned oop CPU to GPU call"): | |
x = x.to("cuda", non_blocking=True) | |
with PerfBlock("Pinned oop CPU to GPU sync"): | |
synchronize() | |
with PerfBlock("Pinned oop CPU to GPU math"): | |
x.mul_(2) | |
y = x.mean() | |
synchronize() | |
with PerfBlock("Pinned oop CPU to GPU del"): | |
del x | |
del y | |
empty_cache() | |
synchronize() | |
# Pinned | |
print("PINNED") | |
with PerfBlock("Pinned CPU create"): | |
x = torch.ones(n, device="cpu", dtype=DTYPE) | |
with PerfBlock("Pinned CPU pin"): | |
# See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html | |
ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0) | |
assert ret | |
assert x.is_pinned() | |
xcpu = x | |
with PerfBlock("Pinned CPU to GPU call"): | |
x = x.to("cuda", non_blocking=True) | |
with PerfBlock("Pinned CPU to GPU sync"): | |
synchronize() | |
with PerfBlock("Pinned CPU to GPU unpin"): | |
ret = cudart().cudaHostUnregister(xcpu.data_ptr()) | |
del xcpu | |
assert ret | |
with PerfBlock("Pinned CPU to GPU math"): | |
x.mul_(2) | |
y = x.mean() | |
synchronize() | |
with PerfBlock("Pinned CPU to GPU del"): | |
del x | |
del y | |
empty_cache() | |
synchronize() | |
# Shared | |
print("SHARED") | |
with PerfBlock("Shared CPU create"): | |
x = torch.ones(n, device="cpu", dtype=DTYPE) | |
with PerfBlock("Shared CPU share"): | |
x.share_memory_() | |
assert x.is_shared() | |
with PerfBlock("Shared CPU pin"): | |
# See https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html | |
ret = cudart().cudaHostRegister(x.data_ptr(), x.nbytes, 0) | |
assert ret | |
assert x.is_pinned() | |
xcpu = x | |
with PerfBlock("Shared CPU to GPU call"): | |
x = x.to("cuda", non_blocking=True) | |
with PerfBlock("Shared CPU to GPU sync"): | |
synchronize() | |
with PerfBlock("Sahred CPU to GPU unpin"): | |
ret = cudart().cudaHostUnregister(xcpu.data_ptr()) | |
del xcpu | |
assert ret | |
with PerfBlock("Shared CPU to GPU math"): | |
x.mul_(2) | |
y = x.mean() | |
synchronize() | |
with PerfBlock("Shared CPU to GPU del"): | |
del x | |
del y | |
empty_cache() | |
synchronize() | |
cudart().cudaProfilerStop() | |
# nsys profile --trace cuda,nvtx --capture-range cudaProfilerApi --output inplace_pin --force-overwrite true python inplace_pin.py |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment