tmux new [-s name] [cmd]
(:new
) - new session
tmux ls
(:ls
) - list sessionstmux switch [-t name]
(:switch
) - switches to an existing session
import gc | |
import torch | |
## MEM utils ## | |
def mem_report(): | |
'''Report the memory usage of the tensor.storage in pytorch | |
Both on CPUs and GPUs are reported''' | |
def _mem_report(tensors, mem_type): |
from graphviz import Digraph | |
import re | |
import torch | |
import torch.nn.functional as F | |
from torch.autograd import Variable | |
from torch.autograd import Variable | |
import torchvision.models as models | |
def make_dot(var): |
#!/usr/bin/env python | |
""" | |
Demo of how to pass GPU memory managed by pycuda to mpi4py. | |
Notes | |
----- | |
This code can be used to perform peer-to-peer communication of data via | |
NVIDIA's GPUDirect technology if mpi4py has been built against a | |
CUDA-enabled MPI implementation. |
"""A simple script to test the biLSTM type that pytorch uses. | |
The gradients are computed only w.r.t the output of one single direction, | |
so gradient of the reverse direction in layer 1 should be zero if type1. | |
In my tests, it's type2 | |
""" | |
import torch | |
from torch import nn |
Latency Comparison Numbers | |
-------------------------- | |
L1 cache reference/hit 1.5 ns 4 cycles | |
Floating-point add/mult/FMA operation 1.5 ns 4 cycles | |
L2 cache reference/hit 5 ns 12 ~ 17 cycles | |
Branch mispredict 6 ns 15 ~ 20 cycles | |
L3 cache hit (unshared cache line) 16 ns 42 cycles | |
L3 cache hit (shared line in another core) 25 ns 65 cycles | |
Mutex lock/unlock 25 ns | |
L3 cache hit (modified in another core) 29 ns 75 cycles |
# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting. | |
# https://developer.nvidia.com/nsight-systems | |
# https://docs.nvidia.com/nsight-systems/profiling/index.html | |
# My preferred nsys (command line executable used to create profiles) commands | |
# | |
# In your script, write | |
# torch.cuda.nvtx.range_push("region name") | |
# ... |