tmux new [-s name] [cmd](:new) - new session
tmux ls(:ls) - list sessionstmux switch [-t name](:switch) - switches to an existing session
| # This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting. | |
| # https://developer.nvidia.com/nsight-systems | |
| # https://docs.nvidia.com/nsight-systems/profiling/index.html | |
| # My preferred nsys (command line executable used to create profiles) commands | |
| # | |
| # In your script, write | |
| # torch.cuda.nvtx.range_push("region name") | |
| # ... |
| import pycuda.driver as cuda | |
| import pycuda.autoinit | |
| from pycuda.compiler import SourceModule | |
| import numpy as np | |
| def compute_xcorr_cpu(d): | |
| dc = d.astype('float32').view('complex64') | |
| dc = dc.transpose((0,2,3,1)).copy() | |
| xcorr_cpu = np.einsum('...i,...j', dc, np.conj(dc)).view('float32').astype('int32').sum(axis=-4) | |
| return xcorr_cpu |
| def logsigsoftmax(logits): | |
| """ | |
| Computes sigsoftmax from the paper - https://arxiv.org/pdf/1805.10829.pdf | |
| """ | |
| max_values = torch.max(logits, 1, keepdim = True)[0] | |
| exp_logits_sigmoided = torch.exp(logits - max_values) * torch.sigmoid(logits) | |
| sum_exp_logits_sigmoided = exp_logits_sigmoided.sum(1, keepdim = True) | |
| log_probs = logits - max_values + torch.log(torch.sigmoid(logits)) - torch.log(sum_exp_logits_sigmoided) | |
| return log_probs |
| import gc | |
| import torch | |
| ## MEM utils ## | |
| def mem_report(): | |
| '''Report the memory usage of the tensor.storage in pytorch | |
| Both on CPUs and GPUs are reported''' | |
| def _mem_report(tensors, mem_type): |
| from graphviz import Digraph | |
| from torch.autograd import Variable | |
| import torch | |
| def make_dot(var, params=None): | |
| if params is not None: | |
| assert isinstance(params.values()[0], Variable) | |
| param_map = {id(v): k for k, v in params.items()} |
| Latency Comparison Numbers | |
| -------------------------- | |
| L1 cache reference/hit 1.5 ns 4 cycles | |
| Floating-point add/mult/FMA operation 1.5 ns 4 cycles | |
| L2 cache reference/hit 5 ns 12 ~ 17 cycles | |
| Branch mispredict 6 ns 15 ~ 20 cycles | |
| L3 cache hit (unshared cache line) 16 ns 42 cycles | |
| L3 cache hit (shared line in another core) 25 ns 65 cycles | |
| Mutex lock/unlock 25 ns | |
| L3 cache hit (modified in another core) 29 ns 75 cycles |