July 18, 2023 01:45 · November 12, 2024 19:58 · June 30, 2023 05:56 · October 21, 2022 04:20 · March 16, 2022 03:53 · November 16, 2021 22:54
 # This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

 # https://developer.nvidia.com/nsight-systems
 # https://docs.nvidia.com/nsight-systems/profiling/index.html

 # My preferred nsys (command line executable used to create profiles) commands
 #
 # In your script, write
 # torch.cuda.nvtx.range_push("region name")
 # ...
 # set default-terminal colors to display default terminal colors

 set -g default-terminal "xterm-256color"

 # set-option -ga terminal-overrides ',xterm-256color:Tc


 # Undercurl

 set-option -g default-terminal "tmux-256color"
 -- Install packer
 local install_path = vim.fn.stdpath 'data' .. '/site/pack/packer/start/packer.nvim'
 local is_bootstrap = false
 if vim.fn.empty(vim.fn.glob(install_path)) > 0 then
  is_bootstrap = true
  vim.fn.system { 'git', 'clone', '--depth', '1', 'https://github.com/wbthomason/packer.nvim', install_path }
  vim.cmd [[packadd packer.nvim]]
 end

 require('packer').startup(function(use)
 """
 call to _all_gather_base with  c10d._coalescing_manager
 Test command:
 mpirun -np $1 -N ${ndev_per_node} --hostfile ${HOST_FILE} \
    --mca plm_rsh_no_tree_spawn 1 \
    -mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 \
    --mca pml ^cm \
    -bind-to none \
    --tag-output \
    -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
 diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
 index e934b69c..207faa39 100644
 --- a/csrc/lamb/fused_lamb_cuda_kernel.cu
 +++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
 @@ -8,7 +8,7 @@
 #include "ATen/cuda/CUDAContext.h"
 #include "ATen/cuda/detail/IndexUtils.cuh"
 //#include "ATen/Type.h"
 -#include <THC/THCGeneral.h>
 +// #include <THC/THCGeneral.h>
 diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
 index 0448a45..ff87993 100644
 --- a/csrc/lamb/fused_lamb_cuda_kernel.cu
 +++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
 @@ -464,7 +464,7 @@ void fused_lamb_cuda(at::Tensor& p,
                         lamb_coeff.data<scalar_t>());
             }));
     }
 -    THCudaCheck(cudaGetLastError());
 +    AT_CUDA_CHECK(cudaGetLastError());
 21: M9 P[5, 6] avail 3.1e+08, max_avail 5.0e+07, queue_sz 5.8e+02, n_inflight 5.1e+03, inflight [9]
 -gather param for module 3: {'id': 0, 'status': 'AVAILABLE', 'numel': 78151680, 'persist': False, 'active_sub_modules': {3}}
 [2021-07-07 21:16:52,635] [INFO] [stage3.py:42:print_rank_0] wait_for_fetch current submodule id 9
 [2021-07-07 21:16:52,635] [INFO] [stage3.py:42:print_rank_0] module id 9 handle is None
 22: M23 P[] avail 3.1e+08, max_avail 5.0e+07, queue_sz 5.8e+02, n_inflight 7.8e+07, inflight [0, 23, 2, 1, 3]
 [2021-07-07 21:16:52,636] [INFO] [stage3.py:42:print_rank_0] wait_for_fetch current submodule id 23
 [2021-07-07 21:16:52,636] [INFO] [stage3.py:42:print_rank_0] module id 23 handle is None
 -gather param for module 24: {'id': 151, 'status': 'NOT_AVAILABLE', 'numel': 6553600, 'persist': False, 'active_sub_modules': {24}}
 -gather param for module 24: {'id': 152, 'status': 'AVAILABLE', 'numel': 2560, 'persist': True, 'active_sub_modules': {24}}
 [2021-07-07 21:16:52,636] [INFO] [utils.py:629:info_rank_
 {
    "train_batch_size": 512,
    "train_micro_batch_size_per_gpu": 8,
    "steps_per_print": 100,
    "prescale_gradients": false,
    "bert_token_file": "bert-large-uncased",
    "bert_model_config": {
        "vocab_size_or_config_json_file": 32003,
        "hidden_size": 2560,
        "num_hidden_layers": 64,
 import re
 import argparse

 def get_args():
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('--file')

    args = arg_parser.parse_args()
    return args
	# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

	# https://developer.nvidia.com/nsight-systems
	# https://docs.nvidia.com/nsight-systems/profiling/index.html

	# My preferred nsys (command line executable used to create profiles) commands
	#
	# In your script, write
	# torch.cuda.nvtx.range_push("region name")
	# ...
	# set default-terminal colors to display default terminal colors

	set -g default-terminal "xterm-256color"

	# set-option -ga terminal-overrides ',xterm-256color:Tc


	# Undercurl

	set-option -g default-terminal "tmux-256color"
	-- Install packer
	local install_path = vim.fn.stdpath 'data' .. '/site/pack/packer/start/packer.nvim'
	local is_bootstrap = false
	if vim.fn.empty(vim.fn.glob(install_path)) > 0 then
	is_bootstrap = true
	vim.fn.system { 'git', 'clone', '--depth', '1', 'https://github.com/wbthomason/packer.nvim', install_path }
	vim.cmd [[packadd packer.nvim]]
	end

	require('packer').startup(function(use)
	"""
	call to _all_gather_base with c10d._coalescing_manager
	Test command:
	mpirun -np $1 -N ${ndev_per_node} --hostfile ${HOST_FILE} \
	--mca plm_rsh_no_tree_spawn 1 \
	-mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 \
	--mca pml ^cm \
	-bind-to none \
	--tag-output \
	-x LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
	diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
	index e934b69c..207faa39 100644
	--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
	+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
	@@ -8,7 +8,7 @@
	#include "ATen/cuda/CUDAContext.h"
	#include "ATen/cuda/detail/IndexUtils.cuh"
	//#include "ATen/Type.h"
	-#include <THC/THCGeneral.h>
	+// #include <THC/THCGeneral.h>
	diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
	index 0448a45..ff87993 100644
	--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
	+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
	@@ -464,7 +464,7 @@ void fused_lamb_cuda(at::Tensor& p,
	lamb_coeff.data<scalar_t>());
	}));
	}
	- THCudaCheck(cudaGetLastError());
	+ AT_CUDA_CHECK(cudaGetLastError());
	21: M9 P[5, 6] avail 3.1e+08, max_avail 5.0e+07, queue_sz 5.8e+02, n_inflight 5.1e+03, inflight [9]
	-gather param for module 3: {'id': 0, 'status': 'AVAILABLE', 'numel': 78151680, 'persist': False, 'active_sub_modules': {3}}
	[2021-07-07 21:16:52,635] [INFO] [stage3.py:42:print_rank_0] wait_for_fetch current submodule id 9
	[2021-07-07 21:16:52,635] [INFO] [stage3.py:42:print_rank_0] module id 9 handle is None
	22: M23 P[] avail 3.1e+08, max_avail 5.0e+07, queue_sz 5.8e+02, n_inflight 7.8e+07, inflight [0, 23, 2, 1, 3]
	[2021-07-07 21:16:52,636] [INFO] [stage3.py:42:print_rank_0] wait_for_fetch current submodule id 23
	[2021-07-07 21:16:52,636] [INFO] [stage3.py:42:print_rank_0] module id 23 handle is None
	-gather param for module 24: {'id': 151, 'status': 'NOT_AVAILABLE', 'numel': 6553600, 'persist': False, 'active_sub_modules': {24}}
	-gather param for module 24: {'id': 152, 'status': 'AVAILABLE', 'numel': 2560, 'persist': True, 'active_sub_modules': {24}}
	[2021-07-07 21:16:52,636] [INFO] [utils.py:629:info_rank_
	{
	"train_batch_size": 512,
	"train_micro_batch_size_per_gpu": 8,
	"steps_per_print": 100,
	"prescale_gradients": false,
	"bert_token_file": "bert-large-uncased",
	"bert_model_config": {
	"vocab_size_or_config_json_file": 32003,
	"hidden_size": 2560,
	"num_hidden_layers": 64,
	import re
	import argparse

	def get_args():
	arg_parser = argparse.ArgumentParser()
	arg_parser.add_argument('--file')

	args = arg_parser.parse_args()
	return args