mcarilli · November 12, 2025 19:13 · vinjn · Jul 14, 2025 · Minyuluck · Oct 1, 2025
diff --git a/nsight.sh b/nsight.sh
 # This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

 # https://developer.nvidia.com/nsight-systems
 # https://docs.nvidia.com/nsight-systems/profiling/index.html

 # My preferred nsys (command line executable used to create profiles) commands
 #
 # In your script, write
 # torch.cuda.nvtx.range_push("region name")
 # ...
 # torch.cuda.nvtx.range_pop()
 # around suspected hotspot regions for easy identification on the timeline.
 #
 # Dummy/warmup iterations prior to the region you want to profile are highly
 # recommended to get caching allocator/cuda context initialization out of the way.
 #
 # Copy paste the desired command and run it for your app. It will produce a .qdrep file.
 # Run the "nsight-sys" GUI executable and File->Open the .qdrep file.
 # If you're making the profile locally on your desktop, you may not need nsys at all, you can do
 # the whole workflow (create and view profile) through the GUI, but if your job runs remotely on
 # a cluster node, I prefer to create .qdrep profiles with nsys remotely, copy them back to my desktop,
 # then open them in nsight-sys.

 # Typical use (collects GPU timeline, Cuda and OS calls on the CPU timeline, but no CPU stack traces)
 nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas -s none -o nsight_report -f true -x true python script.py args...

 # Adds CPU backtraces that will show when you mouse over a long call or small orange tick (sample) on the CPU timeline:
 nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas -s cpu -o nsight_report -f true --cudabacktrace=true --cudabacktrace-threshold=10000 --osrt-threshold=10000 -x true python script.py args...

 # Focused profiling, profiles only a target region
 # (your app must call torch.cuda.cudart().cudaProfilerStart()/Stop() at the start/end of the target region)
 nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas -s cpu -o nsight_report -f true --capture-range=cudaProfilerApi --stop-on-range-end=true --cudabacktrace=true --cudabacktrace-threshold=10000 --osrt-threshold=10000 -x true python script.py args...

 # if appname creates child processes, nsys WILL profile those as well.  They will show up as separate processes with
 # separate timelines when you open the profile in nsight-sys

 # Breakdown of options:
 nsys profile
 -w true # Don't suppress app's console output.
 -t cuda,nvtx,osrt,cudnn,cublas # Instrument, and show timeline bubbles for, cuda api calls, nvtx ranges,
                               # os runtime functions, cudnn library calls, and cublas library calls.
                               # These options do not require -s cpu nor do they silently enable -s cpu.
 -s cpu # Sample the cpu stack periodically.  Stack samples show up as little tickmarks on the cpu timeline.
       # Last time i checked they were orange, but still easy to miss.
       # Mouse over them to show the backtrace at that point.
       # -s cpu can increase cpu overhead substantially (I've seen 2X or more) so be aware of that distortion.
       # -s none disables cpu sampling.  Without cpu sampling, the profiling overhead is reduced.
       # Use -s none if you want the timeline to better represent a production job (api calls and kernels will
       # still appear on the profile, but profiling them doesn't distort the timeline nearly as much).
 -o nsight_report # output file
 -f true # overwrite existing output file
 --capture-range=cudaProfilerApi # Only start profiling when the app calls cudaProfilerStart...
 --stop-on-range-end=true # ...and end profiling when the app calls cudaProfilerStop.
 --cudabacktrace=true # Collect a cpu stack sample for cuda api calls whose runtime exceeds some threshold.
                     # When you mouse over a long-running api call on the timeline, a backtrace will
                     # appear, and you can identify which of your functions invoked it.
                     # I really like this feature.
                     # Requires -s cpu.
 --cudabacktrace-threshold=10000 # Threshold (in nanosec) that determines how long a cuda api call
                                # must run to trigger a backtrace.  10 microsec is a reasonable value
                                # (most kernel launches should take less than 10 microsec) but you
                                # should retune if you see a particular api call you'd like to investigate.
                                # Requires --cudabacktrace=true and -s cpu.
 --osrt-threshold=10000 # Threshold (in nanosec) that determines how long an os runtime call (eg sleep)
                       # must run to trigger a backtrace.
                       # Backtrace collection for os runtime calls that exceed this threshold should
                       # occur by default if -s cpu is enabled.
 -x true # Quit the profiler when the app exits.
 python script.py args...
	# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

	# https://developer.nvidia.com/nsight-systems
	# https://docs.nvidia.com/nsight-systems/profiling/index.html

	# My preferred nsys (command line executable used to create profiles) commands
	#
	# In your script, write
	# torch.cuda.nvtx.range_push("region name")
	# ...
	# torch.cuda.nvtx.range_pop()
	# around suspected hotspot regions for easy identification on the timeline.
	#
	# Dummy/warmup iterations prior to the region you want to profile are highly
	# recommended to get caching allocator/cuda context initialization out of the way.
	#
	# Copy paste the desired command and run it for your app. It will produce a .qdrep file.
	# Run the "nsight-sys" GUI executable and File->Open the .qdrep file.
	# If you're making the profile locally on your desktop, you may not need nsys at all, you can do
	# the whole workflow (create and view profile) through the GUI, but if your job runs remotely on
	# a cluster node, I prefer to create .qdrep profiles with nsys remotely, copy them back to my desktop,
	# then open them in nsight-sys.

	# Typical use (collects GPU timeline, Cuda and OS calls on the CPU timeline, but no CPU stack traces)
	nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas -s none -o nsight_report -f true -x true python script.py args...

	# Adds CPU backtraces that will show when you mouse over a long call or small orange tick (sample) on the CPU timeline:
	nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas -s cpu -o nsight_report -f true --cudabacktrace=true --cudabacktrace-threshold=10000 --osrt-threshold=10000 -x true python script.py args...

	# Focused profiling, profiles only a target region
	# (your app must call torch.cuda.cudart().cudaProfilerStart()/Stop() at the start/end of the target region)
	nsys profile -w true -t cuda,nvtx,osrt,cudnn,cublas -s cpu -o nsight_report -f true --capture-range=cudaProfilerApi --stop-on-range-end=true --cudabacktrace=true --cudabacktrace-threshold=10000 --osrt-threshold=10000 -x true python script.py args...

	# if appname creates child processes, nsys WILL profile those as well. They will show up as separate processes with
	# separate timelines when you open the profile in nsight-sys

	# Breakdown of options:
	nsys profile
	-w true # Don't suppress app's console output.
	-t cuda,nvtx,osrt,cudnn,cublas # Instrument, and show timeline bubbles for, cuda api calls, nvtx ranges,
	# os runtime functions, cudnn library calls, and cublas library calls.
	# These options do not require -s cpu nor do they silently enable -s cpu.
	-s cpu # Sample the cpu stack periodically. Stack samples show up as little tickmarks on the cpu timeline.
	# Last time i checked they were orange, but still easy to miss.
	# Mouse over them to show the backtrace at that point.
	# -s cpu can increase cpu overhead substantially (I've seen 2X or more) so be aware of that distortion.
	# -s none disables cpu sampling. Without cpu sampling, the profiling overhead is reduced.
	# Use -s none if you want the timeline to better represent a production job (api calls and kernels will
	# still appear on the profile, but profiling them doesn't distort the timeline nearly as much).
	-o nsight_report # output file
	-f true # overwrite existing output file
	--capture-range=cudaProfilerApi # Only start profiling when the app calls cudaProfilerStart...
	--stop-on-range-end=true # ...and end profiling when the app calls cudaProfilerStop.
	--cudabacktrace=true # Collect a cpu stack sample for cuda api calls whose runtime exceeds some threshold.
	# When you mouse over a long-running api call on the timeline, a backtrace will
	# appear, and you can identify which of your functions invoked it.
	# I really like this feature.
	# Requires -s cpu.
	--cudabacktrace-threshold=10000 # Threshold (in nanosec) that determines how long a cuda api call
	# must run to trigger a backtrace. 10 microsec is a reasonable value
	# (most kernel launches should take less than 10 microsec) but you
	# should retune if you see a particular api call you'd like to investigate.
	# Requires --cudabacktrace=true and -s cpu.
	--osrt-threshold=10000 # Threshold (in nanosec) that determines how long an os runtime call (eg sleep)
	# must run to trigger a backtrace.
	# Backtrace collection for os runtime calls that exceed this threshold should
	# occur by default if -s cpu is enabled.
	-x true # Quit the profiler when the app exits.
	python script.py args...
No results found