Skip to content

Instantly share code, notes, and snippets.

@cfstras
Last active March 24, 2026 09:44
Show Gist options
  • Select an option

  • Save cfstras/850722b9c4a52bd87171f670d65f6763 to your computer and use it in GitHub Desktop.

Select an option

Save cfstras/850722b9c4a52bd87171f670d65f6763 to your computer and use it in GitHub Desktop.
Installing GPU drivers on a DGX B200

(driver is outdated)

Note: the better solution would be to use the NVIDIA gpu-operator. They are exclusive -- either install the driver via gpu-operator with driver: {enabled: true} in the helm values -- or manually on your nodes like below:

# This worked on a VM with a RTX 50xx GPU attached (in proxmox, with PCIe Passthrough)

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$(grep DISTRIB_RELEASE /etc/lsb-release | cut -d'=' -f2 | tr -d '.')/x86_64/cuda-keyring_1.1-1_all.deb

sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update
sudo apt install cuda-toolkit
#sudo apt install nvidia-gds

sudo apt install nvidia-headless-570-open nvidia-utils-570
reboot
nvidia-smi  # should work now

### if you're on a B200, install infiniband stuff:  --- Don't install this on a non-NVlink system!
sudo apt install nvidia-fabricmanager nvlsm infiniband-diags # the latter only on 22.04 necessary?
if ! grep ib_umad /etc/modules-load.d/modules.conf; then echo ib_umad | sudo tee -a /etc/modules-load.d/modules.conf; fi
sudo modprobe ib_umad
sudo systemctl enable --now nvidia-fabricmanager
nvidia-smi topo -m  # View GPU interconnect topology -- should work now
### (end infiniband)

sudo apt install -y nvidia-container-runtime nvidia-container-toolkit

#  Example cuda test file  (you can find the file below)
/usr/local/cuda/bin/nvcc cuda_test.cu -o cuda_test
./cuda_test
# Success! Found 1 CUDA devices.

# Activate GPUs in k3s
sudo systemctl restart k3s
sudo grep nvidia /var/lib/rancher/k3s/agent/etc/containerd/config.toml # check, should not return empty
sudo nvidia-ctk runtime configure --runtime=containerd # (not necessary)?
# The only difference is that k3s doesn't set "privileged_without_host_devices=false" -- only relevant for privileged containers

# Optional: Test in containerd (not really needed)
sudo ctr images pull docker.io/nvidia/cuda:12.9.1-base-ubuntu24.04
sudo ctr run --rm --gpus 0 -t docker.io/nvidia/cuda:12.9.1-base-ubuntu24.04 cuda-12.9.1-base-ubuntu24.04 nvidia-smi
#include <iostream>
#include <cuda_runtime.h>
int main() {
int deviceCount = 0;
// This is the core function that attempts to initialize the driver
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess) {
// If the call fails, print the error message
std::cerr << "CUDA error: " << cudaGetErrorString(error_id) << std::endl;
std::cerr << "CUDA driver initialization failed!" << std::endl;
return 1; // Return an error code
}
// If the call succeeds, print the number of devices found
if (deviceCount == 0) {
std::cout << "No CUDA devices found." << std::endl;
} else {
std::cout << "Success! Found " << deviceCount << " CUDA devices." << std::endl;
}
return 0; // Return success
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment