sudo apt -y install unzip linux-headers-$(uname -r)
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt -y update && sudo apt -y install cuda
sudo reboot
- Update and source
~/.zshrc
export PATH=/usr/local/cuda/bin:$PATH
export PATH=/usr/local/cuda-12.3/bin${PATH:+:${PATH}}
export LD_LIBRARY_PATH=/usr/local/cuda-12.3/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export CUDA_HOME=/usr/local/cuda
nvcc --version
git clone https://github.com/nvidia/cuda-samples ~/cuda-samples
cd ~/cuda-samples && make
cd ~/cuda-samples/Samples/1_Utilities/deviceQuery && make && ./deviceQuery
cd ~/cuda-samples/Samples/1_Utilities/bandwidthTest && make && ./bandwidthTest
sudo apt -y install cudnn-cuda-12
sudo apt -y install libcudnn9-samples
sudo cp -r /usr/src/cudnn_samples_v9 ~/cudnn_samples_v9
cd ~/cudnn_samples_v9/mnistCUDNN && make clean && make && ./mnistCUDNN
conda env create -f bigd.yml
conda activate bigd
conda env update --file bigd.yml --prune
import tensorrt as rt; print(rt.__version__); assert rt.Builder(rt.Logger())
import tensorflow as tf; print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
import torch; import torchvision; print(torch.cuda.is_available())
import nvidia_smi
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
print(f'mem: {mem_res.used / (1024**2)} (GiB)') # usage in GiB
print(f'mem: {100 * (mem_res.used / mem_res.total):.3f}%') # percentage usage
conda deactivate
conda env remove -n bigd
mkdir -p ~/.local/bin
pushd ~/.local/bin
wget -O ngccli_cat_linux.zip https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip && unzip -o ngccli_cat_linux.zip && chmod u+x ngc-cli/ngc && rm ngccli_cat_linux.zip ngc-cli.md5
echo "export PATH=\"\$PATH:$(pwd)/ngc-cli\"" >> ~/.zprofile && source ~/.zprofile
export PATH=$(pwd)/ngc-cli:$PATH
popd
ngc config set
which ngc
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt -y update && sudo apt -y install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
sudo groupadd docker && sudo usermod -aG docker $USER && newgrp docker
sudo setfacl -m user:$USER:rw /var/run/docker.sock
docker run hello-world
sudo systemctl enable docker.service
sudo systemctl enable containerd.service
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt -y update && sudo apt -y install nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
- Modify
/etc/docker/daemon.json
{
"default-runtime": "nvidia",
"default-shm-size": "2G",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}
- Restart
sudo systemctl restart docker
- Test
docker run --rm --gpus all ubuntu nvidia-smi
- Get PCI ID
lspci -d 8086::0300
- Create conf file
sudo touch /etc/X11/xorg.conf
- Add content
sudo vi /etc/X11/xorg.conf
Section "Device"
Identifier "Device0"
BusID "PCI:0:2:0"
Driver "modesetting"
EndSection
Section "Screen"
Identifier "Screen0"
Device "Device0"
EndSection
- Create udev rules file
sudo touch /lib/udev/rules.d/80-nvidia-pm.rules
- Add content
sudo vi /lib/udev/rules.d/80-nvidia-pm.rules
# Remove NVIDIA USB xHCI Host Controller devices, if present
ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x0c0330", ATTR{remove}="1"
# Remove NVIDIA USB Type-C UCSI devices, if present
ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x0c8000", ATTR{remove}="1"
# Enable runtime PM for NVIDIA VGA controller devices
ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x030000", TEST=="power/control", ATTR{power/control}="auto"
# Enable runtime PM for NVIDIA Audio controller devices
ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x040300", TEST=="power/control", ATTR{power/control}="auto"
sudo cp /lib/udev/rules.d/40-vm-hotadd.rules /etc/udev/rules.d
sudo sed -i '/SUBSYSTEM=="memory", ACTION=="add"/d' /etc/udev/rules.d/40-vm-hotadd.rules
sudo reboot
Configure Ubuntu