sudo apt -y install ca-certificates
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt -y update && sudo apt -y install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
sudo usermod -aG docker $USER
newgrp docker
sudo systemctl enable docker.service
sudo systemctl enable containerd.service
sudo chmod 666 /var/run/docker.sock
sudo systemctl restart docker
docker run hello-world
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
# Install
sudo apt -y update && sudo apt -y install nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
# Rootless
nvidia-ctk runtime configure --runtime=docker --config=$HOME/.config/docker/daemon.json
sudo systemctl restart docker
sudo nvidia-ctk config --set nvidia-container-cli.no-cgroups --in-place
sudo sed -i 's/no-cgroups = true/no-cgroups = false/g' /etc/nvidia-container-runtime/config.toml
sudo systemctl restart docker
# Modify /etc/docker/daemon.json
{
"default-runtime": "nvidia",
"default-shm-size": "2G",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
"exec-opts": ["native.cgroupdriver=cgroupfs"]
}
docker run --rm --gpus all ubuntu nvidia-smi
sudo apt-key del 7fa2af80
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt -y update && sudo apt -y install cuda-toolkit
rm cuda-keyring_1.1-1_all.deb
sudo reboot
nvcc --version
git clone https://github.com/nvidia/cuda-samples ~/cuda-samples
sudo apt install -y cmake
cd ~/cuda-samples && mkdir build && cd build && cmake ..
make -j$(nproc)
cd ~/cuda-samples/build/Samples/1_Utilities/deviceQuery && ./deviceQuery
cd ~/cuda-samples/build/Samples/1_Utilities/bandwidthTest && ./bandwidthTest
cd ../../../../.. && rm -fr cuda-samples
wget -P ~/ https://gist.githubusercontent.com/Melonangie/93c1e3d3a5bac3f135f90e71a48bfb4e/raw/b556c4fe0b5e68d5a43b88221e8885cc10c7b0aa/bigd.yml
conda env create -f bigd.yml
conda activate bigd
pip list
jupyter lab
import tensorrt as rt; print(rt.__version__); assert rt.Builder(rt.Logger())
---
import tensorflow as tf; print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
---
import torch; import torchvision; print(torch.cuda.is_available())
---
import nvidia_smi
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
print(f'mem: {mem_res.used / (1024**2)/10} (GiB)') # usage in GiB
print(f'mem: {100 * (mem_res.used / mem_res.total):.3f}%') # percentage usage
- Tear down
conda deactivate
wget -P ~/ https://gist.githubusercontent.com/Melonangie/d88151b8739bcee4bc06240cfc34df9f/raw/05670c847ccae1d21df96c5e8be16cf5be195723/smalld.yml
conda env create -f smalld.yml
conda activate smalld
pip list
jupyter lab
echo 'conda activate smalld' >> ~/.zshrc
- Get PCI ID
lspci -d 8086::0300
- Create conf file
sudo touch /etc/X11/xorg.conf
- Add content
sudo vi /etc/X11/xorg.conf
Section "Device"
Identifier "Device0"
BusID "PCI:0:2:0"
Driver "modesetting"
EndSection
Section "Screen"
Identifier "Screen0"
Device "Device0"
EndSection
- Create udev rules file
sudo touch /lib/udev/rules.d/80-nvidia-pm.rules
- Add content
sudo vi /lib/udev/rules.d/80-nvidia-pm.rules
# Remove NVIDIA USB xHCI Host Controller devices, if present
ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x0c0330", ATTR{remove}="1"
# Remove NVIDIA USB Type-C UCSI devices, if present
ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x0c8000", ATTR{remove}="1"
# Enable runtime PM for NVIDIA VGA controller devices
ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x030000", TEST=="power/control", ATTR{power/control}="auto"
# Enable runtime PM for NVIDIA Audio controller devices
ACTION=="add", SUBSYSTEM=="pci", ATTR{vendor}=="0x10de", ATTR{class}=="0x040300", TEST=="power/control", ATTR{power/control}="auto"
sudo cp /lib/udev/rules.d/40-vm-hotadd.rules /etc/udev/rules.d
sudo sed -i '/SUBSYSTEM=="memory", ACTION=="add"/d' /etc/udev/rules.d/40-vm-hotadd.rules
sudo reboot
Post System Configurations