Last active
October 8, 2025 16:24
-
-
Save lucaspar/def9198eabc110345c746806e0b90f38 to your computer and use it in GitHub Desktop.
Install NVIDIA drivers and container toolkit on RHEL 9.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # This script installs NVIDIA drivers and container toolkit on RHEL 9. | |
| # It should be safe to run it multiple times: steps are skipped when not needed. | |
| # This is an interactive script: system and services restarts are prompted to the user. | |
| # Source: | |
| # https://gist.github.com/lucaspar/def9198eabc110345c746806e0b90f38/ | |
| set -euo pipefail | |
| script_name=$(basename "$0") | |
| prefix="${script_name} |" | |
| hostname=$(hostname) | |
| distro="rhel9" | |
| function say { | |
| echo -e "\e[34m${prefix} $(date +'%H:%M:%S')\e[0m: $*" | |
| } | |
| function say_error { | |
| echo -e "\e[31m${prefix} $(date +'%H:%M:%S')\e[0m: $*" >&2 | |
| } | |
| function pre-checks { | |
| if ! grep -q "Red Hat Enterprise Linux" /etc/os-release; then | |
| say_error "This script is only meant to run on Red Hat Enterprise Linux" | |
| exit 1 | |
| fi | |
| if [[ ! "$(grep 'VERSION_ID' /etc/os-release | cut -d '"' -f 2)" =~ ^9 ]]; then | |
| say_error "This script is only meant to run on RHEL 9" | |
| exit 1 | |
| fi | |
| } | |
| function enable-epel-rhel9 { | |
| if rpm -q epel-release &>/dev/null; then | |
| say "EPEL is already installed; skipping..." | |
| return | |
| fi | |
| dnf makecache | |
| sudo subscription-manager repos --enable "codeready-builder-for-rhel-9-$(uname -i)-rpms" | |
| sudo dnf install -y "https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm" | |
| } | |
| function enable-open-kernel-modules { | |
| # https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/index.html#dnf-module-enablement | |
| if sudo dnf module list -q nvidia-driver:open-dkms | grep -q "\[d\]"; then | |
| say "Open kernel modules are already enabled; skipping..." | |
| return | |
| fi | |
| say "Enabling open kernel modules..." | |
| sudo dnf module enable -y nvidia-driver:open-dkms | |
| } | |
| function check-nvidia-installed { | |
| lspci | grep -i nvidia &>/dev/null && nvidia-smi &>/dev/null && lsmod | grep -i nvidia &>/dev/null | |
| } | |
| function check-cuda-installed { | |
| command -v nvcc &>/dev/null && command -v nvidia-smi &>/dev/null | |
| } | |
| function check-container-toolkit-installed { | |
| nvidia-container-toolkit --version &>/dev/null | |
| } | |
| function preparation { | |
| # https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/index.html#preparation | |
| say "Installing prerequisite packages..." | |
| sudo dnf install -y kernel-devel-"$(uname -r)" kernel-headers-"$(uname -r)" gcc make dkms acpid libglvnd-glx libglvnd-opengl libglvnd-devel pkgconfig | |
| enable-epel-rhel9 | |
| } | |
| function enable-repos { | |
| # https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/index.html#network-repository-installation | |
| say "Enabling NVIDIA repository..." | |
| sudo dnf config-manager --add-repo "http://developer.download.nvidia.com/compute/cuda/repos/${distro}/$(uname -i)/cuda-${distro}.repo" | |
| } | |
| function install-driver { | |
| # https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/index.html#desktop-only-system | |
| say "Installing NVIDIA drivers..." | |
| sudo dnf module install -y nvidia-driver:open-dkms | |
| # or use a specific version, e.g.: | |
| # sudo dnf module install -y nvidia-driver:570 | |
| } | |
| function install-nvidia-driver { | |
| if check-nvidia-installed; then | |
| say "NVIDIA drivers are already installed; skipping..." | |
| return | |
| fi | |
| # these steps roughly translate to the sections in the official documentation for RHEL 9: | |
| # https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/index.html#red-hat-enterprise-linux | |
| preparation | |
| enable-repos | |
| enable-open-kernel-modules | |
| install-driver | |
| say "A reboot is required to load the NVIDIA drivers." | |
| say "These are the users connected to the system:" | |
| who | |
| echo -ne "\n\t\e[31mDo you want to reboot now? [y/N]\e[0m " | |
| read -r answer | |
| echo -e "\n" | |
| if [[ "${answer}" =~ ^[Yy]$ ]]; then | |
| say "Run this script again after the reboot to continue the installation." | |
| sleep 2 | |
| say "Rebooting in 10 seconds... Ctrl+C to cancel." | |
| sleep 10 | |
| sudo reboot | |
| fi | |
| } | |
| function install-container-toolkit { | |
| if check-container-toolkit-installed; then | |
| say "Container toolkit is already installed; skipping..." | |
| return | |
| fi | |
| say "Installing container toolkit..." | |
| sudo dnf config-manager --add-repo "https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo" | |
| sudo dnf install -y nvidia-container-toolkit | |
| } | |
| function check_if_docker_configured { | |
| docker_daemon_json="/etc/docker/daemon.json" | |
| grep -q "nvidia" ${docker_daemon_json} | |
| } | |
| function configure-container-toolkit { | |
| if check_if_docker_configured; then | |
| say "Docker is already configured to use NVIDIA runtime; skipping..." | |
| return | |
| fi | |
| say "Configuring container toolkit..." | |
| sudo nvidia-ctk runtime configure --runtime=docker | |
| say "These are the active containers:" | |
| docker ps --all --format "table {{.Names}},{{.Image}},{{.Status}}" | column -t -s , | |
| echo -ne "\n\t\e[31mDo you want to restart docker now? [y/N]\e[0m " | |
| read -r answer | |
| echo -e "\n" | |
| if [[ "${answer}" =~ ^[Yy]$ ]]; then | |
| sudo systemctl restart docker | |
| fi | |
| } | |
| function show-nvidia-info { | |
| say "NVIDIA driver info:" | |
| nvidia-smi --query-gpu=gpu_name,driver_version,temperature.gpu,power.draw,memory.used,memory.total --format=csv | column -t -s , | |
| say "Container toolkit info:" | |
| nvidia-container-toolkit --version | |
| say "Docker runtime info:" | |
| docker info | grep --color=always -C 2 nvidia || say "No NVIDIA runtime detected in Docker. Maybe try restarting the service?" | |
| say "To try the GPU access from a Docker container, run:" | |
| say "\t\e[34m docker run --gpus all --rm nvidia/cuda:13.0.1-base-ubuntu24.04 nvidia-smi\n\e[0m" | |
| } | |
| function _uninstall-drivers { | |
| sudo dnf remove -y nvidia-driver | |
| sudo dnf module reset -y nvidia-driver | |
| } | |
| function main { | |
| say "Starting ${script_name} on \e[32m${hostname}\e[0m..." | |
| pre-checks | |
| sudo -v -p "[sudo] I need root access to install packages: " | |
| install-nvidia-driver | |
| install-container-toolkit | |
| configure-container-toolkit | |
| show-nvidia-info | |
| say "Finished ${script_name} on ${hostname}!" | |
| } | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment