# 버전에 맞게 설치
# sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
# sudo sh -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list'
# apt update && apt install cuda-drivers=<cuda-version>
# apt install nvidia-cuda-toolkit=<toolkit-version>
# NVIDIA System Management Interface
# NVML SDK: http://developer.nvidia.com/nvidia-management-library-nvml/
# Python bindings: http://pypi.python.org/pypi/nvidia-ml-py/
nvidia-smi
# nvidia-smi -a
# Mon Nov 23 17:32:25 2020
# +-----------------------------------------------------------------------------+
# | NVIDIA-SMI 450.80.02 Driver Version: 450.80.02 CUDA Version: 11.0 |
# |-------------------------------+----------------------+----------------------+
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
# | | | MIG M. |
# |===============================+======================+======================|
# | 0 Tesla T4 Off | 00000000:1B:00.0 Off | 0 |
# | N/A 33C P8 11W / 70W | 0MiB / 15109MiB | 0% Default |
# | | | N/A |
# +-------------------------------+----------------------+----------------------+
# | 1 Tesla T4 Off | 00000000:1C:00.0 Off | 0 |
# | N/A 33C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |
# | | | N/A |
# +-------------------------------+----------------------+----------------------+
# | 2 Tesla T4 Off | 00000000:1D:00.0 Off | 0 |
# | N/A 31C P8 10W / 70W | 0MiB / 15109MiB | 0% Default |
# | | | N/A |
# +-------------------------------+----------------------+----------------------+
# | 3 Tesla T4 Off | 00000000:1E:00.0 Off | 0 |
# | N/A 32C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |
# | | | N/A |
# +-------------------------------+----------------------+----------------------+
#
# +-----------------------------------------------------------------------------+
# | Processes: |
# | GPU GI CI PID Type Process name GPU Memory |
# | ID ID Usage |
# |=============================================================================|
# | No running processes found |
# +-----------------------------------------------------------------------------+
# 2-1. Docker 엔진 설치
sudo apt-get update
sudo apt-get install -y curl
curl https://get.docker.com | sh
docker -v
# Docker version 19.03.13, build 4484c46d9d
sudo systemctl start docker && sudo systemctl enable docker
sudo systemctl status docker
# 2.2 Nvidia Docker 설치
# 2.2.1 Ubuntu
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
# OK
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
# deb https://nvidia.github.io/libnvidia-container/stable/ubuntu18.04/$(ARCH) /
# deb https://nvidia.github.io/libnvidia-container/experimental/ubuntu18.04/$(ARCH) /
# deb https://nvidia.github.io/nvidia-container-runtime/stable/ubuntu18.04/$(ARCH) /
# deb https://nvidia.github.io/nvidia-container-runtime/experimental/ubuntu18.04/$(ARCH) /
# deb https://nvidia.github.io/nvidia-docker/ubuntu18.04/$(ARCH) /
sudo apt-get update
sudo apt-get install -y nvidia-docker2=2.5.0-1
# 2.2.2 CentOS 7+
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
&& curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo
yum upgrade -y
yum install -y nvidia-docker2
# TEST
docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi
docker run --rm --gpus all nvidia/digits:6.0 --help
cat > /etc/docker/daemon.json << EOF
{
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
}
}
EOF
# "default-runtime": "nvidia", # => kubernetes control-plane pods are not working
sudo systemctl restart docker
sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi
# Unable to find image 'nvidia/cuda:11.0-base' locally
# 11.0-base: Pulling from nvidia/cuda
# docker: Error response from daemon: could not select device driver "" with capabilities: [[gpu]].
>
sudo docker ps -a
# [NONE]
# 사전작업
lsmod | grep br_netfilter
sudo swapoff -a
sed -i '9s/^/Environment="KUBELET_EXTRA_ARGS=--fail-swap-on=false"\n/' /etc/systemd/system/kubelet.service.d/10-kubeadm.conf
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.ipv4.ip_forward = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-iptables = 1
EOF
sudo sysctl --system
sudo apt-get install -y apt-transport-https
curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
# OK
cat <<EOF | sudo tee /etc/apt/sources.list.d/kubernetes.list
deb https://apt.kubernetes.io/ kubernetes-xenial main
EOF
sudo apt-get update
sudo apt-get install -y kubelet=1.18.2-00 kubeadm=1.18.2-00 kubectl=1.18.2-00 kubernetes-cni
sudo apt-mark hold kubelet kubeadm kubectl
# kubexxx set on hold.
# Create kubernetes control-plane
sudo kubeadm init --apiserver-advertise-address=$IP --pod-network-cidr=192.168.0.0/16 --kubernetes-version=1.18.2 --v=5
# 쿠버네티스 config 파일을 디폴트 경로에 복사
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
curl https://docs.projectcalico.org/manifests/calico.yaml -O
kubectl apply -f calico.yaml
kubectl get po -A -o wide
# kubectl taint nodes --all node-role.kubernetes.io/master-
# node/linux untainted
도커는 건너뛰어도 됨. 다음 쿠버네티스 파드 생성 참고.
sudo docker pull nvidia/k8s-device-plugin:v0.7.0
# sudo docker tag nvidia/k8s-device-plugin:v0.7.0 nvidia/k8s-device-plugin:devel
# 2020/10/06 05:06:27 Loading NVML
# 2020/10/06 05:06:27 Failed to initialize NVML: could not load NVML library.
# 2020/10/06 05:06:27 If this is a GPU node, did you set the docker default runtime to `nvidia`?
# 2020/10/06 05:06:27 You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites
# 2020/10/06 05:06:27 You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start
# 2020/10/06 05:06:27 If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes
sudo docker run \
-it \
--security-opt=no-new-privileges \
--cap-drop=ALL \
--network=none \
-v /var/lib/kubelet/device-plugins:/var/lib/kubelet/device-plugins \
nvidia/k8s-device-plugin:devel
- ref/1
- 쿠버네티스 공식 문서에서는 1.0.0-beta4를 설치하지만, 베타 버전이기 때문에 최신 릴리스 버전을 설치한다.
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.7.0/nvidia-device-plugin.yml
sudo journalctl -r -u kubelet -fx
kubectl describe node $MY_GPU_NODE
kubectl logs -n kube-system $DEVICE_PLUGIN_POD
# Create sample pod manifest
cat > $HOME/sample-gpu-pod.yaml << EOF
apiVersion: v1
kind: Pod
metadata:
name: sample-gpu-pod
spec:
containers:
- name: cuda-container
image: nvidia/cuda:11.0-base
args: ["sh", "-c", "nvidia-smi && tail -f /dev/null"]
resources:
limits:
nvidia.com/gpu: 1 # requesting 1 GPUs
- name: digits-container
image: nvidia/digits:6.0
resources:
limits:
nvidia.com/gpu: 1 # requesting 1 GPUs
EOF
# create pod
kubectl apply -f $HOME/sample-gpu-pod.yaml
# get logs
kubectl logs -f sample-gpu-pod -c cuda-container