Skip to content

Instantly share code, notes, and snippets.

@markrexwinkel
Created October 27, 2020 06:45
Show Gist options
  • Select an option

  • Save markrexwinkel/3607c7073f6f94f2f05d51ef48c04e32 to your computer and use it in GitHub Desktop.

Select an option

Save markrexwinkel/3607c7073f6f94f2f05d51ef48c04e32 to your computer and use it in GitHub Desktop.
k3s docker container gpu support
[plugins.opt]
path = "{{ .NodeConfig.Containerd.Opt }}"
[plugins.cri]
stream_server_address = "127.0.0.1"
stream_server_port = "10010"
{{- if .IsRunningInUserNS }}
disable_cgroup = true
disable_apparmor = true
restrict_oom_score_adj = true
{{end}}
{{- if .NodeConfig.AgentConfig.PauseImage }}
sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}"
{{end}}
{{- if not .NodeConfig.NoFlannel }}
[plugins.cri.cni]
bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}"
conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
{{end}}
[plugins.cri.containerd.runtimes.runc]
# ---- changed from 'io.containerd.runc.v2' for GPU support
runtime_type = "io.containerd.runtime.v1.linux"
# ---- added for GPU support
[plugins.linux]
runtime = "nvidia-container-runtime"
{{ if .PrivateRegistryConfig }}
{{ if .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors]{{end}}
{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors."{{$k}}"]
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}]
{{end}}
{{range $k, $v := .PrivateRegistryConfig.Configs }}
{{ if $v.Auth }}
[plugins.cri.registry.configs."{{$k}}".auth]
{{ if $v.Auth.Username }}username = "{{ $v.Auth.Username }}"{{end}}
{{ if $v.Auth.Password }}password = "{{ $v.Auth.Password }}"{{end}}
{{ if $v.Auth.Auth }}auth = "{{ $v.Auth.Auth }}"{{end}}
{{ if $v.Auth.IdentityToken }}identitytoken = "{{ $v.Auth.IdentityToken }}"{{end}}
{{end}}
{{ if $v.TLS }}
[plugins.cri.registry.configs."{{$k}}".tls]
{{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}}
{{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}}
{{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}}
{{end}}
{{end}}
{{end}}
apiVersion: v1
kind: Pod
metadata:
name: cuda-vector-add
spec:
restartPolicy: OnFailure
containers:
- name: cuda-vector-add
image: "k8s.gcr.io/cuda-vector-add:v0.1"
resources:
limits:
nvidia.com/gpu: 1
FROM ubuntu:18.04 as base
RUN apt-get update -y && apt-get install -y ca-certificates
ADD build/out/data.tar.gz /image
RUN mkdir -p /image/etc/ssl/certs /image/run /image/var/run /image/tmp /image/lib/modules /image/lib/firmware && \
cp /etc/ssl/certs/ca-certificates.crt /image/etc/ssl/certs/ca-certificates.crt
RUN cd image/bin && \
rm -f k3s && \
ln -s k3s-server k3s
FROM ubuntu:18.04
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
RUN apt-get update -y && apt-get -y install gnupg2 curl
RUN apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
RUN sh -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list'
RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add -
RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/ubuntu18.04/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list
RUN apt-get update -y
RUN apt-get -y install cuda-drivers
RUN apt-get -y install nvidia-container-runtime
COPY --from=base /image /
RUN mkdir -p /etc && \
echo 'hosts: files dns' > /etc/nsswitch.conf
RUN chmod 1777 /tmp
RUN mkdir -p /var/lib/rancher/k3s/agent/etc/containerd/
COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
RUN mkdir -p /var/lib/rancher/k3s/server/manifests
COPY gpu.yaml /var/lib/rancher/k3s/server/manifests/gpu.yaml
VOLUME /var/lib/kubelet
VOLUME /var/lib/rancher/k3s
VOLUME /var/lib/cni
VOLUME /var/log
ENV PATH="$PATH:/bin/aux"
ENTRYPOINT ["/bin/k3s"]
CMD ["agent"]
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
template:
metadata:
# Mark this pod as a critical add-on; when enabled, the critical add-on scheduler
# reserves resources for critical add-on pods so that they can be rescheduled after
# a failure. This annotation works in tandem with the toleration below.
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
# Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
# This, along with the annotation above marks this pod as a critical add-on.
- key: CriticalAddonsOnly
operator: Exists
containers:
- env:
- name: DP_DISABLE_HEALTHCHECKS
value: xids
image: nvidia/k8s-device-plugin:1.11
name: nvidia-device-plugin-ctr
securityContext:
allowPrivilegeEscalation: true
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
@markrexwinkel
Copy link
Copy Markdown
Author

Steps I took to get gpu working on a k3s container:

  • Installed the cuda drivers on the host
  • Installed nvidia docker on the host
  • Created a new k3s Docker image:
    • Changed the base image to ubuntu:18.04
    • Installed the cuda drivers and nvidia container runtime in the container
    • Copied the deamonset for gpu in the /var/lib/rancher/k3s/server/manifests directory
    • Copied a modified config.toml.tmpl in the /var/lib/rancher/k3s/agent/etc/containerd/ directory. This is needed to configure containerd to use the nvidia container runtime
  • Run the new image with the docker run --gpus all flag
  • Deployed a sample pod to test the gpu.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment