markrexwinkel · October 27, 2020 06:45 · markrexwinkel · Oct 27, 2020
diff --git a/config.toml.tmpl b/config.toml.tmpl
 [plugins.opt]
  path = "{{ .NodeConfig.Containerd.Opt }}"

 [plugins.cri]
  stream_server_address = "127.0.0.1"
  stream_server_port = "10010"

 {{- if .IsRunningInUserNS }}
  disable_cgroup = true
  disable_apparmor = true
  restrict_oom_score_adj = true
 {{end}}

 {{- if .NodeConfig.AgentConfig.PauseImage }}
  sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}"
 {{end}}

 {{- if not .NodeConfig.NoFlannel }}
 [plugins.cri.cni]
  bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}"
  conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
 {{end}}

 [plugins.cri.containerd.runtimes.runc]
  # ---- changed from 'io.containerd.runc.v2' for GPU support
  runtime_type = "io.containerd.runtime.v1.linux"

 # ---- added for GPU support
 [plugins.linux]
  runtime = "nvidia-container-runtime"

 {{ if .PrivateRegistryConfig }}
 {{ if .PrivateRegistryConfig.Mirrors }}
 [plugins.cri.registry.mirrors]{{end}}
 {{range $k, $v := .PrivateRegistryConfig.Mirrors }}
 [plugins.cri.registry.mirrors."{{$k}}"]
  endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}]
 {{end}}

 {{range $k, $v := .PrivateRegistryConfig.Configs }}
 {{ if $v.Auth }}
 [plugins.cri.registry.configs."{{$k}}".auth]
  {{ if $v.Auth.Username }}username = "{{ $v.Auth.Username }}"{{end}}
  {{ if $v.Auth.Password }}password = "{{ $v.Auth.Password }}"{{end}}
  {{ if $v.Auth.Auth }}auth = "{{ $v.Auth.Auth }}"{{end}}
  {{ if $v.Auth.IdentityToken }}identitytoken = "{{ $v.Auth.IdentityToken }}"{{end}}
 {{end}}
 {{ if $v.TLS }}
 [plugins.cri.registry.configs."{{$k}}".tls]
  {{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}}
  {{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}}
  {{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}}
 {{end}}
 {{end}}
 {{end}}
diff --git a/cuda-vector-add.yaml b/cuda-vector-add.yaml
 apiVersion: v1
 kind: Pod
 metadata:
  name: cuda-vector-add
 spec:
  restartPolicy: OnFailure
  containers:
    - name: cuda-vector-add
      image: "k8s.gcr.io/cuda-vector-add:v0.1"
      resources:
        limits:
          nvidia.com/gpu: 1
diff --git a/Dockerfile b/Dockerfile
 FROM ubuntu:18.04 as base
 RUN apt-get update -y && apt-get install -y ca-certificates
 ADD build/out/data.tar.gz /image
 RUN mkdir -p /image/etc/ssl/certs /image/run /image/var/run /image/tmp /image/lib/modules /image/lib/firmware && \
    cp /etc/ssl/certs/ca-certificates.crt /image/etc/ssl/certs/ca-certificates.crt
 RUN cd image/bin && \
    rm -f k3s && \
    ln -s k3s-server k3s

 FROM ubuntu:18.04
 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
 RUN apt-get update -y && apt-get -y install gnupg2 curl
 RUN apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
 RUN sh -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list'
 RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | apt-key add -
 RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/ubuntu18.04/nvidia-container-runtime.list | tee /etc/apt/sources.list.d/nvidia-container-runtime.list
 RUN apt-get update -y
 RUN apt-get -y install cuda-drivers
 RUN apt-get -y install nvidia-container-runtime
 COPY --from=base /image /
 RUN mkdir -p /etc && \
    echo 'hosts: files dns' > /etc/nsswitch.conf
 RUN chmod 1777 /tmp
 RUN mkdir -p /var/lib/rancher/k3s/agent/etc/containerd/
 COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
 RUN mkdir -p /var/lib/rancher/k3s/server/manifests
 COPY gpu.yaml /var/lib/rancher/k3s/server/manifests/gpu.yaml
 VOLUME /var/lib/kubelet
 VOLUME /var/lib/rancher/k3s
 VOLUME /var/lib/cni
 VOLUME /var/log
 ENV PATH="$PATH:/bin/aux"
 ENTRYPOINT ["/bin/k3s"]
 CMD ["agent"]
diff --git a/gpu.yaml b/gpu.yaml
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: nvidia-device-plugin-daemonset
  namespace: kube-system
 spec:
  selector:
    matchLabels:
      name: nvidia-device-plugin-ds
  template:
    metadata:
      # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler
      # reserves resources for critical add-on pods so that they can be rescheduled after
      # a failure.  This annotation works in tandem with the toleration below.
      annotations:
        scheduler.alpha.kubernetes.io/critical-pod: ""
      labels:
        name: nvidia-device-plugin-ds
    spec:
      tolerations:
      # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
      # This, along with the annotation above marks this pod as a critical add-on.
      - key: CriticalAddonsOnly
        operator: Exists
      containers:
      - env:
        - name: DP_DISABLE_HEALTHCHECKS
          value: xids
        image: nvidia/k8s-device-plugin:1.11
        name: nvidia-device-plugin-ctr
        securityContext:
          allowPrivilegeEscalation: true
          capabilities:
            drop: ["ALL"]
        volumeMounts:
          - name: device-plugin
            mountPath: /var/lib/kubelet/device-plugins
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
	[plugins.opt]
	path = "{{ .NodeConfig.Containerd.Opt }}"

	[plugins.cri]
	stream_server_address = "127.0.0.1"
	stream_server_port = "10010"

	{{- if .IsRunningInUserNS }}
	disable_cgroup = true
	disable_apparmor = true
	restrict_oom_score_adj = true
	{{end}}

	{{- if .NodeConfig.AgentConfig.PauseImage }}
	sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}"
	{{end}}

	{{- if not .NodeConfig.NoFlannel }}
	[plugins.cri.cni]
	bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}"
	conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
	{{end}}

	[plugins.cri.containerd.runtimes.runc]
	# ---- changed from 'io.containerd.runc.v2' for GPU support
	runtime_type = "io.containerd.runtime.v1.linux"

	# ---- added for GPU support
	[plugins.linux]
	runtime = "nvidia-container-runtime"

	{{ if .PrivateRegistryConfig }}
	{{ if .PrivateRegistryConfig.Mirrors }}
	[plugins.cri.registry.mirrors]{{end}}
	{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
	[plugins.cri.registry.mirrors."{{$k}}"]
	endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}]
	{{end}}

	{{range $k, $v := .PrivateRegistryConfig.Configs }}
	{{ if $v.Auth }}
	[plugins.cri.registry.configs."{{$k}}".auth]
	{{ if $v.Auth.Username }}username = "{{ $v.Auth.Username }}"{{end}}
	{{ if $v.Auth.Password }}password = "{{ $v.Auth.Password }}"{{end}}
	{{ if $v.Auth.Auth }}auth = "{{ $v.Auth.Auth }}"{{end}}
	{{ if $v.Auth.IdentityToken }}identitytoken = "{{ $v.Auth.IdentityToken }}"{{end}}
	{{end}}
	{{ if $v.TLS }}
	[plugins.cri.registry.configs."{{$k}}".tls]
	{{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}}
	{{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}}
	{{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}}
	{{end}}
	{{end}}
	{{end}}
	apiVersion: v1
	kind: Pod
	metadata:
	name: cuda-vector-add
	spec:
	restartPolicy: OnFailure
	containers:
	- name: cuda-vector-add
	image: "k8s.gcr.io/cuda-vector-add:v0.1"
	resources:
	limits:
	nvidia.com/gpu: 1
	FROM ubuntu:18.04 as base
	RUN apt-get update -y && apt-get install -y ca-certificates
	ADD build/out/data.tar.gz /image
	RUN mkdir -p /image/etc/ssl/certs /image/run /image/var/run /image/tmp /image/lib/modules /image/lib/firmware && \
	cp /etc/ssl/certs/ca-certificates.crt /image/etc/ssl/certs/ca-certificates.crt
	RUN cd image/bin && \
	rm -f k3s && \
	ln -s k3s-server k3s

	FROM ubuntu:18.04
	RUN echo 'debconf debconf/frontend select Noninteractive' \| debconf-set-selections
	RUN apt-get update -y && apt-get -y install gnupg2 curl
	RUN apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
	RUN sh -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list'
	RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey \| apt-key add -
	RUN curl -s -L https://nvidia.github.io/nvidia-container-runtime/ubuntu18.04/nvidia-container-runtime.list \| tee /etc/apt/sources.list.d/nvidia-container-runtime.list
	RUN apt-get update -y
	RUN apt-get -y install cuda-drivers
	RUN apt-get -y install nvidia-container-runtime
	COPY --from=base /image /
	RUN mkdir -p /etc && \
	echo 'hosts: files dns' > /etc/nsswitch.conf
	RUN chmod 1777 /tmp
	RUN mkdir -p /var/lib/rancher/k3s/agent/etc/containerd/
	COPY config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
	RUN mkdir -p /var/lib/rancher/k3s/server/manifests
	COPY gpu.yaml /var/lib/rancher/k3s/server/manifests/gpu.yaml
	VOLUME /var/lib/kubelet
	VOLUME /var/lib/rancher/k3s
	VOLUME /var/lib/cni
	VOLUME /var/log
	ENV PATH="$PATH:/bin/aux"
	ENTRYPOINT ["/bin/k3s"]
	CMD ["agent"]
	apiVersion: apps/v1
	kind: DaemonSet
	metadata:
	name: nvidia-device-plugin-daemonset
	namespace: kube-system
	spec:
	selector:
	matchLabels:
	name: nvidia-device-plugin-ds
	template:
	metadata:
	# Mark this pod as a critical add-on; when enabled, the critical add-on scheduler
	# reserves resources for critical add-on pods so that they can be rescheduled after
	# a failure. This annotation works in tandem with the toleration below.
	annotations:
	scheduler.alpha.kubernetes.io/critical-pod: ""
	labels:
	name: nvidia-device-plugin-ds
	spec:
	tolerations:
	# Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
	# This, along with the annotation above marks this pod as a critical add-on.
	- key: CriticalAddonsOnly
	operator: Exists
	containers:
	- env:
	- name: DP_DISABLE_HEALTHCHECKS
	value: xids
	image: nvidia/k8s-device-plugin:1.11
	name: nvidia-device-plugin-ctr
	securityContext:
	allowPrivilegeEscalation: true
	capabilities:
	drop: ["ALL"]
	volumeMounts:
	- name: device-plugin
	mountPath: /var/lib/kubelet/device-plugins
	volumes:
	- name: device-plugin
	hostPath:
	path: /var/lib/kubelet/device-plugins