gbrayut · April 30, 2024 02:02
diff --git a/00-setup.sh b/00-setup.sh
 # Add GPU node pool with automatic driver installation. Manual drivers requred before 1.27 https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers
 # If you used the installation DaemonSet to manually install GPU drivers on or before January 25, 2023, you might need to re-apply the DaemonSet to get a version that ignores nodes that use automatic driver installation.

 # COS based L4 via g2-standard-24 VMs https://cloud.google.com/compute/docs/accelerator-optimized-machines#g2-vms
 gcloud beta container --project "gregbray-vpc" node-pools create "nvidia-l4-cos" --cluster "gke-iowa" --region "us-central1" \
  --machine-type "g2-standard-24" --accelerator type=nvidia-l4,count=2,gpu-driver-version=default \
  --image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" \
  --num-nodes "1" --enable-autoscaling --min-nodes=1 --max-nodes=1 \
  --max-pods-per-node "110" --node-locations "us-central1-a"
  
diff --git a/01-nvidia-smi-test.yaml b/01-nvidia-smi-test.yaml
 # After applying save logs to file: kubectl logs deploy/nvidia-smi-p2p
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: nvidia-smi-p2p
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: nvidia-smi-p2p
  template:
    metadata:
      labels:
        app: nvidia-smi-p2p
    spec:
      tolerations:
      - operator: "Exists"
      nodeSelector:
        cloud.google.com/gke-accelerator: "nvidia-l4"   # or use gke-nodepool: 'my-gpu-nodepool'
      terminationGracePeriodSeconds: 0
      containers:      
      - name: cuda-simple
        image: nvidia/cuda:11.0.3-base-ubi7
        command:
        - bash
        - -xc
        - |
          /usr/local/nvidia/bin/nvidia-smi topo -p2p r;
          /usr/local/nvidia/bin/nvidia-smi topo -p2p w;
          /usr/local/nvidia/bin/nvidia-smi topo -p2p n;
          /usr/local/nvidia/bin/nvidia-smi topo -p2p a;
          /usr/local/nvidia/bin/nvidia-smi topo -p2p p;
          /usr/local/nvidia/bin/nvidia-smi topo -m;
          /usr/local/nvidia/bin/nvidia-smi -L;
          cat /proc/driver/nvidia/version;
          sleep 86400
        resources:
          limits:
            nvidia.com/gpu: 2
diff --git a/02-gke-127.log b/02-gke-127.log
 $ k describe node gke-gke-iowa-nvidia-l4-cos-a85c9850-zk7g 
 ...
 System Info:
  Machine ID:                 6e6bf0d35afce92bec9d47f425f92ec0
  System UUID:                6e6bf0d3-5afc-e92b-ec9d-47f425f92ec0
  Boot ID:                    7952ea2e-c403-4017-aefc-99734e485af9
  Kernel Version:             5.15.146+
  OS Image:                   Container-Optimized OS from Google
  Operating System:           linux
  Architecture:               amd64
  Container Runtime Version:  containerd://1.7.10
  Kubelet Version:            v1.27.12-gke.1115000
  Kube-Proxy Version:         v1.27.12-gke.1115000
 ...


 $ k logs deploy/nvidia-smi-p2p 
 + /usr/local/nvidia/bin/nvidia-smi topo -p2p r
 	GPU0	GPU1	
 GPU0	X	OK	
 GPU1	OK	X	

 Legend:

  X    = Self
  OK   = Status Ok
  CNS  = Chipset not supported
  GNS  = GPU not supported
  TNS  = Topology not supported
  NS   = Not supported
  U    = Unknown
 + /usr/local/nvidia/bin/nvidia-smi topo -p2p w
 	GPU0	GPU1	
 GPU0	X	OK	
 GPU1	OK	X	

 Legend:

  X    = Self
  OK   = Status Ok
  CNS  = Chipset not supported
  GNS  = GPU not supported
  TNS  = Topology not supported
  NS   = Not supported
  U    = Unknown
 + /usr/local/nvidia/bin/nvidia-smi topo -p2p n
 	GPU0	GPU1	
 GPU0	X	NS	
 GPU1	NS	X	

 Legend:

  X    = Self
  OK   = Status Ok
  CNS  = Chipset not supported
  GNS  = GPU not supported
  TNS  = Topology not supported
  NS   = Not supported
  U    = Unknown
 + /usr/local/nvidia/bin/nvidia-smi topo -p2p a
 	GPU0	GPU1	
 GPU0	X	NS	
 GPU1	NS	X	

 Legend:

  X    = Self
  OK   = Status Ok
  CNS  = Chipset not supported
  GNS  = GPU not supported
  TNS  = Topology not supported
  NS   = Not supported
  U    = Unknown
 + /usr/local/nvidia/bin/nvidia-smi topo -p2p p
 	GPU0	GPU1	
 GPU0	X	OK	
 GPU1	OK	X	

 Legend:

  X    = Self
  OK   = Status Ok
  CNS  = Chipset not supported
  GNS  = GPU not supported
  TNS  = Topology not supported
  NS   = Not supported
  U    = Unknown
 + /usr/local/nvidia/bin/nvidia-smi topo -m
 	GPU0	GPU1	CPU Affinity	NUMA Affinity	GPU NUMA ID
 GPU0	 X 	PHB	0-23	0		N/A
 GPU1	PHB	 X 	0-23	0		N/A

 Legend:

  X    = Self
  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
  PIX  = Connection traversing at most a single PCIe bridge
  NV#  = Connection traversing a bonded set of # NVLinks
 + /usr/local/nvidia/bin/nvidia-smi -L
 GPU 0: NVIDIA L4 (UUID: GPU-17fdd1ab-1f5d-c54e-fde2-f270b0515d8f)
 GPU 1: NVIDIA L4 (UUID: GPU-626f9d6d-645e-abd7-bc74-7a5cb0a0dc0b)
 + cat /proc/driver/nvidia/version
 NVRM version: NVIDIA UNIX Open Kernel Module for x86_64  535.161.07  Release Build  (builder@6c81ef5cd5ba)  Fri Mar  8 01:47:43 PM UTC 2024
 GCC version:  Selected multilib: .;@m64
 + sleep 86400
	# Add GPU node pool with automatic driver installation. Manual drivers requred before 1.27 https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers
	# If you used the installation DaemonSet to manually install GPU drivers on or before January 25, 2023, you might need to re-apply the DaemonSet to get a version that ignores nodes that use automatic driver installation.

	# COS based L4 via g2-standard-24 VMs https://cloud.google.com/compute/docs/accelerator-optimized-machines#g2-vms
	gcloud beta container --project "gregbray-vpc" node-pools create "nvidia-l4-cos" --cluster "gke-iowa" --region "us-central1" \
	--machine-type "g2-standard-24" --accelerator type=nvidia-l4,count=2,gpu-driver-version=default \
	--image-type "COS_CONTAINERD" --disk-type "pd-balanced" --disk-size "100" \
	--num-nodes "1" --enable-autoscaling --min-nodes=1 --max-nodes=1 \
	--max-pods-per-node "110" --node-locations "us-central1-a"
	# After applying save logs to file: kubectl logs deploy/nvidia-smi-p2p
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: nvidia-smi-p2p
	spec:
	replicas: 1
	strategy:
	type: Recreate
	selector:
	matchLabels:
	app: nvidia-smi-p2p
	template:
	metadata:
	labels:
	app: nvidia-smi-p2p
	spec:
	tolerations:
	- operator: "Exists"
	nodeSelector:
	cloud.google.com/gke-accelerator: "nvidia-l4" # or use gke-nodepool: 'my-gpu-nodepool'
	terminationGracePeriodSeconds: 0
	containers:
	- name: cuda-simple
	image: nvidia/cuda:11.0.3-base-ubi7
	command:
	- bash
	- -xc
	- \|
	/usr/local/nvidia/bin/nvidia-smi topo -p2p r;
	/usr/local/nvidia/bin/nvidia-smi topo -p2p w;
	/usr/local/nvidia/bin/nvidia-smi topo -p2p n;
	/usr/local/nvidia/bin/nvidia-smi topo -p2p a;
	/usr/local/nvidia/bin/nvidia-smi topo -p2p p;
	/usr/local/nvidia/bin/nvidia-smi topo -m;
	/usr/local/nvidia/bin/nvidia-smi -L;
	cat /proc/driver/nvidia/version;
	sleep 86400
	resources:
	limits:
	nvidia.com/gpu: 2
	$ k describe node gke-gke-iowa-nvidia-l4-cos-a85c9850-zk7g
	...
	System Info:
	Machine ID: 6e6bf0d35afce92bec9d47f425f92ec0
	System UUID: 6e6bf0d3-5afc-e92b-ec9d-47f425f92ec0
	Boot ID: 7952ea2e-c403-4017-aefc-99734e485af9
	Kernel Version: 5.15.146+
	OS Image: Container-Optimized OS from Google
	Operating System: linux
	Architecture: amd64
	Container Runtime Version: containerd://1.7.10
	Kubelet Version: v1.27.12-gke.1115000
	Kube-Proxy Version: v1.27.12-gke.1115000
	...


	$ k logs deploy/nvidia-smi-p2p
	+ /usr/local/nvidia/bin/nvidia-smi topo -p2p r
	GPU0 GPU1
	GPU0 X OK
	GPU1 OK X

	Legend:

	X = Self
	OK = Status Ok
	CNS = Chipset not supported
	GNS = GPU not supported
	TNS = Topology not supported
	NS = Not supported
	U = Unknown
	+ /usr/local/nvidia/bin/nvidia-smi topo -p2p w
	GPU0 GPU1
	GPU0 X OK
	GPU1 OK X

	Legend:

	X = Self
	OK = Status Ok
	CNS = Chipset not supported
	GNS = GPU not supported
	TNS = Topology not supported
	NS = Not supported
	U = Unknown
	+ /usr/local/nvidia/bin/nvidia-smi topo -p2p n
	GPU0 GPU1
	GPU0 X NS
	GPU1 NS X

	Legend:

	X = Self
	OK = Status Ok
	CNS = Chipset not supported
	GNS = GPU not supported
	TNS = Topology not supported
	NS = Not supported
	U = Unknown
	+ /usr/local/nvidia/bin/nvidia-smi topo -p2p a
	GPU0 GPU1
	GPU0 X NS
	GPU1 NS X

	Legend:

	X = Self
	OK = Status Ok
	CNS = Chipset not supported
	GNS = GPU not supported
	TNS = Topology not supported
	NS = Not supported
	U = Unknown
	+ /usr/local/nvidia/bin/nvidia-smi topo -p2p p
	GPU0 GPU1
	GPU0 X OK
	GPU1 OK X

	Legend:

	X = Self
	OK = Status Ok
	CNS = Chipset not supported
	GNS = GPU not supported
	TNS = Topology not supported
	NS = Not supported
	U = Unknown
	+ /usr/local/nvidia/bin/nvidia-smi topo -m
	GPU0 GPU1 CPU Affinity NUMA Affinity GPU NUMA ID
	GPU0 X PHB 0-23 0 N/A
	GPU1 PHB X 0-23 0 N/A

	Legend:

	X = Self
	SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
	NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
	PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
	PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
	PIX = Connection traversing at most a single PCIe bridge
	NV# = Connection traversing a bonded set of # NVLinks
	+ /usr/local/nvidia/bin/nvidia-smi -L
	GPU 0: NVIDIA L4 (UUID: GPU-17fdd1ab-1f5d-c54e-fde2-f270b0515d8f)
	GPU 1: NVIDIA L4 (UUID: GPU-626f9d6d-645e-abd7-bc74-7a5cb0a0dc0b)
	+ cat /proc/driver/nvidia/version
	NVRM version: NVIDIA UNIX Open Kernel Module for x86_64 535.161.07 Release Build (builder@6c81ef5cd5ba) Fri Mar 8 01:47:43 PM UTC 2024
	GCC version: Selected multilib: .;@m64
	+ sleep 86400