Skip to content

Instantly share code, notes, and snippets.

@dims
Last active September 19, 2024 02:47
Show Gist options
  • Save dims/6d3ffbdcdaf2b894e8eb8e7b5c5c3d14 to your computer and use it in GitHub Desktop.
Save dims/6d3ffbdcdaf2b894e8eb8e7b5c5c3d14 to your computer and use it in GitHub Desktop.
Notes from running kubetest2 ec2 with nvidia
#!/bin/bash
# All the notes you need!!!
# https://github.com/awslabs/amazon-eks-ami/releases/tag/v20240227
# AMI Name: amazon-eks-gpu-node-1.29-v20240227
# https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html
# https://docs.aws.amazon.com/dlami/latest/devguide/gpu.html
# https://hub.docker.com/r/nvidia/cuda/tags
# https://github.com/NVIDIA/k8s-device-plugin/releases
# on the box where you can start stuff from
(cd $HOME/go/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2 && go install .)
# or from their repos!
go install sigs.k8s.io/kubetest2@latest && \
go install sigs.k8s.io/kubetest2/kubetest2-tester-ginkgo@latest && \
go install sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2@latest
AMI_ID=$(aws ssm get-parameters --names \
/aws/service/eks/optimized-ami/1.30/amazon-linux-2-gpu/recommended/image_id \
--query 'Parameters[0].[Value]' --output text)
VERSION=$(curl -Ls https://dl.k8s.io/ci/fast/latest-fast.txt)
kubetest2 ec2 \
--stage https://dl.k8s.io/ci/fast/ \
--version $VERSION \
--instance-type=g4dn.xlarge \
--device-plugin-nvidia true \
--worker-image="$AMI_ID" \
--worker-user-data-file $(go env GOPATH)/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2/config/al2.sh \
--region us-east-1 \
--up
# post startup once you have KUBECONFIG pointed to the cluster created above
kubectl get nodes
kubectl get nodes -o yaml
kubectl get pods --all-namespaces
kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"
# Test pod
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: nvidia-smi
spec:
restartPolicy: OnFailure
containers:
- name: nvidia-smi
image: nvidia/cuda:12.3.2-runtime-ubuntu22.04
args:
- "nvidia-smi"
resources:
limits:
nvidia.com/gpu: 1
EOF
# deploy kubeflow stuff
cd $HOME/go/src/github.com/kubeflow/manifests
grep "namespace: default" apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml || echo " namespace: default" >> apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml
kustomize build apps/training-operator/upstream/base | kubectl -n default apply -f -
# try an example
curl https://raw.githubusercontent.com/aws/aws-k8s-tester/main/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml | sed 's/v2beta1/v1/' | sed 's/mpiImplementation: OpenMPI//' | kubectl apply -f -
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: gpu-test-pod
spec:
containers:
- name: gpu-test
image: tensorflow/tensorflow:latest-gpu
command:
- /usr/local/bin/python
- -c
- |
import tensorflow as tf
import time
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
# Simple matrix multiplication test
with tf.device('/GPU:0'):
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
c = tf.matmul(a, b)
print("Matrix multiplication result:", c.numpy())
# Performance test
n = 5000
start_time = time.time()
with tf.device('/GPU:0'):
matrix1 = tf.random.normal((n, n))
matrix2 = tf.random.normal((n, n))
result = tf.matmul(matrix1, matrix2)
end_time = time.time()
print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f} seconds")
resources:
limits:
nvidia.com/gpu: "1"
restartPolicy: OnFailure
EOF
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
metadata:
name: gpu-test-pod
spec:
containers:
- name: gpu-test
image: cupy/cupy:v13.3.0
command:
- python3
- -c
- |
import cupy as cp
import numpy as np
import time
def test_vector_addition():
# Set the number of elements to test
num_elements_list = [10, 100, 1000, 10000, 100000, 1000000]
for num_elements in num_elements_list:
# Create random input vectors on the CPU
h_A = np.random.rand(num_elements).astype(np.float32)
h_B = np.random.rand(num_elements).astype(np.float32)
# Transfer the input vectors to the GPU
d_A = cp.asarray(h_A)
d_B = cp.asarray(h_B)
# Perform vector addition on the GPU
start_gpu = time.time()
d_C = d_A + d_B
gpu_time = time.time() - start_gpu
# Transfer the result back to the CPU
h_C = cp.asnumpy(d_C)
# Compute the expected result on the CPU
start_cpu = time.time()
h_C_expected = h_A + h_B
cpu_time = time.time() - start_cpu
# Verify the result
if np.allclose(h_C_expected, h_C, atol=1e-5):
print(f"Test PASSED for {num_elements} elements.")
print(f"GPU time: {gpu_time:.6f} seconds")
print(f"CPU time: {cpu_time:.6f} seconds")
print(f"GPU speedup: {cpu_time / gpu_time:.2f}x")
else:
print(f"Test FAILED for {num_elements} elements.")
# Print the first few elements for verification
print("First few elements of A:", h_A[:5])
print("First few elements of B:", h_B[:5])
print("First few elements of C:", h_C[:5])
def main():
test_vector_addition()
if __name__ == "__main__":
main()
resources:
limits:
nvidia.com/gpu: "1"
restartPolicy: OnFailure
EOF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment