Last active
September 19, 2024 02:47
-
-
Save dims/6d3ffbdcdaf2b894e8eb8e7b5c5c3d14 to your computer and use it in GitHub Desktop.
Notes from running kubetest2 ec2 with nvidia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# All the notes you need!!! | |
# https://github.com/awslabs/amazon-eks-ami/releases/tag/v20240227 | |
# AMI Name: amazon-eks-gpu-node-1.29-v20240227 | |
# https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html | |
# https://docs.aws.amazon.com/dlami/latest/devguide/gpu.html | |
# https://hub.docker.com/r/nvidia/cuda/tags | |
# https://github.com/NVIDIA/k8s-device-plugin/releases | |
# on the box where you can start stuff from | |
(cd $HOME/go/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2 && go install .) | |
# or from their repos! | |
go install sigs.k8s.io/kubetest2@latest && \ | |
go install sigs.k8s.io/kubetest2/kubetest2-tester-ginkgo@latest && \ | |
go install sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2@latest | |
AMI_ID=$(aws ssm get-parameters --names \ | |
/aws/service/eks/optimized-ami/1.30/amazon-linux-2-gpu/recommended/image_id \ | |
--query 'Parameters[0].[Value]' --output text) | |
VERSION=$(curl -Ls https://dl.k8s.io/ci/fast/latest-fast.txt) | |
kubetest2 ec2 \ | |
--stage https://dl.k8s.io/ci/fast/ \ | |
--version $VERSION \ | |
--instance-type=g4dn.xlarge \ | |
--device-plugin-nvidia true \ | |
--worker-image="$AMI_ID" \ | |
--worker-user-data-file $(go env GOPATH)/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2/config/al2.sh \ | |
--region us-east-1 \ | |
--up | |
# post startup once you have KUBECONFIG pointed to the cluster created above | |
kubectl get nodes | |
kubectl get nodes -o yaml | |
kubectl get pods --all-namespaces | |
kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" | |
# Test pod | |
cat <<EOF | kubectl apply -f - | |
apiVersion: v1 | |
kind: Pod | |
metadata: | |
name: nvidia-smi | |
spec: | |
restartPolicy: OnFailure | |
containers: | |
- name: nvidia-smi | |
image: nvidia/cuda:12.3.2-runtime-ubuntu22.04 | |
args: | |
- "nvidia-smi" | |
resources: | |
limits: | |
nvidia.com/gpu: 1 | |
EOF | |
# deploy kubeflow stuff | |
cd $HOME/go/src/github.com/kubeflow/manifests | |
grep "namespace: default" apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml || echo " namespace: default" >> apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml | |
kustomize build apps/training-operator/upstream/base | kubectl -n default apply -f - | |
# try an example | |
curl https://raw.githubusercontent.com/aws/aws-k8s-tester/main/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml | sed 's/v2beta1/v1/' | sed 's/mpiImplementation: OpenMPI//' | kubectl apply -f - | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cat <<EOF | kubectl apply -f - | |
apiVersion: v1 | |
kind: Pod | |
metadata: | |
name: gpu-test-pod | |
spec: | |
containers: | |
- name: gpu-test | |
image: tensorflow/tensorflow:latest-gpu | |
command: | |
- /usr/local/bin/python | |
- -c | |
- | | |
import tensorflow as tf | |
import time | |
print("TensorFlow version:", tf.__version__) | |
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) | |
# Simple matrix multiplication test | |
with tf.device('/GPU:0'): | |
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) | |
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]) | |
c = tf.matmul(a, b) | |
print("Matrix multiplication result:", c.numpy()) | |
# Performance test | |
n = 5000 | |
start_time = time.time() | |
with tf.device('/GPU:0'): | |
matrix1 = tf.random.normal((n, n)) | |
matrix2 = tf.random.normal((n, n)) | |
result = tf.matmul(matrix1, matrix2) | |
end_time = time.time() | |
print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f} seconds") | |
resources: | |
limits: | |
nvidia.com/gpu: "1" | |
restartPolicy: OnFailure | |
EOF |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cat <<EOF | kubectl apply -f - | |
apiVersion: v1 | |
kind: Pod | |
metadata: | |
name: gpu-test-pod | |
spec: | |
containers: | |
- name: gpu-test | |
image: cupy/cupy:v13.3.0 | |
command: | |
- python3 | |
- -c | |
- | | |
import cupy as cp | |
import numpy as np | |
import time | |
def test_vector_addition(): | |
# Set the number of elements to test | |
num_elements_list = [10, 100, 1000, 10000, 100000, 1000000] | |
for num_elements in num_elements_list: | |
# Create random input vectors on the CPU | |
h_A = np.random.rand(num_elements).astype(np.float32) | |
h_B = np.random.rand(num_elements).astype(np.float32) | |
# Transfer the input vectors to the GPU | |
d_A = cp.asarray(h_A) | |
d_B = cp.asarray(h_B) | |
# Perform vector addition on the GPU | |
start_gpu = time.time() | |
d_C = d_A + d_B | |
gpu_time = time.time() - start_gpu | |
# Transfer the result back to the CPU | |
h_C = cp.asnumpy(d_C) | |
# Compute the expected result on the CPU | |
start_cpu = time.time() | |
h_C_expected = h_A + h_B | |
cpu_time = time.time() - start_cpu | |
# Verify the result | |
if np.allclose(h_C_expected, h_C, atol=1e-5): | |
print(f"Test PASSED for {num_elements} elements.") | |
print(f"GPU time: {gpu_time:.6f} seconds") | |
print(f"CPU time: {cpu_time:.6f} seconds") | |
print(f"GPU speedup: {cpu_time / gpu_time:.2f}x") | |
else: | |
print(f"Test FAILED for {num_elements} elements.") | |
# Print the first few elements for verification | |
print("First few elements of A:", h_A[:5]) | |
print("First few elements of B:", h_B[:5]) | |
print("First few elements of C:", h_C[:5]) | |
def main(): | |
test_vector_addition() | |
if __name__ == "__main__": | |
main() | |
resources: | |
limits: | |
nvidia.com/gpu: "1" | |
restartPolicy: OnFailure | |
EOF | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment