dims · September 19, 2024 02:47
diff --git a/nvi.sh b/nvi.sh
 #!/bin/bash

 # All the notes you need!!!
 # https://github.com/awslabs/amazon-eks-ami/releases/tag/v20240227
 #   AMI Name: amazon-eks-gpu-node-1.29-v20240227
 # https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html
 # https://docs.aws.amazon.com/dlami/latest/devguide/gpu.html
 # https://hub.docker.com/r/nvidia/cuda/tags
 # https://github.com/NVIDIA/k8s-device-plugin/releases

 # on the box where you can start stuff from
 (cd $HOME/go/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2 && go install .)

 # or from their repos!
 go install sigs.k8s.io/kubetest2@latest && \
 go install sigs.k8s.io/kubetest2/kubetest2-tester-ginkgo@latest && \
 go install sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2@latest

 AMI_ID=$(aws ssm get-parameters --names \
         /aws/service/eks/optimized-ami/1.30/amazon-linux-2-gpu/recommended/image_id \
         --query 'Parameters[0].[Value]' --output text)
 VERSION=$(curl -Ls https://dl.k8s.io/ci/fast/latest-fast.txt)
 kubetest2 ec2 \
 --stage https://dl.k8s.io/ci/fast/ \
 --version $VERSION \
 --instance-type=g4dn.xlarge \
 --device-plugin-nvidia true \
 --worker-image="$AMI_ID" \
 --worker-user-data-file $(go env GOPATH)/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2/config/al2.sh \
 --region us-east-1 \
 --up
 
 # post startup once you have KUBECONFIG pointed to the cluster created above
 kubectl get nodes
 kubectl get nodes -o yaml
 kubectl get pods --all-namespaces
 kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"

 # Test pod
 cat <<EOF | kubectl apply -f -
 apiVersion: v1
 kind: Pod
 metadata:
  name: nvidia-smi
 spec:
  restartPolicy: OnFailure
  containers:
  - name: nvidia-smi
    image: nvidia/cuda:12.3.2-runtime-ubuntu22.04
    args:
    - "nvidia-smi"
    resources:
      limits:
        nvidia.com/gpu: 1
 EOF

 # deploy kubeflow stuff
 cd $HOME/go/src/github.com/kubeflow/manifests
 grep "namespace: default" apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml || echo "  namespace: default" >> apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml
 kustomize build apps/training-operator/upstream/base | kubectl -n default apply -f -

 # try an example
 curl https://raw.githubusercontent.com/aws/aws-k8s-tester/main/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml | sed 's/v2beta1/v1/' | sed 's/mpiImplementation: OpenMPI//' | kubectl apply -f -
 	
diff --git a/test.sh b/test.sh
 cat <<EOF | kubectl apply -f -
 apiVersion: v1
 kind: Pod
 metadata:
  name: gpu-test-pod
 spec:
  containers:
  - name: gpu-test
    image: tensorflow/tensorflow:latest-gpu
    command:
      - /usr/local/bin/python
      - -c
      - |
        import tensorflow as tf
        import time

        print("TensorFlow version:", tf.__version__)
        print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    
        # Simple matrix multiplication test
        with tf.device('/GPU:0'):
            a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
            b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
            c = tf.matmul(a, b)
    
        print("Matrix multiplication result:", c.numpy())
    
        # Performance test
        n = 5000
        start_time = time.time()
        with tf.device('/GPU:0'):
            matrix1 = tf.random.normal((n, n))
            matrix2 = tf.random.normal((n, n))
            result = tf.matmul(matrix1, matrix2)
        end_time = time.time()
    
        print(f"Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f} seconds")
    resources:
      limits:
        nvidia.com/gpu: "1"
  restartPolicy: OnFailure
 EOF
diff --git a/test2.sh b/test2.sh
 cat <<EOF | kubectl apply -f -
 apiVersion: v1
 kind: Pod
 metadata:
  name: gpu-test-pod
 spec:
  containers:
  - name: gpu-test
    image: cupy/cupy:v13.3.0
    command:
      - python3
      - -c
      - |
        import cupy as cp
        import numpy as np
        import time

        def test_vector_addition():
            # Set the number of elements to test
            num_elements_list = [10, 100, 1000, 10000, 100000, 1000000]

            for num_elements in num_elements_list:
                # Create random input vectors on the CPU
                h_A = np.random.rand(num_elements).astype(np.float32)
                h_B = np.random.rand(num_elements).astype(np.float32)

                # Transfer the input vectors to the GPU
                d_A = cp.asarray(h_A)
                d_B = cp.asarray(h_B)

                # Perform vector addition on the GPU
                start_gpu = time.time()
                d_C = d_A + d_B
                gpu_time = time.time() - start_gpu

                # Transfer the result back to the CPU
                h_C = cp.asnumpy(d_C)

                # Compute the expected result on the CPU
                start_cpu = time.time()
                h_C_expected = h_A + h_B
                cpu_time = time.time() - start_cpu

                # Verify the result
                if np.allclose(h_C_expected, h_C, atol=1e-5):
                    print(f"Test PASSED for {num_elements} elements.")
                    print(f"GPU time: {gpu_time:.6f} seconds")
                    print(f"CPU time: {cpu_time:.6f} seconds")
                    print(f"GPU speedup: {cpu_time / gpu_time:.2f}x")
                else:
                    print(f"Test FAILED for {num_elements} elements.")

                # Print the first few elements for verification
                print("First few elements of A:", h_A[:5])
                print("First few elements of B:", h_B[:5])
                print("First few elements of C:", h_C[:5])

        def main():
            test_vector_addition()

        if __name__ == "__main__":
            main()
    resources:
      limits:
        nvidia.com/gpu: "1"
  restartPolicy: OnFailure
 EOF
	#!/bin/bash

	# All the notes you need!!!
	# https://github.com/awslabs/amazon-eks-ami/releases/tag/v20240227
	# AMI Name: amazon-eks-gpu-node-1.29-v20240227
	# https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html
	# https://docs.aws.amazon.com/dlami/latest/devguide/gpu.html
	# https://hub.docker.com/r/nvidia/cuda/tags
	# https://github.com/NVIDIA/k8s-device-plugin/releases

	# on the box where you can start stuff from
	(cd $HOME/go/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2 && go install .)

	# or from their repos!
	go install sigs.k8s.io/kubetest2@latest && \
	go install sigs.k8s.io/kubetest2/kubetest2-tester-ginkgo@latest && \
	go install sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2@latest

	AMI_ID=$(aws ssm get-parameters --names \
	/aws/service/eks/optimized-ami/1.30/amazon-linux-2-gpu/recommended/image_id \
	--query 'Parameters[0].[Value]' --output text)
	VERSION=$(curl -Ls https://dl.k8s.io/ci/fast/latest-fast.txt)
	kubetest2 ec2 \
	--stage https://dl.k8s.io/ci/fast/ \
	--version $VERSION \
	--instance-type=g4dn.xlarge \
	--device-plugin-nvidia true \
	--worker-image="$AMI_ID" \
	--worker-user-data-file $(go env GOPATH)/src/sigs.k8s.io/provider-aws-test-infra/kubetest2-ec2/config/al2.sh \
	--region us-east-1 \
	--up

	# post startup once you have KUBECONFIG pointed to the cluster created above
	kubectl get nodes
	kubectl get nodes -o yaml
	kubectl get pods --all-namespaces
	kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu"

	# Test pod
	cat <<EOF \| kubectl apply -f -
	apiVersion: v1
	kind: Pod
	metadata:
	name: nvidia-smi
	spec:
	restartPolicy: OnFailure
	containers:
	- name: nvidia-smi
	image: nvidia/cuda:12.3.2-runtime-ubuntu22.04
	args:
	- "nvidia-smi"
	resources:
	limits:
	nvidia.com/gpu: 1
	EOF

	# deploy kubeflow stuff
	cd $HOME/go/src/github.com/kubeflow/manifests
	grep "namespace: default" apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml \|\| echo " namespace: default" >> apps/training-operator/upstream/base/rbac/cluster-role-binding.yaml
	kustomize build apps/training-operator/upstream/base \| kubectl -n default apply -f -

	# try an example
	curl https://raw.githubusercontent.com/aws/aws-k8s-tester/main/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml \| sed 's/v2beta1/v1/' \| sed 's/mpiImplementation: OpenMPI//' \| kubectl apply -f -