Last active
August 14, 2024 04:55
-
-
Save Curt-Park/bb20f76ba2b052b03b2e1ea9834517a6 to your computer and use it in GitHub Desktop.
GPU Sharing Containers in a single pod
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: batch/v1 | |
kind: Job | |
metadata: | |
name: gpu-sharing-{UNIQUE_ID}-job | |
spec: | |
template: | |
spec: | |
restartPolicy: Never | |
containers: | |
- name: cuda-container-0 | |
image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime | |
command: | |
- sh | |
- -c | |
- | | |
export | grep NVIDIA_VISIBLE_DEVICES; # just for logging. | |
python3 -c "import torch; print('GPU Count: ', torch.cuda.device_count())"; | |
env: | |
- name: NVIDIA_VISIBLE_DEVICES | |
value: {GPU_UUID} | |
- name: cuda-container-1 | |
image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime | |
command: | |
- sh | |
- -c | |
- | | |
export | grep NVIDIA_VISIBLE_DEVICES; # just for logging. | |
python3 -c "import torch; print('GPU Count: ', torch.cuda.device_count())"; | |
env: | |
- name: NVIDIA_VISIBLE_DEVICES | |
value: {GPU_UUID} | |
nodeSelector: | |
kubernetes.io/hostname: {NODE_NAME} | |
tolerations: | |
- key: nvidia.com/gpu | |
operator: Exists | |
effect: NoSchedule |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this job holds a GPU and triggers a gpu sharing job. | |
apiVersion: batch/v1 | |
kind: Job | |
metadata: | |
name: gpu-sharing-job-trigger-job | |
spec: | |
template: | |
spec: | |
serviceAccountName: gpu-sharing-sa | |
restartPolicy: Never | |
containers: | |
- name: gpu-sharing-job-trigger | |
image: nvidia/cuda:12.1.0-base-ubuntu18.04 | |
command: | |
- sh | |
- -c | |
- | | |
# Setup | |
apt-get update && apt-get install -y curl wget openssl; | |
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"; | |
chmod +x ./kubectl && mv ./kubectl /usr/local/bin/kubectl; | |
# Get GPU UUID. | |
GPU_UUID=$NVIDIA_VISIBLE_DEVICES; | |
echo "Scheduled GPUs: ${GPU_UUID}"; | |
# Get the Job description. | |
wget https://gist.githubusercontent.com/Curt-Park/bb20f76ba2b052b03b2e1ea9834517a6/raw/47c03e1d371c4ddc821d115be02cf3563b8b2c5b/gpu_sharing_job_template.yaml; | |
UNIQUE_ID=$(openssl rand -hex 12); | |
sed -e s/{NODE_NAME}/${NODE_NAME}/ \ | |
-e s/{GPU_UUID}/${GPU_UUID}/ \ | |
-e s/{UNIQUE_ID}/${UNIQUE_ID}/ \ | |
gpu_sharing_job_template.yaml > gpu_sharing_job.yaml; | |
cat gpu_sharing_job.yaml; | |
# Create the job. | |
kubectl create -f gpu_sharing_job.yaml; | |
# Wait for the job terminated. | |
JOB_NAME=gpu-sharing-$UNIQUE_ID-job; | |
while true; do | |
status=$(kubectl get job $JOB_NAME -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}'); | |
if [ "$status" = "True" ]; then | |
echo "Job $JOB_NAME completed"; | |
break; | |
fi; | |
echo "Waiting for the job $JOB_NAME to complete..."; | |
sleep 1; | |
done; | |
env: | |
- name: NODE_NAME | |
valueFrom: | |
fieldRef: | |
fieldPath: spec.nodeName | |
resources: | |
# Requesting 1 phsycal GPUs to share. | |
requests: | |
nvidia.com/gpu: 1 | |
limits: | |
nvidia.com/gpu: 1 | |
nodeSelector: | |
nodeType: gpu | |
tolerations: | |
- key: nvidia.com/gpu | |
operator: Exists | |
effect: NoSchedule | |
--- | |
apiVersion: v1 | |
kind: ServiceAccount | |
metadata: | |
name: gpu-sharing-sa | |
--- | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: Role | |
metadata: | |
name: gpu-sharing-role | |
rules: | |
- apiGroups: ["batch"] | |
resources: ["jobs"] | |
verbs: ["get", "create"] | |
--- | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: RoleBinding | |
metadata: | |
name: gpu-sharing-rolebinding | |
roleRef: | |
apiGroup: rbac.authorization.k8s.io | |
kind: Role | |
name: gpu-sharing-role | |
subjects: | |
- kind: ServiceAccount | |
name: gpu-sharing-sa |
NOTE: export NVIDIA_VISIBLE_DEVICES=...
in containers.command
does not work.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The triggered job log: