Created
October 17, 2025 08:42
-
-
Save goddoe/8ff6db4a3253e2a88705fe11831fa64c to your computer and use it in GitHub Desktop.
pytorch_job_sample
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| apiVersion: kubeflow.org/v1 | |
| kind: PyTorchJob | |
| metadata: | |
| name: pytorch-rlyx-trainer | |
| namespace: p-ksj | |
| spec: | |
| elasticPolicy: | |
| rdzvId: pytorch-job-rlyx-trainer | |
| rdzvBackend: c10d | |
| minReplicas: 4 | |
| maxReplicas: 4 | |
| nProcPerNode: 8 | |
| runPolicy: | |
| cleanPodPolicy: None | |
| pytorchReplicaSpecs: | |
| Worker: | |
| replicas: 4 | |
| restartPolicy: OnFailure | |
| template: | |
| metadata: | |
| annotations: | |
| sidecar.istio.io/inject: "false" | |
| spec: | |
| imagePullSecrets: | |
| - name: ro-base-nvidia | |
| nodeSelector: | |
| mlx.navercorp.com/zone: private-h100-ib-ai-devgru-0 # private-h100-ib-ddn-1 # InfiniBand가 있는 Zone으로 설정합니다. | |
| containers: | |
| - name: pytorch | |
| image: reg.navercorp.com/base-nvidia/nvidia/pytorch:24.07-py3 | |
| imagePullPolicy: Always | |
| securityContext: # Infiniband 를 사용하기 위한 securityContext가 필요합니다. | |
| runAsUser: 0 | |
| capabilities: | |
| add: ["IPC_LOCK"] | |
| # - SYS_RESOURCE | |
| # privileged: true | |
| command: ["bash", "-c"] | |
| args: | |
| - > | |
| echo "print all environment variables" | |
| sudo ulimit -l unlimited | |
| echo "PET_NNODES: ${PET_NNODES} PET_NPROC_PER_NODE: ${PET_NPROC_PER_NODE} PET_RDZV_ID: ${PET_RDZV_ID} PET_RDZV_BACKEND: ${PET_RDZV_BACKEND} PET_RDZV_ENDPOINT: ${PET_RDZV_ENDPOINT}" | |
| curl -LsSf https://astral.sh/uv/install.sh | sh | |
| source $HOME/.local/bin/env | |
| # cp -r path/to/RLYX /workspace/ | |
| # cd /workspace/RLYX | |
| # ray start --address='{}:5001' --block | |
| echo "Bash ready!"; tail -f /dev/null | |
| env: | |
| - name: NCCL_DEBUG | |
| value: INFO | |
| - name: TORCH_DISTRIBUTED_DEBUG | |
| value: INFO | |
| resources: | |
| limits: | |
| memory: "1Ti" | |
| cpu: 32 | |
| nvidia.com/gpu: 8 | |
| rdma/hca_shared_devices_a: 1 | |
| requests: | |
| memory: "256Gi" | |
| cpu: 32 | |
| nvidia.com/gpu: 8 | |
| rdma/hca_shared_devices_a: 1 | |
| # shared memory | |
| volumeMounts: | |
| - mountPath: /dev/shm | |
| name: shared-memory | |
| - mountPath: /mnt/fr20tb | |
| name: fr20tb | |
| volumes: | |
| - emptyDir: | |
| medium: Memory | |
| name: shared-memory | |
| - name: fr20tb | |
| nfs: | |
| server: 10.157.140.75 | |
| path: /data1/nfs-share/user/fr20tb |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment