Last active
July 4, 2023 05:08
-
-
Save cyrildiagne/bd9242327e4f6f8b5f1455c5b8d61f0a to your computer and use it in GitHub Desktop.
Setup Knative Cluster with GPU on GKE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Requires gcloud and kubectl. | |
# Make sure you've enabled the API services using gcloud: | |
# gcloud services enable \ | |
# cloudapis.googleapis.com \ | |
# container.googleapis.com \ | |
# containerregistry.googleapis.com | |
# Exit on error. | |
set -e | |
export CLUSTER_NAME="${CLUSTER_NAME:-kuda}" | |
export CLUSTER_ZONE="${CLUSTER_ZONE:-us-central1-a}" | |
export PROJECT="${PROJECT:-gpu-sh}" | |
export DOMAIN="${DOMAIN:-nip.io}" | |
export MASTER_MACHINE_TYPE="${MASTER_MACHINE_TYPE:-n1-standard-2}" | |
export GPU_MACHINE_TYPE="${GPU_MACHINE_TYPE:-n1-standard-2}" | |
export GPU_ACCELERATOR="${GPU_ACCELERATOR:-nvidia-tesla-k80}" | |
export USE_PREEMPTIBLE_GPU="${USE_PREEMPTIBLE_GPU:-true}" | |
export KNATIVE_VERSION="${KNATIVE_VERSION:-0.10.0}" | |
export CLUSTER_USER_ADMIN=$(gcloud config get-value core/account) | |
function create_main_cluster() { | |
# Create the main Knative cluster. | |
gcloud beta container clusters create $CLUSTER_NAME \ | |
--addons=HorizontalPodAutoscaling,HttpLoadBalancing,Istio \ | |
--machine-type=$MASTER_MACHINE_TYPE \ | |
--cluster-version=latest \ | |
--zone=$CLUSTER_ZONE \ | |
--enable-stackdriver-kubernetes \ | |
--enable-ip-alias \ | |
--enable-autoscaling \ | |
--min-nodes=1 \ | |
--max-nodes=8 \ | |
--enable-autorepair \ | |
--enable-autoupgrade \ | |
--scopes cloud-platform | |
} | |
function grant_admin() { | |
# Grant cluster-admin permissions to the current user. | |
kubectl create clusterrolebinding cluster-admin-binding \ | |
--clusterrole=cluster-admin \ | |
--user=$CLUSTER_USER_ADMIN | |
} | |
function create_gpu_nodepools() { | |
preemptible_mode="" | |
if [ $USE_PREEMPTIBLE_GPU = true ]; then | |
preemptible_mode="--preemptible" | |
fi | |
# Create the default GPU Node pool. | |
gcloud container node-pools create $GPU_ACCELERATOR \ | |
--machine-type=$GPU_MACHINE_TYPE \ | |
--accelerator type=$GPU_ACCELERATOR,count=1 \ | |
--zone $CLUSTER_ZONE \ | |
--cluster $CLUSTER_NAME \ | |
--num-nodes 1 \ | |
--min-nodes 0 \ | |
--max-nodes 8 \ | |
--enable-autoupgrade \ | |
--enable-autoscaling \ | |
--metadata disable-legacy-endpoints=false \ | |
$preemptible_mode | |
} | |
function install_nvidia_drivers() { | |
# Ensure sure the gcloud nvidia drivers are installed. | |
NVIDIA_DRIVER_REPO="GoogleCloudPlatform/container-engine-accelerators" | |
NVIDIA_DRIVER_PATH="master/nvidia-driver-installer/cos/daemonset-preloaded.yaml" | |
kubectl apply -f "https://raw.githubusercontent.com/$NVIDIA_DRIVER_REPO/$NVIDIA_DRIVER_PATH" | |
} | |
function install_knative() { | |
# Install Knative components. | |
# We don't install monitoring nor eventing to save some resourceses | |
knative_serving_repo="https://github.com/knative/serving/releases/download" | |
kubectl apply --wait=true --selector knative.dev/crd-install=true \ | |
--filename $knative_serving_repo/v$KNATIVE_VERSION/serving.yaml | |
# Hack to make sure the CRD have been installed correctly. | |
sleep 3 | |
# Complete installation. | |
kubectl apply \ | |
--filename $knative_serving_repo/v$KNATIVE_VERSION/serving.yaml | |
} | |
function setup() { | |
echo "Setup Knative cluster on GKE..." | |
gcloud config set project $PROJECT | |
# Check if cluster already exists otherwise create one. | |
if gcloud container clusters list | grep -q $CLUSTER_NAME; then | |
echo "→ Cluster already exists." | |
else | |
echo "Creating cluster $CLUSTER_NAME..." | |
create_main_cluster | |
grant_admin | |
echo "→ Cluster created." | |
fi | |
# Get cluster's credentials to use kubectl. | |
gcloud container clusters get-credentials $CLUSTER_NAME | |
# Check if GPU cluster exists otherwise create one. | |
if gcloud container node-pools list \ | |
--zone $CLUSTER_ZONE \ | |
--cluster $CLUSTER_NAME | grep -q $GPU_ACCELERATOR; then | |
echo "→ GPU node pool already exists." | |
else | |
echo "Creating new GPU node pool with default GPU $GPU_ACCELERATOR..." | |
create_gpu_nodepools | |
install_nvidia_drivers | |
echo "→ GPU node pool created." | |
fi | |
# Install Knative. | |
if kubectl get pods \ | |
--namespace knative-serving \ | |
--label-columns=serving.knative.dev/release | grep -q v$KNATIVE_VERSION; then | |
echo "→ Knative v$KNATIVE_VERSION is already installed." | |
else | |
echo "Installing Knative v$KNATIVE_VERSION..." | |
install_knative | |
echo "→ Knative installed." | |
fi | |
# Setup Domain name. | |
if [ "$DOMAIN" = "xip.io" ]; then | |
# TODO: remove this when after next Knative release. | |
EXTERNAL_IP=$(kubectl get svc istio-ingressgateway \ | |
--namespace istio-system \ | |
--output jsonpath="{.status.loadBalancer.ingress[*]['ip']}") | |
kubectl patch configmap config-domain \ | |
--namespace knative-serving \ | |
--patch \ | |
'{"data": {"example.com": null, "'$EXTERNAL_IP'.xip.io": ""}}' | |
else | |
kubectl patch configmap config-domain \ | |
--namespace knative-serving \ | |
--patch \ | |
'{"data": {"example.com": null, "'$DOMAIN'": ""}}' | |
fi | |
echo "Done!" | |
} | |
setup |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment