Last active
January 3, 2023 00:52
-
-
Save MatrixManAtYrService/737cb408e5a27c2aaa19576b0f6ec18a to your computer and use it in GitHub Desktop.
Run a script as a specific service account on google AI platform
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM centos:centos7 | |
RUN yum install -y python3 wget | |
# GCloud Access | |
RUN wget -nv \ | |
https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz && \ | |
mkdir /root/tools && \ | |
tar xvzf google-cloud-sdk.tar.gz -C /root/tools && \ | |
rm google-cloud-sdk.tar.gz && \ | |
/root/tools/google-cloud-sdk/install.sh --usage-reporting=false \ | |
--path-update=false --bash-completion=false \ | |
--disable-installation-options && \ | |
rm -rf /root/.config/* && \ | |
ln -s /root/.config /config && \ | |
# Remove the backup directory that gcloud creates | |
rm -rf /root/tools/google-cloud-sdk/.install/.backup | |
# Path configuration | |
ENV PATH $PATH:/root/tools/google-cloud-sdk/bin | |
# Make sure gsutil will use the default service account | |
RUN echo '[GoogleCompute]\nservice_account = default' > /etc/boto.cfg | |
COPY ./entrypoint /root/entrypoint | |
ENTRYPOINT ["/root/entrypoint"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env bash | |
# If the first parameter looks like: | |
# gs://some-bucket-name/foo_script | |
# then this entrypoint script will copy that script to /run/cmd, make it executable, and run it | |
# the bucket will need to be in the project you're using to launch the training job, otherwise | |
# you'll need to give the default AI training service account access to the bucket | |
# (https://stackoverflow.com/questions/58478478/how-can-i-mount-a-gcs-bucket-in-a-custom-docker-image-on-ai-platform#comment103289576_58478549) | |
set -euo pipefail | |
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" | |
1>&2 echo "Entered Foo Container (container id: $(cat /etc/hostname))" | |
# if the caller has staged a script in /run/cmd, run it | |
runcmd() { | |
1>&2 echo "examining /run/cmd:" | |
SHEBANG='^#!.*$' | |
FIRST_LINE="$(head -1 /run/cmd)" | |
if [[ $FIRST_LINE =~ $SHEBANG ]] | |
then | |
1>&2 printf " It has a shebang, execution via:\n\t$FIRST_LINE\n" | |
chmod +x /run/cmd | |
/run/cmd | |
else | |
1>&2 echo " No shebang detected, sourcing /run/cmd..." | |
source /run/cmd | |
fi | |
} | |
# this function is for runtime setup stuff | |
initcontainer() { | |
source "$DIR/init.sh" | |
1>&2 echo "Foo Container Initialization Complete" | |
} | |
# if the user volume-mounted a gcloud directory, assume their identity | |
if [[ -d /root/.config/gcloud ]] | |
then | |
initcontainer | |
fi | |
if [[ "$#" == 0 ]] | |
then # no arguments were supplied | |
if [ ! -t 1 ] | |
then | |
# stdin is not a tty, try /run/cmd | |
1>&2 echo "No command supplied" | |
runcmd | |
else | |
# stdin is a tty | |
1>&2 echo "No command supplied, going interactive..." | |
bash -i | |
fi | |
else # arguments were supplied | |
if [[ "$1" =~ ^gs://.*/.*script$ ]] | |
then | |
1>&2 echo "Fetching command from google cloud storage bucket into /run/cmd" | |
gsutil cp "$1" /run/cmd | |
runcmd | |
else | |
1>&2 echo "Running: \`$@\`" | |
eval $@ | |
fi | |
fi |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# not my code | |
from sh import rm, gcloud, gsutil | |
import json | |
import sys | |
# my code | |
from env import PROJECT, REGION | |
from show import service_account | |
# given: | |
# - A service account you want to run as | |
# - A script you want to run | |
# - A timeout for the job (so it doesn't get stuck and cost you $60 like it did for me) | |
# This function will: | |
# - Create a temporary service account key for this job | |
# - Create a storage bucket for this job | |
# - In it, put a script that: | |
# - activates the job-runner service account using the temporary key | |
# - runs the given script with a timeout | |
# - stashes stdout, stderr, and the return code from the script's execution into the bucket | |
# - deactivates the temporary key | |
# It returns the url of the launch script, which is consumed by the entrypoint | |
def make_job_bucket(timeout, inner_script): | |
# some local filenames specific to this job | |
key_file = f'/dev/shm/{job_id}_key.json' | |
lifecycle_file = f'/dev/shm/{job_id}_lifecycle.json' | |
launch_script = f'/dev/shm/{job_id}_script' | |
rm(['-f', key_file, lifecycle_file, launch_script]) | |
# create the storage bucket for this job | |
print("## Make a bucket for the pending job", file=sys.stderr) | |
bucket_id = f'gs://{PROJECT}_job_{job_id}' | |
gsutil(['mb', '-l', REGION, bucket_id]) | |
# define a lifecycle event for this bucket (so it self-deletes eventually) | |
with open(lifecycle_file, 'w') as file: | |
file.write(json.dumps({"lifecycle" : | |
{"rule" : | |
[ | |
{ "action" : { "type" : "Delete" }, | |
"condition" : { "age" : 5, } | |
} | |
] | |
} | |
})) | |
gsutil(['lifecycle', 'set', lifecycle_file, bucket_id]) | |
rm(['-f', lifecycle_file]) | |
print("## Make a key for the service account to use for pending job", file=sys.stderr) | |
# get the service account that it will run as | |
sa = list(service_account().keys())[0] | |
# make a new key for this job only | |
gcloud(['iam', 'service-accounts', 'keys', 'create', key_file, | |
'--iam-account', sa]) | |
# read it | |
with open(key_file, 'r') as file: | |
key = file.read() | |
key_id = json.loads(key)['private_key_id'] | |
rm(['-f', key_file]) | |
print("## Generate a script for the job to run at startup", file=sys.stderr) | |
# this script will be placed in /run/cmd | |
# (where it is expected that /root/entrypoint looks for instructions) | |
with open(launch_script, 'w') as file: | |
file.write(f'''#!/usr/bin/env bash | |
set -euo pipefail | |
1>&2 echo "## Injected Script Started" | |
# assume gcloud and gsutil exist in remote image (otherwise how did it get this script?) | |
# activate gcloud with the job runner sa key | |
cat << EOF!! > /dev/shm/sa_key | |
{key} | |
EOF!! | |
1>&2 echo "## Authenticating With Supplied Key" | |
set -x | |
gcloud auth activate-service-account --key-file=/dev/shm/sa_key | |
gcloud config set project {PROJECT} | |
1>&2 echo "## Running The Job | |
# (which may rely on the activated account) | |
mkdir -p /run | |
cat << EOF!! > /run/inner_script | |
{inner_script} | |
EOF!! | |
chmod +x /run/inner_script | |
set +ex | |
printf '\n\n\n' | |
timeout {timeout} /run/inner_script 1> >(tee /tmp/stdout ) 2> >(tee tmp/stderr >&2 ) | |
printf '\n\n\n' | |
set -ex | |
# report what happened with the command | |
CODE=$? | |
case $CODE in | |
0) | |
echo "Job Command Succeeded" | |
;; | |
124) | |
echo "Job Command Timed Out After {timeout}" | |
;; | |
*) | |
echo "Job command Failed With Code {timeout}" | |
;; | |
esac | |
# extract run results into the bucket | |
echo $CODE > /tmp/code | |
gsutil cp /tmp/code {bucket_id}/code | |
gsutil cp /tmp/stdout {bucket_id}/stdout | |
gsutil cp /tmp/stderr {bucket_id}/stderr | |
echo "Copied stdout, stderr, and the return code to {bucket_id}, which will self destruct in 5 days" | |
# deactivate the key once finished | |
gcloud iam service-accounts keys delete --iam-account {sa} {key_id} --quiet | |
echo "Job {job_id} is done" | |
exit $CODE | |
''') | |
# put the script in the bucket (it's up to the image entrypoint to fetch and run it) | |
gsutil(['cp', launch_script, f'{bucket_id}/run_script']) | |
return bucket_id | |
class Job(): | |
def __init__(self, image, command, timeout='72h'): | |
self.id = foo_id.get() | |
self.job_name = f'job_{self.id}' | |
self.image = image | |
self.bucket = make_job_bucket(self.id, timeout, command) | |
def _run_job(self): | |
gcloud(['ai-platform', 'jobs', 'submit', 'training', self.job_name, | |
'--region', REGION, | |
'--master-image-uri', self.image, | |
'--', | |
f'{self.bucket}/run_script']) | |
def run_synchronous(self): | |
self._run_job() | |
gcloud(['ai-platform', 'jobs', 'stream-logs', self.job_name]) | |
return self.bucket | |
def run_asynchronous(self): | |
self._run_job() | |
gcloud(['ai-platform', 'jobs', 'describe', self.job_name]) | |
return self.bucket |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# this script injects the command below into a ai-platform training job | |
# the script will be run using the service account configured by job.py | |
# it's outputs will be stored in a gcp storage bucket | |
from env import JOB_IMAGE | |
from job import Job | |
script = '''#! /usr/bin/env python3 | |
from sh import gcloud | |
gcloud(['config', 'list', 'account', '--format', 'value(core.account)']) | |
''' | |
a_job = Job(JOB_IMAGE, script) | |
a_job.run_synchronous() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment