Skip to content

Instantly share code, notes, and snippets.

@johnallen3d
Last active October 8, 2021 12:56
Show Gist options
  • Save johnallen3d/26a175ba23271cc63f65226aa996958a to your computer and use it in GitHub Desktop.
Save johnallen3d/26a175ba23271cc63f65226aa996958a to your computer and use it in GitHub Desktop.
Basic Kedro + Docker Setup

Basic Kedro + Docker Setup

Prerequisites

  • Docker
  • Docker Compose
  • nib

Setup

nib build --pull

Usage Examples

# kedro cli help
nib run kedro -h

# run a jupyter lab server: http://127.0.0.1:8888/lab?token=token
nib run --service-ports kedro jupyter

# run the kedro visualizer: http://localhost:4141/
nib run --service-ports kedro viz

# start an ipython session
nib run kedro ipython

# start a bash session
nib shell kedro

# run the test suite
nib run kedro test

Debugging

To add a "break-point" for interactive debugging (say when running the test suite) add the following to a function where you would like to start your interactive session.

import ipdb; ipdb.set_trace()

Adding a Library

Add new library dependencies to src/requirements.in then run the following commands.

nib run kedro build-reqs && nib build --pull

Packaging a Pipeline

pipeline_name="curate_repos" # ex. curate_repos
nib run kedro pipeline package $pipeline_name
version: '2'
services:
# the service here is not "web" but the template expects there to be a
# primary service named web
web: &base
build:
context: .
volumes:
- $PWD:/usr/src/app
ports:
- '4141:4141'
- '8888:8888'
test:
<<: *base
entrypoint: kedro test
lint:
<<: *base
entrypoint: kedro lint
trigger:
branches:
include:
- feature/*
pr: none
pool:
vmImage: ubuntu-latest
resources:
repositories:
- repository: 'PSCorePipelineTemplates'
type: github
name: technekes/ps-core-pipeline-templates
endpoint: technekes
ref: 'refs/heads/version/1.0'
variables:
- name: encryptionKey
value: $(KEY)
- name: secretLocation
value: '.azure-pipelines/secrets.encrypted'
- name: buildArgs
value: $(BUILD_ARGS)
- name: dockerCredentialLocation
value: '.azure-pipelines/dockercfg.encrypted'
- name: dockerComposeLocation
value: '.azure-pipelines/docker-compose-ci.yml'
- name: dockerComposeFileName
value: 'docker-compose.yml'
- name: buildRepositoryName
value: $(Build.Repository.Name)
- name: buildSourceVersion
value: $(Build.SourceVersion)
- name: regionName
value: 'us-east-1'
- name: imageSourcePush
value: 'imagename'
- name: imageSourcePull
value: 'imagetag'
- name: azureApiKey
value: $(ADO_API_KEY)
- name: herokuApiToken
value: $(HEROKU_API_TOKEN)
# IF COPYING THIS INTO A NEW API - CHANGE THE VALUES BELOW APPROPRIATELY
- name: repositoryName
value: 'REPO_NAME'
- name: awsCredentials
value: 'IAM - automation-ecr-APPLICATION_NAME'
- name: clientName
value: 'CLIENT_NAME'
- name: applicationName
value: 'APPLICATION_NAME'
- name: projectName
value: 'TKXS%20PowerSuite'
stages:
- stage: PreBuildCheck
displayName: 'Pre-Build Check(s)'
jobs:
- job: Verify
steps:
- task: Bash@3
displayName: 'Verifying Build Request Source'
inputs:
targetType: 'inline'
script: |
if [ $(Build.Reason) == "PullRequest" ] || [ $(echo $(Build.SourceBranch) | awk -F '/' '{print $3}') == "feature" ]; then
echo "Source confirmed, proceeding."
else
echo "##vso[task.LogIssue type=error;]Only Feature branches can be considered on this pipeline."
echo "Build Reason: $(Build.Reason)"
echo "Branch Detected: $(echo $(Build.SourceBranch) | awk -F '/' '{print $3}')"
exit 1
fi
- task: Bash@3
name: 'BuildLinkage'
displayName: 'Verifying Work Linkage Candidates'
inputs:
targetType: 'inline'
script: |
# Associative array is used to ensure a unique list of User Story IDs
declare -A STORIES
# API call to get commit information and data for a specific Azure Pipelines build
BUILD_CHANGES=$(curl -s -u $(ADO_API_KEY) -X GET "https://dev.azure.com/tkxs/TKXS%20PowerSuite/_apis/build/builds/$(Build.BuildId)/changes?api-version=6.0")
# Get the number of commits in an Azure Pipelines build
COMMIT_COUNT=$(echo $BUILD_CHANGES | jq -r .count)
# Loop through each commit that leveraged the Azure Boards plugin and extract the Story ID
for (( i=0; i<$COMMIT_COUNT; i++ ))
do
BUILD_MESSAGE=$(echo $BUILD_CHANGES | jq -r .value[$i].message)
PLUGIN_STRINGS=( $(echo "$BUILD_MESSAGE" | grep -oE "(AB#)([0-9])+") )
for x in "${!PLUGIN_STRINGS[@]}"
do
STORY_ID="$(echo "${PLUGIN_STRINGS[$x]}" | grep -oE "([0-9])+")"
if [ -z "${STORIES[$STORY_ID]}" ]; then
echo "Unique Story ID ($STORY_ID) found."
STORIES[$STORY_ID]="TRUE"
fi
done
done
# Assign a special variable that is reachable across yaml stages
echo "##vso[task.setvariable variable=StoryIDs;isOutput=true]${!STORIES[@]}"
- stage: Build
displayName: 'Build Project'
dependsOn: PreBuildCheck
condition: succeeded()
variables:
- name: StoryIDs
value: $[ stageDependencies.PreBuildCheck.Verify.outputs['BuildLinkage.StoryIDs'] ]
jobs:
- job: Build
steps:
- template: ps-core-docker-build-template.yml@PSCorePipelineTemplates
parameters:
deployBuild: 'None'
projectName: $(projectName)
linkageIDs: $(StoryIDs)
encryptionKey: $(encryptionKey)
secretLocation: $(secretLocation)
buildArgs: $(buildArgs)
dockerComposeLocation: $(dockerComposeLocation)
dockerComposeFileName: $(dockerComposeFileName)
buildRepositoryName: $(buildRepositoryName)
buildSourceVersion: $(buildSourceVersion)
awsCredentials: $(awsCredentials)
regionName: $(regionName)
imageSource: $(imageSourcePush)
repositoryName: $(repositoryName)
parameters:
# - name: deployDev
# displayName: Deploy to Dev?
# type: string
# default: No
# values:
# - No
# - Yes
- name: deployProd
displayName: Deploy to Production? (Approval Required)
type: string
default: No
values:
- No
- Yes
trigger:
branches:
include:
- master
- hotfix/*
pr: none
pool:
vmImage: ubuntu-latest
resources:
repositories:
- repository: 'PSCorePipelineTemplates'
type: github
name: technekes/ps-core-pipeline-templates
endpoint: technekes
ref: 'refs/heads/version/1.0'
variables:
- name: encryptionKey
value: $(KEY)
- name: secretLocation
value: '.azure-pipelines/secrets.encrypted'
- name: buildArgs
value: $(BUILD_ARGS)
- name: dockerCredentialLocation
value: '.azure-pipelines/dockercfg.encrypted'
- name: dockerComposeLocation
value: '.azure-pipelines/docker-compose-ci.yml'
- name: dockerComposeFileName
value: 'docker-compose.yml'
- name: buildRepositoryName
value: $(Build.Repository.Name)
- name: buildSourceVersion
value: $(Build.SourceVersion)
- name: regionName
value: 'us-east-1'
- name: imageSourcePush
value: 'imagename'
- name: imageSourcePull
value: 'imagetag'
- name: azureApiKey
value: $(ADO_API_KEY)
- name: herokuApiToken
value: $(HEROKU_API_TOKEN)
# IF COPYING THIS INTO A NEW API - CHANGE THE VALUES BELOW APPROPRIATELY
- name: repositoryName
value: 'REPO_NAME'
- name: awsCredentials
value: 'IAM - automation-ecr-APPLICATION_NAME'
- name: clientName
value: 'CLIENT_NAME'
- name: applicationName
value: 'APPLICATION_NAME'
- name: projectName
value: 'TKXS%20PowerSuite'
stages:
- stage: PreBuildCheck
displayName: 'Pre-Build Check(s)'
jobs:
- job: Verify
steps:
- task: Bash@3
displayName: 'Verifying Build Request Source'
inputs:
targetType: 'inline'
script: |
if [ $(echo $(Build.SourceBranch) | awk -F '/' '{print $3}') == "master" ] || [ $(echo $(Build.SourceBranch) | awk -F '/' '{print $3}') == "hotfix" ]; then
echo "Branch confirmed, proceeding . . ."
else
echo "##vso[task.LogIssue type=error;]Only Master and Hotfix branches are valid for this pipeline."
echo "Build Reason: $(Build.Reason)"
echo "Branch Detected: $(echo $(Build.SourceBranch) | awk -F '/' '{print $3}')"
exit 1
fi
- task: Bash@3
name: 'BuildLinkage'
displayName: 'Verifying Work Linkage Candidates'
inputs:
targetType: 'inline'
script: |
# Associative array is used to ensure a unique list of User Story IDs
declare -A STORIES
# API call to get commit information and data for a specific Azure Pipelines build
BUILD_CHANGES=$(curl -s -u $(ADO_API_KEY) -X GET "https://dev.azure.com/tkxs/TKXS%20PowerSuite/_apis/build/builds/$(Build.BuildId)/changes?api-version=6.0")
# Get the number of commits in an Azure Pipelines build
COMMIT_COUNT=$(echo $BUILD_CHANGES | jq -r .count)
# Loop through each commit that leveraged the Azure Boards plugin and extract the Story ID
for (( i=0; i<$COMMIT_COUNT; i++ ))
do
BUILD_MESSAGE=$(echo $BUILD_CHANGES | jq -r .value[$i].message)
PLUGIN_STRINGS=( $(echo "$BUILD_MESSAGE" | grep -oE "(AB#)([0-9])+") )
for x in "${!PLUGIN_STRINGS[@]}"
do
STORY_ID="$(echo "${PLUGIN_STRINGS[$x]}" | grep -oE "([0-9])+")"
if [ -z "${STORIES[$STORY_ID]}" ]; then
echo "Unique Story ID ($STORY_ID) found."
STORIES[$STORY_ID]="TRUE"
fi
done
done
# Assign a special variable that is reachable across yaml stages
echo "##vso[task.setvariable variable=StoryIDs;isOutput=true]${!STORIES[@]}"
- stage: Build
displayName: 'Build Project'
dependsOn: PreBuildCheck
condition: succeeded()
variables:
- name: StoryIDs
value: $[ stageDependencies.PreBuildCheck.Verify.outputs['BuildLinkage.StoryIDs'] ]
jobs:
- job: Build
steps:
- template: ps-core-docker-build-template.yml@PSCorePipelineTemplates
parameters:
deployBuild: 'true'
projectName: $(projectName)
linkageIDs: $(StoryIDs)
encryptionKey: $(encryptionKey)
secretLocation: $(secretLocation)
buildArgs: $(buildArgs)
dockerComposeLocation: $(dockerComposeLocation)
dockerComposeFileName: $(dockerComposeFileName)
buildRepositoryName: $(buildRepositoryName)
buildSourceVersion: $(buildSourceVersion)
awsCredentials: $(awsCredentials)
regionName: $(regionName)
imageSource: $(imageSourcePush)
repositoryName: $(repositoryName)
# - stage: DEV
# displayName: 'Deploy to DEV Environment'
# dependsOn:
# - PreBuildCheck
# - Build
# condition: and(dependencies.Build.result, 'Succeeded', eq('${{ parameters.deployDev }}', 'Yes'))
# variables:
# - name: StoryIDs
# value: $[ stageDependencies.PreBuildCheck.Verify.outputs['BuildLinkage.StoryIDs'] ]
# jobs:
# - template: ps-core-docker-deploy-template.yml@PSCorePipelineTemplates
# parameters:
# stageName: Development
# environmentName: dev
# linkageIDs: $(StoryIDs)
# clientName: $(clientName)
# applicationName: $(applicationName)
# projectName: $(projectName)
# dockerCredentialLocation: $(dockerCredentialLocation)
# herokuApiToken: $(herokuApiToken)
# encryptionKey: $(encryptionKey)
# buildRepositoryName: $(repositoryName)
# buildSourceVersion: $(buildSourceVersion)
# awsCredentials: $(awsCredentials)
# regionName: $(regionName)
# imageSource: $(imageSourcePull)
# imageName: 'IMAGE_NAME'
- stage: Production
displayName: 'Deploy to Production Environment'
dependsOn:
- PreBuildCheck
- Build
condition: and(dependencies.Build.result, 'Succeeded', eq('${{ parameters.deployProd }}', 'Yes'))
variables:
- name: StoryIDs
value: $[ stageDependencies.PreBuildCheck.Verify.outputs['BuildLinkage.StoryIDs'] ]
jobs:
- template: ps-core-docker-deploy-template.yml@PSCorePipelineTemplates
parameters:
stageName: Prod
environmentName: prod
linkageIDs: $(StoryIDs)
clientName: $(clientName)
applicationName: $(applicationName)
projectName: $(projectName)
dockerCredentialLocation: $(dockerCredentialLocation)
herokuApiToken: $(herokuApiToken)
encryptionKey: $(encryptionKey)
buildRepositoryName: $(repositoryName)
buildSourceVersion: $(buildSourceVersion)
awsCredentials: $(awsCredentials)
regionName: $(regionName)
imageSource: $(imageSourcePull)
imageName: 'IMAGE_NAME'
version: '2'
services:
kedro:
build:
context: .
volumes:
- $PWD:/usr/src/app
ports:
- '4141:4141'
- '8888:8888'
#! /usr/bin/env bash
# deal with `nib` inserting itself here (e.g. shell history)
args="$(echo "${@}" | tail -n 2 | sed -z '$ s/\n$//' | xargs)"
if [ "${args}" = 'viz' ]; then
args="viz --host 0.0.0.0"
elif [ "${args}" = 'jupyter' ]; then
args="jupyter lab --ip 0.0.0.0 --allow-root --no-browser --LabApp.token=token"
elif [ "${args}" = 'bash' ]; then
exec /bin/bash
exit 0
fi
echo "Running: kedro ${args}"
exec kedro ${args}
FROM python:3.8.11-buster as builder
ENV PROJECT_NAME project-name
ENV PROJECT_PATH src/${PROJECT_NAME}
COPY requirements.txt /tmp/requirements.txt
RUN \
pip install -r /tmp/requirements.txt && \
rm -f /tmp/requirements.txt
WORKDIR /usr/src/app
# COPY in files required by `kedro` to trick it into thinking we have a
# complete project
COPY pyproject.toml .
COPY ${PROJECT_PATH}/cli.py ${PROJECT_PATH}/
# existence this file is required by `kedro` but when the actual
# file is used we have to COPY more and more project contents
RUN touch ${PROJECT_PATH}/settings.py
# install project dependencies
COPY src/requirements.* src/
RUN kedro install
COPY . .
EXPOSE 8888
RUN chmod +x docker-entrypoint.sh
ENTRYPOINT ["./docker-entrypoint.sh"]
CMD ["kedro", "run"]
kedro==kedro-version
# pinned because 3.1.6 incompatibility with Kedro.
# see this issue to track state:
# https://github.com/quantumblacklabs/kedro/issues/888
dynaconf==3.1.5
#! /usr/bin/env bash
while true; do
read -p "Do you a valid/refreshed AWS CLI token? y/N " yn
case $yn in
[Yy]* ) break;;
[Nn]* ) exit 1;;
* ) echo "Please answer yes or no.";;
esac
done
workdir="/usr/src/app"
gist="26a175ba23271cc63f65226aa996958a"
project_name="${1}"
downcased_name=$(echo "${project_name,,}")
repo_name=$(echo "${downcased_name// /-}")
python_package=$(echo "${downcased_name// /_}")
kedro_version="${2:-0.17.2}"
if [[ $repo_name == cdt-* ]]; then
client_name="cdt"
elif [[ $repo_name == ps-core-* ]]; then
client_name="ps-core"
else
echo "Unknown 'client name'. Currently supported client names 'cdt', 'ps-core'"
exit 1
fi
application_name=$(echo "${repo_name//$client_name-/}")
# create configuration file for `kedro new`
echo "output_dir: /usr/src/app
project_name: ${project_name}
repo_name: ${repo_name}
python_package: ${python_package}" > config.yml
# clone shell repo created by TechOps
rm -rf tmp
git clone [email protected]:technekes/${repo_name}.git tmp
echo "scaffolding project: ${repo_name}"
# scaffold the project in a one-off Docker container using
# * `kedro new`
# * a gist with additional files
# * some `sed`
# * `kedro build-reqs`
docker run --rm --workdir $workdir -v $PWD:$workdir python:3.8-buster bash -c "
echo 'install kedro'
pip install kedro==${kedro_version}
echo 'run kedro new'
kedro new --config config.yml
rm config.yml
cd ${repo_name}
curl -L https://gist.github.com/johnallen3d/$gist/download \
--output gist
unzip gist
cd "./${gist}-master/"
mv * ../
cd ..
rm -rf "./${gist}-master/"
rm gist
rm -f scaffold
sed -i \"s/Basic Kedro + Docker Setup/$project_name/g\" README.md
sed -i \"s/project-name/$python_package/g\" Dockerfile
sed -i \"s/kedro-version/$kedro_version/g\" requirements.txt
sed -i \"s/kedro-version/$kedro_version/g\" src__requirements.in
sed -i \"s/kedro-version/$kedro_version/g\" src__requirements.in
sed -i \"s/line_length = 88/line_length = 80/g\" pyproject.toml
sed -i \"s/max-line-length=88/max-line-length=80/g\" setup.cfg
echo \$'\n[tool.black]\nline-length = 80\nexclude = \'cli\\.py\'' >> pyproject.toml
echo \$'exclude=\n src/**/__init__.py \n src/**/__main__.py \n src/**/cli.py \n src/**/hooks.py \n src/**/pipeline.py \n src/tests/test_run.py' >> setup.cfg
echo 'FOO=bar' > secrets.env
mv src__requirements.in src/requirements.in
mkdir -p .azure-pipelines
mv azure-pipelines__docker-compose-ci.yml .azure-pipelines/docker-compose-ci.yml
mv azure-pipelines__pipeline_feature.yml .azure-pipelines/pipeline_feature.yml
mv azure-pipelines__pipeline_master_hotfix.yml .azure-pipelines/pipeline_master_hotfix.yml
sed -i \"s/REPO_NAME/${repo_name}/g\" .azure-pipelines/pipeline_*.yml
sed -i \"s/CLIENT_NAME/${client_name}/g\" .azure-pipelines/pipeline_*.yml
sed -i \"s/APPLICATION_NAME/${application_name}/g\" .azure-pipelines/pipeline_*.yml
echo \$'\nsecrets.*' >> .gitignore
kedro build-reqs
"
# copy git directory from cloned repo into newly scaffolded project
mv tmp/.git/ ${repo_name}/
rm -rf tmp/
cd $repo_name
nib crypt-init
nib encrypt secrets.env .azure-pipelines/secrets.encrypted
# build image and lint
nib build --pull
nib run kedro lint
# commit scaffolding to a new branch
git checkout -b feature/scaffold-kedro
git add .
git commit -m "Scaffold kedro project
New project scaffolded with a Dockerized environment using the scaffold
script provided here:
https://gist.github.com/johnallen3d/26a175ba23271cc63f65226aa996958a
"
echo "
Project generated -- push this branch and create a PR
> cd $repo_name
> gh pr create --label maintenance --fill
"
black==v19.10b0
dynaconf==3.1.5
flake8-docstrings
flake8>=3.7.9, <4.0
gcsfs==0.7.2
great_expectations
ipdb
ipykernel~=5.3
ipython-sql
ipython>=7.10
isort>=4.3.21, <5.0
jaydebeapi
jupyter_client>=5.1, <7.0
jupyterlab==0.31.1
jupyter~=1.0
kedro-viz
kedro==kedro-version
kedro[pandas]==kedro-version
nbstripout==0.3.3
psycopg2
pytest-cov~=2.5
pytest-mock>=1.7.1, <2.0
pytest~=6.1.2
s3fs==0.4.0
sphinx~=3.4.3
sqlalchemy
wheel==0.32.2
@johnallen3d
Copy link
Author

johnallen3d commented Jun 4, 2021

Scaffold a new project:

# change directories to your pipelines folder
cd ~/pipelines
# download scaffold script from this Gist and make it executable
curl -s https://gist.githubusercontent.com/johnallen3d/26a175ba23271cc63f65226aa996958a/raw/scaffold > scaffold && chmod +x scaffold
# execute scaffold script providing a human readable name for your new pipeline
./scaffold "Awesome Pipeline"
# change directories to your new pipeline
cd awesome-pipeline/
# check out the README for "Usage Examples"
cat README.md

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment