Post Portem
Date: 2017-04-11
Impact:
- np.k8s.saltside.io internal service disrupted
- tiller unavailable
- unable to create new pods or recover failed pods
#!/usr/bin/env bash | |
set -euo pipefail | |
main() { | |
local release="${1:-}" test_output pod | |
test_output="$(mktemp)" | |
if [ -z "${release}" ]; then | |
echo "USAGE: ${0} RELEASE" 1>&2 |
--- Cleaning | |
--> Deleting release test-bikroy | |
==> No release found; deleting manually | |
==> Deleting any dangling Service | |
No resources found | |
==> Deleting any dangling Deployment | |
No resources found | |
==> Deleting any dangling Pod | |
No resources found | |
==> Deleting any dangling Secret |
=> Attemping | |
--- Cleaning | |
--> Deleting release test-bikroy | |
==> No release found; deleting manually | |
==> Deleting any dangling Service | |
No resources found | |
==> Deleting any dangling Deployment | |
No resources found | |
==> Deleting any dangling Pod |
--- Cleaning | |
--> Deleting release test-bikroy | |
==> Found helm release; deleting with --purge | |
--> Awaiting resource deleting confirmation | |
--> Awaiting helm confirmation | |
--> Deleting release test-ikman | |
==> Found helm release; deleting with --purge | |
--> Awaiting resource deleting confirmation | |
test-ikman Pod still running. 0/15 tests completed; retrying. | |
NAME READY STATUS RESTARTS AGE |
=> Attemping | |
--- Cleaning | |
--> Deleting release test-bikroy | |
==> No release found; deleting manually | |
==> Deleting any dangling Service | |
No resources found | |
==> Deleting any dangling Deployment | |
No resources found | |
==> Deleting any dangling Pod |
#!/usr/bin/env bash | |
# This script is a workaround for https://github.com/kubernetes/helm/issues/2288. | |
# helm install --wait should do everything this script does. It should be deleted | |
# when the bug is fixed. | |
set -euo pipefail | |
main() { | |
local counter=0 release timeout pod_status |
Post Portem
Date: 2017-04-11
Impact:
- name: Validate cluster name lengths | |
assert: | |
that: | |
- "len('bastions.' + item.name) <= 32" | |
- "len('api.' + item.name) <= 32" | |
with_items: "{{ production_k8s_deployments }}" |
- name: Check test pods | |
shell: > | |
kops export kubecfg {{ item[0].name }} | |
&& kubectl get pod image-pull-test -o yaml --template {%- raw -%}'{{.status.phase}}'{%- endraw -%} --namespace {{ item[1] }} | |
environment: | |
KOPS_STATE_STORE: "s3://{{ kops_prereqs.stack_outputs.Bucket }}" | |
with_subelements: | |
- "{{ production_k8s_deployments }}" | |
- namespaces | |
until: result.stdout.lower() == 'Running' |
let redis = require('redis'); | |
let retry = require('retry') | |
function ping(callback) { | |
let operation = retry.operation({ | |
retries: process.env.ATTEMPTS ? parseInt(process.env.ATTEMPTS) : 60, | |
factor: 1, | |
randomize: false | |
}); |