Created
December 4, 2022 21:37
-
-
Save mdrakiburrahman/50c0606491fdcde68ad90850c00007fa to your computer and use it in GitHub Desktop.
Chaos Mesh Workflow Jinja template for YAML generation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ========================================================================== | |
# SQLINSTANCE-HA | |
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | |
# WORKFLOW DURATION: 18m | |
# -------------------------------------------------------------------------- | |
# CHAOS DESCRIPTION | |
# -------------------------------------------------------------------------- | |
# In this chaos simulation we target a 2 or 3 replica MIAA: | |
# | |
# 1. Setup a User DB | |
# | |
# 2. Setup 10+ pods that continuously attempt to write transactions to the | |
# primary endpoint in an infinite loop | |
# | |
# 3. Kill Replicas in an orchestrated manner to try force, and then promote | |
# a lagging replica, repeat kill the previous 2 primary replicas | |
# | |
# 4. Kill random pods, and sqlserver and orchestrator containers | |
# | |
# 5. Degrade storage, burn CPU/Mem and partially corrupt networking | |
# | |
# -------------------------------------------------------------------------- | |
# VAR SUBSTITUTIONS | |
# -------------------------------------------------------------------------- | |
# - CHAOS_INSTANCE_NAME - Name of SQL Instance | |
# - CHAOS_PLUGIN_NAMESPACE - Arc Data Namespace | |
# - CHAOS_INSTANCE_USERNAME - Username of instance admin | |
# - CHAOS_INSTANCE_PASSWORD - Password of instance admin | |
# - CHAOS_PLUGIN - Sonobuoy Plugin & file; e.g. '{{ CHAOS_PLUGIN }}.yaml.tmpl' | |
# ========================================================================== | |
apiVersion: chaos-mesh.org/v1alpha1 | |
kind: Workflow | |
metadata: | |
name: {{ CHAOS_PLUGIN }} | |
namespace: {{ CHAOS_PLUGIN_NAMESPACE }} | |
spec: | |
entry: serial-root | |
templates: | |
# ==================== entry point ==================== | |
- name: serial-root | |
templateType: Serial | |
children: | |
- setup-user-db | |
- fill-user-db | |
- parallel-root | |
deadline: 18m | |
# Although the deadline is 2 minutes, the task pod definitions stay up | |
# until the 'Workflow' Custom Resource is explicitly deleted. Chaos Mesh | |
# has a design gap that works in our favor to keep stressing transactions, | |
# basically they don't expect you to have infinite loops in your tasks, | |
# and bank on the fact that pods exit naturally. | |
# | |
- name: fill-user-db | |
deadline: 2m | |
templateType: Parallel | |
children: | |
- repeat-perform-transactions | |
- repeat-perform-transactions | |
- repeat-perform-transactions | |
- repeat-perform-transactions | |
- repeat-perform-transactions | |
- repeat-perform-transactions | |
- repeat-perform-transactions | |
- repeat-perform-transactions | |
- repeat-perform-transactions | |
- repeat-perform-transactions | |
# These kick in at the 1m mark, let's try to block transactions being | |
# acked by replicas | |
- partial-database-to-database-corrupt | |
- partial-database-to-database-duplicate | |
- partial-database-to-database-delay | |
- partial-database-to-database-loss | |
- name: parallel-root | |
templateType: Parallel | |
children: | |
- repeat-perform-transactions # In case Chaos Mesh fixes the task pod cleanup above, we want this single inserter to keep running | |
- parallel-compute-killer | |
- parallel-storage-killer | |
- parallel-network-killer | |
- burn-cpu-mem | |
# ================== compute chaos ================== | |
# //////////////////// KILL PODS //////////////////// | |
# =================================================== | |
# ==================== parallels ==================== | |
- name: parallel-compute-killer | |
templateType: Parallel | |
children: | |
- repeat-kill-2-container # Should exit first, try and get a low sequence instance back | |
- orchestrated-0-1-kills # Should exit after, get two instances that have similar seq | |
- repeat-kill-ha-orchestrator-pod # Exits at the end | |
- repeat-kill-ha-supervisor-container # Exits at the end | |
- repeat-kill-random-database-pod # Exits at the end | |
- repeat-kill-controldb-pod # Exits at the end | |
deadline: 13m | |
- name: repeat-kill-0-1-container | |
templateType: Parallel | |
children: | |
- repeat-force-failover-tsql | |
- repeat-kill-0-container | |
- repeat-kill-1-container | |
# ==================== orchestrated ==================== | |
- name: orchestrated-0-1-kills | |
templateType: Serial | |
deadline: 10m | |
children: | |
- force-failover-tsql | |
- kill-ha-pod # For Network changes to take action immediately | |
- kill-0-container | |
- force-failover-tsql | |
- sleep-for-paxos | |
- kill-1-container | |
- force-failover-tsql | |
- sleep-for-paxos | |
- repeat-kill-0-1-container | |
# ==================== repeat kills ==================== | |
- name: repeat-kill-0-container | |
templateType: Schedule | |
schedule: | |
schedule: '@every 10s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: PodChaos | |
podChaos: | |
containerNames: ['arc-sqlmi'] | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-0 | |
mode: one | |
action: container-kill | |
- name: repeat-kill-1-container | |
templateType: Schedule | |
schedule: | |
schedule: '@every 10s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: PodChaos | |
podChaos: | |
containerNames: ['arc-sqlmi'] | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-1 | |
mode: one | |
action: container-kill | |
- name: repeat-kill-2-container | |
templateType: Schedule | |
deadline: 5m | |
schedule: | |
schedule: '@every 10s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: PodChaos | |
podChaos: | |
containerNames: ['arc-sqlmi'] | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-2 | |
mode: one | |
action: container-kill | |
- name: repeat-kill-ha-orchestrator-pod | |
templateType: Schedule | |
schedule: | |
schedule: '@every 7m' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: PodChaos | |
podChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: orchestrator | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: one | |
action: pod-kill | |
- name: repeat-kill-ha-supervisor-container | |
templateType: Schedule | |
schedule: | |
schedule: '@every 400s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: PodChaos | |
podChaos: | |
containerNames: ['arc-ha-supervisor'] | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: one | |
action: container-kill | |
- name: repeat-kill-random-database-pod | |
templateType: Schedule | |
schedule: | |
schedule: '@every 500s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: PodChaos | |
podChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: one | |
action: pod-kill | |
- name: repeat-kill-controldb-pod | |
templateType: Schedule | |
schedule: | |
schedule: '@every 7m' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: PodChaos | |
podChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app: controldb | |
mode: one | |
action: pod-kill | |
# ==================== single kills ==================== | |
- name: kill-ha-pod | |
deadline: 1m | |
templateType: PodChaos | |
podChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: orchestrator | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
action: pod-kill | |
- name: kill-0-container | |
deadline: 1m | |
templateType: PodChaos | |
podChaos: | |
containerNames: ['arc-sqlmi'] | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-0 | |
mode: all | |
action: container-kill | |
- name: kill-1-container | |
templateType: PodChaos | |
deadline: 1m | |
podChaos: | |
containerNames: ['arc-sqlmi'] | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
statefulset.kubernetes.io/pod-name: {{ CHAOS_INSTANCE_NAME }}-1 | |
mode: all | |
action: container-kill | |
# ==================== sleep ==================== | |
- name: sleep-for-paxos | |
templateType: Task | |
deadline: 60s | |
task: | |
container: | |
name: main-contaienr | |
image: busybox | |
command: | |
- sh | |
- -c | |
- sleep 60 | |
# ====================== t-sql ====================== | |
- name: force-failover-tsql | |
templateType: Task | |
deadline: 60s | |
task: | |
container: | |
name: tsql | |
image: mcr.microsoft.com/mssql-tools | |
command: ["/bin/sh", "-c"] | |
args: | |
- set +e; | |
echo "Testing all 3 replicas for failover:"; | |
for replica in $(seq 0 2); do | |
echo \"================================= {{ CHAOS_INSTANCE_NAME }}-$replica ====================================\"; | |
/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-$replica.{{ CHAOS_INSTANCE_NAME }}-svc,1433 -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "ALTER AVAILABILITY GROUP current SET (ROLE = SECONDARY);"; | |
echo \"=============================================================================\"; | |
done; | |
# Chaos Mesh doesn't allow wrapping Schedule CRD on top of task CRDs | |
# (yet), so unfortunately we have to copy-paste the simple loop above. | |
# | |
- name: repeat-force-failover-tsql | |
templateType: Task | |
task: | |
container: | |
name: tsql | |
image: mcr.microsoft.com/mssql-tools | |
command: ["/bin/sh", "-c"] | |
args: | |
- set +e; | |
echo "Testing all 3 replicas for failover:"; | |
while true; do | |
for replica in $(seq 0 2); do | |
echo \"================================= {{ CHAOS_INSTANCE_NAME }}-$replica ====================================\"; | |
/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-$replica.{{ CHAOS_INSTANCE_NAME }}-svc,1433 -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "ALTER AVAILABILITY GROUP current SET (ROLE = SECONDARY);"; | |
echo \"=============================================================================\"; | |
done; | |
done; | |
- name: setup-user-db | |
templateType: Task | |
deadline: 180s | |
task: | |
container: | |
name: tsql | |
image: mcr.microsoft.com/mssql-tools | |
command: ["/bin/sh", "-c"] | |
args: | |
- set +e; | |
echo "Dropping database, if exists:"; | |
/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "master" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "DROP DATABASE IF EXISTS [UserDB];"; | |
echo "Setting up database:"; | |
/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "master" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "CREATE DATABASE [UserDB];"; | |
echo "Dropping table:"; | |
/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "UserDB" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "DROP TABLE IF EXISTS [dbo].[UserTable];"; | |
echo "Creating table:"; | |
/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "UserDB" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "CREATE TABLE [dbo].[UserTable] (id int, name varchar(50));"; | |
echo "Done!"; | |
- name: repeat-perform-transactions | |
templateType: Task | |
task: | |
container: | |
name: tsql | |
image: mcr.microsoft.com/mssql-tools | |
command: ["/bin/sh", "-c"] | |
args: | |
- set +e; | |
echo "Running infinite insertion loop"; | |
i=0; | |
while true; do | |
echo "Attempting - $i"; | |
/opt/mssql-tools/bin/sqlcmd -S {{ CHAOS_INSTANCE_NAME }}-p-svc,1433 -d "UserDB" -U {{ CHAOS_INSTANCE_USERNAME }} -P {{ CHAOS_INSTANCE_PASSWORD }} -Q "INSERT INTO [dbo].[UserTable] (id, name) VALUES ($i, '{{ CHAOS_PLUGIN }}');" -t 1 -l 1; | |
i=$((i+1)); | |
done | |
# ================ storage parallel ================= | |
# //////////// DESTROY CRITICAL STORAGE ///////////// | |
# =================================================== | |
- name: parallel-storage-killer | |
templateType: Parallel | |
children: | |
- storage-delay-write | |
- storage-delay-read | |
# ================== storage chaos ================== | |
# //////////// var/opt/mssql/data //////////// | |
# =================================================== | |
# ====================== write ====================== | |
- name: storage-delay-write | |
templateType: Schedule | |
schedule: | |
schedule: '@every 120s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: IOChaos | |
ioChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
action: latency | |
delay: 150ms | |
path: /var/opt/mssql/**/* | |
methods: | |
- WRITE | |
percent: 15 | |
volumePath: /var/opt/mssql | |
# ====================== read ====================== | |
- name: storage-delay-read | |
templateType: Schedule | |
schedule: | |
schedule: '@every 80s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: IOChaos | |
ioChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
action: latency | |
delay: 150ms | |
path: /var/opt/mssql/**/* | |
methods: | |
- READ | |
percent: 15 | |
volumePath: /var/opt/mssql | |
# ================== comms parallel ================= | |
# /////////// DESTROY COMMS CHANNELS ///////////// | |
# =================================================== | |
- name: parallel-network-killer | |
templateType: Parallel | |
children: | |
- partial-k8s-block-ha | |
- partial-database-to-ha-block | |
- partial-database-to-database-corrupt | |
- partial-database-to-database-duplicate | |
- partial-database-to-database-delay | |
- partial-database-to-database-loss | |
# ====================== k8s ====================== | |
- name: partial-k8s-block-ha | |
templateType: Schedule | |
schedule: | |
schedule: '@every 60s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: NetworkChaos | |
networkChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: orchestrator | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
action: corrupt | |
duration: 55s | |
corrupt: | |
corrupt: '40' | |
correlation: '20' | |
direction: to | |
externalTargets: | |
- 10.0.0.0/16 # AKS default | |
- 100.64.0.0/16 # Our Kubeadm setup | |
- 10.42.0.0/24 # K3s default | |
- 172.30.0.0/16 # OpenShift default | |
- kubernetes.default.svc | |
# ====================== database-to-ha ====================== | |
- name: partial-database-to-ha-block | |
templateType: Schedule | |
schedule: | |
schedule: '@every 60s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: NetworkChaos | |
networkChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: orchestrator | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
action: corrupt | |
duration: 55s | |
corrupt: | |
corrupt: '45' | |
correlation: '40' | |
direction: both | |
target: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
# ====================== database-to-database ====================== | |
- name: partial-database-to-database-corrupt | |
templateType: Schedule | |
schedule: | |
schedule: '@every 60s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: NetworkChaos | |
networkChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
action: corrupt | |
duration: 15s | |
corrupt: | |
corrupt: '40' | |
correlation: '25' | |
direction: to | |
target: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
- name: partial-database-to-database-duplicate | |
templateType: Schedule | |
schedule: | |
schedule: '@every 60s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: NetworkChaos | |
networkChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
action: duplicate | |
duration: 10s | |
duplicate: | |
duplicate: "40" | |
correlation: "25" | |
direction: to | |
target: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
- name: partial-database-to-database-delay | |
templateType: Schedule | |
schedule: | |
schedule: '@every 60s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: NetworkChaos | |
networkChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
action: delay | |
duration: 5s | |
delay: | |
latency: "90ms" | |
correlation: "25" | |
jitter: "90ms" | |
direction: to | |
target: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
- name: partial-database-to-database-loss | |
templateType: Schedule | |
schedule: | |
schedule: '@every 60s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: NetworkChaos | |
networkChaos: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
action: loss | |
duration: 24s | |
loss: | |
loss: "25" | |
correlation: "25" | |
direction: to | |
target: | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/component: database | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
mode: all | |
# ==================== utilities ==================== | |
# //////////// MISC //////////// | |
# =================================================== | |
- name: burn-cpu-mem | |
templateType: Schedule | |
schedule: | |
schedule: '@every 120s' | |
startingDeadlineSeconds: null | |
concurrencyPolicy: Forbid | |
historyLimit: 1 | |
type: StressChaos | |
stressChaos: | |
mode: all | |
selector: | |
namespaces: | |
- {{ CHAOS_PLUGIN_NAMESPACE }} | |
labelSelectors: | |
app.kubernetes.io/part-of: SqlManagedInstance | |
app.kubernetes.io/instance: {{ CHAOS_INSTANCE_NAME }} | |
stressors: | |
memory: | |
workers: 2 | |
size: 256Mi | |
cpu: | |
workers: 4 | |
load: 100 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment