Skip to content

Instantly share code, notes, and snippets.

@tommeramber
Last active June 11, 2025 09:24
Show Gist options
  • Save tommeramber/ae68e102392a5f4ccba956813d1cb9e7 to your computer and use it in GitHub Desktop.
Save tommeramber/ae68e102392a5f4ccba956813d1cb9e7 to your computer and use it in GitHub Desktop.
---
- name: playbook reacting to alerts nfs-stale OR node-health-check
hosts: localhost
gather_facts: false
vars_prompt:
- name: payload
prompt: ""
private: false
pre_tasks:
- name: set fact node_name based on existing label from alert
set_fact:
problematic_node: "{{ payload.labels[item] }}"
loop:
- node
- instance
when: payload.labels is defined and item in payload.labels
register: set_fact_result
until: set_fact_result is succeeded
retries: 1
- name: pre_tasks message
debug:
msg: ’NODE-STUCK alert from OCP: {{ payload.labels.env }}, Node: {{ problematic_node }}’
- name: Include vars file
# Path relative to the running playbook locaion in repo
include_vars: ../vars/{{payload.labels.env}}.yaml
tasks:
- name: Print alert details
debug:
msg: "OCP API: {{ openshift_api }}"
- name: oc login
shell: "oc login --insecure-skip-tls-verify=true --token={{ openshift_token }} {{ openshift_api }}"
- name: oc get node
shell: "oc get node {{ problematic_node }}"
register: oc_get_node
- name: debug oc get node
debug:
msg: "{{ oc_get_node.stdout_lines }}"
- name: Don’t reboot node if you have more then 1 NotReady nodes or 3 alert simultaniously
shell: |
numOfAlertsSimultaniuously=‘oc -n openshift-monitoring exec -it $(oc -n openshift-monitoring get pod -l alertmanager=main -o jsonpath=’{.items[0].metadata.name}’) -- curl -s http://localhost:9093/api/v2/alerts | jq ’.[] | select(.labels.alertname=="detect-nfs-stale" or .labels.alertname=="node-health-check") | .fingerprint’ | sort -u | wc -l‘
numOfNotReadyNodes=‘oc get nodes | grep -E ’NotReady|SchedulingDisabled’ | wc -l‘
if [ "$numOfAlertsSimultaniuously" -gt 3 ] || [ "$numOfNotReadyNodes" -gt 1 ] ; then
exit 1;
else
exit 0;
fi
register: validations_status
until: validations_status.rc == 0
retries: 10
delay: 600 #(10 min in seconds)
- name: Create reboot node script
template:
src: ../templates/reboot_node_pod.yaml.j2 #relative location to the playbook
dest: /tmp/reboot_node_pod-{{ problematic_node }}.yaml #inside the ansible playbook pod
- name: print rebooter pod
command: "cat /tmp/reboot_node_pod-{{ problematic_node }}.yaml"
register: rebooter_pod
- debug:
msg: "{{ rebooter_pod.stdout_lines }}"
- name: Run reboot script
shell: "oc apply -f /tmp/reboot_node_pod-{{ problematic_node }}.yaml"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment