Last active
June 11, 2025 09:24
-
-
Save tommeramber/ae68e102392a5f4ccba956813d1cb9e7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
- name: playbook reacting to alerts nfs-stale OR node-health-check | |
hosts: localhost | |
gather_facts: false | |
vars_prompt: | |
- name: payload | |
prompt: "" | |
private: false | |
pre_tasks: | |
- name: set fact node_name based on existing label from alert | |
set_fact: | |
problematic_node: "{{ payload.labels[item] }}" | |
loop: | |
- node | |
- instance | |
when: payload.labels is defined and item in payload.labels | |
register: set_fact_result | |
until: set_fact_result is succeeded | |
retries: 1 | |
- name: pre_tasks message | |
debug: | |
msg: ’NODE-STUCK alert from OCP: {{ payload.labels.env }}, Node: {{ problematic_node }}’ | |
- name: Include vars file | |
# Path relative to the running playbook locaion in repo | |
include_vars: ../vars/{{payload.labels.env}}.yaml | |
tasks: | |
- name: Print alert details | |
debug: | |
msg: "OCP API: {{ openshift_api }}" | |
- name: oc login | |
shell: "oc login --insecure-skip-tls-verify=true --token={{ openshift_token }} {{ openshift_api }}" | |
- name: oc get node | |
shell: "oc get node {{ problematic_node }}" | |
register: oc_get_node | |
- name: debug oc get node | |
debug: | |
msg: "{{ oc_get_node.stdout_lines }}" | |
- name: Don’t reboot node if you have more then 1 NotReady nodes or 3 alert simultaniously | |
shell: | | |
numOfAlertsSimultaniuously=‘oc -n openshift-monitoring exec -it $(oc -n openshift-monitoring get pod -l alertmanager=main -o jsonpath=’{.items[0].metadata.name}’) -- curl -s http://localhost:9093/api/v2/alerts | jq ’.[] | select(.labels.alertname=="detect-nfs-stale" or .labels.alertname=="node-health-check") | .fingerprint’ | sort -u | wc -l‘ | |
numOfNotReadyNodes=‘oc get nodes | grep -E ’NotReady|SchedulingDisabled’ | wc -l‘ | |
if [ "$numOfAlertsSimultaniuously" -gt 3 ] || [ "$numOfNotReadyNodes" -gt 1 ] ; then | |
exit 1; | |
else | |
exit 0; | |
fi | |
register: validations_status | |
until: validations_status.rc == 0 | |
retries: 10 | |
delay: 600 #(10 min in seconds) | |
- name: Create reboot node script | |
template: | |
src: ../templates/reboot_node_pod.yaml.j2 #relative location to the playbook | |
dest: /tmp/reboot_node_pod-{{ problematic_node }}.yaml #inside the ansible playbook pod | |
- name: print rebooter pod | |
command: "cat /tmp/reboot_node_pod-{{ problematic_node }}.yaml" | |
register: rebooter_pod | |
- debug: | |
msg: "{{ rebooter_pod.stdout_lines }}" | |
- name: Run reboot script | |
shell: "oc apply -f /tmp/reboot_node_pod-{{ problematic_node }}.yaml" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment