Skip to content

Instantly share code, notes, and snippets.

@dav1x
Created June 11, 2018 15:11
Show Gist options
  • Save dav1x/d499011568db55e145de684ea6878a90 to your computer and use it in GitHub Desktop.
Save dav1x/d499011568db55e145de684ea6878a90 to your computer and use it in GitHub Desktop.
Majority etcd set failure
Inventory file:
[etcd]
stretch-master-0.stretch.e2e.bos.redhat.com
stretch-master-1.stretch.e2e.bos.redhat.com
stretch-master-2.stretch.e2e.bos.redhat.com
[etcd-pri]
stretch-master-0.stretch.e2e.bos.redhat.com
stretch-master-1.stretch.e2e.bos.redhat.com
[etcd-sec]
stretch-master-2.stretch.e2e.bos.redhat.com
---
- name: Simulate DR for ETCD
hosts: etcd-pri
tasks:
- name: Stop the etcd services from primary DC
systemd:
name: 'etcd'
state: stopped
[root@stretch-master-0 ~]# oc get node
Error from server (Timeout): the server was unable to return a response in the time allotted, but may still be processing the request (get nodes)
DR playbook:
---
- name: Ensure that we have a rw etcd
hosts: etcd-sec[0]
tasks:
- name: Get the status of the etcd cluster
shell: 'etcdctl -C https://{{ ansible_default_ipv4.address }}:2379 --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt cluster-health | grep "^cluster is healthy$" | wc -l'
become: True
register: etcd_healthy_nodes
- name: Ensure we have the configuration folder
file:
name: '/etc/systemd/system/etcd_container.service.d'
state: directory
become: True
when: etcd_healthy_nodes.stdout == '0'
- name: If we don't have the quorum put the cluster in surviving mode
lineinfile:
dest: /etc/etcd/etcd.conf
regexp: ^ETCD_FORCE_NEW_CLUSTER=
line: ETCD_FORCE_NEW_CLUSTER=true
become: True
when: etcd_healthy_nodes.stdout == '0'
- name: Restart the etcd service in survive mode
systemd:
name: 'etcd'
state: restarted
daemon_reload: True
become: True
when: etcd_healthy_nodes.stdout == '0'
- name: Remove the surviving mode
lineinfile:
dest: /etc/etcd/etcd.conf
regexp: ^ETCD_FORCE_NEW_CLUSTER=
state: absent
become: True
when: etcd_healthy_nodes.stdout == '0'
register: change_file_result
- name: Reload systemd daemons
systemd:
name: 'etcd'
state: started
daemon_reload: True
become: True
when: change_file_result | changed
[root@stretch-master-0 ~]# oc get node
NAME STATUS ROLES AGE VERSION
stretch-app-0 Ready compute 3d v1.9.1+a0ce1bc657
stretch-app-1 Ready compute 3d v1.9.1+a0ce1bc657
[root@stretch-master-0 ~]# etcdctl -C https://stretch-master-2:2379 --cert-file /etc/etcd/peer.crt --key-file /etc/etcd/peer.key --ca-file /etc/etcd/ca.crt cluster-health
member 1f43aaf2ecf94cab is healthy: got healthy result from https://10.19.114.17:2379
cluster is healthy
Recovery cluster playbook
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment