Server names cl[0-9]*-hdp Master hdp-m-03.local:7051
###### missing #######
mkdir -p /tmp/test_kudu_data
rm -f /tmp/test_kudu_data/xx*
cd /tmp/test_kudu_data
sudo -u kudu kudu cluster ksck cl-hdp-m-03.local:7051 > /tmp/csplit/kudu_valery
cat /tmp/csplit/kudu_valery| csplit - '/^Tablet /' '{*}'
for file in $(ls xx*);do
grep -q missing ${file} || continue
grep -q 'bad state [LEADER]' ${file} && continue
healthy_s=$(cat $file|grep -E 'cl[0-9]*-hdp'|grep -E ': RUNNING \[LEADER\]$'| head -1| awk -F '(' '{print $2}'| awk -F ')' '{print $1}'|tr -d ' '| tr -d '\n' )
missing_s=$(cat $file|grep -E 'cl[0-9]*-hdp'|grep -E ': missing$|: warning$'|sort -r| head -1| awk -F '(' '{print $2}'| awk -F ')' '{print $1}'|tr -d ' '| tr -d '\n' )
tablet_s=$(cat $file|grep -E '^Tablet * '|awk '{print $2}')
[[ "${healthy_s}" = "" ]] && continue
[[ "${missing_s}" = "" ]] && continue
[[ "${tablet_s}" = "" ]] && continue
echo "sudo -u kudu kudu remote_replica copy ${tablet_s} ${healthy_s} ${missing_s} -force_copy"
doneServer names cl[0-9]*-hdp Master hdp-m-03.local:7051
############# bad status ###############
mkdir -p /tmp/test_kudu_data
rm -f /tmp/test_kudu_data/xx*
cd /tmp/test_kudu_data
sudo -u kudu kudu cluster ksck hdp-m-03.local:7051 > /tmp/csplit/kudu_valery
cat /tmp/csplit/kudu_valery| csplit - '/^Tablet /' '{*}'
for file in $(ls xx*);do
grep -q missing ${file} && continue
grep -q 'bad state [LEADER]' ${file} && continue
healthy_s=$(cat $file|grep -E 'cl[0-9]*-hdp'|grep -E ': RUNNING \[LEADER\]$'| head -1| awk -F '(' '{print $2}'| awk -F ')' '{print $1}'|tr -d ' '| tr -d '\n' )
badstate_s=$(cat $file|grep -E 'cl[0-9]*-hdp'|grep -E ': bad state$'|sort -r| head -1| awk -F '(' '{print $2}'| awk -F ')' '{print $1}'|tr -d ' '| tr -d '\n')
tablet_s=$(cat $file|grep -E '^Tablet * '|awk '{print $2}')
[[ "${healthy_s}" = "" ]] && continue
[[ "${badstate_s}" = "" ]] && continue
[[ "${tablet_s}" = "" ]] && continue
echo "sudo -u kudu kudu remote_replica copy ${tablet_s} ${healthy_s} ${badstate_s} -force_copy"
cat $file|grep -E 'cl[0-9]*-hdp'
done- Show issue
[root@cl01-hdp-m-01 test_kudu_data]# sudo -u kudu kudu cluster ksck --tablets=c5faaf8340af4af9945455855b241856 cl01-hdp-m-03.local:7051
Connected to the Master
Fetched info from all 6 Tablet Servers
Tablet c5faaf8340af4af9945455855b241856 of table 'table' is unavailable: 2 replica(s) not RUNNING
6a84969e6908461a84dd9dd7189dd769 (cl01-hdp-w-07.local:7050): missing
97fbdfe92ec542d0b27214a070d0d3cb (cl01-hdp-w-09.local:7050): bad state [LEADER]
State: NOT_STARTED
Data state: TABLET_DATA_TOMBSTONED
Last status: Tablet initializing...
fd98b625ed064b5b9f2d09bbb2d5f3c2 (cl01-hdp-w-05.local:7050): RUNNING
1 replicas' active configs differ from the master's.
All the peers reported by the master and tablet servers are:
A = 6a84969e6908461a84dd9dd7189dd769
B = 97fbdfe92ec542d0b27214a070d0d3cb
C = fd98b625ed064b5b9f2d09bbb2d5f3c2
The consensus matrix is:
Config source | Voters | Current term | Config index | Committed?
---------------+------------------------+--------------+--------------+------------
master | A B* C | | | Yes
A | [config not available] | | |
B | [config not available] | | |
C | A B C | 18 | 15 | Yes
Table table has 1 unavailable tablet(s)
WARNING: 1 out of 1 table(s) are not in a healthy state
==================
Errors:
==================
table consistency check error: Corruption: 1 table(s) are bad
FAILED
Runtime error: ksck discovered errors- Fix
mkdir -p /tmp/test_kudu_data
rm -f /tmp/test_kudu_data/xx*
cd /tmp/test_kudu_data
sudo -u kudu kudu cluster ksck cl01-hdp-m-03.local:7051 > /tmp/csplit/kudu_valery
cat /tmp/csplit/kudu_valery| csplit - '/^Tablet /' '{*}'
for file in $(ls xx*);do
grep -q f_events_demand_agg ${file} || continue
grep -q missing ${file} || continue
grep -q 'bad state [LEADER]' ${file} && continue
badstate_l=$(cat $file|grep -E 'cl[0-9]*-hdp'|grep -E 'bad state \[LEADER\]$'| head -1| awk -F '(' '{print $2}'| awk -F ')' '{print $1}'|tr -d ' '| tr -d '\n' )
missing_s=$(cat $file|grep -E 'cl[0-9]*-hdp'|grep -E ': missing$|: warning$'|sort -r| head -1| awk -F '(' '{print $2}'| awk -F ')' '{print $1}'|tr -d ' '| tr -d '\n' )
healthy_s=$(cat $file|grep -E 'cl[0-9]*-hdp'|grep -E ': RUNNING$'| head -1| awk -F '(' '{print $2}'| awk -F ')' '{print $1}'|tr -d ' '| tr -d '\n' )
tablet_s=$(cat $file|grep -E '^Tablet * '|awk '{print $2}')
[[ "${badstate_l}" = "" ]] && continue
[[ "${missing_s}" = "" ]] && continue
[[ "${tablet_s}" = "" ]] && continue
[[ "${healthy_s}" = "" ]] && continue
table=$(cat $file|grep -E '^Tablet * '|awk -F 'of table' '{print $2}'| awk '{print $1}')
echo "==== ${tablet_s} on $table =========="
echo "echo $table ;sudo -u kudu kudu remote_replica delete ${badstate_l} ${tablet_s} bugfix"
echo "echo $table ;sleep 5;sudo -u kudu kudu cluster ksck --tablets=${tablet_s} cl01-hdp-m-03.local:7051|grep 'Deleted tablet blocks from disk'"
echo "echo $table ;sudo -u kudu kudu remote_replica copy ${tablet_s} ${healthy_s} ${missing_s}"
echo "echo $table ;sudo -u kudu kudu cluster ksck --tablets=${tablet_s} cl01-hdp-m-03.local:7051"
cat $file|grep -E 'cl[0-9]*-hdp'
donehttps://kudu.apache.org/docs/command_line_tools_reference.html#cluster-ksck
kudu cluster ksck hdp-m-01.local,hdp-m-02.local,hdp-m-03.localKudu tablet servers are not resistant to disk failure. When a disk containing a data directory or the write-ahead log (WAL) dies, the entire tablet server must be rebuilt. Kudu will automatically re-replicate tablets on other servers after a tablet server fails, but manual intervention is needed in order to restore the failed tablet server to a running state.
- stop kudu tablet server
- remove the existing data directories and WAL directory
rm -rf /opt/cloudera/data/*/kudu-t/* - start kudu tablet server