Last active
May 24, 2023 21:02
-
-
Save hexfusion/dc9d82caa4b20ed8d1c29d01f9a57f0c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# sniff hypervisor | |
dmesg | grep Hypervisor | |
[ 0.000000] Hypervisor detected: KVM | |
# get key in bytes | |
etcdctl get $key -w fields | grep -oP "(?<=Value\" : \").*" | wc -c | |
# defrag status in MB | |
cat etcd_info/endpoint_status.json | jq '(.[0].Status.dbSize - .[0].Status.dbSizeInUse)/1000/1000' | |
3095.384064 | |
# count objects | |
sh-4.4# etcdctl get / --prefix --keys-only | sed '/^$/d' | cut -d/ -f3 | sort | uniq -c | sort -rn | |
# jq skip non json | |
cat $json | jq -R 'fromjson? | '.query' | |
# time range | |
cat $json | jq -R 'fromjson? | select((.ts >= "2021-04-14T02:48") and (.ts <= "2021-04-14T03:50"))' | |
# hh new revision | |
oc patch etcd cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge | |
# parse protobuf | |
cat data-1/member/snap/*.snap | protoc --decode_raw | |
# debug | |
curl --cert $ETCDCTL_CERT --key $ETCDCTL_KEY --cacert $ETCDCTL_CACERT -L https://127.0.0.1:2379/config/local/log -XPUT -d '{"level": "error"}' | |
curl --cert $ETCDCTL_CERT --key $ETCDCTL_KEY --cacert $ETCDCTL_CACERT -L https://127.0.0.1:2379/config/local/log -XPUT -d '{"level": "debug"}' | |
# grab slow queries and leader elections | |
sed -rn -e 's/.*\s([0-9]{4}-[0-9]{1,2}-[0-9]{2}\s[0-9]{1,2}\:[0-9]{1,2}\:[1-9]{1,2}\.[1-9]{1,10}).*\btook too long \(([0-9]{1,4}\.[0-9]{1,12}s).*/\1 \2/p' -e 's/.*\s([0-9]{4}-[0-9]{1,2}-[0-9]{2}\s[0-9]{1,2}\:[0-9]{1,2}\:[1-9]{1,2}\.[1-9]{1,10}).*(elected leader [0-9a-z]{1,20} at term [0-9]{1,5}).*/\1\2/p' file | |
# metrics | |
grep -oP '(?<=took too long \().*(\d{1,6})' * | sort | |
# grafana dashboards | |
3070 # etcd default | |
https://github.com/cloud-bulldozer/arsenal/blob/master/openshift-performance-dashboard/grafana/on-cluster-latest.json | |
# run etcd metrics dump | |
curl https://gist.githubusercontent.com/hexfusion/f9a10ef97ca2bbd70b754a038c4e05c2/raw/9e9d2c877116c801417778e17027fc19d4798bd7/ocp4-etcd-get-metrics.sh | bash | |
# grab size from etcd metics | |
grep -oP '(?<=size:)[0-9]+' | sort -n | |
# use bbolt to fix freelist | |
bbolt compact -o ./fixed.db ./snapshot-apa600001.db | |
# observe writes | |
echo 1 > /proc/sys/vm/block_dump | |
journalctl -f | |
# heavy compact | |
rev=$(etcdctl3 endpoint status --write-out="json" | egrep -o '"revision":[0-9]*' | egrep -o '[0-9]*' -m1) | |
etcdctl3 compact $rev | |
# metrics | |
histogram_quantile(0.95, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance,le)) | |
# number of daily leader elections | |
changes(etcd_server_leader_changes_seen_total{job="$etcd_name"}[1d]) | |
# percentage of roundtrip below 6.4ms | |
sum(rate(etcd_network_peer_round_trip_time_seconds_bucket{le="0.0064"}[5m])) by (instance) / sum(rate(etcd_network_peer_round_trip_time_seconds_count[5m])) by (instance) * 100 | |
# etcd_disk_backend_commit_duration_seconds_bucket lt 0.032 by percentage | |
sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{le="0.032"}[5m])) by (instance) / sum(rate(etcd_disk_backend_commit_duration_seconds_count[5m])) by (instance) * 100 | |
# fsync FAQ says p99 should be less than 10ms we do 16 | |
sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{le="0.016"}[5m])) by (instance) / sum(rate(etcd_disk_wal_fsync_duration_seconds_count[5m])) by (instance) * 100 | |
# number of watch streams | |
sum(grpc_server_started_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) | |
# number of lease streams | |
sum(grpc_server_started_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) | |
# RPC Failed Rate | |
sum(rate(grpc_server_handled_total{grpc_type="unary",grpc_code!="OK"}[5m])) | |
# RPC Rate | |
sum(rate(grpc_server_started_total{grpc_type="unary"}[5m])) | |
# DB Size | |
etcd_mvcc_db_total_size_in_bytes | |
# has leader | |
sum(etcd_server_has_leader) | |
RSS | |
process_resident_memory_bytes{job="$etcd_name"} | |
# gRPC traffic in (Client) | |
rate(etcd_network_client_grpc_received_bytes_total{job="$etcd_name"}[5m]) | |
# gRPC traffic out (Client) | |
rate(etcd_network_client_grpc_sent_bytes_total{job="$etcd_name"}[5m]) | |
# gRPC traffic in (Peer) | |
sum(rate(etcd_network_peer_received_bytes_total{job="$etcd_name"}[5m])) by (instance) | |
# gRPC traffic out (Peer) | |
sum(rate(etcd_network_peer_sent_bytes_total{job="$etcd_name"}[5m])) by (instance) | |
# CPU % | |
rate(process_cpu_seconds_total{job="etcd"}[5m]) * 100 | |
# amount of CPU seconds throttled per second | |
increase(container_cpu_cfs_throttled_periods_total{container_name!="<your-container>",namespace="<your-namespace>"}[5m]) | |
# etcd CPU total with cadvisor | |
rate(container_cpu_usage_seconds_total{container_name=~"etcd.*",pod_name!=}[5m]) | |
# RSS per namespace | |
process_resident_memory_bytes{endpoint="etcd-metrics",job="etcd",namespace="openshift-etcd",service="etcd"} | |
##### etcd log grep strings | |
E | # errors | |
C | # catosprophic | |
pkg/osutil: received terminated # killed | |
# kubelet | |
etcd-member.yaml": invalid pod: [spec.initContainers[0].image: Required value] # MCO not sending image | |
Started container etcd-member | |
mcdorig | |
podman run --volume "$PWD:/mount:z" docker.io/ljishen/fio /mount/etcd.fio > result | |
## fio config | |
[global] | |
name=custom | |
filename=/mount/custom-delete-me | |
rw=write | |
bs=2300 | |
fdatasync=1 | |
iodepth=128 | |
[file1] | |
size=128M | |
ioengine=libaio | |
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) | |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) | |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) | |
container_memory_rss{namespace="openshift-etcd", container="etcd-member"} / (1024 ^ 3) | |
# 4.4+ | |
container_memory_rss{namespace="openshift-etcd", container="etcd"} / (1024 ^ 3) | |
sum(rate(etcd_server_leader_changes_seen_total[5m])) | |
#3.11 memory query | |
container_memory_rss{namespace=~"kube-system",pod_name=~"master-etcd.*",container_name="etcd"} / (1024 ^ 3) |
Inject latency
#!/bin/bash
choice=$(oc get --namespace openshift-etcd --selector etcd pods -o json | jq -r '.items[] | .spec.nodeName + " " + (.status.containerStatuses[] | select(.name=="etcd") | .containerID[8:])' | fzf)
IFS=' ' read node container_id <<< "$choice"
pid=$(oc debug --quiet nodes/$node -- chroot /host crictl inspect -o go-template --template '{{.info.pid}}' $container_id)
oc debug --quiet nodes/$node -- chroot /host strace -Tfe inject=fdatasync:delay_enter=800000 -e trace=fdatasync -p $pid
compose
#!/bin/bash
set -x
CONTAINER_IDS=($(docker-compose ps -q))
PID=$(docker inspect --format '{{ .State.Pid }}' ${CONTAINER_IDS[0]})
echo -e "injecting latency into container id ${CONTAINER_IDS[0]}"
sudo strace -Tfe inject=fdatasync:delay_enter=2400000 -e trace=fdatasync -p $PID
terms and latency parse
awk '/became.*at term/ {
print $1,$2, $7, $8, $9, $10, $11; }
/took too long.*[0-9]+s)/ {
split($(NF-2),a,"[(s)]");
if (a[2] > 1) {
print $1, $2, "slow request took:", $(NF-2)
}
}' | tr -d '()'
sort .. | sort -n -k 6
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
dump prom 3.11