Last active
February 6, 2024 09:49
-
-
Save hexfusion/0ac019af41a19e8e1fe62f6cfb7435cc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# routes | |
oc get routes --all-namespaces | |
# get the image shas from a release | |
oc adm release info quay.io/openshift-release-dev/ocp-release:4.14.10-x86_64 --pullspecs | awk '{print " - " $2}' | |
# use butane for machine configs | |
https://docs.openshift.com/container-platform/4.8/installing/install_config/installing-customizing.html | |
# p&f issues | |
grep -r 'apiserver.openshift.io/watch-rate-limit' 'gcp-audit/quay-io-openshift-release-dev-ocp-v4-0-art-dev-sha256-3090cd5333971e522a2cb54e8586308cb5388e8c8011ffbdfd8305db0b0d8a41/audit_logs/kube-apiserver' | grep "watch=true" | grep 429 | grep -v '"userAgent":"kubelet' | grep '"username"' | wc -l | |
# query loki for etcd | |
sum by (instance) (rate({job="etcd"} | json | duration > 400ms [1m])) | |
# | |
kinit [email protected] | |
# debug no api | |
https://docs.openshift.com/container-platform/4.6/support/gathering-cluster-data.html#support-generating-a-sosreport-archive_gathering-cluster-data | |
# top against node | |
oc adm top node | |
# etcd pprof crictl | |
rictl exec -it b2c59200242fe sh -c 'curl --cert $ETCDCTL_CERT --key $ETCDCTL_KEY --cacert $ETCDCTL_CACERT https://localhost:2379/debug/pprof/heap' > /tmp/heap | |
# atop | |
atop -r atop -1 -f -D -p | |
# count resources etcd | |
etcdctl get / --prefix --keys-only | grep -oP "(?<=/kubernetes.io\/).+?(?=\/)" | sort | uniq -c | |
# custom dep | |
replace github.com/openshift/library-go => github.com/hexfusion/library-go v0.0.0-20200729104859-26a400dab398 | |
# backup example | |
oc get cm cluster-backup-pod -n openshift-etcd -o "jsonpath={.data['backup-pod\.yaml']}" | |
# node reboots | |
oc get events -o json | jq '.items[] | select((.reason=="Rebooted") or .reason=="Reboot") | .lastTimestamp + " -> " + .reason + " -> " + .involvedObject.name + " -> " + .message' | |
# parse events | |
curl -s https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_cluster-etcd-operator/350/pull-ci-openshift-cluster-etcd-operator-master-e2e-aws/1463/artifacts/e2e-aws/gather-extra/events.json | jq '.items[] |select(.reason=="UnhealthyEtcdMember") | .message' | |
# list leases | |
for lease in $(etcdctl lease list);do etcdctl lease timetolive $lease; done | |
for token in $(etcdctl get --prefix /openshift.io/oauth/accesstokens --keys-only | sort -u);do etcdctl get $token -w json | etcdctl lease timetolive $(printf "%x\n" $(python -c 'import json,sys;print json.load(sys.stdin)["kvs"][0]["lease"]'));done | |
grep '"verb":' exlode | sort | uniq | |
# list event reasons | |
cat events.json | grep '\"reason\": \"' | sort -u -k2 | awk '{ print $2 }' | |
# base tel metrics | |
id_version_ebs_account_internal:cluster_subscribed | |
# metrics join | |
id_version_ebs_account_internal:cluster_subscribed + on(_id) (topk(1,cluster_version{type="current",version=~"4.\\d+\\.\\d+"})) | |
# more joins | |
id_version_ebs_account_internal:cluster_subscribed{managed="true"} + on(version) group_left() (topk(1,cluster_version{type="current",version=~"4.\\d+\\.\\d+"})) | |
# big join | |
label_replace(count(max by (_id) ((cluster_operator_up{name="ingress"} == 0) and on (_id) (cluster_operator_up{name="authentication"} == 0))), "which", "both", "mode", "") or | |
label_replace(count(max by (_id) ((cluster_operator_up{name="authentication"} == 0))), "which", "authentication", "mode", "") or | |
label_replace(count(max by (_id) ((cluster_operator_up{name="ingress"} == 0))), "which", "ingress", "mode", "") | |
# more prom | |
count by (version) (id_version_ebs_account_internal:cluster_subscribed + on(version) group_left(_blah) 0*(topk(1, cluster_version{type="current",version=~"4\\.\\d+\\.\\d+"}))) | |
# parse events | |
jq '.items[] | select(.source.component=="kube-apiserver-operator-revisioncontroller" and .reason=="RevisionTriggered") | {lastTimestamp: .lastTimestamp, message: .message, source: .source }' events.json | |
### vsphere get console logs | |
Navigate to datastore view -> click datastore -> files -> <virtual machine name> directory -> click "serial.log" -> click "download" | |
oc rsh -n openshift-etcd $(oc get pods -n openshift-etcd -o jsonpath='{.items[0].metadata.name}') | |
crictl ps -a --label "io.kubernetes.pod.namespace=openshift-etcd" -o json | jq -r '.containers[].id' | xargs -n1 crictl logs | |
crictl ps -a --label "io.kubernetes.pod.namespace=openshift-etcd" -o json | jq -r '.containers[] |"\(.id) \(.metadata.name)"' | xargs -n 2 bash -c 'crictl logs -t $0 &> $1-$0.log' | |
# cleanup | |
find ~/clusters/{aws,gcp,azure} -mindepth 2 -maxdepth 2 -type d -exec bash -c '$1/bin/openshift-install --dir "$1" destroy cluster' _ {} \; | |
# upgrade | |
oc adm upgrade --to-image registry.svc.ci.openshift.org/ocp/release:$UPGRADE_RELEASE --force | |
# download CI run | |
wget -r -e robots=off -np -H https://gcsweb-ci.svc.ci.openshift.org/gcs/origin-ci-test/pr-logs/pull/24458/pull-ci-openshift-origin-master-e2e-aws-serial/12333/artifacts/e2e-aws-serial/ | |
# grep kube log | |
sed -rn 's/^(\w+ [0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}) ([^ ]+) ([^ ]+): [^ ]+ ([^ ]+) .* (Readiness probe for "etcd-quorum-guard-[[:alnum:]]+-[[:alnum:]]+).* (\w+$)/\1 \2 \3 \4 \5 \6/p' kubelet_service.log | |
# grep kube newer | |
sed -rn 's/(\w+ [0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}.[0-9]+) ([^ ]+) ([^ ]+): [^ ]+ ([^ ]+) .* (Readiness probe for "etcd-quorum-guard-[[:alnum:]]+-[[:alnum:]]+).* (.*+$)/\1 \2 \6/p' kubelet_service.log | sort -k 2 | grep failure | |
# nicer way | |
cat host_service_logs/masters/kubelet_service.log | awk '/Readiness probe for \"etcd-quorum-guard.*/ { print $1, $2, $3, $4, $10, $11, $13, $14; }' | |
# check for crashing containers | |
oc get po -A | grep -vE "(Running|Completed)" | |
# patch to see upgrade paths | |
oc patch clusterversion/version --patch '{"spec":{"upstream":"https://openshift-release.svc.ci.openshift.org/graph"}}' --type=merge | |
# update lastest on cluster | |
podman pull --authfile=/var/lib/kubelet/config.json $image | |
# grab MCO MachineConfig for etcd | |
oc get machineconfig 00-master -o jsonpath='{.spec.config.storage.files[?(@.path=="/etc/kubernetes/manifests/etcd-member.yaml")]contents.source}' | |
# grab ci runs | |
gsutil -m cp -r gs://origin-ci-test/logs/canary-openshift-ocp-installer-e2e-azure-4.2/290/ . | |
# vsphere install | |
https://github.com/openshift/installer/tree/master/upi/vsphere | |
https://vcsa.vmware.devcluster.openshift.com/ui | |
# merge json pullsecrets | |
jq -s '.[0] * .[1]' CORE_PULL_SECRET CI_PULL_SECRET &> MASTER_PULL_SECRET | |
# merge all json in dir into 1 file | |
jq -s '[.[][]]'*.json > manifest.json | |
# podman build rhel | |
sudo podman build --authfile=./PULL_SECRET_LOCATION -f images/tests/Dockerfile.rhel . | |
# search CI errors | |
https://ci-search-ci-search-next.svc.ci.openshift.org/?search=failed%3A.*API+data+in+etcd.*&maxAge=336h&context=2&type=all | |
# oc debug node | |
oc debug node/ip-10-0-137-127.us-east-2.compute.internal | |
chroot /host | |
## create/simulate latency | |
## https://www.enodev.fr/posts/emulate-a-slow-block-device-with-dm-delay.html | |
dmsetup create delayed | |
## watch keyspace and print counts by resource | |
ETCDCTL_API=3 etcdctl watch / --prefix -w fields > out | watch 'cat out | grep -oP "(?<=/kubernetes.io\/).+?(?=\/)" | sort | uniq -c' | |
# convert to decimal | |
printf "%.2f" 7.516192768e+09 | |
# p99 raw | |
echo $(( $(printf "%.f" 2.127424e+06) *99/100 )) | |
## refresh token | |
oc registry login --to=PULL_SECRET_LOCATION | |
## get IP of node | |
oc get node ip-10-0-143-125.us-east-2.compute.internal -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}' | |
## list registtry | |
oc get secret pull-secret -n openshift-config -o jsonpath={.data.'\.dockerconfigjson'} | base64 -d | jq -r '.auths | to_entries[]' | jq -r '.key' | |
## release | |
oc get imagestream installer -n openshift -o jsonpath={.status.tags[0].items[0].dockerImageReference} | egrep -o '^[^@]+' | |
## image | |
oc adm release info --image-for kube-etcd-signer-server --registry-config=./PULL_SECRET_LOCATION | |
# check cluster version | |
oc --config=${INSTALL_DIR}/auth/kubeconfig get clusterversion -oyaml | |
# etcd logs | |
master-logs etcd etcd &> etcd_server.log | |
# etcd related tasks | |
https://docs.openshift.com/container-platform/3.11/day_two_guide/host_level_tasks.html#day-two-guide-etcd-backup | |
# external testing 2 stage docker | |
# https://mojo.redhat.com/docs/DOC-1178565?sr=search&searchId=429ba108-213b-4b81-87f2-b667aca3e228&searchIndex=0 | |
FROM openshift/origin-release:golang-1.10 AS builder | |
# openshift 4 | |
CONTAINER=$(runc list | grep `pgrep etcd` | awk '{print $1}'); runc exec $CONTAINER etcd --version | |
# etcd | |
oc get pods --all-namespaces | grep etcd | |
# get image of container | |
oc get pod -o "jsonpath={range .status.containerStatuses[*]}{.name}{'\t'}{.state}{'\t'}{.image}{'\n'}{end}" -n kube-system etcd-member-ip-10-0-18-84.ec2.internal | |
# send to docker hub with podman | |
sudo podman push --authfile ~sbatsche/.docker/config.json localhost/machine-config-operator:v3.11.0-699-g100373ce-dirty hexfusion/machine-config-operator:latest | |
# give kube-system perms for operator | |
oc create clusterrolebinding etcd_operator --clusterrole=cluster-admin --serviceaccount=kube-system:default | |
# location of certs on bootstrap | |
/var/opt/openshift/tls | |
/sysroot/ostree/deploy/redhat-coreos-maipo/var/opt/openshift/tls/ | |
# MCO regen | |
go test ./pkg/controller/template/... -u | |
my guess is | |
on bootstrap: | |
https://github.com/openshift/machine-config-operator/pull/517/files#diff-8fb88a4862bafc203a34072446df1407R52 | |
etcd-metrics-ca content is empty | |
in cluster: | |
https://github.com/openshift/machine-config-operator/pull/517/files#diff-554de5523753fda8c93e7008c9bd947fR287 | |
etcd-metrics-ca config maps exists and is non-empty. | |
a good way would be | |
use the release image generated for your PR in CI and use the to create a cluster using installer. | |
how to find release image: | |
```2019/03/06 20:48:58 Create release image registry.svc.ci.openshift.org/ci-op-l4qnhir7/release:latest``` | |
from https://openshift-gce-devel.appspot.com/build/origin-ci-test/pr-logs/pull/openshift_machine-config-operator/517/pull-ci-openshift-machine-config-operator-master-e2e-aws/2369?log#log | |
then `OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE=<release-image> create cluster` | |
and on bootstrap node check `/etc/mcs/machine-configs` and compare them with `oc get machine-configs` the generated ones | |
## | |
sudo podman run --quiet --rm quay.io/hexfusion/origin-release:v4.0 image kube-client-agent | |
# check cert | |
openssl x509 -text -noout -in cert.pem | |
# check cert with output from oc | |
oc get cm -n openshift-config-managed csr-controller-ca -o json | jq -r '.data["ca-bundle.crt"]' | openssl x509 -text -noout -in - | |
# check CA created cert | |
openssl verify -verbose -x509_strict -CAfile ca.crt somecert.crt | |
# check csr | |
openssl req -text -noout -verify -in test.pem | |
# verify key was signed by csr | |
openssl verify -verbose -CAFile ca.crt domain.crt | |
# decryot base 64 encoded certs. | |
for ext in crt key; do oc -n openshift-config get secrets etcd-metrics-proxy-client -o yaml | grep tls.${ext} | awk '{ print $2 }' | base64 --decode > etcd-metrics-proxy-client.${ext}; | |
### etcd watch keyspace | |
ETCDCTL_API=3 etcdctl watch / --prefix -w fields | |
### build latest CI release. | |
$ oc login and get link | |
## go to site https://openshift-release.svc.ci.openshift.org/ | |
## get a release IE registry.svc.ci.openshift.org/ocp/release:4.1.0-0.ci-2019-04-29-142604 | |
## OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE=registry.svc.ci.openshift.org/ocp/release:4.1.0-0.ci-2019-04-29-142604 create cluster | |
# docker login | |
docker login -u hexfusion -p $(oc whoami -t) registry.svc.ci.openshift.org | |
# release | |
docker run -it -v $(pwd)/ci-operator:/ci-operator:z registry.svc.ci.openshift.org/ci/ci-operator-prowgen:latest --from-dir /ci-operator/config/ --to-dir /ci-operator/job | |
# exec into etcd | |
id=$(sudo crictl ps --name etcd-member | awk 'FNR==2{ print $1}') && sudo crictl exec -it $id /bin/sh | |
# export certs | |
export ETCDCTL_API=3 ETCDCTL_CACERT=/etc/ssl/etcd/ca.crt ETCDCTL_CERT=$(find /etc/ssl/ -name *peer*crt) ETCDCTL_KEY=$(find /etc/ssl/ -name *peer*key) | |
# use etcdctl | |
ETCDCTL_API=3 etcdctl --cert=$(find / -name 'system:etcd-peer*.crt') --key=$(find / -name 'system:etcd-peer*.key') --cacert=$(find / -name 'ca.crt') member list | |
# print cluster ID. | |
etcdctl member list -w fields | grep -oP '(?<=ClusterID\"\s:\s).*' | xargs printf '%x\n' | |
# list a records from SRV | |
dig +noall +answer SRV _etcd-server-ssl._tcp.hexfusion.local | grep -oP '(?<=2380 ).*[^\.]' | xargs| sed -e 's/ /,/g' | |
# list ipv4 address | |
ip -o addr | grep -oP '(?<=inet )(\d{1,3}\.?){4}' | |
# size of secrets | |
etcdctl get --prefix /kubernetes.io/secrets -w fields | grep -oP '(?<=Value\" : ")(.*)' | wc -c | |
# selinux | |
ausearch -m avc -c etcd | |
# create intial_cluster. | |
etcdctl member list -w json | jq -r '.members[] | [.name,.peerURLs[0]] | "\(.[0])=\(.[1])" ' | xargs | sed -e 's/ /,/g' | |
##### | |
# Operator debug | |
##### | |
# extract the payload for what CVO manages. | |
oc adm release extract --from=quay.io/hexfusion/origin-release:v4.2 --to=release-image | |
function git () { | |
(sudo podman run --rm -v ${PWD}:/root --rm --volume "$(pwd):/git:z" alpine/git "$@") | |
} | |
# regex to parse kubelet | |
https://gist.github.com/hexfusion/88e45f9d2c0ce6530bd4e3fa0bd9cfde | |
# cleanup clusters | |
find ./clusters/{aws,gcp} -maxdepth 1 -type d -exec bash -c 'openshift-install --dir "$1" destroy cluster' _ {} \; |
Author
hexfusion
commented
May 31, 2019
•
ETCDS=($(oc get pods -n openshift-etcd -o jsonpath='{.items[*].metadata.name}'))
for pod in "${ETCDS[@]}"; do
OUT=$(oc exec -it $pod -n "openshift-etcd" -c "etcd-member" -- sh -c \
'CERT=$(find /etc/ssl/etcd -name *etcd-metric*crt); \
CACERT=/etc/ssl/etcd/metric-ca.crt ; \
KEY=$(find /etc/ssl/etcd -name *metric*key); \
curl --cacert $CACERT --key "$KEY" --cert "${CERT//:/\\:}" https://127.0.0.1:9979/metrics -k')
echo "$OUT"
done
#!/bin/sh
fetch() {
for i in $(seq 464); do
mkdir -p "${i}"
curl "https://storage.googleapis.com/origin-ci-test/logs/release-openshift-ocp-installer-e2e-aws-4.3/$i/finished.json" > "${i}/finished.json"
curl "https://storage.googleapis.com/origin-ci-test/logs/release-openshift-ocp-installer-e2e-aws-4.3/$i/artifacts/e2e-aws/metrics/prometheus.tar" > "${i}/prometheus.tar"
if test "$(wc -c "${i}/prometheus.tar" | cut -f1 -d' ')" -lt 1000; then
rm -rf "${i}/prometheus.tar"
fi
done
}
analyze() {
DIRECTORY="$(realpath "${1}")"
if test ! -d "${DIRECTORY}/prometheus"; then
mkdir "${DIRECTORY}/prometheus"
tar -xzf "${DIRECTORY}/prometheus.tar" -C "${DIRECTORY}/prometheus"
chmod -R 0777 "${DIRECTORY}/prometheus"
fi
END_TIME="$(jq -r '.timestamp' "${DIRECTORY}/finished.json")"
START_TIME="$((END_TIME - 3600*3))"
CONTAINER="$(sudo podman run -d -v "${DIRECTORY}/prometheus:/prometheus:z" -p 9090:9090 docker.io/prom/prometheus:v2.6.0)"
sleep 10
curl "http://localhost:9090/api/v1/query_range?start=${START_TIME}&end=${END_TIME}&step=60&query=histogram_quantile(0.99%2C%20rate(etcd_disk_wal_fsync_duration_seconds_bucket%5B5m%5D))"
sudo podman rm -f "${CONTAINER}" >/dev/null
sudo rm -rf "${DIRECTORY}/prometheus"
}
analyze_all() {
for DIRECTORY in *; do
if test -e "${DIRECTORY}/prometheus.tar"; then
analyze "${DIRECTORY}" >"${DIRECTORY}/result.json"
fi
done
}
fetch
analyze_all
for RESULT in */result.json; do VALUE="$(jq '[.data.result[].values[][1] | tonumber] | max' "${RESULT}")"; END_TIME="$(jq -r '.timestamp | todateiso8601' "${RESULT/result/finished}")"; if test -n "${VALUE}"; then echo "${END_TIME} ${VALUE}"; fi; done | gnuplot -e 'set terminal png; set xdata time; set timefmt "%Y-%m-%dT%H:%M%SZ"; plot "-" using 1:2 with points t "etcd fsync 99th"' >result.png
metrics
id=$(sudo crictl ps --name etcd-member | awk 'FNR==2{ print $1}'); \
OUT=$(sudo crictl exec -it $id /bin/sh -c \
'CERT=$(find /etc/ssl/etcd -name *etcd-metric*crt); \
CACERT=/etc/ssl/etcd/metric-ca.crt ; \
KEY=$(find /etc/ssl/etcd -name *metric*key); \
curl --cacert $CACERT --key "$KEY" --cert "${CERT//:/\\:}" https://127.0.0.1:9979/metrics -k')
echo "$OUT" > /tmp/$(hostname)-metrics.log
dump prom
#!/bin/bash
ARTIFACT_DIR=$PWD
mkdir -p $ARTIFACT_DIR/metrics
echo "Snapshotting prometheus ..."
oc --insecure-skip-tls-verify exec -n openshift-monitoring prometheus-k8s-0 -- tar cvzf - -C /prometheus . >$ARTIFACT_DIR/metrics/prometheus.tar.gz
metrics
sum(container_memory_usage_bytes{image!="",}) by (namespace)
get commit sha of release image
$ sudo podman pull --authfile ~/.PULL_SECRET_BUILD $(oc get pods -n openshift-etcd-operator -o json | jq -r '.items[].spec.containers[0].env[] | select(.name=="OPERATOR_IMAGE")'.value)
Trying to pull registry.svc.ci.openshift.org/ocp/4.5-2020-03-16-085352@sha256:c6aac32c2ebb7fa9c3915617368f830473308450d53975644cc931801e60997a...
Getting image source signatures
Copying blob 4fbc3bafa3d4 skipped: already exists
Copying blob 34971b2d1eb9 skipped: already exists
Copying blob 2ccc210e15d6 done
Copying blob ca166bc0bd99 done
Copying blob 685a3b67eda6 done
Copying config f04b9935bd done
Writing manifest to image destination
Storing signatures
f04b9935bde071777630da63ebd1be66fdc692edeac8f67b69373e7d567c17cb
git checkout $(sudo podman inspect 84a4766806bb56652311518e75c8d8e9b77b8f16c0662c6f9052c254427b24c0 | jq -r .[].Labels.\"io.openshift.build.commit.id\")
$ git log -1 -p -m ":/cache the client based on the endpoints to avoid reconstruction"
POST /test-index/events
{
"items":{
"properties":{
"firstTimestamp":{ "type" : "date" },
"count":{ "type" : "int" },
"involvedObject":{
"properties":{
"kind":{"type" : "text"},
"name":{"type" : "text"},
"uuid":{"type" : "text"}
}
},
"kind":{"type" : "text" },
"lastTimestamp":{"type" : "date" },
"message":{ "type" :"text" },
"metadata":{
"properties":{
"creationTimestamp":{"type": "date"},
"name":{"type": "text"},
"namespace":{"type": "text"},
"resourceVersion":{"type": "text"},
"selfLink":{"type": "text"},
"uuid":{"type": "text"}
}
},
"reason":{ "type":"text" },
"reportingComponent":{ "type":"text" },
"reportingInstance" :{ "type":"text" },
"source":{
"properties":{
"component":{"type": "text"},
"host":{"type": "text"}
}
}
}
}
}
dump prom
#!/usr/bin/env bash
function queue() {
local TARGET="${1}"
shift
local LIVE
LIVE="$(jobs | wc -l)"
while [[ "${LIVE}" -ge 45 ]]; do
sleep 1
LIVE="$(jobs | wc -l)"
done
echo "${@}"
if [[ -n "${FILTER:-}" ]]; then
"${@}" | "${FILTER}" >"${TARGET}" &
else
"${@}" >"${TARGET}" &
fi
}
ARTIFACT_DIR=$PWD
mkdir -p $ARTIFACT_DIR/metrics
id=$(crictl ps -q --label "io.kubernetes.container.name=prometheus")
if [ -z "$id" ]; then
echo "prom container not found.."
fi
echo "Snapshotting prometheus (may take 15s) ..."
queue ${ARTIFACT_DIR}/metrics/prometheus.tar.gz crictl exec $id tar cvzf - -C /prometheus .
FILTER=gzip queue ${ARTIFACT_DIR}/metrics/prometheus-target-metadata.json.gz crictl exec $id /bin/bash -c "curl -G http://localhost:9090/api/v1/targets/metadata --data-urlencode 'match_target={instance!=\"\"}'"
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment