Last active
February 6, 2024 09:49
-
-
Save hexfusion/0ac019af41a19e8e1fe62f6cfb7435cc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# routes | |
oc get routes --all-namespaces | |
# get the image shas from a release | |
oc adm release info quay.io/openshift-release-dev/ocp-release:4.14.10-x86_64 --pullspecs | awk '{print " - " $2}' | |
# use butane for machine configs | |
https://docs.openshift.com/container-platform/4.8/installing/install_config/installing-customizing.html | |
# p&f issues | |
grep -r 'apiserver.openshift.io/watch-rate-limit' 'gcp-audit/quay-io-openshift-release-dev-ocp-v4-0-art-dev-sha256-3090cd5333971e522a2cb54e8586308cb5388e8c8011ffbdfd8305db0b0d8a41/audit_logs/kube-apiserver' | grep "watch=true" | grep 429 | grep -v '"userAgent":"kubelet' | grep '"username"' | wc -l | |
# query loki for etcd | |
sum by (instance) (rate({job="etcd"} | json | duration > 400ms [1m])) | |
# | |
kinit [email protected] | |
# debug no api | |
https://docs.openshift.com/container-platform/4.6/support/gathering-cluster-data.html#support-generating-a-sosreport-archive_gathering-cluster-data | |
# top against node | |
oc adm top node | |
# etcd pprof crictl | |
rictl exec -it b2c59200242fe sh -c 'curl --cert $ETCDCTL_CERT --key $ETCDCTL_KEY --cacert $ETCDCTL_CACERT https://localhost:2379/debug/pprof/heap' > /tmp/heap | |
# atop | |
atop -r atop -1 -f -D -p | |
# count resources etcd | |
etcdctl get / --prefix --keys-only | grep -oP "(?<=/kubernetes.io\/).+?(?=\/)" | sort | uniq -c | |
# custom dep | |
replace github.com/openshift/library-go => github.com/hexfusion/library-go v0.0.0-20200729104859-26a400dab398 | |
# backup example | |
oc get cm cluster-backup-pod -n openshift-etcd -o "jsonpath={.data['backup-pod\.yaml']}" | |
# node reboots | |
oc get events -o json | jq '.items[] | select((.reason=="Rebooted") or .reason=="Reboot") | .lastTimestamp + " -> " + .reason + " -> " + .involvedObject.name + " -> " + .message' | |
# parse events | |
curl -s https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_cluster-etcd-operator/350/pull-ci-openshift-cluster-etcd-operator-master-e2e-aws/1463/artifacts/e2e-aws/gather-extra/events.json | jq '.items[] |select(.reason=="UnhealthyEtcdMember") | .message' | |
# list leases | |
for lease in $(etcdctl lease list);do etcdctl lease timetolive $lease; done | |
for token in $(etcdctl get --prefix /openshift.io/oauth/accesstokens --keys-only | sort -u);do etcdctl get $token -w json | etcdctl lease timetolive $(printf "%x\n" $(python -c 'import json,sys;print json.load(sys.stdin)["kvs"][0]["lease"]'));done | |
grep '"verb":' exlode | sort | uniq | |
# list event reasons | |
cat events.json | grep '\"reason\": \"' | sort -u -k2 | awk '{ print $2 }' | |
# base tel metrics | |
id_version_ebs_account_internal:cluster_subscribed | |
# metrics join | |
id_version_ebs_account_internal:cluster_subscribed + on(_id) (topk(1,cluster_version{type="current",version=~"4.\\d+\\.\\d+"})) | |
# more joins | |
id_version_ebs_account_internal:cluster_subscribed{managed="true"} + on(version) group_left() (topk(1,cluster_version{type="current",version=~"4.\\d+\\.\\d+"})) | |
# big join | |
label_replace(count(max by (_id) ((cluster_operator_up{name="ingress"} == 0) and on (_id) (cluster_operator_up{name="authentication"} == 0))), "which", "both", "mode", "") or | |
label_replace(count(max by (_id) ((cluster_operator_up{name="authentication"} == 0))), "which", "authentication", "mode", "") or | |
label_replace(count(max by (_id) ((cluster_operator_up{name="ingress"} == 0))), "which", "ingress", "mode", "") | |
# more prom | |
count by (version) (id_version_ebs_account_internal:cluster_subscribed + on(version) group_left(_blah) 0*(topk(1, cluster_version{type="current",version=~"4\\.\\d+\\.\\d+"}))) | |
# parse events | |
jq '.items[] | select(.source.component=="kube-apiserver-operator-revisioncontroller" and .reason=="RevisionTriggered") | {lastTimestamp: .lastTimestamp, message: .message, source: .source }' events.json | |
### vsphere get console logs | |
Navigate to datastore view -> click datastore -> files -> <virtual machine name> directory -> click "serial.log" -> click "download" | |
oc rsh -n openshift-etcd $(oc get pods -n openshift-etcd -o jsonpath='{.items[0].metadata.name}') | |
crictl ps -a --label "io.kubernetes.pod.namespace=openshift-etcd" -o json | jq -r '.containers[].id' | xargs -n1 crictl logs | |
crictl ps -a --label "io.kubernetes.pod.namespace=openshift-etcd" -o json | jq -r '.containers[] |"\(.id) \(.metadata.name)"' | xargs -n 2 bash -c 'crictl logs -t $0 &> $1-$0.log' | |
# cleanup | |
find ~/clusters/{aws,gcp,azure} -mindepth 2 -maxdepth 2 -type d -exec bash -c '$1/bin/openshift-install --dir "$1" destroy cluster' _ {} \; | |
# upgrade | |
oc adm upgrade --to-image registry.svc.ci.openshift.org/ocp/release:$UPGRADE_RELEASE --force | |
# download CI run | |
wget -r -e robots=off -np -H https://gcsweb-ci.svc.ci.openshift.org/gcs/origin-ci-test/pr-logs/pull/24458/pull-ci-openshift-origin-master-e2e-aws-serial/12333/artifacts/e2e-aws-serial/ | |
# grep kube log | |
sed -rn 's/^(\w+ [0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}) ([^ ]+) ([^ ]+): [^ ]+ ([^ ]+) .* (Readiness probe for "etcd-quorum-guard-[[:alnum:]]+-[[:alnum:]]+).* (\w+$)/\1 \2 \3 \4 \5 \6/p' kubelet_service.log | |
# grep kube newer | |
sed -rn 's/(\w+ [0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}.[0-9]+) ([^ ]+) ([^ ]+): [^ ]+ ([^ ]+) .* (Readiness probe for "etcd-quorum-guard-[[:alnum:]]+-[[:alnum:]]+).* (.*+$)/\1 \2 \6/p' kubelet_service.log | sort -k 2 | grep failure | |
# nicer way | |
cat host_service_logs/masters/kubelet_service.log | awk '/Readiness probe for \"etcd-quorum-guard.*/ { print $1, $2, $3, $4, $10, $11, $13, $14; }' | |
# check for crashing containers | |
oc get po -A | grep -vE "(Running|Completed)" | |
# patch to see upgrade paths | |
oc patch clusterversion/version --patch '{"spec":{"upstream":"https://openshift-release.svc.ci.openshift.org/graph"}}' --type=merge | |
# update lastest on cluster | |
podman pull --authfile=/var/lib/kubelet/config.json $image | |
# grab MCO MachineConfig for etcd | |
oc get machineconfig 00-master -o jsonpath='{.spec.config.storage.files[?(@.path=="/etc/kubernetes/manifests/etcd-member.yaml")]contents.source}' | |
# grab ci runs | |
gsutil -m cp -r gs://origin-ci-test/logs/canary-openshift-ocp-installer-e2e-azure-4.2/290/ . | |
# vsphere install | |
https://github.com/openshift/installer/tree/master/upi/vsphere | |
https://vcsa.vmware.devcluster.openshift.com/ui | |
# merge json pullsecrets | |
jq -s '.[0] * .[1]' CORE_PULL_SECRET CI_PULL_SECRET &> MASTER_PULL_SECRET | |
# merge all json in dir into 1 file | |
jq -s '[.[][]]'*.json > manifest.json | |
# podman build rhel | |
sudo podman build --authfile=./PULL_SECRET_LOCATION -f images/tests/Dockerfile.rhel . | |
# search CI errors | |
https://ci-search-ci-search-next.svc.ci.openshift.org/?search=failed%3A.*API+data+in+etcd.*&maxAge=336h&context=2&type=all | |
# oc debug node | |
oc debug node/ip-10-0-137-127.us-east-2.compute.internal | |
chroot /host | |
## create/simulate latency | |
## https://www.enodev.fr/posts/emulate-a-slow-block-device-with-dm-delay.html | |
dmsetup create delayed | |
## watch keyspace and print counts by resource | |
ETCDCTL_API=3 etcdctl watch / --prefix -w fields > out | watch 'cat out | grep -oP "(?<=/kubernetes.io\/).+?(?=\/)" | sort | uniq -c' | |
# convert to decimal | |
printf "%.2f" 7.516192768e+09 | |
# p99 raw | |
echo $(( $(printf "%.f" 2.127424e+06) *99/100 )) | |
## refresh token | |
oc registry login --to=PULL_SECRET_LOCATION | |
## get IP of node | |
oc get node ip-10-0-143-125.us-east-2.compute.internal -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}' | |
## list registtry | |
oc get secret pull-secret -n openshift-config -o jsonpath={.data.'\.dockerconfigjson'} | base64 -d | jq -r '.auths | to_entries[]' | jq -r '.key' | |
## release | |
oc get imagestream installer -n openshift -o jsonpath={.status.tags[0].items[0].dockerImageReference} | egrep -o '^[^@]+' | |
## image | |
oc adm release info --image-for kube-etcd-signer-server --registry-config=./PULL_SECRET_LOCATION | |
# check cluster version | |
oc --config=${INSTALL_DIR}/auth/kubeconfig get clusterversion -oyaml | |
# etcd logs | |
master-logs etcd etcd &> etcd_server.log | |
# etcd related tasks | |
https://docs.openshift.com/container-platform/3.11/day_two_guide/host_level_tasks.html#day-two-guide-etcd-backup | |
# external testing 2 stage docker | |
# https://mojo.redhat.com/docs/DOC-1178565?sr=search&searchId=429ba108-213b-4b81-87f2-b667aca3e228&searchIndex=0 | |
FROM openshift/origin-release:golang-1.10 AS builder | |
# openshift 4 | |
CONTAINER=$(runc list | grep `pgrep etcd` | awk '{print $1}'); runc exec $CONTAINER etcd --version | |
# etcd | |
oc get pods --all-namespaces | grep etcd | |
# get image of container | |
oc get pod -o "jsonpath={range .status.containerStatuses[*]}{.name}{'\t'}{.state}{'\t'}{.image}{'\n'}{end}" -n kube-system etcd-member-ip-10-0-18-84.ec2.internal | |
# send to docker hub with podman | |
sudo podman push --authfile ~sbatsche/.docker/config.json localhost/machine-config-operator:v3.11.0-699-g100373ce-dirty hexfusion/machine-config-operator:latest | |
# give kube-system perms for operator | |
oc create clusterrolebinding etcd_operator --clusterrole=cluster-admin --serviceaccount=kube-system:default | |
# location of certs on bootstrap | |
/var/opt/openshift/tls | |
/sysroot/ostree/deploy/redhat-coreos-maipo/var/opt/openshift/tls/ | |
# MCO regen | |
go test ./pkg/controller/template/... -u | |
my guess is | |
on bootstrap: | |
https://github.com/openshift/machine-config-operator/pull/517/files#diff-8fb88a4862bafc203a34072446df1407R52 | |
etcd-metrics-ca content is empty | |
in cluster: | |
https://github.com/openshift/machine-config-operator/pull/517/files#diff-554de5523753fda8c93e7008c9bd947fR287 | |
etcd-metrics-ca config maps exists and is non-empty. | |
a good way would be | |
use the release image generated for your PR in CI and use the to create a cluster using installer. | |
how to find release image: | |
```2019/03/06 20:48:58 Create release image registry.svc.ci.openshift.org/ci-op-l4qnhir7/release:latest``` | |
from https://openshift-gce-devel.appspot.com/build/origin-ci-test/pr-logs/pull/openshift_machine-config-operator/517/pull-ci-openshift-machine-config-operator-master-e2e-aws/2369?log#log | |
then `OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE=<release-image> create cluster` | |
and on bootstrap node check `/etc/mcs/machine-configs` and compare them with `oc get machine-configs` the generated ones | |
## | |
sudo podman run --quiet --rm quay.io/hexfusion/origin-release:v4.0 image kube-client-agent | |
# check cert | |
openssl x509 -text -noout -in cert.pem | |
# check cert with output from oc | |
oc get cm -n openshift-config-managed csr-controller-ca -o json | jq -r '.data["ca-bundle.crt"]' | openssl x509 -text -noout -in - | |
# check CA created cert | |
openssl verify -verbose -x509_strict -CAfile ca.crt somecert.crt | |
# check csr | |
openssl req -text -noout -verify -in test.pem | |
# verify key was signed by csr | |
openssl verify -verbose -CAFile ca.crt domain.crt | |
# decryot base 64 encoded certs. | |
for ext in crt key; do oc -n openshift-config get secrets etcd-metrics-proxy-client -o yaml | grep tls.${ext} | awk '{ print $2 }' | base64 --decode > etcd-metrics-proxy-client.${ext}; | |
### etcd watch keyspace | |
ETCDCTL_API=3 etcdctl watch / --prefix -w fields | |
### build latest CI release. | |
$ oc login and get link | |
## go to site https://openshift-release.svc.ci.openshift.org/ | |
## get a release IE registry.svc.ci.openshift.org/ocp/release:4.1.0-0.ci-2019-04-29-142604 | |
## OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE=registry.svc.ci.openshift.org/ocp/release:4.1.0-0.ci-2019-04-29-142604 create cluster | |
# docker login | |
docker login -u hexfusion -p $(oc whoami -t) registry.svc.ci.openshift.org | |
# release | |
docker run -it -v $(pwd)/ci-operator:/ci-operator:z registry.svc.ci.openshift.org/ci/ci-operator-prowgen:latest --from-dir /ci-operator/config/ --to-dir /ci-operator/job | |
# exec into etcd | |
id=$(sudo crictl ps --name etcd-member | awk 'FNR==2{ print $1}') && sudo crictl exec -it $id /bin/sh | |
# export certs | |
export ETCDCTL_API=3 ETCDCTL_CACERT=/etc/ssl/etcd/ca.crt ETCDCTL_CERT=$(find /etc/ssl/ -name *peer*crt) ETCDCTL_KEY=$(find /etc/ssl/ -name *peer*key) | |
# use etcdctl | |
ETCDCTL_API=3 etcdctl --cert=$(find / -name 'system:etcd-peer*.crt') --key=$(find / -name 'system:etcd-peer*.key') --cacert=$(find / -name 'ca.crt') member list | |
# print cluster ID. | |
etcdctl member list -w fields | grep -oP '(?<=ClusterID\"\s:\s).*' | xargs printf '%x\n' | |
# list a records from SRV | |
dig +noall +answer SRV _etcd-server-ssl._tcp.hexfusion.local | grep -oP '(?<=2380 ).*[^\.]' | xargs| sed -e 's/ /,/g' | |
# list ipv4 address | |
ip -o addr | grep -oP '(?<=inet )(\d{1,3}\.?){4}' | |
# size of secrets | |
etcdctl get --prefix /kubernetes.io/secrets -w fields | grep -oP '(?<=Value\" : ")(.*)' | wc -c | |
# selinux | |
ausearch -m avc -c etcd | |
# create intial_cluster. | |
etcdctl member list -w json | jq -r '.members[] | [.name,.peerURLs[0]] | "\(.[0])=\(.[1])" ' | xargs | sed -e 's/ /,/g' | |
##### | |
# Operator debug | |
##### | |
# extract the payload for what CVO manages. | |
oc adm release extract --from=quay.io/hexfusion/origin-release:v4.2 --to=release-image | |
function git () { | |
(sudo podman run --rm -v ${PWD}:/root --rm --volume "$(pwd):/git:z" alpine/git "$@") | |
} | |
# regex to parse kubelet | |
https://gist.github.com/hexfusion/88e45f9d2c0ce6530bd4e3fa0bd9cfde | |
# cleanup clusters | |
find ./clusters/{aws,gcp} -maxdepth 1 -type d -exec bash -c 'openshift-install --dir "$1" destroy cluster' _ {} \; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
dump prom