metrics server, prometheus, vpa

Raw

wget https://get.helm.sh/helm-v3.2.1-linux-amd64.tar.gz
tar -zxvf helm-v3.2.1-linux-amd64.tar.gz
mv linux-amd64/helm /usr/local/bin/helm
helm repo add stable https://kubernetes-charts.storage.googleapis.com/
helm repo update
helm install grafana stable/grafana
kubectl port-forward --address 0.0.0.0 -n default svc/grafana 8081:80
# grafana password
kubectl get secret --namespace default grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo

Raw

kubectl.md

https://kubernetes.io/zh/docs/tasks/administer-cluster/access-cluster-api/

# Check all possible clusters, as you .KUBECONFIG may have multiple contexts:
kubectl config view -o jsonpath='{"Cluster name\tServer\n"}{range .clusters[*]}{.name}{"\t"}{.cluster.server}{"\n"}{end}'

# Select name of cluster you want to interact with from above output:
export CLUSTER_NAME="some_server_name"

# Point to the API server refering the cluster name
APISERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"$CLUSTER_NAME\")].cluster.server}")

# Gets the token value
TOKEN=$(kubectl get secrets -o jsonpath="{.items[?(@.metadata.annotations['kubernetes\.io/service-account\.name']=='default')].data.token}"|base64 -d)

# Explore the API with TOKEN
curl -X GET $APISERVER/api --header "Authorization: Bearer $TOKEN" --insecure

kubectl config view 
export CLUSTER_NAME="local"

# Point to the API server refering the cluster name
APISERVER=$(kubectl config view -o jsonpath="{.clusters[?(@.name==\"$CLUSTER_NAME\")].cluster.server}")

curl -k --cert /var/run/kubernetes/client-admin.crt \
  --key  /var/run/kubernetes/client-admin.key \
  --cacert /var/run/kubernetes/server-ca.crt \
"$APISERVER/apis/apps.tkestack.io/v1/namespaces/default/tapps/example-tapp/scale"

Raw

kubelet.md

token=$(cat ~/.kube/config  | grep token | awk -F: '{print $2}' | awk '{print $1}')
echo $token

curl -k -H "Authorization: Bearer ${token}" https://$ip:10250/stats/summary?only_cpu_and_memory=true

Raw

metrics-server.md

https://github.com/kubernetes/community/blob/master/contributors/design-proposals/instrumentation/resource-metrics-api.md

kubectl api-resources | grep metrics

token=$(cat ~/.kube/config  | grep token | awk -F: '{print $2}' | awk '{print $1}')
token=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)

echo $token


curl -H "Authorization: Bearer ${token}" -k https://metrics-server.kube-system

## list
curl -H "Authorization: Bearer ${token}" -k https://kubernetes/apis/metrics.k8s.io/v1beta1/namespaces/demo/pods/

## single pod
curl -H "Authorization: Bearer ${token}" -k https://metrics-server.kube-system/apis/metrics.k8s.io/v1beta1/namespaces/demo/pods/rami-56c4565dc4-mjsfw

## all pods
curl -H "Authorization: Bearer ${token}" -k https://kubernetes/apis/metrics.k8s.io/v1beta1/pods

#https://github.com/feiskyer/kubernetes-handbook/blob/master/en/addons/metrics.md
kubectl get --raw /apis/metrics.k8s.io/v1beta1/namespaces/default/pods

# external metrics server

curl -H "Authorization: Bearer $token" -k $APISERVER/apis/external.metrics.k8s.io/v1beta1/

## list
curl -H "Authorization: Bearer $token" -k $APISERVER/apis/external.metrics.k8s.io/v1beta1/externalmetricvalues

## get
curl -H "Authorization: Bearer $token" -k $APISERVER/apis/external.metrics.k8s.io/v1beta1/externalmetricvalues/ufa_backend_totalcount_smpl

# get with kubectl --raw
kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1/externalmetricvalues"

Raw

prometheus.md

kubectl port-forward --address 0.0.0.0 -n kube-system svc/prometheus 8081:9090

http://$ip:8081/graph?g0.range_input=1h&g0.expr=rate(container_cpu_usage_seconds_total%7Bpod_name%3D~%22.%2B%22%7D%5B5m%5D)%5B8d%3A%5D&g0.tab=1

http://$ip:8081/api/v1/query?query=kube_pod_container_resource_requests{pod_name=~"ramists-.+"}[1d:]
kube_pod_container_resource_requests{pod_name=~"ramists-.+"}[1d:]

Raw

vpa.md

# vpa-recommend 获取prometheus历史数据，需要修改vpa-recommend去掉job tag（因tkestack去掉了job tag），否则获取不到metrics
      - args:
        - --v=4
        - --storage=prometheus
        - --prometheus-address=http://prometheus.kube-system.svc.cluster.local:9090
        - --prometheus-cadvisor-job-name=
        - --container-name-label=pod_name
        - --cpu-histogram-decay-half-life=10m
        - --pod-recommendation-min-memory-mb=10

# 删除tke hpa-metrics-server的APIService，tke hpa-metrics-server有bug，获取不到所有namespace的metrics

kubectl delete APIService v1beta1.metrics.k8s.io

# 按官方说明提交metrics-server
kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/download/v0.3.6/components.yaml

# 会遇到下面报错
# E0527 16:23:16.112915       1 manager.go:111] unable to fully collect metrics: [unable to fully scrape metrics from source kubelet_summary:x.x.x.x: unable to fetch metrics from Kubelet x.x.x.x (x.x.x.x): Get https://x.x.x.x:10250/stats/summary?only_cpu_and_memory=true: x509: certificate signed by unknown authority...]

# 增加metrics-server参数：
          - --kubelet-preferred-address-types=InternalIP,Hostname,ExternalIP
          - --kubelet-insecure-tls

vpa去掉label的patch https://github.com/chenchun/autoscaler/commit/a974fdf5fa813ca276747cb523c4ddca54d141a8

diff --git a/vertical-pod-autoscaler/pkg/recommender/input/history/history_provider.go b/vertical-pod-autoscaler/pkg/recommender/input/history/history_provider.go
index 7dafa34c9..f965bb105 100644
--- a/vertical-pod-autoscaler/pkg/recommender/input/history/history_provider.go
+++ b/vertical-pod-autoscaler/pkg/recommender/input/history/history_provider.go
@@ -190,8 +190,12 @@ func (p *prometheusHistoryProvider) readLastLabels(res map[model.PodID]*PodHisto
 
 func (p *prometheusHistoryProvider) GetClusterHistory() (map[model.PodID]*PodHistory, error) {
        res := make(map[model.PodID]*PodHistory)
-       podSelector := fmt.Sprintf("job=\"%s\", %s=~\".+\", %s!=\"POD\", %s!=\"\"",
-               p.config.CadvisorMetricsJobName, p.config.CtrPodNameLabel,
+       var podSelector string
+       if p.config.CadvisorMetricsJobName != "" {
+               podSelector = fmt.Sprintf("job=\"%s\", ", p.config.CadvisorMetricsJobName)
+       }
+       podSelector = fmt.Sprintf("%s%s=~\".+\", %s!=\"POD\", %s!=\"\"",
+               podSelector, p.config.CtrPodNameLabel,
                p.config.CtrNameLabel, p.config.CtrNameLabel)
 
        // This query uses Prometheus Subquery notation, to gives us a result of a five minute cpu rate by default evaluated every 1minute for last config.HistoryLength days/hours/minutes. In order to change the evaluation step, you need change Prometheus global.evaluation_interval configuration parameter.

chenchun/grafana.md