Last active
January 20, 2023 08:46
-
-
Save aneagoe/6e18aaff48333ec059d0c1283b06813f to your computer and use it in GitHub Desktop.
k8s garbage collector daemon set (for https://github.com/kubernetes/kubernetes/issues/106957)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
apiVersion: v1 | |
metadata: | |
name: gc-script | |
namespace: garbage-collector | |
kind: ConfigMap | |
data: | |
gc: |- | |
#!/bin/bash | |
# set defaults | |
SLEEP_INTERVAL=300 | |
POD_SCOPES=() | |
logger() | |
{ | |
echo "`TZ=UTC date --iso-8601=seconds` $@" | |
} | |
usage() | |
{ | |
echo "usage: $0 [-s SLEEP_INTERVAL (seconds)]" | |
} | |
while getopts ":s:h" opt; do | |
case $opt in | |
h ) | |
usage | |
exit 0 | |
;; | |
s ) | |
SLEEP_INTERVAL=${OPTARG} | |
;; | |
esac | |
done | |
if [[ ${SLEEP_INTERVAL} != ?(-)+([0-9]) ]]; then | |
logger "${SLEEP_INTERVAL} is not an integer" | |
usage | |
exit 1 | |
fi | |
gc_pods(){ | |
POD_IDS=($(crictl pods -q)) | |
POD_SCOPES=() | |
for POD_ID in ${POD_IDS[@]}; do | |
JSONDUMP="`crictl inspectp ${POD_ID}`" | |
POD_NAME="`echo ${JSONDUMP} | jq -r '.status.metadata.name'`" | |
POD_SCOPE="`echo ${JSONDUMP} | jq -r '.info.runtimeSpec.linux.cgroupsPath' | awk -F: '{print "crio-" $NF ".scope"}'`" | |
if [[ $? -ne 0 || -z "${POD_SCOPE}" ]]; then | |
logger "Error fetching pod SCOPE for pod with ID ${POD_ID}" | |
continue | |
else | |
POD_SCOPES+=($POD_SCOPE) | |
fi | |
POD_NAMESPACE="`echo ${JSONDUMP} | jq -r '.status.metadata.namespace'`" | |
if [[ $? -ne 0 || -z "${POD_NAMESPACE}" ]]; then | |
logger "Error fetching pod NAMESPACE for pod with ID ${POD_ID}" | |
continue | |
fi | |
POD_CREATED="`echo ${JSONDUMP} | jq -r '.status.createdAt'`" | |
if [[ $? -ne 0 || -z "${POD_CREATED}" ]]; then | |
logger "Error fetching pod created timestamp for pod with ID ${POD_ID}" | |
continue | |
fi | |
POD_NS="`echo ${JSONDUMP} | jq -r '.info.runtimeSpec.linux.namespaces[]|.path' | grep run | awk -F\/ '{print $NF}' | sort -u`" | |
if [[ $? -ne 0 || -z "${POD_NS}" ]]; then | |
logger "Error fetching pod namespace for pod with ID ${POD_ID}" | |
continue | |
fi | |
if ip netns list | grep -q ${POD_NS}; then | |
POD_PIDS=($(ip netns pids ${POD_NS})) | |
if [[ $? -ne 0 ]]; then | |
logger "Error fetching pod PIDs for pod ${POD_NAME}" | |
continue | |
fi | |
else | |
POD_PIDS=() | |
fi | |
# check if pod is known to k8s control plane | |
KUBECONFIG=/var/lib/kubelet/kubeconfig kubectl get pod ${POD_NAME} -n ${POD_NAMESPACE} &>/dev/null | |
if [[ $? -ne 0 ]]; then | |
# additional check for safety, making sure that if there's a problem with apiserver we don't blindly remove pods with running processes | |
KUBECONFIG=/var/lib/kubelet/kubeconfig kubectl get --raw='/readyz' &>/dev/null | |
if [[ $? -ne 0 ]]; then | |
logger "Kubernetes API unavailable. Could be false positive so skipping deletion of POD ${POD_NAME}" | |
continue | |
fi | |
if [[ ${#POD_PIDS[@]} -eq 0 ]]; then | |
logger "Found POD ${POD_NAME} unknown to k8s control plane and without any PIDs, will delete it..." | |
crictl stopp ${POD_ID} || logger "Failed to stop POD ${POD_NAME}" | |
crictl rmp ${POD_ID} || logger "Error removing POD ${POD_NAME}" | |
fi | |
# else | |
# logger "Pod ${POD_NAME} in namespace ${POD_NAMESPACE} is still known to control plane, skipping..." | |
fi | |
done | |
} | |
gc_cgroups(){ | |
CGROUPDIRS=($(find /sys/fs/cgroup -type d)) | |
LEFTOVER_SCOPES_TMP=($(journalctl --since "10m ago" | egrep 'Failed to update stats for container|Failed to create existing container' | grep -o 'crio-.*scope' | sort -u)) | |
for PODID in `journalctl --since "10 ago" | grep 'Unable to fetch pod log stats' | grep -o '\/var.*:' | tr -d ':' | awk -F_ '{print $NF}' | sed 's/-/_/g' | sort -u`; do | |
LEFTOVER_SCOPES_TMP+=($(printf -- '%s\n' "${testarray[@]}" | grep ${PODID} | grep -o crio.*$)) | |
done | |
LEFTOVER_SCOPES=($(printf -- '%s\n' "${LEFTOVER_SCOPES_TMP[@]}" | sort -u)) | |
for SCOPENAME in ${LEFTOVER_SCOPES[@]}; do | |
if [[ " ${POD_SCOPES[*]} " =~ " ${SCOPENAME} " ]]; then | |
logger "Scope ${SCOPENAME} found under running pod, skipping..." | |
continue | |
else | |
for SCOPE in `printf -- '%s\n' "${CGROUPDIRS[@]}" | grep ${SCOPENAME}`; do | |
logger "Removing CGROUP ${SCOPENAME} and its parent..." | |
rmdir ${SCOPE} | |
if [[ $? -eq 0 ]]; then | |
rmdir `dirname ${SCOPE}` | |
if [[ $? -ne 0 ]]; then | |
logger "Failed to remove parent for CGROUP ${SCOPE}..." | |
fi | |
fi | |
done | |
fi | |
done | |
} | |
# sleep for 1m to allow garbage collector to setup properly and avoid pod start race-condition | |
sleep 60 | |
while true; do | |
logger "Starting k8s garbage collector run..." | |
KUBECONFIG=/var/lib/kubelet/kubeconfig kubectl get --raw='/readyz' &>/dev/null | |
if [[ $? -ne 0 ]]; then | |
logger "Kubernetes API unavailable. Cancelling run." | |
else | |
gc_pods | |
gc_cgroups | |
fi | |
logger "Sleeping for ${SLEEP_INTERVAL} seconds..." | |
sleep ${SLEEP_INTERVAL} | |
done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: apps/v1 | |
kind: DaemonSet | |
metadata: | |
name: k8s-gc | |
namespace: garbage-collector | |
spec: | |
revisionHistoryLimit: 10 | |
selector: | |
matchLabels: | |
app: k8s-gc | |
template: | |
metadata: | |
creationTimestamp: null | |
labels: | |
app: k8s-gc | |
spec: | |
containers: | |
- command: ["/bin/sh"] | |
args: ["-c", "cp /tmp/gc.sh /host/tmp/gc.sh ; chmod +x /host/tmp/gc.sh ; chroot /host ./tmp/gc.sh -s $SLEEP_INTERVAL"] | |
image: image-registry.openshift-image-registry.svc:5000/openshift/cli | |
imagePullPolicy: IfNotPresent | |
name: garbage-collector | |
env: | |
- name: SLEEP_INTERVAL | |
value: "600" | |
securityContext: | |
privileged: true | |
runAsUser: 0 | |
volumeMounts: | |
- mountPath: /host | |
name: host | |
- mountPath: "/tmp/gc.sh" | |
subPath: gc | |
name: gc-script | |
enableServiceLinks: true | |
hostNetwork: true | |
hostPID: true | |
serviceAccount: garbage-collector | |
serviceAccountName: garbage-collector | |
terminationGracePeriodSeconds: 30 | |
volumes: | |
- name: gc-script | |
configMap: | |
name: gc-script | |
- hostPath: | |
path: / | |
type: Directory | |
name: host | |
nodeSelector: | |
kubernetes.io/os: linux | |
tolerations: | |
- operator: Exists | |
updateStrategy: | |
type: OnDelete |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: v1 | |
kind: Namespace | |
metadata: | |
name: garbage-collector | |
annotations: | |
openshift.io/node-selector: "" | |
labels: | |
openshift.io/cluster-monitoring: "true" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
apiVersion: v1 | |
kind: ServiceAccount | |
metadata: | |
name: garbage-collector | |
namespace: garbage-collector | |
--- | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: RoleBinding | |
metadata: | |
name: system:openshift:scc:anyuid | |
namespace: garbage-collector | |
roleRef: | |
apiGroup: rbac.authorization.k8s.io | |
kind: ClusterRole | |
name: system:openshift:scc:anyuid | |
subjects: | |
- kind: ServiceAccount | |
name: garbage-collector | |
namespace: garbage-collector | |
--- | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: RoleBinding | |
metadata: | |
name: system:openshift:scc:privileged | |
namespace: garbage-collector | |
roleRef: | |
apiGroup: rbac.authorization.k8s.io | |
kind: ClusterRole | |
name: system:openshift:scc:privileged | |
subjects: | |
- kind: ServiceAccount | |
name: garbage-collector | |
namespace: garbage-collector |
Works great, thx!
Just got many (non blocking) error messages like this:
Error: Peer netns reference is invalid.
Can investigate but maybe you know right of the bat if it is a concern? Like maybe some network NS leakage left?
It's harmless. Not sure why it's happening though.
Running the script directly on the node (ie not via DS) doesn't have this issue. However, running through the DS/chroot, whenever you run ip a
or ip netns list
you get that error. Didn't spend too much time on it since it's only used in sanity checks. If you're curious and check it out, let me know if you find something :)
Updated image as suggested by @kai-uwe-rommel
Updated configmap using suggestions from @msteenhu.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Review, adjust to your liking and run at your own risk. I have tested this on both my test and prod clusters and there it behaves fine (OKD 4.9, OKD 4.10).
This assumes openshift/okd deployment, so for regular k8s some rolebindings or labels/annotations won't be required.
The garbage collector tries to address a few issues of either leaking pods (https://bugzilla.redhat.com/show_bug.cgi?id=2074820) or log spam on nodes (https://bugzilla.redhat.com/show_bug.cgi?id=2080253, kubernetes/kubernetes#106957).