-
-
Save charlychiu/1b46cd60803e0871bbcf95637e0f40f9 to your computer and use it in GitHub Desktop.
k8s garbage collector daemon set (for https://github.com/kubernetes/kubernetes/issues/106957)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| apiVersion: v1 | |
| metadata: | |
| name: gc-script | |
| namespace: garbage-collector | |
| kind: ConfigMap | |
| data: | |
| gc: |- | |
| #!/bin/bash | |
| # set defaults | |
| SLEEP_INTERVAL=300 | |
| POD_SCOPES=() | |
| logger() | |
| { | |
| echo "`TZ=UTC date --iso-8601=seconds` $@" | |
| } | |
| usage() | |
| { | |
| echo "usage: $0 [-s SLEEP_INTERVAL (seconds)]" | |
| } | |
| while getopts ":s:h" opt; do | |
| case $opt in | |
| h ) | |
| usage | |
| exit 0 | |
| ;; | |
| s ) | |
| SLEEP_INTERVAL=${OPTARG} | |
| ;; | |
| esac | |
| done | |
| if [[ ${SLEEP_INTERVAL} != ?(-)+([0-9]) ]]; then | |
| logger "${SLEEP_INTERVAL} is not an integer" | |
| usage | |
| exit 1 | |
| fi | |
| gc_pods(){ | |
| POD_IDS=($(crictl pods -q)) | |
| POD_SCOPES=() | |
| for POD_ID in ${POD_IDS[@]}; do | |
| JSONDUMP="`crictl inspectp ${POD_ID}`" | |
| POD_NAME="`echo ${JSONDUMP} | jq -r '.status.metadata.name'`" | |
| POD_SCOPE="`echo ${JSONDUMP} | jq -r '.info.runtimeSpec.linux.cgroupsPath' | awk -F: '{print "crio-" $NF ".scope"}'`" | |
| if [[ $? -ne 0 || -z "${POD_SCOPE}" ]]; then | |
| logger "Error fetching pod SCOPE for pod with ID ${POD_ID}" | |
| continue | |
| else | |
| POD_SCOPES+=($POD_SCOPE) | |
| fi | |
| POD_NAMESPACE="`echo ${JSONDUMP} | jq -r '.status.metadata.namespace'`" | |
| if [[ $? -ne 0 || -z "${POD_NAMESPACE}" ]]; then | |
| logger "Error fetching pod NAMESPACE for pod with ID ${POD_ID}" | |
| continue | |
| fi | |
| POD_CREATED="`echo ${JSONDUMP} | jq -r '.status.createdAt'`" | |
| if [[ $? -ne 0 || -z "${POD_CREATED}" ]]; then | |
| logger "Error fetching pod created timestamp for pod with ID ${POD_ID}" | |
| continue | |
| fi | |
| POD_NS="`echo ${JSONDUMP} | jq -r '.info.runtimeSpec.linux.namespaces[]|.path' | grep run | awk -F\/ '{print $NF}' | sort -u`" | |
| if [[ $? -ne 0 || -z "${POD_NS}" ]]; then | |
| logger "Error fetching pod namespace for pod with ID ${POD_ID}" | |
| continue | |
| fi | |
| if ip netns list | grep -q ${POD_NS}; then | |
| POD_PIDS=($(ip netns pids ${POD_NS})) | |
| if [[ $? -ne 0 ]]; then | |
| logger "Error fetching pod PIDs for pod ${POD_NAME}" | |
| continue | |
| fi | |
| else | |
| POD_PIDS=() | |
| fi | |
| # check if pod is known to k8s control plane | |
| KUBECONFIG=/var/lib/kubelet/kubeconfig kubectl get pod ${POD_NAME} -n ${POD_NAMESPACE} &>/dev/null | |
| if [[ $? -ne 0 ]]; then | |
| # additional check for safety, making sure that if there's a problem with apiserver we don't blindly remove pods with running processes | |
| if [[ ${#POD_PIDS[@]} -eq 0 ]]; then | |
| logger "Found POD ${POD_NAME} unknown to k8s control plane and without any PIDs, will delete it..." | |
| crictl stopp ${POD_ID} || logger "Failed to stop POD ${POD_NAME}" | |
| crictl rmp ${POD_ID} || logger "Error removing POD ${POD_NAME}" | |
| fi | |
| # else | |
| # logger "Pod ${POD_NAME} in namespace ${POD_NAMESPACE} is still known to control plane, skipping..." | |
| fi | |
| done | |
| } | |
| gc_cgroups(){ | |
| CGROUPDIRS=($(find /sys/fs/cgroup -type d)) | |
| LEFTOVER_SCOPES_TMP=($(journalctl --since "10m ago" | egrep 'Failed to update stats for container|Failed to create existing container' | grep -o 'crio-.*scope' | sort -u)) | |
| for PODID in `journalctl --since "10 ago" | grep 'Unable to fetch pod log stats' | grep -o '\/var.*:' | tr -d ':' | awk -F_ '{print $NF}' | sed 's/-/_/g' | sort -u`; do | |
| LEFTOVER_SCOPES_TMP+=($(printf -- '%s\n' "${testarray[@]}" | grep ${PODID} | grep -o crio.*$)) | |
| done | |
| LEFTOVER_SCOPES=($(printf -- '%s\n' "${LEFTOVER_SCOPES_TMP[@]}" | sort -u)) | |
| for SCOPENAME in ${LEFTOVER_SCOPES[@]}; do | |
| if [[ " ${POD_SCOPES[*]} " =~ " ${SCOPENAME} " ]]; then | |
| logger "Scope ${SCOPENAME} found under running pod, skipping..." | |
| continue | |
| else | |
| for SCOPE in `printf -- '%s\n' "${CGROUPDIRS[@]}" | grep ${SCOPENAME}`; do | |
| logger "Removing CGROUP ${SCOPENAME} and its parent..." | |
| rmdir ${SCOPE} | |
| if [[ $? -eq 0 ]]; then | |
| rmdir `dirname ${SCOPE}` | |
| if [[ $? -ne 0 ]]; then | |
| logger "Failed to remove parent for CGROUP ${SCOPE}..." | |
| fi | |
| fi | |
| done | |
| fi | |
| done | |
| } | |
| cleanup_misc_cgroups(){ | |
| # From logging from the last 10 minutes (this could be relative to the configurable timeframe of the script) | |
| # extract all path does not exist errors. these happen at log level 2. | |
| # sort and then only get uniques. (the housekeeping loop triggering them will run multiple times in 10 minutes). | |
| # find the accompaning cgroup directory. (to make the workaround broader one could start from /sys/fs/cgroup, from what i saw in my case it was the misc cgroup that was left dangling) | |
| for cgroupdir in $(journalctl --since="10 min ago" | grep "Path does not exist" | sed -E 's|.*/var/lib/kubelet/pods/([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}).*|\1|' | sed 's/-/_/g' | sort | uniq | xargs -I {} find /sys/fs/cgroup/misc -name "*{}.slice"); do | |
| echo "cleaning up cgroupdir: ${cgroupdir}" | |
| # because we can't rmdir the cgroup without closing the scopes, first find all scopes in the cgroup | |
| for scopedir in $(find ${cgroupdir} -name "*.scope" -type d); do | |
| echo "cleaning up scopedir: ${scopedir}" | |
| # and close them | |
| rmdir ${scopedir} | |
| done | |
| # and close the cgroup | |
| rmdir ${cgroupdir} | |
| done | |
| } | |
| # sleep for 1m to allow garbage collector to setup properly and avoid pod start race-condition | |
| sleep 60 | |
| while true; do | |
| logger "Starting k8s garbage collector run..." | |
| gc_pods | |
| gc_cgroups | |
| cleanup_misc_cgroups | |
| logger "Sleeping for ${SLEEP_INTERVAL} seconds..." | |
| sleep ${SLEEP_INTERVAL} | |
| done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| apiVersion: apps/v1 | |
| kind: DaemonSet | |
| metadata: | |
| name: k8s-gc | |
| namespace: garbage-collector | |
| spec: | |
| revisionHistoryLimit: 10 | |
| selector: | |
| matchLabels: | |
| app: k8s-gc | |
| template: | |
| metadata: | |
| creationTimestamp: null | |
| labels: | |
| app: k8s-gc | |
| spec: | |
| containers: | |
| - command: ["/bin/sh"] | |
| args: ["-c", "cp /tmp/gc.sh /host/tmp/gc.sh ; chmod +x /host/tmp/gc.sh ; chroot /host ./tmp/gc.sh -s $SLEEP_INTERVAL"] | |
| image: quay.io/openshift/okd-content@sha256:452b2b079bf21bb715d6c915ed38103cc6000c5c444e6484e70660535474a86c | |
| imagePullPolicy: IfNotPresent | |
| name: garbage-collector | |
| env: | |
| - name: SLEEP_INTERVAL | |
| value: "600" | |
| securityContext: | |
| privileged: true | |
| runAsUser: 0 | |
| volumeMounts: | |
| - mountPath: /host | |
| name: host | |
| - mountPath: "/tmp/gc.sh" | |
| subPath: gc | |
| name: gc-script | |
| enableServiceLinks: true | |
| hostNetwork: true | |
| hostPID: true | |
| serviceAccount: garbage-collector | |
| serviceAccountName: garbage-collector | |
| terminationGracePeriodSeconds: 30 | |
| volumes: | |
| - name: gc-script | |
| configMap: | |
| name: gc-script | |
| - hostPath: | |
| path: / | |
| type: Directory | |
| name: host | |
| nodeSelector: | |
| kubernetes.io/os: linux | |
| tolerations: | |
| - operator: Exists | |
| updateStrategy: | |
| type: OnDelete |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| apiVersion: v1 | |
| kind: Namespace | |
| metadata: | |
| name: garbage-collector | |
| annotations: | |
| openshift.io/node-selector: "" | |
| labels: | |
| openshift.io/cluster-monitoring: "true" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| apiVersion: v1 | |
| kind: ServiceAccount | |
| metadata: | |
| name: garbage-collector | |
| namespace: garbage-collector | |
| --- | |
| apiVersion: rbac.authorization.k8s.io/v1 | |
| kind: RoleBinding | |
| metadata: | |
| name: system:openshift:scc:anyuid | |
| namespace: garbage-collector | |
| roleRef: | |
| apiGroup: rbac.authorization.k8s.io | |
| kind: ClusterRole | |
| name: system:openshift:scc:anyuid | |
| subjects: | |
| - kind: ServiceAccount | |
| name: garbage-collector | |
| namespace: garbage-collector | |
| --- | |
| apiVersion: rbac.authorization.k8s.io/v1 | |
| kind: RoleBinding | |
| metadata: | |
| name: system:openshift:scc:privileged | |
| namespace: garbage-collector | |
| roleRef: | |
| apiGroup: rbac.authorization.k8s.io | |
| kind: ClusterRole | |
| name: system:openshift:scc:privileged | |
| subjects: | |
| - kind: ServiceAccount | |
| name: garbage-collector | |
| namespace: garbage-collector |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment