# rke2 etcd check & troubleshotting ## etcd static pod using kubectl * etcdctl 檢查性能 ``` $ for etcdpod in $(kubectl -n kube-system get pod -l component=etcd --no-headers -o custom-columns=NAME:.metadata.name); do kubectl -n kube-system exec $etcdpod -- sh -c "ETCDCTL_ENDPOINTS='https://127.0.0.1:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl check perf"; done ``` * etcdctl 後端 status 可以查看時是 leader,並且 member 有哪些 ``` $ for etcdpod in $(kubectl -n kube-system get pod -l component=etcd --no-headers -o custom-columns=NAME:.metadata.name); do kubectl -n kube-system exec $etcdpod -- sh -c "ETCDCTL_ENDPOINTS='https://127.0.0.1:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl endpoint status"; done ``` ``` # 第一個 true 代表這台是 leader # ENDPOINT|ID|VERSION|DB SIZE|IS LEADER|RAFT TERM|RAFT INDEX https://127.0.0.1:2379, cbb02c5017743456, 3.5.5, 24 MB, true, false, 2, 67715, 67715, ``` * etcdctl endpoint health * 檢查同步時間 ``` $ for etcdpod in $(kubectl -n kube-system get pod -l component=etcd --no-headers -o custom-columns=NAME:.metadata.name); do kubectl -n kube-system exec $etcdpod -- sh -c "ETCDCTL_ENDPOINTS='https://127.0.0.1:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl endpoint health"; done ``` * etcdctl 警告列表 ``` $ for etcdpod in $(kubectl -n kube-system get pod -l component=etcd --no-headers -o custom-columns=NAME:.metadata.name); do kubectl -n kube-system exec $etcdpod -- sh -c "ETCDCTL_ENDPOINTS='https://127.0.0.1:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl alarm list"; done ``` * etcdctl 緊湊型 ``` $ rev=$(kubectl -n kube-system exec $(kubectl -n kube-system get pod -l component=etcd --no-headers -o custom-columns=NAME:.metadata.name | head -1) -- sh -c "ETCDCTL_ENDPOINTS='https://127.0.0.1:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl endpoint status --write-out fields | grep Revision | cut -d: -f2") kubectl -n kube-system exec $(kubectl -n kube-system get pod -l component=etcd --no-headers -o custom-columns=NAME:.metadata.name | head -1) -- sh -c "ETCDCTL_ENDPOINTS='https://127.0.0.1:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl compact \"$(echo $rev)\"" ``` * etcdctl 碎片整理,如果發現 DB SIZE 一致,可以透過 defrag 整理 etcd 之間的 DB SIZE ``` $ kubectl -n kube-system exec $(kubectl -n kube-system get pod -l component=etcd --no-headers -o custom-columns=NAME:.metadata.name | head -1) -- sh -c "ETCDCTL_ENDPOINTS='https://127.0.0.1:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl defrag --cluster" ``` * 發現 etcd DB SIZE 大小不一致 ``` $ ETCDCTL_ENDPOINTS='https://127.0.0.1:2379,https://192.168.11.143:2379,https://192.168.11.144:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl endpoint status -w table +-----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | ENDPOINT | ID | VERSION | DB SIZE | IS LEADER | IS LEARNER | RAFT TERM | RAFT INDEX | RAFT APPLIED INDEX | ERRORS | +-----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | https://127.0.0.1:2379 | bcde571486a77aca | 3.5.13 | 119 MB | false | false | 13 | 249407 | 249407 | | | https://192.168.11.143:2379 | ad7c7c84e3f9412e | 3.5.13 | 119 MB | true | false | 13 | 249408 | 249408 | | | https://192.168.11.144:2379 | e29ceba20fb42eff | 3.5.13 | 42 MB | false | false | 13 | 249409 | 249409 | | +-----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ # 整理 etcd 大小 $ ETCDCTL_ENDPOINTS='https://127.0.0.1:2379,https://192.168.11.143:2379,https://192.168.11.144:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl defrag --cluster # 再次檢查三個 etcd DB SIZE 大小都一致了 $ ETCDCTL_ENDPOINTS='https://127.0.0.1:2379,https://192.168.11.143:2379,https://192.168.11.144:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl endpoint status -w table +-----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | ENDPOINT | ID | VERSION | DB SIZE | IS LEADER | IS LEARNER | RAFT TERM | RAFT INDEX | RAFT APPLIED INDEX | ERRORS | +-----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | https://127.0.0.1:2379 | bcde571486a77aca | 3.5.13 | 38 MB | false | false | 13 | 251304 | 251304 | | | https://192.168.11.143:2379 | ad7c7c84e3f9412e | 3.5.13 | 38 MB | true | false | 13 | 251304 | 251304 | | | https://192.168.11.144:2379 | e29ceba20fb42eff | 3.5.13 | 38 MB | false | false | 13 | 251304 | 251304 | | +-----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ ``` * 不進入 pod 直接使用 etcdctl 查詢,需先安裝 etcdctl 指令 ``` $ ETCDCTL_ENDPOINTS='https://127.0.0.1:2379,https://192.168.11.143:2379,https://192.168.11.144:2379' ETCDCTL_CACERT='/var/lib/rancher/rke2/server/tls/etcd/server-ca.crt' ETCDCTL_CERT='/var/lib/rancher/rke2/server/tls/etcd/server-client.crt' ETCDCTL_KEY='/var/lib/rancher/rke2/server/tls/etcd/server-client.key' ETCDCTL_API=3 etcdctl endpoint status -w table +-----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | ENDPOINT | ID | VERSION | DB SIZE | IS LEADER | IS LEARNER | RAFT TERM | RAFT INDEX | RAFT APPLIED INDEX | ERRORS | +-----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | https://127.0.0.1:2379 | bcde571486a77aca | 3.5.13 | 119 MB | false | false | 13 | 249407 | 249407 | | | https://192.168.11.143:2379 | ad7c7c84e3f9412e | 3.5.13 | 119 MB | true | false | 13 | 249408 | 249408 | | | https://192.168.11.144:2379 | e29ceba20fb42eff | 3.5.13 | 42 MB | false | false | 13 | 249409 | 249409 | | +-----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ ``` ### rke2 etcd troubleshotting * 要加入第三個節點失敗,看 `journalctl -u rke2-server.service` 發現可能是 etcd 問題 ``` Apr 18 16:07:32 antony-rsm3 rke2[12137]: time="2023-04-18T16:07:32+08:00" level=fatal msg="ETCD join failed: duplicate node name found, please use a unique name for this node" ``` * 查看 etcd 只有兩個節點,但是進入 etcd pod 發現 * member 有三個,`rsm3-2126c18e` 應該就是他搶到名字,因此把它移除 ``` rancher@antony-rsm1:~> kubectl get pod -n kube-system NAME READY STATUS RESTARTS AGE cloud-controller-manager-rsm1 1/1 Running 3 (11m ago) 3h22m cloud-controller-manager-rsm2 1/1 Running 2 (11m ago) 163m etcd-rsm1 1/1 Running 1 (12m ago) 3h22m etcd-rsm2 1/1 Running 1 (12m ago) 163m helm-install-rke2-canal-qxj99 0/1 Completed 0 3h23m helm-install-rke2-coredns-dzkjq 0/1 Completed 0 3h23m helm-install-rke2-ingress-nginx-c9rj5 0/1 Completed 0 3h23m helm-install-rke2-metrics-server-ts5h5 0/1 Completed 0 3h23m kube-apiserver-rsm1 1/1 Running 1 (12m ago) 3h23m kube-apiserver-rsm2 1/1 Running 1 (12m ago) 163m kube-controller-manager-rsm1 1/1 Running 2 (12m ago) 3h23m kube-controller-manager-rsm2 1/1 Running 2 (11m ago) 163m kube-proxy-rsm1 1/1 Running 1 (12m ago) 3h23m kube-proxy-rsm2 1/1 Running 1 (12m ago) 163m kube-scheduler-rsm1 1/1 Running 1 (12m ago) 3h22m kube-scheduler-rsm2 1/1 Running 1 (12m ago) 163m rke2-canal-2lfmv 2/2 Running 2 (12m ago) 163m rke2-canal-f5fnn 2/2 Running 2 (12m ago) 3h22m rke2-coredns-rke2-coredns-545d64676-czkn5 1/1 Running 1 (12m ago) 163m rke2-coredns-rke2-coredns-545d64676-dmghg 1/1 Running 1 (12m ago) 3h22m rke2-coredns-rke2-coredns-autoscaler-5dd676f5c7-v7h2t 1/1 Running 1 (12m ago) 3h22m rke2-ingress-nginx-controller-hgfh4 1/1 Running 1 3h21m rke2-ingress-nginx-controller-plx6r 1/1 Running 1 (12m ago) 163m rke2-metrics-server-6564db4569-hnrt2 1/1 Running 1 (12m ago) 3h22m rancher@antony-rsm1:~> kubectl -n kube-system exec -it etcd-rsm1 -- bash bash-4.2# etcdctl --cert /var/lib/rancher/rke2/server/tls/etcd/server-client.crt --key /var/lib/rancher/rke2/server/tls/etcd/server-client.key --endpoints https://127.0.0.1:2379 --cacert /var/lib/rancher/rke2/server/tls/etcd/server-ca.crt member list 121f9e856446fcc4, started, rsm2-f826f76c, https://192.168.11.62:2380, https://192.168.11.62:2379, false 67f4840cc79f9560, started, rsm1-168e267e, https://192.168.11.61:2380, https://192.168.11.61:2379, false 6b20c6a3411cab14, started, rsm3-2126c18e, https://192.168.11.63:2380, https://192.168.11.63:2379, false # 移除後第三個節點就可以順利加回來了 bash-4.2# etcdctl --cert /var/lib/rancher/rke2/server/tls/etcd/server-client.crt --key /var/lib/rancher/rke2/server/tls/etcd/server-client.key --endpoints https://127.0.0.1:2379 --cacert /var/lib/rancher/rke2/server/tls/etcd/server-ca.crt member remove 6b20c6a3411cab14 Member 6b20c6a3411cab14 removed from cluster 9ea915bf6adaa894 bash-4.2# etcdctl --cert /var/lib/rancher/rke2/server/tls/etcd/server-client.crt --key /var/lib/rancher/rke2/server/tls/etcd/server-client.key --endpoints https://127.0.0.1:2379 --cacert /var/lib/rancher/rke2/server/tls/etcd/server-ca.crt member list 121f9e856446fcc4, started, rsm2-f826f76c, https://192.168.11.62:2380, https://192.168.11.62:2379, false 67f4840cc79f9560, started, rsm1-168e267e, https://192.168.11.61:2380, https://192.168.11.61:2379, false bash-4.2# exit ``` ### rke etcd troubleshotting * 此時還沒有 m2 這台機器,這時再加入會有問題,因此需把它移除 ``` $ docker exec etcd etcdctl member list 152a703c84689c2b, started, etcd-m3, https://192.168.11.73:2380, https://192.168.11.73:2379, false 79ffc5f67991d1f9, started, etcd-d1, https://192.168.11.70:2380, https://192.168.11.70:2379, false b474e4803693b4d4, started, etcd-m2, https://192.168.11.72:2380, https://192.168.11.72:2379, false ``` * 移除舊的 m2 節點資訊 ``` $ docker exec etcd etcdctl member remove b474e4803693b4d4 Member b474e4803693b4d4 removed from cluster 70e7b07b0f3befab ``` * 檢查 ``` $ docker exec etcd etcdctl member list 152a703c84689c2b, started, etcd-m3, https://192.168.11.73:2380, https://192.168.11.73:2379, false 79ffc5f67991d1f9, started, etcd-d1, https://192.168.11.70:2380, https://192.168.11.70:2379, false ``` ###### tags: `trouble`