# etcd ``` $ sudo docker ps -a -f=name=etcd$ fc3aa636f5c4 harbor.twlottery.com.internal/rancher/mirrored-coreos-etcd:v3.5.6 "/usr/local/bin/etcd…" 3 hours ago Up 3 hours etcd $ sudo docker logs etcd ``` * Check etcd members on all nodes * Check the etcd endpoint status * Check endpoint health * Check connectivity between nodes * Check alarms * Check for space issues * Increase the logging level to ’Debug ## rke etcd check * Check etcd Members ``` $ docker exec etcd etcdctl member list 152a703c84689c2b, started, etcd-m3, https://192.168.11.73:2380, https://192.168.11.73:2379, false 5ff1c0689a05d740, started, etcd-m2, https://192.168.11.72:2380, https://192.168.11.72:2379, false 79ffc5f67991d1f9, started, etcd-m1, https://192.168.11.70:2380, https://192.168.11.70:2379, false ``` * 檢查 Endpoint 狀態 * 正常 etcd 同步時間要在 100 毫秒以下 ``` $ docker exec -e ETCDCTL_ENDPOINTS=$(docker exec etcd etcdctl member list | cut -d, -f5 | sed -e 's/ //g' | paste -sd ',') etcd etcdctl endpoint status --write-out table ``` * 檢查 Endpoint 健康狀態 ``` $ docker exec -e ETCDCTL_ENDPOINTS=$(docker exec etcd etcdctl member list | cut -d, -f5 | sed -e 's/ //g' | paste -sd ',') etcd etcdctl endpoint health ``` * 檢查 Port TCP/2379 的連接狀況 ``` $ for endpoint in $(docker exec etcd etcdctl member list | cut -d, -f5); do echo "Validating connection to ${endpoint}/health" docker run --net=host -v $(docker inspect kubelet --format '{{ range .Mounts }}{{ if eq .Destination "/etc/kubernetes" }}{{ .Source }}{{ end }}{{ end }}')/ssl:/etc/kubernetes/ssl:ro appropriate/curl -s -w "\n" --cacert $(docker inspect -f '{{range $index, $value := .Config.Env}}{{if eq (index (split $value "=") 0) "ETCDCTL_CACERT" }}{{range $i, $part := (split $value "=")}}{{if gt $i 1}}{{print "="}}{{end}}{{if gt $i 0}}{{print $part}}{{end}}{{end}}{{end}}{{end}}' etcd) --cert $(docker inspect -f '{{range $index, $value := .Config.Env}}{{if eq (index (split $value "=") 0) "ETCDCTL_CERT" }}{{range $i, $part := (split $value "=")}}{{if gt $i 1}}{{print "="}}{{end}}{{if gt $i 0}}{{print $part}}{{end}}{{end}}{{end}}{{end}}' etcd) --key $(docker inspect -f '{{range $index, $value := .Config.Env}}{{if eq (index (split $value "=") 0) "ETCDCTL_KEY" }}{{range $i, $part := (split $value "=")}}{{if gt $i 1}}{{print "="}}{{end}}{{if gt $i 0}}{{print $part}}{{end}}{{end}}{{end}}{{end}}' etcd) "${endpoint}/health" done ``` * 檢查 Port TCP/2380 的連接狀況 ``` $ for endpoint in $(docker exec etcd etcdctl member list | cut -d, -f4); do echo "Validating connection to ${endpoint}/version"; docker run --net=host -v $(docker inspect kubelet --format '{{ range .Mounts }}{{ if eq .Destination "/etc/kubernetes" }}{{ .Source }}{{ end }}{{ end }}')/ssl:/etc/kubernetes/ssl:ro appropriate/curl --http1.1 -s -w "\n" --cacert $(docker inspect -f '{{range $index, $value := .Config.Env}}{{if eq (index (split $value "=") 0) "ETCDCTL_CACERT" }}{{range $i, $part := (split $value "=")}}{{if gt $i 1}}{{print "="}}{{end}}{{if gt $i 0}}{{print $part}}{{end}}{{end}}{{end}}{{end}}' etcd) --cert $(docker inspect -f '{{range $index, $value := .Config.Env}}{{if eq (index (split $value "=") 0) "ETCDCTL_CERT" }}{{range $i, $part := (split $value "=")}}{{if gt $i 1}}{{print "="}}{{end}}{{if gt $i 0}}{{print $part}}{{end}}{{end}}{{end}}{{end}}' etcd) --key $(docker inspect -f '{{range $index, $value := .Config.Env}}{{if eq (index (split $value "=") 0) "ETCDCTL_KEY" }}{{range $i, $part := (split $value "=")}}{{if gt $i 1}}{{print "="}}{{end}}{{if gt $i 0}}{{print $part}}{{end}}{{end}}{{end}}{{end}}' etcd) "${endpoint}/version" done ``` * 不進入 container 直接使用 etcdctl 查詢,需先安裝 etcdctl 指令 ``` $ ETCDCTL_ENDPOINTS='https://127.0.0.1:2379,https://192.168.11.53:2379,https://192.168.11.54:2379' ETCDCTL_CACERT='/etc/kubernetes/ssl/kube-ca.pem' ETCDCTL_CERT='/etc/kubernetes/ssl/kube-etcd-192-168-11-102.pem' ETCDCTL_KEY='/etc/kubernetes/ssl/kube-etcd-192-168-11-102-key.pem' ETCDCTL_API=3 etcdctl endpoint status -w table +----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | ENDPOINT | ID | VERSION | DB SIZE | IS LEADER | IS LEARNER | RAFT TERM | RAFT INDEX | RAFT APPLIED INDEX | ERRORS | +----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | https://127.0.0.1:2379 | 37f57fc09a11af9e | 3.5.10 | 39 MB | false | false | 30 | 5868573 | 5868573 | | | https://192.168.11.53:2379 | 42c538fc0ebfad32 | 3.5.10 | 38 MB | false | false | 30 | 5868574 | 5868574 | | | https://192.168.11.54:2379 | 84645341c9406c2f | 3.5.10 | 43 MB | true | false | 30 | 5868574 | 5868574 | | +----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ ``` * etcd 告警 ``` $ docker exec etcd etcdctl alarm list ``` * 以下為 NOSPACE 告警內容 * 錯誤訊息有可能為 `etcdserver: mvcc: database space exceeded` 或 `applying raft message exceeded backend quota` ``` memberID:x alarm:NOSPACE memberID:x alarm:NOSPACE memberID:x alarm:NOSPACE ``` * 查看 etcd log 時可能出現以下資訊 ``` $ docker logs etcd ``` | Log | 解釋 | | --------------------------------------------------------------------------------------------- | -------- | | `health check for peer xxx could not connect: dial tcp IP:2380: getsockopt: connection refused` | 無法建立與連接埠 2380 上顯示的位址的連線。檢查 etcd 容器是否正在顯示位址的主機上執行。 | | `xxx is starting a new election at term x ` | etcd 集群失去了法定人數,正在嘗試建立新的領導者。當大多數運行 etcd 的節點發生故障/無法存取時,就會發生這種情況。 | | `connection error: desc = "transport: Error while dialing dial tcp 0.0.0.0:2379: i/o timeout"; Reconnecting to {0.0.0.0:2379 0 <nil>}` | 主機防火牆正在阻止網路通訊。 | | `rafthttp: request cluster ID mismatch` | 具有 etcd 實例日誌記錄 rafthttp: 請求群集 ID 不符的節點正在嘗試加入已經與另一個對等點形成的群集。應從叢集中刪除該節點,然後重新新增。 | | `rafthttp: failed to find member` |叢集狀態 (/var/lib/etcd) 包含加入叢集的錯誤訊息。應從叢集中刪除節點,清理狀態目錄並重新新增節點。 | # Troubleshooting Control Plane ``` $ sudo docker ps -a -f=name='kube-apiserver|kube-controller-manager|kube-scheduler' fb97614a6494 harbor.twlottery.com.internal/rancher/hyperkube:v1.25.6-rancher4 "/opt/rke-tools/entr…" 2 hours ago Up 2 hours kube-apiserver 015d1c138bd5 harbor.twlottery.com.internal/rancher/hyperkube:v1.25.6-rancher4 "/opt/rke-tools/entr…" 3 hours ago Up 2 hours kube-scheduler 9030194d2208 harbor.twlottery.com.internal/rancher/hyperkube:v1.25.6-rancher4 "/opt/rke-tools/entr…" 3 hours ago Up 2 hours kube-controller-manager ``` ``` $ sudo docker logs kube-apiserver $ sudo docker logs kube-controller-manager $ sudo docker logs kube-scheduler ``` # Troubleshooting NGINX Proxy * nginx-proxy 部屬在每個 node 上 > The nginx-proxy container is deployed on every node that does not have the controlplane role. > > It provides access to all the nodes with the controlplane role by dynamically generating the NGINX configuration based on available nodes with the controlplane role. > > To check that this is functioning correctly we can do the following. > • Check the container is running. > • Check the generated NGINX configuration. > • Check the nginx-proxy container logging # Troubleshooting Worker Nodes * 主要檢查 kubelet & kube-proxy ``` $ sudo docker ps -a -f=name=kubelet CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES c637c00d557f harbor.twlottery.com.internal/rancher/hyperkube:v1.25.6-rancher4 "/opt/rke-tools/entr…" 3 hours ago Up 3 hours kubelet $ sudo docker ps -a -f=name=kube-proxy CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 04715c577a54 harbor.cooloo9871.com/rancher/hyperkube:v1.26.8-rancher1 "/opt/rke-tools/entr…" 6 weeks ago Up 3 weeks kube-proxy ``` ``` $ sudo docker logs kubelet $ sudo docker logs kube-proxy ``` ## 收集 log ``` # 收集 kube-apiserver $ mkdir -p /tmp/rancher-logs-$HOSTNAME/kube-apiserver; $ cp -rf $(docker inspect $(docker ps -a | grep kube-apiserver | awk '{print $1}') -f '{{.LogPath}}')* /tmp/rancher-logs-$HOSTNAME/kube-apiserver # 收集 kube-proxy $ mkdir -p /tmp/rancher-logs-$HOSTNAME/kube-proxy; $ cp -rf $(docker inspect $(docker ps -a | grep kube-proxy | awk '{print $1}') -f '{{.LogPath}}')* /tmp/rancher-logs-$HOSTNAME/kube-proxy # 收集 kubelet $ mkdir -p /tmp/rancher-logs-$HOSTNAME/kubelet; $ cp -rf $(docker inspect $(docker ps -a | grep kubelet | awk '{print $1}') -f '{{.LogPath}}')* /tmp/rancher-logs-$HOSTNAME/kube-proxy ``` #### 連結 https://ranchermanager.docs.rancher.com/troubleshooting/kubernetes-components/troubleshooting-etcd-nodes