ITRI
, prometheus
, node-exporter
, numa_node
machine_cpu_cores
machine_cpu_cores{endpoint="https-metrics",instance="<node2_ip>:10250",job="kubelet",metrics_path="/metrics/cadvisor",namespace="kube-system",node="node2",service="po-prometheus-operator-kubelet"} 72
machine_cpu_cores{endpoint="https-metrics",instance="<node1_ip>:10250",job="kubelet",metrics_path="/metrics/cadvisor",namespace="kube-system",node="node1",service="po-prometheus-operator-kubelet"} 72
node_cpu_seconds_total
List only core19 as followingElement Value
node_cpu_seconds_total{cpu="19",endpoint="metrics",instance="<node2_ip>:9100",job="node-exporter",mode="idle",namespace="default",pod="po-prometheus-node-exporter-vhdst",service="po-prometheus-node-exporter"} 4011071.24
node_cpu_seconds_total{cpu="19",endpoint="metrics",instance="<node2_ip>:9100",job="node-exporter",mode="iowait",namespace="default",pod="po-prometheus-node-exporter-vhdst",service="po-prometheus-node-exporter"} 2676.3
node_cpu_seconds_total{cpu="19",endpoint="metrics",instance="<node2_ip>:9100",job="node-exporter",mode="irq",namespace="default",pod="po-prometheus-node-exporter-vhdst",service="po-prometheus-node-exporter"} 0
node_cpu_seconds_total{cpu="19",endpoint="metrics",instance="<node2_ip>:9100",job="node-exporter",mode="nice",namespace="default",pod="po-prometheus-node-exporter-vhdst",service="po-prometheus-node-exporter"} 0.95
node_cpu_seconds_total{cpu="19",endpoint="metrics",instance="<node2_ip>:9100",job="node-exporter",mode="softirq",namespace="default",pod="po-prometheus-node-exporter-vhdst",service="po-prometheus-node-exporter"} 98.03
node_cpu_seconds_total{cpu="19",endpoint="metrics",instance="<node2_ip>:9100",job="node-exporter",mode="steal",namespace="default",pod="po-prometheus-node-exporter-vhdst",service="po-prometheus-node-exporter"} 0
node_cpu_seconds_total{cpu="19",endpoint="metrics",instance="<node2_ip>:9100",job="node-exporter",mode="system",namespace="default",pod="po-prometheus-node-exporter-vhdst",service="po-prometheus-node-exporter"} 5134.41
node_cpu_seconds_total{cpu="19",endpoint="metrics",instance="<node2_ip>:9100",job="node-exporter",mode="user",namespace="default",pod="po-prometheus-node-exporter-vhdst",service="po-prometheus-node-exporter"} 31405.68
Textfile Collector | Pushgateway |
---|---|
--collector.textfile.directory
flag on the Node exporter. The collector will parse all files in that directory matching the glob *.prom
using the text format. Note: Timestamps are not supported.There are two node exporters in my Kubernetes cluster.
# kubectl get pod | grep node-exporter
po-prometheus-node-exporter-p7m8w 1/1 Running 0 3d19h
po-prometheus-node-exporter-vhdst 1/1 Running 0 3d19h
Before beginning, we should edit the daemonset of node-exporter po-prometheus-node-exporter
first. Notice that after editing, the daemonset would restart all the node-exporter pods.
# kubectl describe daemonsets.apps po-prometheus-node-exporter
Add the directory location with the args --collector.textfile.directory
.
Containers:
node-exporter:
Args:
--collector.textfile.directory=/home/node_exporter/textfile_collector
/* 省略 */
[Notice] Do not use "" in the value like the following:
❌ --collector.textfile.directory="/home/node_exporter/textfile_collector"
Or you would get the error message in pod logs:
time="2020-04-26T11:17:42Z" level=error msg="Error reading textfile collector directory \"\\\"/home/node_exporter/textfile_collector\\\"\": open \"/home/
node_exporter/textfile_collector\": no such file or directory" source="textfile.go:192"
Go into node-export container of each pod. kubectl exec -it <node_exporter_pod_name> sh
# kubectl exec -it <node_exporter_pod_name> sh
Create a script vi /home/numa.sh
.
# Dir Location
TEXTFILE_COLLECTOR_DIR=/home/node_exporter/textfile_collector
#TEXTFILE_COLLECTOR_DIR=/home
SYS_FS_CGROUPS_CPUSET_DIR=/sys/fs/cgroup/cpuset
NODE_NUMA_LSCPU_FILENAME=node_numa.prom
# Variables
SOCKETS_NUM=$(cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l)
PROCESSOR_NUM=$(cat /proc/cpuinfo | grep "processor" | sort -u | wc -l)
#PROCESSOR_NUM=$(egrep -e "processor" /proc/cpuinfo | wc -l)
ON_LINE_CPUS_LIST=$(cat $SYS_FS_CGROUPS_CPUSET_DIR/cpuset.effective_cpus)
ON_LINE_MEMS_LIST=$(cat $SYS_FS_CGROUPS_CPUSET_DIR/cpuset.effective_mems)
SIBLINGS_NUM=$(cat /proc/cpuinfo | grep "siblings" | sort -u | cut -d" " -f 2)
CPU_CORES_NUM=$(cat /proc/cpuinfo | grep "cores" | sort -u | cut -d" " -f 3)
THREADS_PER_CORE=$((SIBLINGS_NUM / CPU_CORES_NUM))
CORES_PER_SOCKET=$CPU_CORES_NUM
NUMA_NUM=$(find /sys/devices/system/node/node* -name "node*" | wc -l)
init()
{
if [ ! -d TEXTFILE_COLLECTOR_DIR ]; then
mkdir -p TEXTFILE_COLLECTOR_DIR
fi
if [ -f TEXTFILE_COLLECTOR_DIR/NODE_NUMA_LSCPU_FILENAME ]; then
rm TEXTFILE_COLLECTOR_DIR/NODE_NUMA_LSCPU_FILENAME
fi
}
numa_node_cpu_mapping()
{
numa_num_index=0
while [[ $numa_num_index -lt $NUMA_NUM ]]; do
printf "node$numa_num_index " >> temp
start_interval_core_num=""
index_interval_core_num=""
end_interval_core_num=""
for cpu_num in $(ls -1 /sys/devices/system/node/node$numa_num_index | sort -n -k1.4 | grep cpu*); do
# cpu**^&&
if [[ ! -z $(echo "${cpu_num##*/}" | tr -d '[A-Za-z]') ]]; then
# cpu 0 1 2 ...
if [[ -z $start_interval_core_num ]] && [[ -z $index_interval_core_num ]] && [[ -z $end_interval_core_num ]]; then
index_interval_core_num=$(echo $cpu_num | tr -d '[A-Za-z]')
start_interval_core_num=$index_interval_core_num
end_interval_core_num=$index_interval_core_num
elif [[ $((index_interval_core_num + 1)) -eq $(echo $cpu_num | tr -d '[A-Za-z]') ]]; then
index_interval_core_num=$(echo $cpu_num | tr -d '[A-Za-z]')
end_interval_core_num=$index_interval_core_num
fi
# find a interval
if [[ ! -d /sys/devices/system/node/node$numa_num_index/cpu$((index_interval_core_num + 1)) ]]; then
printf "$start_interval_core_num"-"$end_interval_core_num", >> temp
start_interval_core_num=""
index_interval_core_num=""
end_interval_core_num=""
fi
fi
done
echo >> temp
sed -i "$ s/.$//$((numa_num_index + 1))g" temp
#
# Write out metrics to a temporary file.
#
printf "node_numa_cpus_per_node_total{numa_node=\"node$numa_num_index\",cpus_num=\"$(sed "$((numa_num_index + 1))q;d" temp | cut -d' ' -f2)\"}" \
>> "$TEXTFILE_COLLECTOR_DIR/$NODE_NUMA_LSCPU_FILENAME.$$"
printf " $(ls /sys/devices/system/node/node$numa_num_index | grep -Eo "cpu[0-9]" | wc -l)\n" \
>> "$TEXTFILE_COLLECTOR_DIR/$NODE_NUMA_LSCPU_FILENAME.$$"
numa_num_index=$((numa_num_index + 1))
done
# numa_num_index=0
# while IFS= read -r line; do
# echo "node_numa_cpus_per_node_total{numa_node=\"$(echo $line | cut -d' ' -f1)\",cpus_num=\"$(echo $line | cut -d' ' -f2)\"} 0"
# done < temp
rm temp
}
count_on_line_cpus_total()
{
result=0
while IFS= read -r line; do
line=${line//[[:blank:]]/}
num="${line//[^,]}"
interval_num=$(($(echo "${#num}") + 1))
for i in $(seq 1 $interval_num); do
interval_string=$(echo $line | cut -d"," -f $i)
result=$((result + $(echo $interval_string | cut -d"-" -f 2) - $(echo $interval_string | cut -d"-" -f 1) + 1))
done
done < /sys/fs/cgroup/cpuset/cpuset.effective_cpus
echo "node_numa_on_line_cpus_total{on_line_cpus_list=\"$ON_LINE_CPUS_LIST\"} $result" \
>> "$TEXTFILE_COLLECTOR_DIR/$NODE_NUMA_LSCPU_FILENAME.$$"
}
#
# NUMA Total Information as Label
#
init
echo "node_numa_total{\
cpus=\"$PROCESSOR_NUM\",\
on_line_cpus_list=\"$ON_LINE_CPUS_LIST\",\
on_line_mems_list=\"$ON_LINE_MEMS_LIST\",\
threads_per_core=\"$THREADS_PER_CORE\",\
cores_per_socket=\"$CORES_PER_SOCKET\",\
sockets=\"$SOCKETS_NUM\"} $NUMA_NUM" \
>> "$TEXTFILE_COLLECTOR_DIR/$NODE_NUMA_LSCPU_FILENAME.$$"
count_on_line_cpus_total
echo "node_numa_threads_per_core{siblings_total=\"$SIBLINGS_NUM\",cpu_cores=\"$CPU_CORES_NUM\"} $THREADS_PER_CORE" \
>> "$TEXTFILE_COLLECTOR_DIR/$NODE_NUMA_LSCPU_FILENAME.$$"
echo "node_numa_cores_per_socket{cpu_cores=\"$CPU_CORES_NUM\"} $CORES_PER_SOCKET" >> "$TEXTFILE_COLLECTOR_DIR/$NODE_NUMA_LSCPU_FILENAME.$$"
echo "node_numa_sockets_total{} $SOCKETS_NUM" >> "$TEXTFILE_COLLECTOR_DIR/$NODE_NUMA_LSCPU_FILENAME.$$"
numa_node_cpu_mapping
# Write out metrics to a temporary file.
#i=0
#while [ $i -lt $((PROCESSOR_NUM)) ]
#do
# echo "node_numa_core_numuber{cpu=\"$i\"} $i" \
# >> $TEXTFILE_COLLECTOR_DIR/$NODE_NUMA_LSCPU_FILENAME.$$
# i=`expr $i + 1`
# if [ $i -eq $((PROCESSOR_NUM)) ]
# then
# break
# fi
#done
mv "$TEXTFILE_COLLECTOR_DIR/$NODE_NUMA_LSCPU_FILENAME.$$" "$TEXTFILE_COLLECTOR_DIR/$NODE_NUMA_LSCPU_FILENAME"
on_line_mems_list
From /sys/fs/cgroup/cpuset/cpuset.effective_mems
node_numa_on_line_cpus_total
From /sys/fs/cgroup/cpuset/cpuset.effective_cpus
node_numa_threads_per_core
From /proc/cpuinfo
node_numa_cores_per_socket
From /proc/cpuinfo
node_numa_sockets_total
From /proc/cpuinfo
node_numa_cpus_per_node_total
From /sys/devices/system/node/node<number>
Then we can get the node_numa.prom
as following:
node_numa_total{cpus="72",on_line_cpus_list="0-71",on_line_mems_list="0-1",threads_per_c
node_numa_on_line_cpus_total{on_line_cpus_list="0-71"} 72
node_numa_threads_per_core{siblings_total="36",cpu_cores="18"} 2
node_numa_cores_per_socket{cpu_cores="18"} 18
node_numa_sockets_total 2
node_numa_cpus_per_node_total{numa_node="node0",cpus_num="0-17,36-53"} 36
node_numa_cpus_per_node_total{numa_node="node1",cpus_num="18-35,54-71"} 36
In node-export container of pod. Use the command to get help ./bin/node_exporter --help
. Here use --collector.textfile.directory
to expose our own metrics. After execute the command, we can ignore the error message.
$ ./bin/node_exporter --collector.textfile.directory=/home/node_exporter/textfile_collector
INFO[0000] Starting node_exporter (version=0.18.1, branch=HEAD, revision=3db77732e925c08f675d7404a8c46466b2ece83e) source="node_exporter.go:156"
INFO[0000] Build context (go=go1.12.5, user=root@b50852a1acba, date=20190604-16:41:18) source="node_exporter.go:157"
INFO[0000] Enabled collectors: source="node_exporter.go:97"
INFO[0000] - arp source="node_exporter.go:104"
INFO[0000] - bcache source="node_exporter.go:104"
INFO[0000] - bonding source="node_exporter.go:104"
INFO[0000] - conntrack source="node_exporter.go:104"
INFO[0000] - cpu source="node_exporter.go:104"
INFO[0000] - cpufreq source="node_exporter.go:104"
INFO[0000] - diskstats source="node_exporter.go:104"
INFO[0000] - edac source="node_exporter.go:104"
INFO[0000] - entropy source="node_exporter.go:104"
INFO[0000] - filefd source="node_exporter.go:104"
INFO[0000] - filesystem source="node_exporter.go:104"
INFO[0000] - hwmon source="node_exporter.go:104"
INFO[0000] - infiniband source="node_exporter.go:104"
INFO[0000] - ipvs source="node_exporter.go:104"
INFO[0000] - loadavg source="node_exporter.go:104"
INFO[0000] - mdadm source="node_exporter.go:104"
INFO[0000] - meminfo source="node_exporter.go:104"
INFO[0000] - netclass source="node_exporter.go:104"
INFO[0000] - netdev source="node_exporter.go:104"
INFO[0000] - netstat source="node_exporter.go:104"
INFO[0000] - nfs source="node_exporter.go:104"
INFO[0000] - nfsd source="node_exporter.go:104"
INFO[0000] - pressure source="node_exporter.go:104"
INFO[0000] - sockstat source="node_exporter.go:104"
INFO[0000] - stat source="node_exporter.go:104"
INFO[0000] - textfile source="node_exporter.go:104"
INFO[0000] - time source="node_exporter.go:104"
INFO[0000] - timex source="node_exporter.go:104"
INFO[0000] - uname source="node_exporter.go:104"
INFO[0000] - vmstat source="node_exporter.go:104"
INFO[0000] - xfs source="node_exporter.go:104"
INFO[0000] - zfs source="node_exporter.go:104"
INFO[0000] Listening on :9100 source="node_exporter.go:170"
FATA[0000] listen tcp :9100: bind: address already in use source="node_exporter.go:172"
curl
a node exporter to get our own numa metrics.
# curl -s localhost:9100/metrics | grep node_numa
# HELP node_numa_cores_per_socket Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# TYPE node_numa_cores_per_socket untyped
node_numa_cores_per_socket{cpu_cores="18"} 18
# HELP node_numa_cpus_per_node_total Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# TYPE node_numa_cpus_per_node_total untyped
node_numa_cpus_per_node_total{cpus_num="0-17,36-53",numa_node="node0"} 36
node_numa_cpus_per_node_total{cpus_num="18-35,54-71",numa_node="node1"} 36
# HELP node_numa_on_line_cpus_total Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# TYPE node_numa_on_line_cpus_total untyped
node_numa_on_line_cpus_total{on_line_cpus_list="0-71"} 72
# HELP node_numa_sockets_total Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# TYPE node_numa_sockets_total untyped
node_numa_sockets_total 2
# HELP node_numa_threads_per_core Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# TYPE node_numa_threads_per_core untyped
node_numa_threads_per_core{cpu_cores="18",siblings_total="36"} 2
# HELP node_numa_total Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# TYPE node_numa_total untyped
node_numa_total{cores_per_socket="18",cpus="72",on_line_cpus_list="0-71",on_line_mems_list="0-1",sockets="2",threads_per_core="2"} 2
node_textfile_mtime_seconds{file="node_numa.prom"} 1.588160907e+09
We can also curl
the other node exporter.
# curl -s <node2_ip>:9100/metrics | grep node_numa
curl
the metrics to see the "textfile" condition.
# curl -s localhost:9100/metrics | grep textfile
# HELP node_numa_cores_per_socket Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# HELP node_numa_cpus_per_node_total Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# HELP node_numa_on_line_cpus_total Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# HELP node_numa_sockets_total Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# HELP node_numa_threads_per_core Metric read from /home/node_exporter/textfile_collector/node_numa.prom
# HELP node_numa_total Metric read from /home/node_exporter/textfile_collector/node_numa.prom
node_scrape_collector_duration_seconds{collector="textfile"} 0.00225249
node_scrape_collector_success{collector="textfile"} 1
# HELP node_textfile_mtime_seconds Unixtime mtime of textfiles successfully read.
# TYPE node_textfile_mtime_seconds gauge
node_textfile_mtime_seconds{file="node_numa.prom"} 1.588160907e+09
# HELP node_textfile_scrape_error 1 if there was an error opening or reading a file, 0 otherwise
# TYPE node_textfile_scrape_error gauge
node_textfile_scrape_error 0