owned this note
owned this note
Published
Linked with GitHub
# Grafana Cluster Monitoring for Kubernetes - Customer edit
[prometheus & prometheus Adapter ArgoCD Application Setting](https://hackmd.io/scJpKQkbRLm6-czFs-ujqw)
[org](https://grafana.com/grafana/dashboards/10000)
[ref](https://grafana.com/grafana/dashboards/6336)
```json=
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"description": "This dashboard provides cluster admins with the ability to monitor nodes and identify workload bottlenecks. It can be deployed with PSPs enabled using the following helm chart - https://github.com/pivotal-cf/charts-grafana",
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": 10000,
"graphTooltip": 0,
"id": 11,
"iteration": 1658049306578,
"links": [],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 34,
"panels": [],
"title": "Summary",
"type": "row"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)"
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 65
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 0,
"y": 1
},
"id": 4,
"links": [],
"maxDataPoints": 100,
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "8.3.1",
"targets": [
{
"expr": "sum (container_memory_working_set_bytes{image!=\"\",pod!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}
],
"title": "Cluster memory usage",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)"
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 65
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 8,
"y": 1
},
"id": 6,
"links": [],
"maxDataPoints": 100,
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "8.3.1",
"targets": [
{
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[$interval])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 10
}
],
"title": "Cluster CPU usage ($interval avg)",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "rgba(50, 172, 45, 0.97)"
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 65
},
{
"color": "rgba(245, 54, 54, 0.9)",
"value": 90
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 8,
"x": 16,
"y": 1
},
"id": 7,
"links": [],
"maxDataPoints": 100,
"options": {
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "8.3.1",
"targets": [
{
"expr": "sum (container_fs_usage_bytes{id=\"/\"}) / sum (container_fs_limit_bytes{id=\"/\"}) * 100",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "",
"metric": "",
"refId": "A",
"step": 10
}
],
"title": "Cluster filesystem usage",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 0,
"y": 6
},
"id": 9,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.3.1",
"targets": [
{
"expr": "sum (container_memory_working_set_bytes{image!=\"\",pod!=\"\",kubernetes_io_hostname=~\"^$Node$\"})",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}
],
"title": "Used",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 4,
"y": 6
},
"id": 10,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.3.1",
"targets": [
{
"expr": "sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}
],
"title": "Total",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 8,
"y": 6
},
"id": 11,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.3.1",
"targets": [
{
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[$interval]))",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 10
}
],
"title": "Used",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 12,
"y": 6
},
"id": 12,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.3.1",
"targets": [
{
"expr": "sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"refId": "A",
"step": 10
}
],
"title": "Total",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 16,
"y": 6
},
"id": 13,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.3.1",
"targets": [
{
"expr": "sum (container_fs_usage_bytes{id=\"/\"})",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 10
}
],
"title": "Used",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [
{
"options": {
"match": "null",
"result": {
"text": "N/A"
}
},
"type": "special"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "red",
"value": 80
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 4,
"x": 20,
"y": 6
},
"id": 14,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "none",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "8.3.1",
"targets": [
{
"expr": "sum (container_fs_limit_bytes{id=\"/\"})",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A",
"step": 10
}
],
"title": "Total",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 9
},
"id": 35,
"panels": [],
"title": "Memory",
"type": "row"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"decimals": 2,
"editable": true,
"error": false,
"fill": 0,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 10
},
"hiddenSeries": false,
"id": 25,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"hideEmpty": false,
"max": false,
"min": false,
"show": true,
"sideWidth": 200,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(container_memory_working_set_bytes{image!=\"\",pod!=\"\",kubernetes_io_hostname=~\"^$Node$\"}) by (pod)",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "{{ pod }}",
"metric": "container_memory_usage:sort_desc",
"refId": "A",
"step": 10
}
],
"thresholds": [],
"timeRegions": [],
"title": "Pods memory usage",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"logBase": 1,
"show": true
},
{
"format": "short",
"logBase": 1,
"show": false
}
],
"yaxis": {
"align": false
}
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 17
},
"id": 37,
"panels": [],
"title": "CPU",
"type": "row"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"decimals": 3,
"editable": true,
"error": false,
"fill": 0,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 18
},
"height": "",
"hiddenSeries": false,
"id": 17,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",pod!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[$interval])) by (pod)",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "{{ pod }}",
"metric": "container_cpu",
"refId": "A",
"step": 10
}
],
"thresholds": [],
"timeRegions": [],
"title": "Pods CPU usage ($interval avg)",
"tooltip": {
"msResolution": true,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "none",
"label": "cores",
"logBase": 1,
"show": true
},
{
"format": "short",
"logBase": 1,
"show": false
}
],
"yaxis": {
"align": false
}
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 25
},
"id": 33,
"panels": [],
"title": "Network I/O",
"type": "row"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 26
},
"height": "200px",
"hiddenSeries": false,
"id": 32,
"legend": {
"alignAsTable": false,
"avg": true,
"current": true,
"max": false,
"min": false,
"rightSide": false,
"show": false,
"sideWidth": 200,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum (rate (container_network_receive_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[$interval]))",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "Received",
"metric": "network",
"refId": "A",
"step": 10
},
{
"expr": "- sum (rate (container_network_transmit_bytes_total{kubernetes_io_hostname=~\"^$Node$\"}[$interval]))",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "Sent",
"metric": "network",
"refId": "B",
"step": 10
}
],
"thresholds": [],
"timeRegions": [],
"title": "Network I/O pressure",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "Bps",
"logBase": 1,
"show": true
},
{
"format": "Bps",
"logBase": 1,
"show": false
}
],
"yaxis": {
"align": false
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 31
},
"hiddenSeries": false,
"id": 16,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": false,
"min": false,
"show": true,
"sideWidth": 200,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "8.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",pod!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[$interval])) by (pod)",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "-> {{ pod }}",
"metric": "network",
"refId": "A",
"step": 10
},
{
"expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",pod!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[$interval])) by (pod)",
"format": "time_series",
"interval": "10s",
"intervalFactor": 1,
"legendFormat": "<- {{ pod }}",
"metric": "network",
"refId": "B",
"step": 10
}
],
"thresholds": [],
"timeRegions": [],
"title": "Pods network I/O ($interval avg)",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"mode": "time",
"show": true,
"values": []
},
"yaxes": [
{
"format": "Bps",
"logBase": 1,
"show": true
},
{
"format": "short",
"logBase": 1,
"show": false
}
],
"yaxis": {
"align": false
}
}
],
"refresh": "10s",
"schemaVersion": 33,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"auto": true,
"auto_count": 20,
"auto_min": "2m",
"current": {
"selected": false,
"text": "auto",
"value": "$__auto_interval_interval"
},
"hide": 2,
"name": "interval",
"options": [
{
"selected": true,
"text": "auto",
"value": "$__auto_interval_interval"
},
{
"selected": false,
"text": "1m",
"value": "1m"
},
{
"selected": false,
"text": "10m",
"value": "10m"
},
{
"selected": false,
"text": "30m",
"value": "30m"
},
{
"selected": false,
"text": "1h",
"value": "1h"
},
{
"selected": false,
"text": "6h",
"value": "6h"
},
{
"selected": false,
"text": "12h",
"value": "12h"
},
{
"selected": false,
"text": "1d",
"value": "1d"
},
{
"selected": false,
"text": "7d",
"value": "7d"
},
{
"selected": false,
"text": "14d",
"value": "14d"
},
{
"selected": false,
"text": "30d",
"value": "30d"
}
],
"query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
"refresh": 2,
"skipUrlSync": false,
"type": "interval"
},
{
"current": {
"selected": false,
"text": "default",
"value": "default"
},
"hide": 0,
"includeAll": false,
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "${datasource}"
},
"definition": "",
"hide": 0,
"includeAll": true,
"multi": false,
"name": "Node",
"options": [],
"query": {
"query": "label_values(kubernetes_io_hostname)",
"refId": "Prometheus-Node-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Cluster Monitoring for Kubernetes Copy",
"uid": "5RDCdNgVz",
"version": 24,
"weekStart": ""
}
```