#main.tf
# You get the datasource prometheus
data "grafana_data_source" "prometheus" {
name = "Prometheus"
}
# Similar get loki
data "grafana_data_source" "loki" {
name = "loki"
}
# Create contact point
resource "grafana_contact_point" "base_alert" {
name = "Base Alert via Microsoft Team"
teams {
url = var.webhook_team
title = "Prod InFlow24 Metric Error Alert !!!"
disable_resolve_message = true
message = <<EOT
{{ range .Alerts.Firing }}
Alert summaries:
{{ template "Alert Instance Template" . }}
{{ end }}
EOT
}
}
resource "grafana_contact_point" "log_alert" {
name = "Log Alert via Microsoft Team"
teams {
disable_resolve_message = true
url = var.webhook_team
title = "Prod InFlow24 Log Error Alert !!!"
message = <<EOT
{{ template "ErrorLogMessage" . }}
EOT
}
}
# Alert template Grafana will send to webhook
resource "grafana_message_template" "base_alert" {
name = "Prometheus Alert Template"
template = <<EOT
{{ define "Alert Instance Template" }}
{{ range .Annotations.SortedPairs }}
- {{ .Name }} = {{ .Value }}
{{ end }}
{{ end}}
EOT
}
resource "grafana_message_template" "log_alert" {
name = "Loki Alert Template"
template = <<EOT
{{ define "ErrorLogMessage" }}
{{ if gt (len .Alerts.Firing) 0 }}
{{ range .Alerts.Firing }}
{{ .Annotations.AlertValues }}
{{ end }}
{{ range .Alerts.Firing }}
View URL: [Link to grafana]({{ .Annotations.GrafanaLink }})
{{ end }}
{{ end }}
{{ end }}
EOT
}
# Notifcation Policy
resource "grafana_notification_policy" "notification_policy" {
group_by = ["..."]
contact_point = grafana_contact_point.base_alert.name
group_wait = "45s"
group_interval = "4m"
repeat_interval = "1h"
policy {
matcher {
label = "datasource"
match = "="
value = "Prometheus"
}
contact_point = grafana_contact_point.base_alert.name
group_by = ["alertname"]
continue = true
group_wait = "45s"
group_interval = "4m"
repeat_interval = "1h"
}
policy {
matcher {
label = "datasource"
match = "="
value = "Loki"
}
contact_point = grafana_contact_point.log_alert.name
group_by = ["alertname"]
continue = true
group_wait = "45s"
group_interval = "4m"
repeat_interval = "1h"
}
}
# Must be create folder for alert (NOTE: Obligatory)
resource "grafana_folder" "prometheus_alert" {
title = "Prometheus Alert Provisioning by Terraform"
}
resource "grafana_folder" "loki_alert" {
title = "Loki Alert Provisioning by Terraform"
}
# Create Rule Alert
resource "grafana_rule_group" "prometheus_alert" {
name = "Prometheus Alert Rule Group"
folder_uid = grafana_folder.prometheus_alert.uid
interval_seconds = 60
org_id = 1
rule {
name = "Pod High CPU Resource"
for = "2m"
condition = "B"
no_data_state = "OK"
exec_err_state = "Alerting"
is_paused = false
annotations = {
"Pod" = "{{ $labels.pod }}"
"Summary" = "Pod have high resource than permit for 2 minutes - (CPU > 85%)"
}
labels = {
"datasource" = "Prometheus"
}
data {
ref_id = "A"
query_type = ""
relative_time_range {
from = 600
to = 0
}
datasource_uid = data.grafana_data_source.prometheus.uid
model = jsonencode({
editorMode = "code"
datasource = {
type = "prometheus",
uid = "${data.grafana_data_source.prometheus.uid}"
}
expr = "(sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{cluster=~\".*\", namespace=~\".*\", pod=~\".*\"}) by (container,pod,namespace) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{cluster=~\".*\", namespace=~\".*\", pod=~\".*\"}) by (container,pod,namespace)) \u003e= 0.85"
intervalMs = 1000
legendFormat = "__auto"
maxDataPoints = 43200
range = true
hide = false
refId = "A"
})
}
data {
ref_id = "B"
query_type = ""
relative_time_range {
from = 0
to = 0
}
datasource_uid = "-100" # Expression query type
model = <<EOT
{
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
EOT
}
}
rule {
name = "Pod High Memory Resource"
for = "2m"
no_data_state = "OK"
exec_err_state = "Alerting"
condition = "B"
annotations = {
"Pod" = "{{ $labels.pod }}"
"Summary" = "Pod have high resource than permit for 2 minutes - (Memory > 95%)"
}
labels = {
"datasource" = "Prometheus"
}
is_paused = false
data {
ref_id = "A"
query_type = ""
relative_time_range {
from = 600
to = 0
}
datasource_uid = data.grafana_data_source.prometheus.uid
model = jsonencode({
editorMode = "code"
datasource = {
type = "prometheus",
uid = "${data.grafana_data_source.prometheus.uid}"
}
expr = "(sum(container_memory_working_set_bytes{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\", cluster=~\".*\", namespace=~\".*\", pod=~\".*\", container!=\"\", image!=\"\"}) by (container,pod,namespace) / sum(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{cluster=~\".*\", namespace=~\".*\", pod=~\".*\"}) by (container,pod,namespace)) \u003e= 0.95"
intervalMs = 1000
legendFormat = "__auto"
maxDataPoints = 43200
range = true
hide = false
refId = "A"
})
}
data {
ref_id = "B"
query_type = ""
relative_time_range {
from = 0
to = 0
}
datasource_uid = "-100"
model = <<EOT
{
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
EOT
}
}
rule {
name = "Pod Not Ready"
for = "5m"
condition = "B"
annotations = {
"Pod" = "{{ $labels.pod }}"
"Summary" = "Pod has been in a non-ready state for more than 5 minutes"
}
labels = {
"datasource" = "Prometheus"
}
no_data_state = "OK"
exec_err_state = "Alerting"
data {
ref_id = "A"
datasource_uid = data.grafana_data_source.prometheus.uid
relative_time_range {
from = 600
to = 0
}
query_type = ""
model = jsonencode({
editorMode = "code"
datasource = {
type = "prometheus",
uid = "${data.grafana_data_source.prometheus.uid}"
}
expr = "sum by (namespace, pod, cluster) (max by(namespace, pod, cluster) (kube_pod_status_phase{job=\"kube-state-metrics\", namespace=\".*\", phase=~\"Pending|Unknown|Failed\"}) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!=\"Job\"}))) > 0"
intervalMs = 1000
legendFormat = "__auto"
maxDataPoints = 43200
range = true
hide = false
refId = "A"
})
}
data {
ref_id = "B"
datasource_uid = "-100"
relative_time_range {
from = 0
to = 0
}
model = <<EOT
{
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
EOT
}
}
rule {
name = "Job Failed"
for = "1h"
condition = "B"
annotations = {
"Job Name" = "{{ $labels.job_name }}"
"Summary" = "Job failed to complete"
}
labels = {
"datasource" = "Prometheus"
}
no_data_state = "OK"
exec_err_state = "Alerting"
data {
ref_id = "A"
datasource_uid = data.grafana_data_source.prometheus.uid
relative_time_range {
from = 3600
to = 0
}
query_type = ""
model = jsonencode({
editorMode = "code"
datasource = {
type = "prometheus",
uid = "${data.grafana_data_source.prometheus.uid}"
}
expr = "kube_job_status_failed{job=\"kube-state-metrics\",namespace=\"infrastructure\"} > 0"
intervalMs = 1000
legendFormat = "__auto"
maxDataPoints = 43200
range = true
hide = false
refId = "A"
})
}
data {
ref_id = "B"
datasource_uid = "-100"
relative_time_range {
from = 0
to = 0
}
model = <<EOT
{
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
EOT
}
}
rule {
name = "OOMKilled"
for = "0s"
condition = "B"
annotations = {
"Pod" = "{{ $labels.pod }}"
"Summary" = "Immediate termination (SIGKILL) OOMKilled during 10Min before"
}
labels = {
"datasource" = "Prometheus"
}
no_data_state = "OK"
exec_err_state = "Alerting"
data {
ref_id = "A"
datasource_uid = data.grafana_data_source.prometheus.uid
relative_time_range {
from = 600
to = 0
}
query_type = ""
model = jsonencode({
editorMode = "code"
datasource = {
type = "prometheus",
uid = "${data.grafana_data_source.prometheus.uid}"
}
expr = "(kube_pod_container_status_terminated{namespace=~\".*\"} > 0) and (kube_pod_container_status_last_terminated_exitcode{namespace=~\".*\"} == 137)"
intervalMs = 1000
legendFormat = "__auto"
maxDataPoints = 43200
range = true
hide = false
refId = "A"
})
}
data {
ref_id = "B"
datasource_uid = "-100"
relative_time_range {
from = 0
to = 0
}
model = <<EOT
{
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
EOT
}
}
rule {
name = "Error Termination"
for = "0s"
condition = "B"
annotations = {
"Pod" = "{{ $labels.pod }}"
"Summary" = "Abnormal termination (SIGABRT) during 10Min before"
}
labels = {
"datasource" = "Prometheus"
}
no_data_state = "OK"
exec_err_state = "Alerting"
data {
ref_id = "A"
datasource_uid = data.grafana_data_source.prometheus.uid
relative_time_range {
from = 600
to = 0
}
query_type = ""
model = jsonencode({
editorMode = "code"
datasource = {
type = "prometheus",
uid = "${data.grafana_data_source.prometheus.uid}"
}
expr = "(kube_pod_container_status_waiting{namespace=~\".*\"} > 0) and (kube_pod_container_status_last_terminated_exitcode{namespace=~\".*\"} == 134)"
intervalMs = 1000
legendFormat = "__auto"
maxDataPoints = 43200
range = true
hide = false
refId = "A"
})
}
data {
ref_id = "B"
datasource_uid = "-100"
relative_time_range {
from = 0
to = 0
}
model = <<EOT
{
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
EOT
}
}
rule {
name = "ErrPullImages"
for = "0s"
condition = "B"
annotations = {
"Pod" = "{{ $labels.pod }}"
"Summary" = "Cannot pull images during 10Min before"
}
labels = {
"datasource" = "Prometheus"
}
no_data_state = "OK"
exec_err_state = "Alerting"
data {
ref_id = "A"
datasource_uid = data.grafana_data_source.prometheus.uid
relative_time_range {
from = 600
to = 0
}
query_type = ""
model = jsonencode({
editorMode = "code"
datasource = {
type = "prometheus",
uid = "${data.grafana_data_source.prometheus.uid}"
}
expr = "(kube_pod_container_status_waiting{namespace=~\".*\"} > 0) and (kube_pod_container_status_last_terminated_exitcode{namespace=~\".*\"} == 134)"
intervalMs = 1000
legendFormat = "__auto"
maxDataPoints = 43200
range = true
hide = false
refId = "A"
})
}
data {
ref_id = "B"
datasource_uid = "-100"
relative_time_range {
from = 0
to = 0
}
model = <<EOT
{
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
EOT
}
}
rule {
name = "ServiceRestarted"
for = "0s"
condition = "B"
annotations = {
"Pod" = "{{ $labels.pod }}"
"Summary" = "Service restarted during 15m ago"
}
labels = {
"datasource" = "Prometheus"
}
no_data_state = "OK"
exec_err_state = "Alerting"
data {
ref_id = "A"
datasource_uid = data.grafana_data_source.prometheus.uid
relative_time_range {
from = 900
to = 0
}
query_type = ""
model = jsonencode({
editorMode = "code"
datasource = {
type = "prometheus",
uid = "${data.grafana_data_source.prometheus.uid}"
}
expr = "increase(kube_pod_container_status_restarts_total[15m]) > 0"
intervalMs = 1000
legendFormat = "__auto"
maxDataPoints = 43200
range = true
hide = false
refId = "A"
})
}
data {
ref_id = "B"
datasource_uid = "-100"
relative_time_range {
from = 0
to = 0
}
model = <<EOT
{
"conditions": [
{
"evaluator": {
"params": [
3
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "B",
"type": "reduce"
}
EOT
}
}
}
resource "grafana_rule_group" "loki_alert" {
name = "Loki Alert Rule Group"
folder_uid = grafana_folder.loki_alert.uid
interval_seconds = 60
org_id = 1
rule {
name = "ServiceLogErrors"
no_data_state = "OK"
exec_err_state = "Alerting"
for = "0s"
condition = "B"
annotations = {
"AlertValues" = <<EOT
{{ with $values }}
{{ range $k, $v := . }}
**{{toUpper $v.Labels.app}}** => {{$v.Labels.message}}
{{ end }}
{{ end }}
EOT
"GrafanaLink" = "https://${var.monitoring_domain_config}/explore?orgId=1&left=%7B\"datasource\":\"loki\",\"queries\":%5B%7B\"refId\":\"A\",\"editorMode\":\"code\",\"expr\":\"sum%20by%28app,message%29%20%28count_over_time%28%7Bapp%3D~%5C\".%2A%5C\",level%3D%5C\"Error%5C\"%7D%20%7C%20pattern%20%5C\"<app>%20<message>%5C\"%20%5B$__interval%5D%29%29\",\"queryType\":\"range\"%7D%5D,\"range\":%7B\"from\":\"now-15m\",\"to\":\"now\"%7D%7D"
}
labels = {
"level" = "error"
"type" = "api"
"datasource" = "Loki"
}
data {
ref_id = "A"
query_type = "range"
relative_time_range {
from = 900
to = 0
}
datasource_uid = data.grafana_data_source.loki.uid
model = jsonencode({
expr = <<EOT
sum by(app,message) (count_over_time({app=~".*",level="Error"} | pattern "<app> <message>" [${var.time_log}]))
EOT
editorMode = "builder"
hide = false
intervalMs = 1000
legendFormat = ""
maxDataPoints = 43200
query_type = "range"
refID = "A"
})
}
data {
ref_id = "B"
query_type = ""
relative_time_range {
from = 0
to = 0
}
datasource_uid = "-100"
model = <<EOT
{
"conditions": [
{
"evaluator": {
"params": [
0
],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"A"
]
},
"reducer": {
"params": [],
"type": "count"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "-100"
},
"expression": "A",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "B",
"type": "classic_conditions"
}
EOT
}
}
}