Add prometheus/alert.rules.yml
This commit is contained in:
40
prometheus/alert.rules.yml
Normal file
40
prometheus/alert.rules.yml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
groups:
|
||||||
|
- name: instance-health
|
||||||
|
rules:
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Instance {{ $labels.instance }} down"
|
||||||
|
description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 5 minutes."
|
||||||
|
|
||||||
|
- name: host-health
|
||||||
|
rules:
|
||||||
|
- alert: HighCPULoad
|
||||||
|
expr: avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) > 0.85
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High CPU load on {{ $labels.instance }}"
|
||||||
|
description: "Average CPU usage > 85% for 10m."
|
||||||
|
|
||||||
|
- alert: HighMemoryUsage
|
||||||
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High memory usage on {{ $labels.instance }}"
|
||||||
|
description: "Memory usage > 90% for 10m."
|
||||||
|
|
||||||
|
- alert: DiskSpaceLow
|
||||||
|
expr: (node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{fstype!~"tmpfs|overlay"}) / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} > 0.85
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Disk getting full on {{ $labels.instance }}"
|
||||||
|
description: "Disk usage > 85% on {{ $labels.mountpoint }}."
|
||||||
Reference in New Issue
Block a user