diff --git a/prometheus/alert.rules.yml b/prometheus/alert.rules.yml new file mode 100644 index 0000000..a411f7f --- /dev/null +++ b/prometheus/alert.rules.yml @@ -0,0 +1,40 @@ +groups: + - name: instance-health + rules: + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 5 minutes." + + - name: host-health + rules: + - alert: HighCPULoad + expr: avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (instance) > 0.85 + for: 10m + labels: + severity: warning + annotations: + summary: "High CPU load on {{ $labels.instance }}" + description: "Average CPU usage > 85% for 10m." + + - alert: HighMemoryUsage + expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9 + for: 10m + labels: + severity: warning + annotations: + summary: "High memory usage on {{ $labels.instance }}" + description: "Memory usage > 90% for 10m." + + - alert: DiskSpaceLow + expr: (node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} - node_filesystem_free_bytes{fstype!~"tmpfs|overlay"}) / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"} > 0.85 + for: 10m + labels: + severity: warning + annotations: + summary: "Disk getting full on {{ $labels.instance }}" + description: "Disk usage > 85% on {{ $labels.mountpoint }}."