groups: -name:node_alerts rules: -alert:InstanceDown expr:up==0 for:1m labels: severity:critical annotations: summary:"Instance {{ $labels.instance }} down" description:"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute."
-alert:HighCpuUsage expr:100-(avgby(instance)(irate(node_cpu_seconds_total{mode="idle"}[5m]))*100)>80 for:2m labels: severity:warning annotations: summary:"High CPU usage on {{ $labels.instance }}" description:"CPU usage is above 80% for more than 2 minutes on {{ $labels.instance }}"