prometheus_agent: alerts: NodeDown: group: nodeexporter enabled: True expr: 'up{job="node", instance="{{ inventory_hostname }}"} == 0 or absent(up{job="node", instance="{{ inventory_hostname }}"})' for: 10m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: down{%endraw%}' description: '{%raw%}{{ $labels.instance }} is down{%endraw%}' MemoryWarning: group: nodeexporter alert: LowMem enabled: True expr: | (node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} - (node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"})) / node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 80 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: MemoryWarning{%endraw%}' description: "Node memory is filling up" MemoryCritical: group: nodeexporter alert: LowMem enabled: True expr: | (node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} - (node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"})) / node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 90 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: MemoryCritical{%endraw%}' description: "Node memory is full" DiscWarning: group: nodeexporter alert: LowDisc enabled: True expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 10 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: DiscWarning{%endraw%}' description: "Node disc is full soon" DiscCritical: group: nodeexporter alert: LowDisc enabled: True expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 5 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: DiscCritical{%endraw%}' description: "Node disc is full" DiscFillingWarning: group: nodeexporter alert: DiscFilling enabled: True expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[1h], 12 * 3600) < 0 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: DiscFillingWarning{%endraw%}' description: "Node disc is filling" DiscFillingCritical: group: nodeexporter alert: DiscFilling enabled: True expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[1h], 1 * 3600) < 0 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: DiscFillingCritical{%endraw%}' description: "Node disc is filling fast" DiscInodesWarning: group: nodeexporter enabled: True alert: DiscInodes expr: node_filesystem_files_free{job="node", instance="{{ inventory_hostname }}"} / node_filesystem_files{job="node", instance="{{ inventory_hostname }}"} * 100 < 15 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: DiscInodesWarning{%endraw%}' description: "Disk is almost running out of available inodes" CPUWarning: group: nodeexporter alert: CPULoad enabled: True expr: | 100 - avg(irate(node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}[5m])) * 100 > 80 or node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}) for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: CPUWarning{%endraw%}' description: "High CPU load or ussage" CPUCritical: group: nodeexporter alert: CPULoad enabled: True expr: | 100 - avg(irate(node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}[5m])) * 100 > 80 or node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}) for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: CPUCritical{%endraw%}' description: "Very high CPU load or ussage" SwapWarning: group: nodeexporter enabled: True alert: SwapLow expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 50 labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: SwapWarning{%endraw%}' description: "Swap running full" SwapCritical: group: nodeexporter enabled: True alert: SwapLow expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 80 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: SwapCritical{%endraw%}' description: "Swap running full" MDADMRaidCritical: group: nodeexporter enabled: True expr: | node_md_disks{job="node", instance="{{ inventory_hostname }}"} - node_md_disks_active{job="node", instance="{{ inventory_hostname }}"} != 0 or node_md_disks{state="fail", job="node", instance="{{ inventory_hostname }}"} > 0 or node_md_state{state="inactive", job="node", instance="{{ inventory_hostname }}"} > 0 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: MDADMRaidCritical{%endraw%}' description: "Raid is missing a disk" OOM: group: nodeexporter enabled: True expr: increase(node_vmstat_oom_kill{job="node", instance="{{ inventory_hostname }}"}[10m]) > 0 for: 1m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: OOM{%endraw%}' description: "OOM killer run" TempWarning: group: nodeexporter enabled: True alert: TempHigh expr: | node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 80 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: TempWarning{%endraw%}' description: "Temperatur too high" TempCritical: group: nodeexporter enabled: True alert: TempHigh expr: | node_hwmon_temp_alarm{job="node", instance="{{ inventory_hostname }}"} > 0 or node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 90 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: TempCritical{%endraw%}' description: "Too hot!" tls: mode: stunnel manage: yes pki: ~ agents: nodeexporter: args: 'web.listen-address': "[::1]:9100" proxy: mappings: {} scrape_timeout: ~ scrape_interval: ~ scrapers: {} ansible_groups_as_labels: True labels: {}