prometheus_agent: tls: mode: stunnel manage: yes pki: ~ agents: nodeexporter: args: 'web.listen-address': "[::1]:9100" proxy: mappings: {} blackbox: enable: False args: "web.listen-address": "[::1]:9115" "config.file": "/etc/prometheus/blackbox.yml" config: modules: http_2xx: prober: http http: preferred_ip_protocol: ip4 http_post_2xx: prober: http http: method: POST preferred_ip_protocol: ip4 tcp_connect: prober: tcp tcp: preferred_ip_protocol: ip4 ssh_banner: prober: tcp tcp: preferred_ip_protocol: ip4 query_response: - expect: "^SSH-2.0-" icmp: icmp: preferred_ip_protocol: ip4 prober: icmp jobs: {} promtail: enable: False scrape_timeout: ~ scrape_interval: ~ metrics_path: ~ scrapers: {} ansible_groups_as_labels: True labels: {} alerts: NodeDown: group: nodeexporter enabled: True expr: 'up{job="node", instance="{{ inventory_hostname }}"} == 0 or absent(up{job="node", instance="{{ inventory_hostname }}"})' for: 10m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: down{%endraw%}' description: '{%raw%}{{ $labels.instance }} is down{%endraw%}' MemoryWarning: group: nodeexporter alert: LowMem enabled: True expr: | (node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} - (node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"})) / node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 80 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: MemoryWarning{%endraw%}' description: "Node memory is filling up" MemoryCritical: group: nodeexporter alert: LowMem enabled: True expr: | (node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} - (node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"})) / node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 90 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: MemoryCritical{%endraw%}' description: "Node memory is full" DiscWarning: group: nodeexporter alert: LowDisc enabled: True expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 10 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: DiscWarning{%endraw%}' description: "Node disc is full soon" DiscCritical: group: nodeexporter alert: LowDisc enabled: True expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 5 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: DiscCritical{%endraw%}' description: "Node disc is full" DiscFillingWarning: group: nodeexporter alert: DiscFilling enabled: True expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[12h], 24 * 3600) < 0 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: DiscFillingWarning{%endraw%}' description: "Node disc is filling in 24h" DiscFillingCritical: group: nodeexporter alert: DiscFilling enabled: True expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[12h], 3 * 3600) < 0 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: DiscFillingCritical{%endraw%}' description: "Node disc is filling in 3h" DiscInodesWarning: group: nodeexporter enabled: True alert: DiscInodes expr: node_filesystem_files_free{job="node", instance="{{ inventory_hostname }}"} / node_filesystem_files{job="node", instance="{{ inventory_hostname }}"} * 100 < 15 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: DiscInodesWarning{%endraw%}' description: "Disk is almost running out of available inodes" CPUWarning: group: nodeexporter alert: CPULoad enabled: True expr: | node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}) * 10 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: CPUWarning{%endraw%}' description: "High CPU load or ussage" CPUCritical: group: nodeexporter alert: CPULoad enabled: True expr: | node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}) * 20 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: CPUCritical{%endraw%}' description: "Very high CPU load or ussage" SwapWarning: group: nodeexporter enabled: True alert: SwapLow expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 50 labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: SwapWarning{%endraw%}' description: "Swap running full" SwapCritical: group: nodeexporter enabled: True alert: SwapLow expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 80 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: SwapCritical{%endraw%}' description: "Swap running full" MDADMRaidCritical: group: nodeexporter enabled: True expr: | node_md_disks{job="node", instance="{{ inventory_hostname }}"} - node_md_disks_active{job="node", instance="{{ inventory_hostname }}"} != 0 or node_md_disks{state="fail", job="node", instance="{{ inventory_hostname }}"} > 0 or node_md_state{state="inactive", job="node", instance="{{ inventory_hostname }}"} > 0 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: MDADMRaidCritical{%endraw%}' description: "Raid is missing a disk" OOM: group: nodeexporter enabled: True expr: increase(node_vmstat_oom_kill{job="node", instance="{{ inventory_hostname }}"}[10m]) > 0 for: 1m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: OOM{%endraw%}' description: "OOM killer run" TempWarning: group: nodeexporter enabled: True alert: TempHigh expr: | node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 80 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: TempWarning{%endraw%}' description: "Temperatur raising" TempCritical: group: nodeexporter enabled: True alert: TempHigh expr: | node_hwmon_temp_alarm{job="node", instance="{{ inventory_hostname }}"} > 0 or node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 90 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: TempCritical{%endraw%}' description: "Temperatur much too high, shutdown soon pending" SystemdServices: group: nodeexporter enabled: True alert: systemd expr: | node_systemd_units{job="node", instance=~"{{ inventory_hostname }}", state="failed"} > 0 for: 5m labels: severity: critical annotations: title: '{%raw%}{{ $labels.instance }}: Not all systemd services are running{%endraw%}' description: "Service not running" Uptime: group: nodeexporter enabled: True alert: uptime expr: | node_time_seconds{job="node", instance="{{ inventory_hostname }}"} - node_boot_time_seconds{job="node", instance="{{ inventory_hostname }}"} > 3600*24*30 for: 5m labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: Uptime{%endraw%}' description: "Uptime is more than 30 days, please reboot soon" AptUpgradesPending: group: nodeexporter enabled: True alert: UpgradesPending expr: | sum(apt_upgrades_pending{job="node", instance="{{ inventory_hostname }}"}) > 0 or sum(apt_upgrades_held{job="node", instance="{{ inventory_hostname }}"}) > 0 for: 25h labels: severity: warning annotations: title: '{%raw%}{{ $labels.instance }}: AptUpgradesPending{%endraw%}' description: "Upgrades pending for more than 24 hours"