ansible-role-prometheus-agent/defaults/main.yml
2020-11-20 22:56:28 +01:00

249 lines
9.3 KiB
YAML

prometheus_agent:
alerts:
NodeDown:
group: nodeexporter
enabled: True
expr: 'up{job="node", instance="{{ inventory_hostname }}"} == 0 or absent(up{job="node", instance="{{ inventory_hostname }}"})'
for: 10m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: down{%endraw%}'
description: '{%raw%}{{ $labels.instance }} is down{%endraw%}'
MemoryWarning:
group: nodeexporter
alert: LowMem
enabled: True
expr: |
(node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} -
(node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"}))
/ node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 80
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: MemoryWarning{%endraw%}'
description: "Node memory is filling up"
MemoryCritical:
group: nodeexporter
alert: LowMem
enabled: True
expr: |
(node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} -
(node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"}))
/ node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 90
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: MemoryCritical{%endraw%}'
description: "Node memory is full"
DiscWarning:
group: nodeexporter
alert: LowDisc
enabled: True
expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 10
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: DiscWarning{%endraw%}'
description: "Node disc is full soon"
DiscCritical:
group: nodeexporter
alert: LowDisc
enabled: True
expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 5
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: DiscCritical{%endraw%}'
description: "Node disc is full"
DiscFillingWarning:
group: nodeexporter
alert: DiscFilling
enabled: True
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[4h], 12 * 3600) < 0
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: DiscFillingWarning{%endraw%}'
description: "Node disc is filling in 12h"
DiscFillingCritical:
group: nodeexporter
alert: DiscFilling
enabled: True
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[4h], 1 * 3600) < 0
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: DiscFillingCritical{%endraw%}'
description: "Node disc is filling in 1h"
DiscInodesWarning:
group: nodeexporter
enabled: True
alert: DiscInodes
expr: node_filesystem_files_free{job="node", instance="{{ inventory_hostname }}"} / node_filesystem_files{job="node", instance="{{ inventory_hostname }}"} * 100 < 15
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: DiscInodesWarning{%endraw%}'
description: "Disk is almost running out of available inodes"
CPUWarning:
group: nodeexporter
alert: CPULoad
enabled: True
expr: |
100 - avg(irate(node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}[5m])) * 100 > 80 or
node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"})
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: CPUWarning{%endraw%}'
description: "High CPU load or ussage"
CPUCritical:
group: nodeexporter
alert: CPULoad
enabled: True
expr: |
100 - avg(irate(node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}[5m])) * 100 > 80 or
node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"})
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: CPUCritical{%endraw%}'
description: "Very high CPU load or ussage"
SwapWarning:
group: nodeexporter
enabled: True
alert: SwapLow
expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 50
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: SwapWarning{%endraw%}'
description: "Swap running full"
SwapCritical:
group: nodeexporter
enabled: True
alert: SwapLow
expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 80
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: SwapCritical{%endraw%}'
description: "Swap running full"
MDADMRaidCritical:
group: nodeexporter
enabled: True
expr: |
node_md_disks{job="node", instance="{{ inventory_hostname }}"} - node_md_disks_active{job="node", instance="{{ inventory_hostname }}"} != 0 or
node_md_disks{state="fail", job="node", instance="{{ inventory_hostname }}"} > 0 or
node_md_state{state="inactive", job="node", instance="{{ inventory_hostname }}"} > 0
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: MDADMRaidCritical{%endraw%}'
description: "Raid is missing a disk"
OOM:
group: nodeexporter
enabled: True
expr: increase(node_vmstat_oom_kill{job="node", instance="{{ inventory_hostname }}"}[10m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: OOM{%endraw%}'
description: "OOM killer run"
TempWarning:
group: nodeexporter
enabled: True
alert: TempHigh
expr: |
node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 80
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: TempWarning{%endraw%}'
description: "Temperatur raising"
TempCritical:
group: nodeexporter
enabled: True
alert: TempHigh
expr: |
node_hwmon_temp_alarm{job="node", instance="{{ inventory_hostname }}"} > 0 or
node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 90
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: TempCritical{%endraw%}'
description: "Temperatur much too high, shutdown soon pending"
SystemdServices:
group: nodeexporter
enabled: True
alert: systemd
expr: |
node_systemd_units{job="node", instance=~"{{ inventory_hostname }}", state="failed"} > 0
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: Not all systemd services are running{%endraw%}'
description: "Service not running"
tls:
mode: stunnel
manage: yes
pki: ~
agents:
nodeexporter:
args:
'web.listen-address': "[::1]:9100"
proxy:
mappings: {}
blackbox:
enable: False
args:
"web.listen-address": "[::1]:9115"
"config.file": "/etc/prometheus/blackbox.yml"
config:
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: ip4
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: ip4
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: ip4
ssh_banner:
prober: tcp
tcp:
preferred_ip_protocol: ip4
query_response:
- expect: "^SSH-2.0-"
icmp:
icmp:
preferred_ip_protocol: ip4
prober: icmp
jobs: {}
scrape_timeout: ~
scrape_interval: ~
scrapers: {}
ansible_groups_as_labels: True
labels: {}