230 lines
8.6 KiB
YAML
230 lines
8.6 KiB
YAML
prometheus_agent:
|
|
alerts:
|
|
NodeDown:
|
|
group: nodeexporter
|
|
enabled: True
|
|
expr: 'up{job="node", instance="{{ inventory_hostname }}"} == 0 or absent(up{job="node", instance="{{ inventory_hostname }}"})'
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: down{%endraw%}'
|
|
description: '{%raw%}{{ $labels.instance }} is down{%endraw%}'
|
|
MemoryWarning:
|
|
group: nodeexporter
|
|
alert: LowMem
|
|
enabled: True
|
|
expr: |
|
|
(node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} -
|
|
(node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"}))
|
|
/ node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: MemoryWarning{%endraw%}'
|
|
description: "Node memory is filling up"
|
|
MemoryCritical:
|
|
group: nodeexporter
|
|
alert: LowMem
|
|
enabled: True
|
|
expr: |
|
|
(node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} -
|
|
(node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"}))
|
|
/ node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: MemoryCritical{%endraw%}'
|
|
description: "Node memory is full"
|
|
DiscWarning:
|
|
group: nodeexporter
|
|
alert: LowDisc
|
|
enabled: True
|
|
expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: DiscWarning{%endraw%}'
|
|
description: "Node disc is full soon"
|
|
DiscCritical:
|
|
group: nodeexporter
|
|
alert: LowDisc
|
|
enabled: True
|
|
expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 5
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: DiscCritical{%endraw%}'
|
|
description: "Node disc is full"
|
|
DiscFillingWarning:
|
|
group: nodeexporter
|
|
alert: DiscFilling
|
|
enabled: True
|
|
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[1h], 12 * 3600) < 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: DiscFillingWarning{%endraw%}'
|
|
description: "Node disc is filling"
|
|
DiscFillingCritical:
|
|
group: nodeexporter
|
|
alert: DiscFilling
|
|
enabled: True
|
|
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[1h], 1 * 3600) < 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: DiscFillingCritical{%endraw%}'
|
|
description: "Node disc is filling fast"
|
|
DiscInodesWarning:
|
|
group: nodeexporter
|
|
enabled: True
|
|
alert: DiscInodes
|
|
expr: node_filesystem_files_free{job="node", instance="{{ inventory_hostname }}"} / node_filesystem_files{job="node", instance="{{ inventory_hostname }}"} * 100 < 15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: DiscInodesWarning{%endraw%}'
|
|
description: "Disk is almost running out of available inodes"
|
|
CPUWarning:
|
|
group: nodeexporter
|
|
alert: CPULoad
|
|
enabled: True
|
|
expr: |
|
|
100 - avg(irate(node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}[5m])) * 100 > 80 or
|
|
node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"})
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: CPUWarning{%endraw%}'
|
|
description: "High CPU load or ussage"
|
|
CPUCritical:
|
|
group: nodeexporter
|
|
alert: CPULoad
|
|
enabled: True
|
|
expr: |
|
|
100 - avg(irate(node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}[5m])) * 100 > 80 or
|
|
node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"})
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: CPUCritical{%endraw%}'
|
|
description: "Very high CPU load or ussage"
|
|
SwapWarning:
|
|
group: nodeexporter
|
|
enabled: True
|
|
alert: SwapLow
|
|
expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 50
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: SwapWarning{%endraw%}'
|
|
description: "Swap running full"
|
|
SwapCritical:
|
|
group: nodeexporter
|
|
enabled: True
|
|
alert: SwapLow
|
|
expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: SwapCritical{%endraw%}'
|
|
description: "Swap running full"
|
|
MDADMRaidCritical:
|
|
group: nodeexporter
|
|
enabled: True
|
|
expr: |
|
|
node_md_disks{job="node", instance="{{ inventory_hostname }}"} - node_md_disks_active{job="node", instance="{{ inventory_hostname }}"} != 0 or
|
|
node_md_disks{state="fail", job="node", instance="{{ inventory_hostname }}"} > 0 or
|
|
node_md_state{state="inactive", job="node", instance="{{ inventory_hostname }}"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: MDADMRaidCritical{%endraw%}'
|
|
description: "Raid is missing a disk"
|
|
OOM:
|
|
group: nodeexporter
|
|
enabled: True
|
|
expr: increase(node_vmstat_oom_kill{job="node", instance="{{ inventory_hostname }}"}[10m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: OOM{%endraw%}'
|
|
description: "OOM killer run"
|
|
TempWarning:
|
|
group: nodeexporter
|
|
enabled: True
|
|
alert: TempHigh
|
|
expr: |
|
|
node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: TempWarning{%endraw%}'
|
|
description: "Temperatur too high"
|
|
TempCritical:
|
|
group: nodeexporter
|
|
enabled: True
|
|
alert: TempHigh
|
|
expr: |
|
|
node_hwmon_temp_alarm{job="node", instance="{{ inventory_hostname }}"} > 0 or
|
|
node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 90
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
title: '{%raw%}{{ $labels.instance }}: TempCritical{%endraw%}'
|
|
description: "Too hot!"
|
|
|
|
tls:
|
|
mode: stunnel
|
|
manage: yes
|
|
pki: ~
|
|
agents:
|
|
nodeexporter:
|
|
args:
|
|
'web.listen-address': "[::1]:9100"
|
|
proxy:
|
|
mappings: {}
|
|
blackbox:
|
|
enable: False
|
|
args:
|
|
"web.listen-address": "[::1]:9115"
|
|
"config.file": "/etc/prometheus/blackbox.yml"
|
|
config:
|
|
modules:
|
|
http_2xx:
|
|
prober: http
|
|
http:
|
|
http_post_2xx:
|
|
prober: http
|
|
http:
|
|
method: POST
|
|
tcp_connect:
|
|
prober: tcp
|
|
ssh_banner:
|
|
prober: tcp
|
|
tcp:
|
|
query_response:
|
|
- expect: "^SSH-2.0-"
|
|
icmp:
|
|
prober: icmp
|
|
jobs: {}
|
|
scrape_timeout: ~
|
|
scrape_interval: ~
|
|
scrapers: {}
|
|
ansible_groups_as_labels: True
|
|
labels: {}
|