ansible-role-prometheus-agent/defaults/main.yml

318 lines
12 KiB
YAML

prometheus_agent:
tls:
mode: stunnel
manage: yes
pki: ~
agents:
nodeexporter:
args:
'web.listen-address': "[::1]:9100"
'collector.filesystem.ignored-mount-points': '^/(dev|proc|run|sys|mnt|media|var/lib/docker|var/chroots)($|/)'
proxy:
mappings: {}
blackbox:
enable: False
args:
"web.listen-address": "[::1]:9115"
"config.file": "/etc/prometheus/blackbox.yml"
config:
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: ip4
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: ip4
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: ip4
ssh_banner:
prober: tcp
tcp:
preferred_ip_protocol: ip4
query_response:
- expect: "^SSH-2.0-"
icmp:
icmp:
preferred_ip_protocol: ip4
prober: icmp
jobs: {}
promtail:
enable: False
config:
server:
disable: true
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /var/lib/promtail/positions.yaml
# clients is generated based on prometheus_agent.scrapers
# scrape_configs is generated based on prometheus_agent.agents.promtail.scrape_jobs
# "scrape_jobs" items have the same format as the "scrape_jobs" promtail
# config key. However, using a dictionary simplifies extending or changing
# the default scrape configs. Items with an empty value are ignored.
# The "job_name" field defaults to the item key.
scrape_jobs:
journal:
journal:
max_age: 12h
labels:
job: systemd-journal
relabel_configs:
- source_labels: ['__journal__systemd_unit']
target_label: 'unit'
- source_labels: ['__journal_priority_keyword']
target_label: 'level'
pipeline_stages:
- structured_metadata:
level:
snmp:
enable: False
mib_path: /usr/share/snmp/mibs
args:
"web.listen-address": "[::1]:9116"
"config.file": "/etc/prometheus/snmp.yml"
# A list of objects containing a single key, either `url` or `file`, e.g.:
# - url: https://example.com/path/to/mib
# - file: /path/to/mib
mibs: []
# Generator configuration, see https://github.com/prometheus/snmp_exporter/blob/main/generator/generator.yml
config:
modules: {}
jobs: {}
scrape_timeout: ~
scrape_interval: ~
metrics_path: ~
scrapers: {}
ansible_groups_as_labels: True
labels: {}
alerts:
NodeDown:
group: nodeexporter
enabled: True
expr: 'up{job="node", instance="{{ inventory_hostname }}"} == 0 or absent(up{job="node", instance="{{ inventory_hostname }}"})'
for: 10m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: down{%endraw%}'
description: '{%raw%}{{ $labels.instance }} is down{%endraw%}'
MemoryWarning:
group: nodeexporter
alert: LowMem
enabled: True
expr: |
(node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} -
(node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"}))
/ node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 80
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: MemoryWarning{%endraw%}'
description: "Node memory is filling up"
MemoryCritical:
group: nodeexporter
alert: LowMem
enabled: True
expr: |
(node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} -
(node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"}))
/ node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 90
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: MemoryCritical{%endraw%}'
description: "Node memory is full"
DiscWarning:
group: nodeexporter
alert: LowDisc
enabled: True
expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 10
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: DiscWarning{%endraw%}'
description: "Node disc is full soon"
DiscCritical:
group: nodeexporter
alert: LowDisc
enabled: True
expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 5
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: DiscCritical{%endraw%}'
description: "Node disc is full"
DiscFillingWarning:
group: nodeexporter
alert: DiscFilling
enabled: True
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[12h], 24 * 3600) < 0
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: DiscFillingWarning{%endraw%}'
description: "Node disc is filling in 24h"
DiscFillingCritical:
group: nodeexporter
alert: DiscFilling
enabled: True
expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[12h], 3 * 3600) < 0
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: DiscFillingCritical{%endraw%}'
description: "Node disc is filling in 3h"
DiscInodesWarning:
group: nodeexporter
enabled: True
alert: DiscInodes
expr: node_filesystem_files_free{job="node", instance="{{ inventory_hostname }}"} / node_filesystem_files{job="node", instance="{{ inventory_hostname }}"} * 100 < 15
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: DiscInodesWarning{%endraw%}'
description: "Disk is almost running out of available inodes"
CPUWarning:
group: nodeexporter
alert: CPULoad
enabled: True
expr: |
node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}) * 10
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: CPUWarning{%endraw%}'
description: "High CPU load or ussage"
CPUCritical:
group: nodeexporter
alert: CPULoad
enabled: True
expr: |
node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}) * 20
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: CPUCritical{%endraw%}'
description: "Very high CPU load or ussage"
SwapWarning:
group: nodeexporter
enabled: True
alert: SwapLow
expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 50
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: SwapWarning{%endraw%}'
description: "Swap running full"
SwapCritical:
group: nodeexporter
enabled: True
alert: SwapLow
expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 80
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: SwapCritical{%endraw%}'
description: "Swap running full"
MDADMRaidCritical:
group: nodeexporter
enabled: True
expr: |
node_md_disks{job="node", instance="{{ inventory_hostname }}"} - node_md_disks_active{job="node", instance="{{ inventory_hostname }}"} != 0 or
node_md_disks{state="fail", job="node", instance="{{ inventory_hostname }}"} > 0 or
node_md_state{state="inactive", job="node", instance="{{ inventory_hostname }}"} > 0
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: MDADMRaidCritical{%endraw%}'
description: "Raid is missing a disk"
OOM:
group: nodeexporter
enabled: True
expr: increase(node_vmstat_oom_kill{job="node", instance="{{ inventory_hostname }}"}[10m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: OOM{%endraw%}'
description: "OOM killer run"
TempWarning:
group: nodeexporter
enabled: True
alert: TempHigh
expr: |
node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 80
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: TempWarning{%endraw%}'
description: "Temperatur raising"
TempCritical:
group: nodeexporter
enabled: True
alert: TempHigh
expr: |
node_hwmon_temp_alarm{job="node", instance="{{ inventory_hostname }}"} > 0 or
node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 90
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: TempCritical{%endraw%}'
description: "Temperatur much too high, shutdown soon pending"
SystemdServices:
group: nodeexporter
enabled: True
alert: systemd
expr: |
node_systemd_units{job="node", instance=~"{{ inventory_hostname }}", state="failed"} > 0
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: Not all systemd services are running{%endraw%}'
description: "Service not running"
Uptime:
group: nodeexporter
enabled: True
alert: uptime
expr: |
node_time_seconds{job="node", instance="{{ inventory_hostname }}"} -
node_boot_time_seconds{job="node", instance="{{ inventory_hostname }}"} > 3600*24*30
for: 5m
labels:
severity: warning
annotations:
title: '{%raw%}{{ $labels.instance }}: Uptime{%endraw%}'
description: "Uptime is more than 30 days, please reboot soon"
AptUpgradesPending:
group: nodeexporter
enabled: True
alert: UpgradesPending
expr: |
sum(apt_upgrades_pending{job="node", instance="{{ inventory_hostname }}"}) > 0 or
sum(apt_upgrades_held{job="node", instance="{{ inventory_hostname }}"}) > 0
for: 25h
labels:
severity: warning
instance: "{{ inventory_hostname }}"
annotations:
title: '{%raw%}{{ $labels.instance }}: AptUpgradesPending{%endraw%}'
description: "Upgrades pending for more than 24 hours"