add uptime alert

This commit is contained in:
nd 2021-12-04 15:08:20 +01:00
parent 3e3f45958c
commit 57e6238d72
No known key found for this signature in database
GPG key ID: 21B5CD4DEE3670E9

View file

@ -1,4 +1,53 @@
prometheus_agent:
tls:
mode: stunnel
manage: yes
pki: ~
agents:
nodeexporter:
args:
'web.listen-address': "[::1]:9100"
proxy:
mappings: {}
blackbox:
enable: False
args:
"web.listen-address": "[::1]:9115"
"config.file": "/etc/prometheus/blackbox.yml"
config:
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: ip4
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: ip4
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: ip4
ssh_banner:
prober: tcp
tcp:
preferred_ip_protocol: ip4
query_response:
- expect: "^SSH-2.0-"
icmp:
icmp:
preferred_ip_protocol: ip4
prober: icmp
jobs: {}
promtail:
enable: False
scrape_timeout: ~
scrape_interval: ~
metrics_path: ~
scrapers: {}
ansible_groups_as_labels: True
labels: {}
alerts:
NodeDown:
group: nodeexporter
@ -198,53 +247,16 @@ prometheus_agent:
annotations:
title: '{%raw%}{{ $labels.instance }}: Not all systemd services are running{%endraw%}'
description: "Service not running"
tls:
mode: stunnel
manage: yes
pki: ~
agents:
nodeexporter:
args:
'web.listen-address': "[::1]:9100"
proxy:
mappings: {}
blackbox:
enable: False
args:
"web.listen-address": "[::1]:9115"
"config.file": "/etc/prometheus/blackbox.yml"
config:
modules:
http_2xx:
prober: http
http:
preferred_ip_protocol: ip4
http_post_2xx:
prober: http
http:
method: POST
preferred_ip_protocol: ip4
tcp_connect:
prober: tcp
tcp:
preferred_ip_protocol: ip4
ssh_banner:
prober: tcp
tcp:
preferred_ip_protocol: ip4
query_response:
- expect: "^SSH-2.0-"
icmp:
icmp:
preferred_ip_protocol: ip4
prober: icmp
jobs: {}
promtail:
enable: False
scrape_timeout: ~
scrape_interval: ~
metrics_path: ~
scrapers: {}
ansible_groups_as_labels: True
labels: {}
Uptime:
group: nodeexporter
enabled: True
alert: uptime
expr: |
sum(node_time_seconds{job="node", instance="{{ inventory_hostname }}"}) -
sum(node_boot_time_seconds{job="node", instance="{{ inventory_hostname }}"}) > 3600*24*120
for: 5m
labels:
severity: critical
annotations:
title: '{%raw%}{{ $labels.instance }}: Uptime{%endraw%}'
description: "Uptime is more than 120 days, please reboot soon"