add uptime alert
This commit is contained in:
parent
3e3f45958c
commit
57e6238d72
1 changed files with 62 additions and 50 deletions
|
|
@ -1,4 +1,53 @@
|
|||
prometheus_agent:
|
||||
tls:
|
||||
mode: stunnel
|
||||
manage: yes
|
||||
pki: ~
|
||||
agents:
|
||||
nodeexporter:
|
||||
args:
|
||||
'web.listen-address': "[::1]:9100"
|
||||
proxy:
|
||||
mappings: {}
|
||||
blackbox:
|
||||
enable: False
|
||||
args:
|
||||
"web.listen-address": "[::1]:9115"
|
||||
"config.file": "/etc/prometheus/blackbox.yml"
|
||||
config:
|
||||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
http:
|
||||
preferred_ip_protocol: ip4
|
||||
http_post_2xx:
|
||||
prober: http
|
||||
http:
|
||||
method: POST
|
||||
preferred_ip_protocol: ip4
|
||||
tcp_connect:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: ip4
|
||||
ssh_banner:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: ip4
|
||||
query_response:
|
||||
- expect: "^SSH-2.0-"
|
||||
icmp:
|
||||
icmp:
|
||||
preferred_ip_protocol: ip4
|
||||
prober: icmp
|
||||
jobs: {}
|
||||
promtail:
|
||||
enable: False
|
||||
scrape_timeout: ~
|
||||
scrape_interval: ~
|
||||
metrics_path: ~
|
||||
scrapers: {}
|
||||
ansible_groups_as_labels: True
|
||||
labels: {}
|
||||
alerts:
|
||||
NodeDown:
|
||||
group: nodeexporter
|
||||
|
|
@ -198,53 +247,16 @@ prometheus_agent:
|
|||
annotations:
|
||||
title: '{%raw%}{{ $labels.instance }}: Not all systemd services are running{%endraw%}'
|
||||
description: "Service not running"
|
||||
|
||||
tls:
|
||||
mode: stunnel
|
||||
manage: yes
|
||||
pki: ~
|
||||
agents:
|
||||
nodeexporter:
|
||||
args:
|
||||
'web.listen-address': "[::1]:9100"
|
||||
proxy:
|
||||
mappings: {}
|
||||
blackbox:
|
||||
enable: False
|
||||
args:
|
||||
"web.listen-address": "[::1]:9115"
|
||||
"config.file": "/etc/prometheus/blackbox.yml"
|
||||
config:
|
||||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
http:
|
||||
preferred_ip_protocol: ip4
|
||||
http_post_2xx:
|
||||
prober: http
|
||||
http:
|
||||
method: POST
|
||||
preferred_ip_protocol: ip4
|
||||
tcp_connect:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: ip4
|
||||
ssh_banner:
|
||||
prober: tcp
|
||||
tcp:
|
||||
preferred_ip_protocol: ip4
|
||||
query_response:
|
||||
- expect: "^SSH-2.0-"
|
||||
icmp:
|
||||
icmp:
|
||||
preferred_ip_protocol: ip4
|
||||
prober: icmp
|
||||
jobs: {}
|
||||
promtail:
|
||||
enable: False
|
||||
scrape_timeout: ~
|
||||
scrape_interval: ~
|
||||
metrics_path: ~
|
||||
scrapers: {}
|
||||
ansible_groups_as_labels: True
|
||||
labels: {}
|
||||
Uptime:
|
||||
group: nodeexporter
|
||||
enabled: True
|
||||
alert: uptime
|
||||
expr: |
|
||||
sum(node_time_seconds{job="node", instance="{{ inventory_hostname }}"}) -
|
||||
sum(node_boot_time_seconds{job="node", instance="{{ inventory_hostname }}"}) > 3600*24*120
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
title: '{%raw%}{{ $labels.instance }}: Uptime{%endraw%}'
|
||||
description: "Uptime is more than 120 days, please reboot soon"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue