From 57e6238d728841da7063e9f77d0e5630742e7d10 Mon Sep 17 00:00:00 2001 From: nd Date: Sat, 4 Dec 2021 15:08:20 +0100 Subject: [PATCH] add uptime alert --- defaults/main.yml | 112 +++++++++++++++++++++++++--------------------- 1 file changed, 62 insertions(+), 50 deletions(-) diff --git a/defaults/main.yml b/defaults/main.yml index 9e356d1..a66840a 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -1,4 +1,53 @@ prometheus_agent: + tls: + mode: stunnel + manage: yes + pki: ~ + agents: + nodeexporter: + args: + 'web.listen-address': "[::1]:9100" + proxy: + mappings: {} + blackbox: + enable: False + args: + "web.listen-address": "[::1]:9115" + "config.file": "/etc/prometheus/blackbox.yml" + config: + modules: + http_2xx: + prober: http + http: + preferred_ip_protocol: ip4 + http_post_2xx: + prober: http + http: + method: POST + preferred_ip_protocol: ip4 + tcp_connect: + prober: tcp + tcp: + preferred_ip_protocol: ip4 + ssh_banner: + prober: tcp + tcp: + preferred_ip_protocol: ip4 + query_response: + - expect: "^SSH-2.0-" + icmp: + icmp: + preferred_ip_protocol: ip4 + prober: icmp + jobs: {} + promtail: + enable: False + scrape_timeout: ~ + scrape_interval: ~ + metrics_path: ~ + scrapers: {} + ansible_groups_as_labels: True + labels: {} alerts: NodeDown: group: nodeexporter @@ -198,53 +247,16 @@ prometheus_agent: annotations: title: '{%raw%}{{ $labels.instance }}: Not all systemd services are running{%endraw%}' description: "Service not running" - - tls: - mode: stunnel - manage: yes - pki: ~ - agents: - nodeexporter: - args: - 'web.listen-address': "[::1]:9100" - proxy: - mappings: {} - blackbox: - enable: False - args: - "web.listen-address": "[::1]:9115" - "config.file": "/etc/prometheus/blackbox.yml" - config: - modules: - http_2xx: - prober: http - http: - preferred_ip_protocol: ip4 - http_post_2xx: - prober: http - http: - method: POST - preferred_ip_protocol: ip4 - tcp_connect: - prober: tcp - tcp: - preferred_ip_protocol: ip4 - ssh_banner: - prober: tcp - tcp: - preferred_ip_protocol: ip4 - query_response: - - expect: "^SSH-2.0-" - icmp: - icmp: - preferred_ip_protocol: ip4 - prober: icmp - jobs: {} - promtail: - enable: False - scrape_timeout: ~ - scrape_interval: ~ - metrics_path: ~ - scrapers: {} - ansible_groups_as_labels: True - labels: {} + Uptime: + group: nodeexporter + enabled: True + alert: uptime + expr: | + sum(node_time_seconds{job="node", instance="{{ inventory_hostname }}"}) - + sum(node_boot_time_seconds{job="node", instance="{{ inventory_hostname }}"}) > 3600*24*120 + for: 5m + labels: + severity: critical + annotations: + title: '{%raw%}{{ $labels.instance }}: Uptime{%endraw%}' + description: "Uptime is more than 120 days, please reboot soon"