From 57e6238d728841da7063e9f77d0e5630742e7d10 Mon Sep 17 00:00:00 2001
From: nd <git@notandy.de>
Date: Sat, 4 Dec 2021 15:08:20 +0100
Subject: [PATCH] add uptime alert

---
 defaults/main.yml | 112 +++++++++++++++++++++++++---------------------
 1 file changed, 62 insertions(+), 50 deletions(-)

diff --git a/defaults/main.yml b/defaults/main.yml
index 9e356d1..a66840a 100644
--- a/defaults/main.yml
+++ b/defaults/main.yml
@@ -1,4 +1,53 @@
 prometheus_agent:
+  tls:
+    mode: stunnel
+    manage: yes
+    pki: ~
+  agents:
+    nodeexporter:
+      args:
+        'web.listen-address': "[::1]:9100"
+    proxy:
+      mappings: {}
+    blackbox:
+      enable: False
+      args:
+        "web.listen-address": "[::1]:9115"
+        "config.file": "/etc/prometheus/blackbox.yml"
+      config:
+        modules:
+          http_2xx:
+            prober: http
+            http:
+              preferred_ip_protocol: ip4
+          http_post_2xx:
+            prober: http
+            http:
+              method: POST
+              preferred_ip_protocol: ip4
+          tcp_connect:
+            prober: tcp
+            tcp:
+              preferred_ip_protocol: ip4
+          ssh_banner:
+            prober: tcp
+            tcp:
+              preferred_ip_protocol: ip4
+              query_response:
+              - expect: "^SSH-2.0-"
+          icmp:
+            icmp:
+              preferred_ip_protocol: ip4
+            prober: icmp
+      jobs: {}
+    promtail:
+      enable: False
+  scrape_timeout: ~
+  scrape_interval: ~
+  metrics_path: ~
+  scrapers: {}
+  ansible_groups_as_labels: True
+  labels: {}
   alerts:
     NodeDown:
       group: nodeexporter
@@ -198,53 +247,16 @@ prometheus_agent:
       annotations:
         title: '{%raw%}{{ $labels.instance }}: Not all systemd services are running{%endraw%}'
         description: "Service not running"
-
-  tls:
-    mode: stunnel
-    manage: yes
-    pki: ~
-  agents:
-    nodeexporter:
-      args:
-        'web.listen-address': "[::1]:9100"
-    proxy:
-      mappings: {}
-    blackbox:
-      enable: False
-      args:
-        "web.listen-address": "[::1]:9115"
-        "config.file": "/etc/prometheus/blackbox.yml"
-      config:
-        modules:
-          http_2xx:
-            prober: http
-            http:
-              preferred_ip_protocol: ip4
-          http_post_2xx:
-            prober: http
-            http:
-              method: POST
-              preferred_ip_protocol: ip4
-          tcp_connect:
-            prober: tcp
-            tcp:
-              preferred_ip_protocol: ip4
-          ssh_banner:
-            prober: tcp
-            tcp:
-              preferred_ip_protocol: ip4
-              query_response:
-              - expect: "^SSH-2.0-"
-          icmp:
-            icmp:
-              preferred_ip_protocol: ip4
-            prober: icmp
-      jobs: {}
-    promtail:
-      enable: False
-  scrape_timeout: ~
-  scrape_interval: ~
-  metrics_path: ~
-  scrapers: {}
-  ansible_groups_as_labels: True
-  labels: {}
+    Uptime:
+      group: nodeexporter
+      enabled: True
+      alert: uptime
+      expr: |
+        sum(node_time_seconds{job="node", instance="{{ inventory_hostname }}"}) - 
+        sum(node_boot_time_seconds{job="node", instance="{{ inventory_hostname }}"}) > 3600*24*120
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        title: '{%raw%}{{ $labels.instance }}: Uptime{%endraw%}'
+        description: "Uptime is more than 120 days, please reboot soon"