From 7997ebfb593f7bef89d49488123c16031c63de79 Mon Sep 17 00:00:00 2001 From: nd Date: Tue, 2 Jun 2020 17:34:03 +0200 Subject: [PATCH] added default alerts for nodes --- defaults/main.yml | 192 ++++++++++++++++++++++++++++++++++++++- tasks/main.yml | 8 +- templates/node-alerts.j2 | 10 ++ 3 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 templates/node-alerts.j2 diff --git a/defaults/main.yml b/defaults/main.yml index bf67ebe..d469ff7 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -1,4 +1,194 @@ prometheus_agent: + alerts: + NodeDown: + group: nodeexporter + enabled: True + expr: 'up{job="node", instance="{{ inventory_hostname }}"} == 0 or absent(up{job="node", instance="{{ inventory_hostname }}"})' + for: 10m + labels: + severity: critical + annotations: + title: '{%raw%}{{ $labels.instance }}: down{%endraw%}' + description: '{%raw%}{{ $labels.instance }} is down{%endraw%}' + MemoryWarning: + group: nodeexporter + alert: LowMem + enabled: True + expr: | + (node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} - + (node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"})) + / node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + title: '{%raw%}{{ $labels.instance }}: MemoryWarning{%endraw%}' + description: "Node memory is filling up" + MemoryCritical: + group: nodeexporter + alert: LowMem + enabled: True + expr: | + (node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} - node_memory_MemFree_bytes{instance="{{ inventory_hostname }}",job="node"} - + (node_memory_Cached_bytes{instance="{{ inventory_hostname }}",job="node"} + node_memory_Buffers_bytes{instance="{{ inventory_hostname }}",job="node"})) + / node_memory_MemTotal_bytes{instance="{{ inventory_hostname }}",job="node"} * 100 > 90 + for: 5m + labels: + severity: critical + annotations: + title: '{%raw%}{{ $labels.instance }}: MemoryCritical{%endraw%}' + description: "Node memory is full" + DiscWarning: + group: nodeexporter + alert: LowDisc + enabled: True + expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 10 + for: 5m + labels: + severity: warning + annotations: + title: '{%raw%}{{ $labels.instance }}: DiscWarning{%endraw%}' + description: "Node disc is full soon" + DiscCritical: + group: nodeexporter + alert: LowDisc + enabled: True + expr: (node_filesystem_avail_bytes{job="node", instance="{{ inventory_hostname }}"} * 100) / node_filesystem_size_bytes{job="node", instance="{{ inventory_hostname }}"} < 5 + for: 5m + labels: + severity: critical + annotations: + title: '{%raw%}{{ $labels.instance }}: DiscCritical{%endraw%}' + description: "Node disc is full" + DiscFillingWarning: + group: nodeexporter + alert: DiscFilling + enabled: True + expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[1h], 12 * 3600) < 0 + for: 5m + labels: + severity: warning + annotations: + title: '{%raw%}{{ $labels.instance }}: DiscFillingWarning{%endraw%}' + description: "Node disc is filling" + DiscFillingCritical: + group: nodeexporter + alert: DiscFilling + enabled: True + expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs", job="node", instance="{{ inventory_hostname }}"}[1h], 1 * 3600) < 0 + for: 5m + labels: + severity: critical + annotations: + title: '{%raw%}{{ $labels.instance }}: DiscFillingCritical{%endraw%}' + description: "Node disc is filling fast" + DiscInodesWarning: + group: nodeexporter + enabled: True + alert: DiscInodes + expr: node_filesystem_files_free{job="node", instance="{{ inventory_hostname }}"} / node_filesystem_files{job="node", instance="{{ inventory_hostname }}"} * 100 < 15 + for: 5m + labels: + severity: warning + annotations: + title: '{%raw%}{{ $labels.instance }}: DiscInodesWarning{%endraw%}' + description: "Disk is almost running out of available inodes" + CPUWarning: + group: nodeexporter + alert: CPULoad + enabled: True + expr: | + 100 - avg(irate(node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}[5m])) * 100 > 80 or + node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}) + for: 5m + labels: + severity: warning + annotations: + title: '{%raw%}{{ $labels.instance }}: CPUWarning{%endraw%}' + description: "High CPU load or ussage" + CPUCritical: + group: nodeexporter + alert: CPULoad + enabled: True + expr: | + 100 - avg(irate(node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}[5m])) * 100 > 80 or + node_load5{job="node", instance="{{ inventory_hostname }}"} > count without(cpu, mode) (node_cpu_seconds_total{mode="idle", job="node", instance="{{ inventory_hostname }}"}) + for: 5m + labels: + severity: critical + annotations: + title: '{%raw%}{{ $labels.instance }}: CPUCritical{%endraw%}' + description: "Very high CPU load or ussage" + SwapWarning: + group: nodeexporter + enabled: True + alert: SwapLow + expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 50 + labels: + severity: warning + annotations: + title: '{%raw%}{{ $labels.instance }}: SwapWarning{%endraw%}' + description: "Swap running full" + SwapCritical: + group: nodeexporter + enabled: True + alert: SwapLow + expr: (1 - (node_memory_SwapFree_bytes{job="node", instance="{{ inventory_hostname }}"} / node_memory_SwapTotal_bytes{job="node", instance="{{ inventory_hostname }}"})) * 100 > 80 + for: 5m + labels: + severity: critical + annotations: + title: '{%raw%}{{ $labels.instance }}: SwapCritical{%endraw%}' + description: "Swap running full" + MDADMRaidCritical: + group: nodeexporter + enabled: True + expr: | + node_md_disks{job="node", instance="{{ inventory_hostname }}"} - node_md_disks_active{job="node", instance="{{ inventory_hostname }}"} != 0 or + node_md_disks{state="fail", job="node", instance="{{ inventory_hostname }}"} > 0 or + node_md_state{state="inactive", job="node", instance="{{ inventory_hostname }}"} > 0 + for: 5m + labels: + severity: critical + annotations: + title: '{%raw%}{{ $labels.instance }}: MDADMRaidCritical{%endraw%}' + description: "Raid is missing a disk" + OOM: + group: nodeexporter + enabled: True + expr: increase(node_vmstat_oom_kill{job="node", instance="{{ inventory_hostname }}"}[10m]) > 0 + for: 1m + labels: + severity: critical + annotations: + title: '{%raw%}{{ $labels.instance }}: OOM{%endraw%}' + description: "OOM killer run" + TempWarning: + group: nodeexporter + enabled: True + alert: TempHigh + expr: | + node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 80 + for: 5m + labels: + severity: warning + annotations: + title: '{%raw%}{{ $labels.instance }}: TempWarning{%endraw%}' + description: "Temperatur too high" + TempCritical: + group: nodeexporter + enabled: True + alert: TempHigh + expr: | + node_hwmon_temp_alarm{job="node", instance="{{ inventory_hostname }}"} > 0 or + node_hwmon_temp_celsius{job="node", instance="{{ inventory_hostname }}"} > 90 + for: 5m + labels: + severity: critical + annotations: + title: '{%raw%}{{ $labels.instance }}: TempCritical{%endraw%}' + description: "Too hot!" + tls: mode: stunnel manage: yes @@ -12,5 +202,5 @@ prometheus_agent: scrape_timeout: ~ scrape_interval: ~ scrapers: {} - ansible_groups_as_labels: False + ansible_groups_as_labels: True labels: {} diff --git a/tasks/main.yml b/tasks/main.yml index 1c01612..2bd81e8 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -62,7 +62,7 @@ dest: "/etc/prometheus/targetcerts/{{ inventory_hostname }}.crt" - set_fact: - labels_ansible_groups: '{ {% for g in group_names %}"ansible_group_{{g}}": True{% if not loop.last %}, {% endif %}{% endfor %} }' + labels_ansible_groups: '{ {% for g in group_names %}"ansible_group_{{g}}": 1{% if not loop.last %}, {% endif %}{% endfor %} }' - set_fact: merged_prometheus_labels: "{{ {}|combine((labels_ansible_groups if prometheus_agent.ansible_groups_as_labels else {}), prometheus_agent.labels) }}" - name: setup scraper @@ -71,3 +71,9 @@ template: src: node-scraper.j2 dest: /etc/prometheus/conf.d/scrape_configs/agent_{{ inventory_hostname }}.conf +- name: setup alerts + loop: "{{ prometheus_agent.scrapers.keys()|list }}" + delegate_to: "{{ item }}" + template: + src: node-alerts.j2 + dest: /etc/prometheus/conf.d/rule_files/agent_{{ inventory_hostname }}.conf diff --git a/templates/node-alerts.j2 b/templates/node-alerts.j2 new file mode 100644 index 0000000..c38d15b --- /dev/null +++ b/templates/node-alerts.j2 @@ -0,0 +1,10 @@ +groups: +{% for groupname, alerts in prometheus_agent.alerts|dict2items|groupby("value.group") %} +- name: "{{inventory_hostname}}: {{ groupname }}" + rules: +{% for alertdict in alerts if alertdict.value.enabled %} + - alert: {{ alertdict.value.alert|d(alertdict.key) }} +{% set alert=alertdict.value|dict2items|rejectattr('key', 'in', ["enabled", "group", "alert"])|list|items2dict %} +{{ alert|to_nice_yaml(indent=2, width=9999)|indent(width=4, first=True) }} +{% endfor %} +{% endfor %}