From 0f748dd02acb946eae778540334493fa0c06bb41 Mon Sep 17 00:00:00 2001 From: nd Date: Tue, 2 Jun 2020 17:35:56 +0200 Subject: [PATCH] Initial commit --- defaults/main.yml | 49 +++++++++++++++++++++ handlers/main.yml | 4 ++ tasks/main.yml | 18 ++++++++ templates/prometheus-alertmanager.j2 | 65 ++++++++++++++++++++++++++++ 4 files changed, 136 insertions(+) create mode 100644 defaults/main.yml create mode 100644 handlers/main.yml create mode 100644 tasks/main.yml create mode 100644 templates/prometheus-alertmanager.j2 diff --git a/defaults/main.yml b/defaults/main.yml new file mode 100644 index 0000000..d2c306d --- /dev/null +++ b/defaults/main.yml @@ -0,0 +1,49 @@ +prometheus_alertmanager: + args: + "web.listen-address": "[::1]:9093" + config: + global: + # The smarthost and SMTP sender used for mail notifications. + smtp_from: 'alertmanager@{{ inventory_hostname }}' + templates: + - '/etc/prometheus/alertmanager_templates/*.tmpl' + route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service'] + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + # A default receiver + receiver: mail-default + # All the above attributes are inherited by all child routes and can + # overwritten on each. + # The child route trees. + routes: [] + # Inhibition rules allow to mute a set of alerts given that another alert is + # firing. + # We use this to mute any warning-level notifications if the same alert is + # already critical. + inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'cluster', 'service'] + receivers: + - name: "blackhole" + - name: 'mail-default' + send_resolved: True + email_configs: + - to: 'root@localhost' diff --git a/handlers/main.yml b/handlers/main.yml new file mode 100644 index 0000000..b2d5e92 --- /dev/null +++ b/handlers/main.yml @@ -0,0 +1,4 @@ +- name: restart alertmanager + service: + name: prometheus-alertmanager + state: restarted diff --git a/tasks/main.yml b/tasks/main.yml new file mode 100644 index 0000000..38d966b --- /dev/null +++ b/tasks/main.yml @@ -0,0 +1,18 @@ +- name: install alertmanager + apt: + pkg: prometheus-alertmanager + +- name: wrtie alertmanager service config + notify: restart alertmanager + template: + src: prometheus-alertmanager.j2 + dest: /etc/default/prometheus-alertmanager + +- name: wrtie alertmanager config + notify: restart alertmanager + copy: + owner: root + group: root + mode: 0644 + dest: /etc/prometheus/alertmanager.yml + content: "{{ prometheus_alertmanager.config|to_nice_yaml(indent=2) }}" diff --git a/templates/prometheus-alertmanager.j2 b/templates/prometheus-alertmanager.j2 new file mode 100644 index 0000000..b7ac709 --- /dev/null +++ b/templates/prometheus-alertmanager.j2 @@ -0,0 +1,65 @@ +# Set the command-line arguments to pass to the server. +ARGS="{% for i in prometheus_alertmanager.args %} --{{ i }}{% if prometheus_alertmanager.args[i] and prometheus_alertmanager.args[i] != {} %}='{{ prometheus_alertmanager.args[i] }}'{% endif %} {% endfor %}" + +# The alert manager supports the following options: + +# --config.file="/etc/prometheus/alertmanager.yml" +# Alertmanager configuration file name. +# --storage.path="/var/lib/prometheus/alertmanager/" +# Base path for data storage. +# --data.retention=120h +# How long to keep data for. +# --alerts.gc-interval=30m +# Interval between alert GC. +# --log.level=info +# Only log messages with the given severity or above. +# --web.external-url=WEB.EXTERNAL-URL +# The URL under which Alertmanager is externally reachable (for example, +# if Alertmanager is served via a reverse proxy). Used for generating +# relative and absolute links back to Alertmanager itself. If the URL has +# a path portion, it will be used to prefix all HTTP endpoints served by +# Alertmanager. If omitted, relevant URL components will be derived +# automatically. +# --web.route-prefix=WEB.ROUTE-PREFIX +# Prefix for the internal routes of web endpoints. Defaults to path of +# --web.external-url. +# --web.listen-address=":9093" +# Address to listen on for the web interface and API. +# --web.ui-path="/usr/share/prometheus/alertmanager/ui/" +# Path to static UI directory. +# --template.default="/usr/share/prometheus/alertmanager/default.tmpl" +# Path to default notification template. +# --cluster.listen-address="0.0.0.0:9094" +# Listen address for cluster. +# --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS +# Explicit address to advertise in cluster. +# --cluster.peer=CLUSTER.PEER ... +# Initial peers (may be repeated). +# --cluster.peer-timeout=15s +# Time to wait between peers to send notifications. +# --cluster.gossip-interval=200ms +# Interval between sending gossip messages. By lowering this value (more +# frequent) gossip messages are propagated across the cluster more +# quickly at the expense of increased bandwidth. +# --cluster.pushpull-interval=1m0s +# Interval for gossip state syncs. Setting this interval lower (more +# frequent) will increase convergence speeds across larger clusters at +# the expense of increased bandwidth usage. +# --cluster.tcp-timeout=10s Timeout for establishing a stream connection +# with a remote node for a full state sync, and for stream read and write +# operations. +# --cluster.probe-timeout=500ms +# Timeout to wait for an ack from a probed node before assuming it is +# unhealthy. This should be set to 99-percentile of RTT (round-trip time) +# on your network. +# --cluster.probe-interval=1s +# Interval between random node probes. Setting this lower (more frequent) +# will cause the cluster to detect failed nodes more quickly at the +# expense of increased bandwidth usage. +# --cluster.settle-timeout=1m0s +# Maximum time to wait for cluster connections to settle before +# evaluating notifications. +# --cluster.reconnect-interval=10s +# Interval between attempting to reconnect to lost peers. +# --cluster.reconnect-timeout=6h0m0s +# Length of time to attempt to reconnect to a lost peer.