Initial commit

2020-06-02 17:35:56 +02:00 · 2020-06-02 17:35:56 +02:00 · 0f748dd02a
commit 0f748dd02a
4 changed files with 136 additions and 0 deletions
--- a/defaults/main.yml
+++ b/defaults/main.yml
@ -0,0 +1,49 @@
 prometheus_alertmanager:
  args:
    "web.listen-address": "[::1]:9093"
  config:
    global:
      # The smarthost and SMTP sender used for mail notifications.
      smtp_from: 'alertmanager@{{ inventory_hostname }}'
    templates:
    - '/etc/prometheus/alertmanager_templates/*.tmpl'
    route:
      # The labels by which incoming alerts are grouped together. For example,
      # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
      # be batched into a single group.
      group_by: ['alertname', 'cluster', 'service']
      # When a new group of alerts is created by an incoming alert, wait at
      # least 'group_wait' to send the initial notification.
      # This way ensures that you get multiple alerts for the same group that start
      # firing shortly after another are batched together on the first
      # notification.
      group_wait: 30s
      # When the first notification was sent, wait 'group_interval' to send a batch
      # of new alerts that started firing for that group.
      group_interval: 5m
      # If an alert has successfully been sent, wait 'repeat_interval' to
      # resend them.
      repeat_interval: 3h
      # A default receiver
      receiver: mail-default
      # All the above attributes are inherited by all child routes and can
      # overwritten on each.
      # The child route trees.
      routes: []
    # Inhibition rules allow to mute a set of alerts given that another alert is
    # firing.
    # We use this to mute any warning-level notifications if the same alert is
    # already critical.
    inhibit_rules:
    - source_match:
        severity: 'critical'
      target_match:
        severity: 'warning'
      # Apply inhibition if the alertname is the same.
      equal: ['alertname', 'cluster', 'service']
    receivers:
    - name: "blackhole"
    - name: 'mail-default'
      send_resolved: True
      email_configs:
      - to: 'root@localhost'
--- a/handlers/main.yml
+++ b/handlers/main.yml
@ -0,0 +1,4 @@
 - name: restart alertmanager
  service:
    name: prometheus-alertmanager
    state: restarted
--- a/tasks/main.yml
+++ b/tasks/main.yml
@ -0,0 +1,18 @@
 - name: install alertmanager
  apt:
    pkg: prometheus-alertmanager
 - name: wrtie alertmanager service config
  notify: restart alertmanager
  template:
    src: prometheus-alertmanager.j2
    dest: /etc/default/prometheus-alertmanager
 - name: wrtie alertmanager config
  notify: restart alertmanager
  copy:
    owner: root
    group: root
    mode: 0644
    dest: /etc/prometheus/alertmanager.yml
    content: "{{ prometheus_alertmanager.config|to_nice_yaml(indent=2) }}"
--- a/templates/prometheus-alertmanager.j2
+++ b/templates/prometheus-alertmanager.j2
@ -0,0 +1,65 @@
 # Set the command-line arguments to pass to the server.
 ARGS="{% for i in prometheus_alertmanager.args %} --{{ i }}{% if prometheus_alertmanager.args[i] and prometheus_alertmanager.args[i] != {} %}='{{ prometheus_alertmanager.args[i] }}'{% endif %} {% endfor %}"
 # The alert manager supports the following options:
 #  --config.file="/etc/prometheus/alertmanager.yml"
 #       Alertmanager configuration file name.
 #  --storage.path="/var/lib/prometheus/alertmanager/"
 #       Base path for data storage.
 #  --data.retention=120h
 #       How long to keep data for.
 #  --alerts.gc-interval=30m
 #       Interval between alert GC.
 #  --log.level=info
 #       Only log messages with the given severity or above.
 #  --web.external-url=WEB.EXTERNAL-URL
 #       The URL under which Alertmanager is externally reachable (for example,
 #       if Alertmanager is served via a reverse proxy). Used for generating
 #       relative and absolute links back to Alertmanager itself. If the URL has
 #       a path portion, it will be used to prefix all HTTP endpoints served by
 #       Alertmanager. If omitted, relevant URL components will be derived
 #       automatically.
 #  --web.route-prefix=WEB.ROUTE-PREFIX
 #       Prefix for the internal routes of web endpoints. Defaults to path of
 #       --web.external-url.
 #  --web.listen-address=":9093"
 #       Address to listen on for the web interface and API.
 #  --web.ui-path="/usr/share/prometheus/alertmanager/ui/"
 #       Path to static UI directory.
 #  --template.default="/usr/share/prometheus/alertmanager/default.tmpl"
 #       Path to default notification template.
 #  --cluster.listen-address="0.0.0.0:9094"
 #       Listen address for cluster.
 #  --cluster.advertise-address=CLUSTER.ADVERTISE-ADDRESS
 #       Explicit address to advertise in cluster.
 #  --cluster.peer=CLUSTER.PEER ...
 #       Initial peers (may be repeated).
 #  --cluster.peer-timeout=15s
 #       Time to wait between peers to send notifications.
 #  --cluster.gossip-interval=200ms
 #       Interval between sending gossip messages. By lowering this value (more
 #       frequent) gossip messages are propagated across the cluster more
 #       quickly at the expense of increased bandwidth.
 #  --cluster.pushpull-interval=1m0s
 #       Interval for gossip state syncs. Setting this interval lower (more
 #       frequent) will increase convergence speeds across larger clusters at
 #       the expense of increased bandwidth usage.
 #  --cluster.tcp-timeout=10s  Timeout for establishing a stream connection
 #       with a remote node for a full state sync, and for stream read and write
 #       operations.
 #  --cluster.probe-timeout=500ms
 #       Timeout to wait for an ack from a probed node before assuming it is
 #       unhealthy. This should be set to 99-percentile of RTT (round-trip time)
 #       on your network.
 #  --cluster.probe-interval=1s
 #       Interval between random node probes. Setting this lower (more frequent)
 #       will cause the cluster to detect failed nodes more quickly at the
 #       expense of increased bandwidth usage.
 #  --cluster.settle-timeout=1m0s
 #       Maximum time to wait for cluster connections to settle before
 #       evaluating notifications.
 #  --cluster.reconnect-interval=10s
 #       Interval between attempting to reconnect to lost peers.
 #  --cluster.reconnect-timeout=6h0m0s
 #       Length of time to attempt to reconnect to a lost peer.