Alerting: Support Unified Alerting with Grafana HA (#37920)

* Alerting: Support Unified Alerting in Grafana's HA mode.
4 years ago · 7db97097c9
parent 92209f1011
commit 7db97097c9
25 changed files with 6377 additions and 79 deletions
--- a/conf/defaults.ini
+++ b/conf/defaults.ini
@ -211,7 +211,7 @@ rudderstack_data_plane_url =
 # Application Insights connection string. Specify an URL string to enable this feature.
 application_insights_connection_string =

-# Optional. Specifies an Application Insights endpoint URL where the endpoint string is wrapped in backticks ``. 
+# Optional. Specifies an Application Insights endpoint URL where the endpoint string is wrapped in backticks ``.
 application_insights_endpoint_url =

 #################################### Security ############################
@ -732,7 +732,37 @@ global_alert_rule = -1
 #################################### Unified Alerting ####################
 [unified_alerting]
 # Specify the frequency of polling for admin config changes.
-admin_config_poll_interval_seconds = 60
+# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+admin_config_poll_interval = 60s
+
+# Specify the frequency of polling for Alertmanager config changes.
+# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+alertmanager_config_poll_interval = 60s
+
+# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port.
+ha_listen_address = "0.0.0.0:9094"
+
+# Explicit address/hostname and port to advertise other Grafana instances. The port is used for both TCP and UDP.
+ha_advertise_address = ""
+
+# Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting.
+ha_peers = ""
+
+# Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will
+# be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should
+# each instance wait before sending the notification to take into account replication lag.
+# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+ha_peer_timeout = 15s
+
+# The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated
+# across cluster more quickly at the expense of increased bandwidth usage.
+# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+ha_gossip_interval = 200ms
+
+# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
+# across larger clusters at the expense of increased bandwidth usage.
+# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+ha_push_pull_interval = 60s

 #################################### Alerting ############################
 [alerting]
--- a/conf/sample.ini
+++ b/conf/sample.ini
@ -709,7 +709,38 @@
 #################################### Unified Alerting ####################
 [unified_alerting]
 # Specify the frequency of polling for admin config changes.
-;admin_config_poll_interval_seconds = 60
+# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+;admin_config_poll_interval = 60s
+
+# Specify the frequency of polling for Alertmanager config changes.
+# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+;alertmanager_config_poll_interval = 60s
+
+# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`.
+;ha_listen_address = "0.0.0.0:9094"
+
+# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`.
+;ha_advertise_address = ""
+
+# Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting.
+;ha_peers = ""
+
+# Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will
+# be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should
+# each instance wait before sending the notification to take into account replication lag.
+# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+;ha_peer_timeout = "15s"
+
+# The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated
+# across cluster more quickly at the expense of increased bandwidth usage.
+# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+;ha_gossip_interval = "200ms"
+
+# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
+# across larger clusters at the expense of increased bandwidth usage.
+# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+;ha_push_pull_interval = "60s"
+

 #################################### Alerting ############################
 [alerting]
--- a/devenv/docker/ha-test-unified-alerting/.gitignore
+++ b/devenv/docker/ha-test-unified-alerting/.gitignore
@ -0,0 +1 @@
+grafana/provisioning/dashboards/alerts/alert-*
--- a/devenv/docker/ha-test-unified-alerting/README.md
+++ b/devenv/docker/ha-test-unified-alerting/README.md
@ -0,0 +1,66 @@
+# Grafana Unified Alerting High Availability (HA) test setup
+
+A set of docker compose services which together creates a Grafana HA test setup for unified alerting.
+
+Included services
+
+- Grafana
+- Mysql - Grafana configuration database, exporter for metrics and session storage
+- Prometheus - Monitoring of Grafana and used as data source
+- Nginx - Reverse proxy for Grafana and Prometheus. Enables browsing Grafana/Prometheus UI using a hostname
+
+## Prerequisites
+
+### Build grafana docker container
+
+Build a Grafana docker container from current branch and commit and tag it as grafana/grafana:dev.
+
+```bash
+$ cd <grafana repo>
+$ make build-docker-full
+```
+
+### Virtual host names
+
+#### Alternative 1 - Use dnsmasq
+
+```bash
+$ sudo apt-get install dnsmasq
+$ echo 'address=/loc/127.0.0.1' | sudo tee /etc/dnsmasq.d/dnsmasq-loc.conf > /dev/null
+$ sudo /etc/init.d/dnsmasq restart
+$ ping whatever.loc
+PING whatever.loc (127.0.0.1) 56(84) bytes of data.
+64 bytes from localhost (127.0.0.1): icmp_seq=1 ttl=64 time=0.076 ms
+--- whatever.loc ping statistics ---
+1 packet transmitted, 1 received, 0% packet loss, time 1998ms
+```
+
+#### Alternative 2 - Manually update /etc/hosts
+
+Update your `/etc/hosts` to be able to access Grafana and/or Prometheus UI using a hostname.
+
+```bash
+$ cat /etc/hosts
+127.0.0.1       grafana.loc
+127.0.0.1       prometheus.loc
+```
+
+## Start services
+
+```bash
+$ docker-compose up -d
+```
+
+Browse
+- http://grafana.loc/
+- http://prometheus.loc/
+
+
+## Test alerting
+
+### Create contact points
+TBD
+### Create alerts
+TBD
+### Create silences
+TBD
--- a/devenv/docker/ha-test-unified-alerting/docker-compose.yaml
+++ b/devenv/docker/ha-test-unified-alerting/docker-compose.yaml
@ -0,0 +1,90 @@
+version: "2.1"
+
+services:
+  db:
+    image: mysql:5.6
+    platform: linux/x86_64
+    environment:
+      MYSQL_ROOT_PASSWORD: rootpass
+      MYSQL_DATABASE: grafana
+      MYSQL_USER: grafana
+      MYSQL_PASSWORD: password
+    command: [mysqld, --character-set-server=utf8mb4, --collation-server=utf8mb4_unicode_ci, --innodb_monitor_enable=all, --max-connections=1001]
+    ports:
+      - 3306
+    healthcheck:
+      test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"]
+      timeout: 10s
+      retries: 10
+  mysqld-exporter:
+    image: prom/mysqld-exporter
+    environment:
+      - DATA_SOURCE_NAME=root:rootpass@(db:3306)/
+    ports:
+      - 9104
+    depends_on:
+      db:
+        condition: service_healthy
+  prometheus:
+    image: prom/prometheus:v2.4.2
+    volumes:
+      - ./prometheus/:/etc/prometheus/
+    environment:
+      - VIRTUAL_HOST=prometheus.loc
+    ports:
+      - 909
+  nginx-proxy:
+    image: jwilder/nginx-proxy
+    ports:
+      - "80:80"
+    volumes:
+      - /var/run/docker.sock:/tmp/docker.sock:ro
+    depends_on:
+      db:
+        condition: service_healthy
+  grafana1:
+    image: grafana/grafana:dev
+    volumes:
+      - ./grafana/provisioning/:/etc/grafana/provisioning/
+    environment:
+      - VIRTUAL_HOST=grafana.loc
+      - GF_FEATURE_TOGGLES_ENABLE=ngalert
+      - GF_UNIFIED_ALERTING_HA_PEERS=ha-test-unified-alerting_grafana2_1:9094,ha-test-unified-alerting_grafana1_1:9094
+      - GF_SERVER_ROOT_URL=http://grafana.loc
+      - GF_DATABASE_NAME=grafana
+      - GF_DATABASE_USER=grafana
+      - GF_DATABASE_PASSWORD=password
+      - GF_DATABASE_TYPE=mysql
+      - GF_DATABASE_HOST=db:3306
+      - GF_DATABASE_MAX_OPEN_CONN=300
+      - GF_SESSION_PROVIDER=mysql
+      - GF_SESSION_PROVIDER_CONFIG=grafana:password@tcp(db:3306)/grafana?allowNativePasswords=true
+    ports:
+      - 3010:3000
+    depends_on:
+      db:
+        condition: service_healthy
+
+  grafana2:
+    image: grafana/grafana:dev
+    volumes:
+      - ./grafana/provisioning/:/etc/grafana/provisioning/
+    environment:
+      - VIRTUAL_HOST=grafana.loc
+      - GF_FEATURE_TOGGLES_ENABLE=ngalert
+      - GF_UNIFIED_ALERTING_HA_PEERS=ha-test-unified-alerting_grafana2_1:9094,ha-test-unified-alerting_grafana1_1:9094
+      - GF_SERVER_ROOT_URL=http://grafana.loc
+      - GF_DATABASE_NAME=grafana
+      - GF_DATABASE_USER=grafana
+      - GF_DATABASE_PASSWORD=password
+      - GF_DATABASE_TYPE=mysql
+      - GF_DATABASE_HOST=db:3306
+      - GF_DATABASE_MAX_OPEN_CONN=300
+      - GF_SESSION_PROVIDER=mysql
+      - GF_SESSION_PROVIDER_CONFIG=grafana:password@tcp(db:3306)/grafana?allowNativePasswords=true
+    ports:
+      - 3020:3000
+    depends_on:
+      db:
+        condition: service_healthy
+
--- a/devenv/docker/ha-test-unified-alerting/grafana/provisioning/alerts.jsonnet
+++ b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/alerts.jsonnet
@ -0,0 +1,203 @@
+local numAlerts = std.extVar('alerts');
+local condition = std.extVar('condition');
+local arr = std.range(1, numAlerts);
+
+local alertDashboardTemplate = {
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "alert": {
+        "conditions": [
+          {
+            "evaluator": {
+              "params": [
+                65
+              ],
+              "type": "gt"
+            },
+            "operator": {
+              "type": "and"
+            },
+            "query": {
+              "params": [
+                "A",
+                "5m",
+                "now"
+              ]
+            },
+            "reducer": {
+              "params": [],
+              "type": "avg"
+            },
+            "type": "query"
+          }
+        ],
+        "executionErrorState": "alerting",
+        "frequency": "10s",
+        "handler": 1,
+        "for": "1m",
+        "name": "bulk alerting",
+        "noDataState": "no_data",
+        "notifications": [
+          {
+            "id": 2
+          }
+        ]
+      },
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "Prometheus",
+      "fill": 1,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "$$hashKey": "object:117",
+          "expr": "go_goroutines",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "refId": "A"
+        }
+      ],
+      "thresholds": [
+        {
+          "colorMode": "critical",
+          "fill": true,
+          "line": true,
+          "op": "gt",
+          "value": 50
+        }
+      ],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Panel Title",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ]
+    }
+  ],
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "New dashboard",
+  "uid": null,
+  "version": 0
+};
+
+
+{
+  ['alert-' + std.toString(x) + '.json']:
+    alertDashboardTemplate + {
+      panels: [
+        alertDashboardTemplate.panels[0] +
+        {
+          alert+: {
+            name: 'Alert rule ' + x,
+            conditions: [
+              alertDashboardTemplate.panels[0].alert.conditions[0] +
+              {
+                evaluator+: {
+                  params: [condition]
+                }
+              },
+            ],
+          },
+        },
+      ],
+      uid: 'alert-' + x,
+      title: 'Alert ' + x
+    },
+      for x in arr
+}
--- a/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/alerts/overview.json
+++ b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/alerts/overview.json
@ -0,0 +1,172 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "-- Grafana --",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "links": [],
+  "panels": [
+    {
+      "aliasColors": {
+        "Active alerts": "#bf1b00"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": "Prometheus",
+      "fill": 1,
+      "gridPos": {
+        "h": 12,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "interval": "",
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": true,
+        "max": false,
+        "min": false,
+        "rightSide": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 2,
+      "links": [],
+      "nullPointMode": "null",
+      "percentage": false,
+      "pointradius": 5,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [
+        {
+          "alias": "Active grafana instances",
+          "dashes": true,
+          "fill": 0
+        }
+      ],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(increase(grafana_alerting_notification_sent_total[1m])) by(job)",
+          "format": "time_series",
+          "instant": false,
+          "interval": "1m",
+          "intervalFactor": 1,
+          "legendFormat": "Notifications sent",
+          "refId": "A"
+        },
+        {
+          "expr": "min(grafana_alerting_active_alerts) without(instance)",
+          "format": "time_series",
+          "interval": "1m",
+          "intervalFactor": 1,
+          "legendFormat": "Active alerts",
+          "refId": "B"
+        },
+        {
+          "expr": "count(up{job=\"grafana\"})",
+          "format": "time_series",
+          "intervalFactor": 1,
+          "legendFormat": "Active grafana instances",
+          "refId": "C"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "Notifications sent vs active alerts",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": 3
+      }
+    }
+  ],
+  "schemaVersion": 16,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m",
+      "15m",
+      "30m",
+      "1h",
+      "2h",
+      "1d"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h",
+      "2d",
+      "7d",
+      "30d"
+    ]
+  },
+  "timezone": "",
+  "title": "Overview",
+  "uid": "xHy7-hAik",
+  "version": 6
+}
--- a/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/dashboards.yaml
+++ b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/dashboards.yaml
@ -0,0 +1,14 @@
+apiVersion: 1
+
+providers:
+ - name: 'Alerts'
+   folder: 'Alerts'
+   type: file
+   options:
+     path: /etc/grafana/provisioning/dashboards/alerts
+
+ - name: 'MySQL'
+   folder: 'MySQL'
+   type: file
+   options:
+     path: /etc/grafana/provisioning/dashboards/mysql
--- a/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/mysql/overview.json
+++ b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/mysql/overview.json
--- a/devenv/docker/ha-test-unified-alerting/grafana/provisioning/datasources/datasources.yaml
+++ b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/datasources/datasources.yaml
@ -0,0 +1,16 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    jsonData:
+      timeInterval: 10s
+      queryTimeout: 30s
+      httpMethod: POST
+
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
--- a/devenv/docker/ha-test-unified-alerting/prometheus/prometheus.yml
+++ b/devenv/docker/ha-test-unified-alerting/prometheus/prometheus.yml
@ -0,0 +1,47 @@
+# my global config
+global:
+  scrape_interval:     10s # By default, scrape targets every 15 seconds.
+  evaluation_interval: 10s # By default, scrape targets every 15 seconds.
+  # scrape_timeout is set to the global default (10s).
+
+# Load and evaluate rules in this file every 'evaluation_interval' seconds.
+#rule_files:
+# - "alert.rules"
+# - "first.rules"
+# - "second.rules"
+
+# alerting:
+#   alertmanagers:
+#   - scheme: http
+#     static_configs:
+#     - targets:
+#       - "127.0.0.1:9093"
+
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: 'grafana'
+    dns_sd_configs:
+      - names:
+        - 'grafana'
+        type: 'A'
+        port: 3000
+        refresh_interval: 10s
+
+  - job_name: 'mysql'
+    dns_sd_configs:
+      - names:
+        - 'mysqld-exporter'
+        type: 'A'
+        port: 9104
+        refresh_interval: 10s
+
+  - job_name: 'loki'
+    dns_sd_configs:
+      - names:
+        - 'loki'
+        type: 'A'
+        port: 3100
+        refresh_interval: 10s
--- a/docs/sources/administration/configuration.md
+++ b/docs/sources/administration/configuration.md
@ -1119,9 +1119,51 @@ Sets a global limit on number of alert rules that can be created. Default is -1

 For more information about the Grafana 8 alerts, refer to [Unified Alerting]({{< relref "../alerting/unified-alerting/_index.md" >}}).

-### admin_config_poll_interval_seconds
+### admin_config_poll_interval

-Specify the frequency of polling for admin config changes. The default value is `60`.
+Specify the frequency of polling for admin config changes. The default value is `60s`.
+
+The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+
+### alertmanager_config_poll_interval
+
+Specify the frequency of polling for Alertmanager config changes. The default value is `60s`.
+
+The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+
+### ha_listen_address
+
+Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`.
+
+### ha_advertise_address
+
+Explicit address/hostname and port to advertise other Grafana instances. The port is used for both TCP and UDP.
+
+### ha_peers
+
+Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting.
+
+### ha_peer_timeout
+
+Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will
+be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should
+each instance wait before sending the notification to take into account replication lag. The default value is `15s`.
+
+The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+
+### ha_gossip_interval
+
+The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated
+across cluster more quickly at the expense of increased bandwidth usage. The default value is `200ms`.
+
+The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+
+### ha_push_pull_interval
+
+The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
+across larger clusters at the expense of increased bandwidth usage. The default value is `60s`.
+
+The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

 <hr>

--- a/pkg/services/ngalert/metrics/ngalert.go
+++ b/pkg/services/ngalert/metrics/ngalert.go
@ -52,6 +52,7 @@ type Scheduler struct {
 }

 type MultiOrgAlertmanager struct {
+	Registerer               prometheus.Registerer
 	ActiveConfigurations     prometheus.Gauge
 	DiscoveredConfigurations prometheus.Gauge
 	registries               *OrgRegistries
@ -178,6 +179,7 @@ func newStateMetrics(r prometheus.Registerer) *State {

 func newMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
 	return &MultiOrgAlertmanager{
+		Registerer: r,
 		registries: NewOrgRegistries(),
 		DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
 			Namespace: Namespace,
--- a/pkg/services/ngalert/ngalert.go
+++ b/pkg/services/ngalert/ngalert.go
@ -84,6 +84,8 @@ type AlertNG struct {
 }

 func (ng *AlertNG) init() error {
+	var err error
+
 	baseInterval := ng.Cfg.AlertingBaseInterval
 	if baseInterval <= 0 {
 		baseInterval = defaultBaseIntervalSeconds
@ -97,7 +99,11 @@ func (ng *AlertNG) init() error {
 		Logger:                 ng.Log,
 	}

-	ng.MultiOrgAlertmanager = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore, ng.Metrics.GetMultiOrgAlertmanagerMetrics())
+	multiOrgMetrics := ng.Metrics.GetMultiOrgAlertmanagerMetrics()
+	ng.MultiOrgAlertmanager, err = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore, multiOrgMetrics, log.New("ngalert.multiorg.alertmanager"))
+	if err != nil {
+		return err
+	}

 	// Let's make sure we're able to complete an initial sync of Alertmanagers before we start the alerting components.
 	if err := ng.MultiOrgAlertmanager.LoadAndSyncAlertmanagersForOrgs(context.Background()); err != nil {
--- a/pkg/services/ngalert/notifier/alertmanager.go
+++ b/pkg/services/ngalert/notifier/alertmanager.go
@ -15,6 +15,7 @@ import (

 	gokit_log "github.com/go-kit/kit/log"
 	amv2 "github.com/prometheus/alertmanager/api/v2/models"
+	"github.com/prometheus/alertmanager/cluster"
 	"github.com/prometheus/alertmanager/dispatch"
 	"github.com/prometheus/alertmanager/inhibit"
 	"github.com/prometheus/alertmanager/nflog"
@ -24,6 +25,7 @@ import (
 	"github.com/prometheus/alertmanager/silence"
 	"github.com/prometheus/alertmanager/template"
 	"github.com/prometheus/alertmanager/types"
+	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/common/model"

 	"github.com/grafana/grafana/pkg/components/securejsondata"
@ -77,9 +79,16 @@ const (
 `
 )

+type ClusterPeer interface {
+	AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel
+	Position() int
+	WaitReady(context.Context) error
+}
+
 type Alertmanager struct {
 	logger      log.Logger
 	gokitLogger gokit_log.Logger
+	OrgID       int64

 	Settings  *setting.Cfg
 	Store     store.AlertingStore
@ -90,6 +99,8 @@ type Alertmanager struct {
 	marker          types.Marker
 	alerts          *mem.Alerts
 	route           *dispatch.Route
+	peer            ClusterPeer
+	peerTimeout     time.Duration

 	dispatcher *dispatch.Dispatcher
 	inhibitor  *inhibit.Inhibitor
@ -111,7 +122,7 @@ type Alertmanager struct {
 	orgID           int64
 }

-func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Alertmanager) (*Alertmanager, error) {
+func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, peer ClusterPeer, m *metrics.Alertmanager) (*Alertmanager, error) {
 	am := &Alertmanager{
 		Settings:          cfg,
 		stopc:             make(chan struct{}),
@ -120,6 +131,8 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k
 		stageMetrics:      notify.NewMetrics(m.Registerer),
 		dispatcherMetrics: dispatch.NewDispatcherMetrics(false, m.Registerer),
 		Store:             store,
+		peer:              peer,
+		peerTimeout:       cfg.HAPeerTimeout,
 		Metrics:           m,
 		orgID:             orgID,
 	}
@ -148,6 +161,9 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k
 	if err != nil {
 		return nil, fmt.Errorf("unable to initialize the notification log component of alerting: %w", err)
 	}
+	c := am.peer.AddState(fmt.Sprintf("notificationlog:%d", am.OrgID), am.notificationLog, m.Registerer)
+	am.notificationLog.SetBroadcast(c.Broadcast)
+
 	// Initialize silences
 	am.silences, err = silence.New(silence.Options{
 		Metrics:      m.Registerer,
@ -158,6 +174,9 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k
 		return nil, fmt.Errorf("unable to initialize the silencing component of alerting: %w", err)
 	}

+	c = am.peer.AddState(fmt.Sprintf("silences:%d", am.OrgID), am.silences, m.Registerer)
+	am.silences.SetBroadcast(c.Broadcast)
+
 	am.wg.Add(1)
 	go func() {
 		am.silences.Maintenance(15*time.Minute, silencesFilePath, am.stopc, func() (int64, error) {
@ -392,15 +411,16 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig
 	am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, am.gokitLogger)
 	am.silencer = silence.NewSilencer(am.silences, am.marker, am.gokitLogger)

+	meshStage := notify.NewGossipSettleStage(am.peer)
 	inhibitionStage := notify.NewMuteStage(am.inhibitor)
 	silencingStage := notify.NewMuteStage(am.silencer)
 	for name := range integrationsMap {
-		stage := am.createReceiverStage(name, integrationsMap[name], waitFunc, am.notificationLog)
-		routingStage[name] = notify.MultiStage{silencingStage, inhibitionStage, stage}
+		stage := am.createReceiverStage(name, integrationsMap[name], am.waitFunc, am.notificationLog)
+		routingStage[name] = notify.MultiStage{meshStage, silencingStage, inhibitionStage, stage}
 	}

 	am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil)
-	am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics)
+	am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, am.timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics)

 	am.wg.Add(1)
 	go func() {
@ -701,21 +721,17 @@ func (am *Alertmanager) createReceiverStage(name string, integrations []notify.I
 	return fs
 }

-func waitFunc() time.Duration {
-	// When it's a single instance, we don't need additional wait. The routing policies will have their own group wait.
-	// We need >0 wait here in case we have peers to sync the notification state with. 0 wait in that case can result
-	// in duplicate notifications being sent.
-	// TODO: we have setting.AlertingNotificationTimeout in legacy settings. Either use that or separate set of config
-	// for clustering with intuitive name, like "PeerTimeout".
-	return 0
+func (am *Alertmanager) waitFunc() time.Duration {
+	return time.Duration(am.peer.Position()) * am.peerTimeout
 }

-func timeoutFunc(d time.Duration) time.Duration {
-	//TODO: What does MinTimeout means here?
+func (am *Alertmanager) timeoutFunc(d time.Duration) time.Duration {
+	// time.Duration d relates to the receiver's group_interval. Even with a group interval of 1s,
+	// we need to make sure (non-position-0) peers in the cluster wait before flushing the notifications.
 	if d < notify.MinTimeout {
 		d = notify.MinTimeout
 	}
-	return d + waitFunc()
+	return d + am.waitFunc()
 }

 type nilLimits struct{}
--- a/pkg/services/ngalert/notifier/alertmanager_test.go
+++ b/pkg/services/ngalert/notifier/alertmanager_test.go
@ -48,7 +48,7 @@ func setupAMTest(t *testing.T) *Alertmanager {
 	}

 	kvStore := newFakeKVStore(t)
-	am, err := newAlertmanager(1, cfg, s, kvStore, m)
+	am, err := newAlertmanager(1, cfg, s, kvStore, &NilPeer{}, m)
 	require.NoError(t, err)
 	return am
 }
--- a/pkg/services/ngalert/notifier/multiorg_alertmanager.go
+++ b/pkg/services/ngalert/notifier/multiorg_alertmanager.go
@ -6,6 +6,12 @@ import (
 	"sync"
 	"time"

+	"github.com/grafana/grafana/pkg/services/ngalert/logging"
+
+	gokit_log "github.com/go-kit/kit/log"
+	"github.com/prometheus/alertmanager/cluster"
+	"github.com/prometheus/client_golang/prometheus"
+
 	"github.com/grafana/grafana/pkg/infra/kvstore"
 	"github.com/grafana/grafana/pkg/infra/log"
 	"github.com/grafana/grafana/pkg/services/ngalert/metrics"
@ -14,7 +20,6 @@ import (
 )

 var (
-	SyncOrgsPollInterval    = 1 * time.Minute
 	ErrNoAlertmanagerForOrg = fmt.Errorf("Alertmanager does not exist for this organization")
 	ErrAlertmanagerNotReady = fmt.Errorf("Alertmanager is not ready yet")
 )
@ -26,6 +31,10 @@ type MultiOrgAlertmanager struct {
 	settings *setting.Cfg
 	logger   log.Logger

+	// clusterPeer represents the clustering peers of Alertmanagers between Grafana instances.
+	peer         ClusterPeer
+	settleCancel context.CancelFunc
+
 	configStore store.AlertingStore
 	orgStore    store.OrgStore
 	kvStore     kvstore.KVStore
@ -33,16 +42,52 @@ type MultiOrgAlertmanager struct {
 	metrics *metrics.MultiOrgAlertmanager
 }

-func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore, m *metrics.MultiOrgAlertmanager) *MultiOrgAlertmanager {
-	return &MultiOrgAlertmanager{
+func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore, m *metrics.MultiOrgAlertmanager, l log.Logger) (*MultiOrgAlertmanager, error) {
+	moa := &MultiOrgAlertmanager{
+		logger:        l,
 		settings:      cfg,
-		logger:        log.New("multiorg.alertmanager"),
 		alertmanagers: map[int64]*Alertmanager{},
 		configStore:   configStore,
 		orgStore:      orgStore,
 		kvStore:       kvStore,
 		metrics:       m,
 	}
+
+	clusterLogger := gokit_log.With(gokit_log.NewLogfmtLogger(logging.NewWrapper(l)), "component", "cluster")
+	moa.peer = &NilPeer{}
+	if len(cfg.HAPeers) > 0 {
+		peer, err := cluster.Create(
+			clusterLogger,
+			m.Registerer,
+			cfg.HAListenAddr,
+			cfg.HAAdvertiseAddr,
+			cfg.HAPeers, // peers
+			true,
+			cfg.HAPushPullInterval,
+			cfg.HAGossipInterval,
+			cluster.DefaultTcpTimeout,
+			cluster.DefaultProbeTimeout,
+			cluster.DefaultProbeInterval,
+			nil,
+		)
+
+		if err != nil {
+			return nil, fmt.Errorf("unable to initialize gossip mesh: %w", err)
+		}
+
+		err = peer.Join(cluster.DefaultReconnectInterval, cluster.DefaultReconnectTimeout)
+		if err != nil {
+			l.Error("msg", "unable to join gossip mesh while initializing cluster for high availability mode", "err", err)
+		}
+		// Attempt to verify the number of peers for 30s every 2s. The risk here is what we send a notification "too soon".
+		// Which should _never_ happen given we share the notification log via the database so the risk of double notification is very low.
+		var ctx context.Context
+		ctx, moa.settleCancel = context.WithTimeout(context.Background(), 30*time.Second)
+		go peer.Settle(ctx, cluster.DefaultGossipInterval*10)
+		moa.peer = peer
+	}
+
+	return moa, nil
 }

 func (moa *MultiOrgAlertmanager) Run(ctx context.Context) error {
@ -53,7 +98,7 @@ func (moa *MultiOrgAlertmanager) Run(ctx context.Context) error {
 		case <-ctx.Done():
 			moa.StopAndWait()
 			return nil
-		case <-time.After(SyncOrgsPollInterval):
+		case <-time.After(moa.settings.AlertmanagerConfigPollInterval):
 			if err := moa.LoadAndSyncAlertmanagersForOrgs(ctx); err != nil {
 				moa.logger.Error("error while synchronizing Alertmanager orgs", "err", err)
 			}
@ -90,7 +135,7 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(orgIDs []int64) {
 			// To export them, we need to translate the metrics from each individual registry and,
 			// then aggregate them on the main registry.
 			m := metrics.NewAlertmanagerMetrics(moa.metrics.GetOrCreateOrgRegistry(orgID))
-			am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, m)
+			am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, moa.peer, m)
 			if err != nil {
 				moa.logger.Error("unable to create Alertmanager for org", "org", orgID, "err", err)
 			}
@ -130,6 +175,14 @@ func (moa *MultiOrgAlertmanager) StopAndWait() {
 	for _, am := range moa.alertmanagers {
 		am.StopAndWait()
 	}
+
+	p, ok := moa.peer.(*cluster.Peer)
+	if ok {
+		moa.settleCancel()
+		if err := p.Leave(10 * time.Second); err != nil {
+			moa.logger.Warn("unable to leave the gossip mesh", "err", err)
+		}
+	}
 }

 // AlertmanagerFor returns the Alertmanager instance for the organization provided.
@ -150,3 +203,16 @@ func (moa *MultiOrgAlertmanager) AlertmanagerFor(orgID int64) (*Alertmanager, er

 	return orgAM, nil
 }
+
+// NilPeer and NilChannel implements the Alertmanager clustering interface.
+type NilPeer struct{}
+
+func (p *NilPeer) Position() int                   { return 0 }
+func (p *NilPeer) WaitReady(context.Context) error { return nil }
+func (p *NilPeer) AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel {
+	return &NilChannel{}
+}
+
+type NilChannel struct{}
+
+func (c *NilChannel) Broadcast([]byte) {}
--- a/pkg/services/ngalert/notifier/multiorg_alertmanager_test.go
+++ b/pkg/services/ngalert/notifier/multiorg_alertmanager_test.go
@ -8,6 +8,7 @@ import (
 	"testing"
 	"time"

+	"github.com/grafana/grafana/pkg/infra/log"
 	"github.com/grafana/grafana/pkg/services/ngalert/metrics"
 	"github.com/grafana/grafana/pkg/services/ngalert/models"
 	"github.com/grafana/grafana/pkg/setting"
@ -18,7 +19,6 @@ import (
 )

 func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
-	t.Skipf("Skipping multiorg alertmanager tests for now")
 	configStore := &FakeConfigStore{
 		configs: map[int64]*models.AlertConfiguration{},
 	}
@ -28,12 +28,15 @@ func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {

 	tmpDir, err := ioutil.TempDir("", "test")
 	require.NoError(t, err)
-
-	SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
 	kvStore := newFakeKVStore(t)
 	reg := prometheus.NewPedanticRegistry()
 	m := metrics.NewNGAlert(reg)
-	mam := NewMultiOrgAlertmanager(&setting.Cfg{DataPath: tmpDir}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics())
+	cfg := &setting.Cfg{
+		DataPath:                       tmpDir,
+		AlertmanagerConfigPollInterval: 3 * time.Minute, // do not poll in tests
+	}
+	mam, err := NewMultiOrgAlertmanager(cfg, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics(), log.New("testlogger"))
+	require.NoError(t, err)
 	ctx := context.Background()

 	t.Cleanup(cleanOrgDirectories(tmpDir, t))
@ -82,22 +85,23 @@ grafana_alerting_discovered_configurations 4
 }

 func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {
-	t.Skipf("Skipping multiorg alertmanager tests for now")
 	configStore := &FakeConfigStore{
 		configs: map[int64]*models.AlertConfiguration{},
 	}
 	orgStore := &FakeOrgStore{
 		orgs: []int64{1, 2, 3},
 	}
-
 	tmpDir, err := ioutil.TempDir("", "test")
 	require.NoError(t, err)
-
-	SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests.
+	cfg := &setting.Cfg{
+		DataPath:                       tmpDir,
+		AlertmanagerConfigPollInterval: 3 * time.Minute, // do not poll in tests
+	}
 	kvStore := newFakeKVStore(t)
 	reg := prometheus.NewPedanticRegistry()
 	m := metrics.NewNGAlert(reg)
-	mam := NewMultiOrgAlertmanager(&setting.Cfg{DataPath: tmpDir}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics())
+	mam, err := NewMultiOrgAlertmanager(cfg, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics(), log.New("testlogger"))
+	require.NoError(t, err)
 	ctx := context.Background()

 	t.Cleanup(cleanOrgDirectories(tmpDir, t))
--- a/pkg/services/ngalert/schedule/schedule_unit_test.go
+++ b/pkg/services/ngalert/schedule/schedule_unit_test.go
@ -231,6 +231,8 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac
 	mockedClock := clock.NewMock()
 	logger := log.New("ngalert schedule test")
 	m := metrics.NewNGAlert(prometheus.NewPedanticRegistry())
+	moa, err := notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, &notifier.FakeConfigStore{}, &notifier.FakeOrgStore{}, &notifier.FakeKVStore{}, nil, log.New("testlogger"))
+	require.NoError(t, err)
 	schedCfg := SchedulerCfg{
 		C:                       mockedClock,
 		BaseInterval:            time.Second,
@ -239,7 +241,7 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac
 		RuleStore:               rs,
 		InstanceStore:           is,
 		AdminConfigStore:        acs,
-		MultiOrgNotifier:        notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, &notifier.FakeConfigStore{}, &notifier.FakeOrgStore{}, &notifier.FakeKVStore{}, nil),
+		MultiOrgNotifier:        moa,
 		Logger:                  logger,
 		Metrics:                 m.GetSchedulerMetrics(),
 		AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
--- a/pkg/setting/setting.go
+++ b/pkg/setting/setting.go
@ -18,15 +18,14 @@ import (
 	"strings"
 	"time"

-	"github.com/gobwas/glob"
-
-	"github.com/prometheus/common/model"
-	"gopkg.in/ini.v1"
-
 	"github.com/grafana/grafana-aws-sdk/pkg/awsds"
 	"github.com/grafana/grafana/pkg/components/gtime"
 	"github.com/grafana/grafana/pkg/infra/log"
 	"github.com/grafana/grafana/pkg/util"
+
+	"github.com/gobwas/glob"
+	"github.com/prometheus/common/model"
+	"gopkg.in/ini.v1"
 )

 type Scheme string
@ -420,7 +419,14 @@ type Cfg struct {
 	GeomapEnableCustomBaseLayers bool

 	// Unified Alerting
-	AdminConfigPollInterval time.Duration
+	AdminConfigPollInterval        time.Duration
+	AlertmanagerConfigPollInterval time.Duration
+	HAListenAddr                   string
+	HAAdvertiseAddr                string
+	HAPeers                        []string
+	HAPeerTimeout                  time.Duration
+	HAGossipInterval               time.Duration
+	HAPushPullInterval             time.Duration
 }

 // IsLiveConfigEnabled returns true if live should be able to save configs to SQL tables
@ -916,8 +922,7 @@ func (cfg *Cfg) Load(args CommandLineArgs) error {
 	if err := readAlertingSettings(iniFile); err != nil {
 		return err
 	}
-
-	if err := cfg.readUnifiedAlertingSettings(iniFile); err != nil {
+	if err := cfg.ReadUnifiedAlertingSettings(iniFile); err != nil {
 		return err
 	}

@ -1374,13 +1379,6 @@ func (cfg *Cfg) readRenderingSettings(iniFile *ini.File) error {
 	return nil
 }

-func (cfg *Cfg) readUnifiedAlertingSettings(iniFile *ini.File) error {
-	ua := iniFile.Section("unified_alerting")
-	s := ua.Key("admin_config_poll_interval_seconds").MustInt(60)
-	cfg.AdminConfigPollInterval = time.Second * time.Duration(s)
-	return nil
-}
-
 func readAlertingSettings(iniFile *ini.File) error {
 	alerting := iniFile.Section("alerting")
 	AlertingEnabled = alerting.Key("enabled").MustBool(true)
--- a/pkg/setting/setting_unified_alerting.go
+++ b/pkg/setting/setting_unified_alerting.go
@ -0,0 +1,57 @@
+package setting
+
+import (
+	"strings"
+	"time"
+
+	"github.com/grafana/grafana/pkg/components/gtime"
+
+	"github.com/prometheus/alertmanager/cluster"
+	"gopkg.in/ini.v1"
+)
+
+const (
+	AlertmanagerDefaultClusterAddr          = "0.0.0.0:9094"
+	AlertmanagerDefaultPeerTimeout          = 15 * time.Second
+	AlertmanagerDefaultGossipInterval       = cluster.DefaultGossipInterval
+	AlertmanagerDefaultPushPullInterval     = cluster.DefaultPushPullInterval
+	SchedulerDefaultAdminConfigPollInterval = 60 * time.Second
+	AlertmanagerDefaultConfigPollInterval   = 60 * time.Second
+)
+
+func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
+	ua := iniFile.Section("unified_alerting")
+	var err error
+	cfg.AdminConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "admin_config_poll_interval", (SchedulerDefaultAdminConfigPollInterval).String()))
+	if err != nil {
+		return err
+	}
+	cfg.AlertmanagerConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "alertmanager_config_poll_interval", (AlertmanagerDefaultConfigPollInterval).String()))
+	if err != nil {
+		return err
+	}
+	cfg.HAPeerTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_peer_timeout", (AlertmanagerDefaultPeerTimeout).String()))
+	if err != nil {
+		return err
+	}
+	cfg.HAGossipInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_gossip_interval", (AlertmanagerDefaultGossipInterval).String()))
+	if err != nil {
+		return err
+	}
+	cfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (AlertmanagerDefaultPushPullInterval).String()))
+	if err != nil {
+		return err
+	}
+	cfg.HAListenAddr = ua.Key("ha_listen_address").MustString(AlertmanagerDefaultClusterAddr)
+	cfg.HAAdvertiseAddr = ua.Key("ha_advertise_address").MustString("")
+	peers := ua.Key("ha_peers").MustString("")
+	cfg.HAPeers = make([]string, 0)
+	if peers != "" {
+		for _, peer := range strings.Split(peers, ",") {
+			peer = strings.TrimSpace(peer)
+			cfg.HAPeers = append(cfg.HAPeers, peer)
+		}
+	}
+
+	return nil
+}
--- a/pkg/setting/setting_unified_alerting_test.go
+++ b/pkg/setting/setting_unified_alerting_test.go
@ -0,0 +1,39 @@
+package setting
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
+	cfg := NewCfg()
+	err := cfg.Load(CommandLineArgs{HomePath: "../../", Config: "../../conf/defaults.ini"})
+	require.NoError(t, err)
+
+	// It sets the correct defaults.
+	{
+		require.Equal(t, 60*time.Second, cfg.AdminConfigPollInterval)
+		require.Equal(t, 60*time.Second, cfg.AlertmanagerConfigPollInterval)
+		require.Equal(t, 15*time.Second, cfg.HAPeerTimeout)
+		require.Equal(t, "0.0.0.0:9094", cfg.HAListenAddr)
+		require.Equal(t, "", cfg.HAAdvertiseAddr)
+		require.Len(t, cfg.HAPeers, 0)
+		require.Equal(t, 200*time.Millisecond, cfg.HAGossipInterval)
+		require.Equal(t, 60*time.Second, cfg.HAPushPullInterval)
+	}
+
+	// With peers set, it correctly parses them.
+	{
+		require.Len(t, cfg.HAPeers, 0)
+		s, err := cfg.Raw.NewSection("unified_alerting")
+		require.NoError(t, err)
+		_, err = s.NewKey("ha_peers", "hostname1:9090,hostname2:9090,hostname3:9090")
+		require.NoError(t, err)
+
+		require.NoError(t, cfg.ReadUnifiedAlertingSettings(cfg.Raw))
+		require.Len(t, cfg.HAPeers, 3)
+		require.ElementsMatch(t, []string{"hostname1:9090", "hostname2:9090", "hostname3:9090"}, cfg.HAPeers)
+	}
+}
--- a/pkg/tests/api/alerting/api_admin_configuration_test.go
+++ b/pkg/tests/api/alerting/api_admin_configuration_test.go
@ -21,9 +21,9 @@ import (

 func TestAdminConfiguration_SendingToExternalAlertmanagers(t *testing.T) {
 	dir, path := testinfra.CreateGrafDir(t, testinfra.GrafanaOpts{
-		EnableFeatureToggles:              []string{"ngalert"},
-		DisableAnonymous:                  true,
-		NGAlertAdminConfigIntervalSeconds: 2,
+		EnableFeatureToggles:           []string{"ngalert"},
+		DisableAnonymous:               true,
+		NGAlertAdminConfigPollInterval: 2 * time.Second,
 	})

 	grafanaListedAddr, s := testinfra.StartGrafana(t, dir, path)
--- a/pkg/tests/api/alerting/api_alertmanager_configuration_test.go
+++ b/pkg/tests/api/alerting/api_alertmanager_configuration_test.go
@ -8,8 +8,6 @@ import (
 	"testing"
 	"time"

-	"github.com/grafana/grafana/pkg/services/ngalert/notifier"
-
 	"github.com/grafana/grafana/pkg/bus"
 	"github.com/grafana/grafana/pkg/models"
 	"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
@ -19,16 +17,10 @@ import (
 )

 func TestAlertmanagerConfigurationIsTransactional(t *testing.T) {
-	// TODO: We need a reliable way to ensure Alertmanagers have synced correctly.
-	// For now, make them sync quicker.
-	p := notifier.SyncOrgsPollInterval
-	notifier.SyncOrgsPollInterval = 2 * time.Second
-	t.Cleanup(func() {
-		notifier.SyncOrgsPollInterval = p
-	})
 	dir, path := testinfra.CreateGrafDir(t, testinfra.GrafanaOpts{
-		EnableFeatureToggles: []string{"ngalert"},
-		DisableAnonymous:     true,
+		EnableFeatureToggles:                  []string{"ngalert"},
+		NGAlertAlertmanagerConfigPollInterval: 2 * time.Second,
+		DisableAnonymous:                      true,
 	})

 	grafanaListedAddr, store := testinfra.StartGrafana(t, dir, path)
--- a/pkg/tests/testinfra/testinfra.go
+++ b/pkg/tests/testinfra/testinfra.go
@ -10,6 +10,7 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"

 	"github.com/grafana/grafana/pkg/api"
 	"github.com/grafana/grafana/pkg/infra/fs"
@ -204,13 +205,18 @@ func CreateGrafDir(t *testing.T, opts ...GrafanaOpts) (string, string) {
 			_, err = featureSection.NewKey("enable", strings.Join(o.EnableFeatureToggles, " "))
 			require.NoError(t, err)
 		}
-		if o.NGAlertAdminConfigIntervalSeconds != 0 {
-			ngalertingSection, err := cfg.NewSection("ngalerting")
+		if o.NGAlertAdminConfigPollInterval != 0 {
+			ngalertingSection, err := cfg.NewSection("unified_alerting")
 			require.NoError(t, err)
-			_, err = ngalertingSection.NewKey("admin_config_poll_interval_seconds", fmt.Sprintf("%d", o.NGAlertAdminConfigIntervalSeconds))
+			_, err = ngalertingSection.NewKey("admin_config_poll_interval", o.NGAlertAdminConfigPollInterval.String())
+			require.NoError(t, err)
+		}
+		if o.NGAlertAlertmanagerConfigPollInterval != 0 {
+			ngalertingSection, err := cfg.NewSection("unified_alerting")
+			require.NoError(t, err)
+			_, err = ngalertingSection.NewKey("alertmanager_config_poll_interval", o.NGAlertAlertmanagerConfigPollInterval.String())
 			require.NoError(t, err)
 		}
-
 		if o.AnonymousUserRole != "" {
 			_, err = anonSect.NewKey("org_role", string(o.AnonymousUserRole))
 			require.NoError(t, err)
@ -252,13 +258,14 @@ func CreateGrafDir(t *testing.T, opts ...GrafanaOpts) (string, string) {
 }

 type GrafanaOpts struct {
-	EnableCSP                         bool
-	EnableFeatureToggles              []string
-	NGAlertAdminConfigIntervalSeconds int
-	AnonymousUserRole                 models.RoleType
-	EnableQuota                       bool
-	DisableAnonymous                  bool
-	CatalogAppEnabled                 bool
-	ViewersCanEdit                    bool
-	PluginAdminEnabled                bool
+	EnableCSP                             bool
+	EnableFeatureToggles                  []string
+	NGAlertAdminConfigPollInterval        time.Duration
+	NGAlertAlertmanagerConfigPollInterval time.Duration
+	AnonymousUserRole                     models.RoleType
+	EnableQuota                           bool
+	DisableAnonymous                      bool
+	CatalogAppEnabled                     bool
+	ViewersCanEdit                        bool
+	PluginAdminEnabled                    bool
 }
				`@ -0,0 +1 @@`
				`grafana/provisioning/dashboards/alerts/alert-*`