mirror of https://github.com/grafana/grafana
Alerting: Support Unified Alerting with Grafana HA (#37920)
* Alerting: Support Unified Alerting in Grafana's HA mode.pull/39298/head
parent
92209f1011
commit
7db97097c9
@ -0,0 +1 @@ |
||||
grafana/provisioning/dashboards/alerts/alert-* |
@ -0,0 +1,66 @@ |
||||
# Grafana Unified Alerting High Availability (HA) test setup |
||||
|
||||
A set of docker compose services which together creates a Grafana HA test setup for unified alerting. |
||||
|
||||
Included services |
||||
|
||||
- Grafana |
||||
- Mysql - Grafana configuration database, exporter for metrics and session storage |
||||
- Prometheus - Monitoring of Grafana and used as data source |
||||
- Nginx - Reverse proxy for Grafana and Prometheus. Enables browsing Grafana/Prometheus UI using a hostname |
||||
|
||||
## Prerequisites |
||||
|
||||
### Build grafana docker container |
||||
|
||||
Build a Grafana docker container from current branch and commit and tag it as grafana/grafana:dev. |
||||
|
||||
```bash |
||||
$ cd <grafana repo> |
||||
$ make build-docker-full |
||||
``` |
||||
|
||||
### Virtual host names |
||||
|
||||
#### Alternative 1 - Use dnsmasq |
||||
|
||||
```bash |
||||
$ sudo apt-get install dnsmasq |
||||
$ echo 'address=/loc/127.0.0.1' | sudo tee /etc/dnsmasq.d/dnsmasq-loc.conf > /dev/null |
||||
$ sudo /etc/init.d/dnsmasq restart |
||||
$ ping whatever.loc |
||||
PING whatever.loc (127.0.0.1) 56(84) bytes of data. |
||||
64 bytes from localhost (127.0.0.1): icmp_seq=1 ttl=64 time=0.076 ms |
||||
--- whatever.loc ping statistics --- |
||||
1 packet transmitted, 1 received, 0% packet loss, time 1998ms |
||||
``` |
||||
|
||||
#### Alternative 2 - Manually update /etc/hosts |
||||
|
||||
Update your `/etc/hosts` to be able to access Grafana and/or Prometheus UI using a hostname. |
||||
|
||||
```bash |
||||
$ cat /etc/hosts |
||||
127.0.0.1 grafana.loc |
||||
127.0.0.1 prometheus.loc |
||||
``` |
||||
|
||||
## Start services |
||||
|
||||
```bash |
||||
$ docker-compose up -d |
||||
``` |
||||
|
||||
Browse |
||||
- http://grafana.loc/ |
||||
- http://prometheus.loc/ |
||||
|
||||
|
||||
## Test alerting |
||||
|
||||
### Create contact points |
||||
TBD |
||||
### Create alerts |
||||
TBD |
||||
### Create silences |
||||
TBD |
@ -0,0 +1,90 @@ |
||||
version: "2.1" |
||||
|
||||
services: |
||||
db: |
||||
image: mysql:5.6 |
||||
platform: linux/x86_64 |
||||
environment: |
||||
MYSQL_ROOT_PASSWORD: rootpass |
||||
MYSQL_DATABASE: grafana |
||||
MYSQL_USER: grafana |
||||
MYSQL_PASSWORD: password |
||||
command: [mysqld, --character-set-server=utf8mb4, --collation-server=utf8mb4_unicode_ci, --innodb_monitor_enable=all, --max-connections=1001] |
||||
ports: |
||||
- 3306 |
||||
healthcheck: |
||||
test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"] |
||||
timeout: 10s |
||||
retries: 10 |
||||
mysqld-exporter: |
||||
image: prom/mysqld-exporter |
||||
environment: |
||||
- DATA_SOURCE_NAME=root:rootpass@(db:3306)/ |
||||
ports: |
||||
- 9104 |
||||
depends_on: |
||||
db: |
||||
condition: service_healthy |
||||
prometheus: |
||||
image: prom/prometheus:v2.4.2 |
||||
volumes: |
||||
- ./prometheus/:/etc/prometheus/ |
||||
environment: |
||||
- VIRTUAL_HOST=prometheus.loc |
||||
ports: |
||||
- 909 |
||||
nginx-proxy: |
||||
image: jwilder/nginx-proxy |
||||
ports: |
||||
- "80:80" |
||||
volumes: |
||||
- /var/run/docker.sock:/tmp/docker.sock:ro |
||||
depends_on: |
||||
db: |
||||
condition: service_healthy |
||||
grafana1: |
||||
image: grafana/grafana:dev |
||||
volumes: |
||||
- ./grafana/provisioning/:/etc/grafana/provisioning/ |
||||
environment: |
||||
- VIRTUAL_HOST=grafana.loc |
||||
- GF_FEATURE_TOGGLES_ENABLE=ngalert |
||||
- GF_UNIFIED_ALERTING_HA_PEERS=ha-test-unified-alerting_grafana2_1:9094,ha-test-unified-alerting_grafana1_1:9094 |
||||
- GF_SERVER_ROOT_URL=http://grafana.loc |
||||
- GF_DATABASE_NAME=grafana |
||||
- GF_DATABASE_USER=grafana |
||||
- GF_DATABASE_PASSWORD=password |
||||
- GF_DATABASE_TYPE=mysql |
||||
- GF_DATABASE_HOST=db:3306 |
||||
- GF_DATABASE_MAX_OPEN_CONN=300 |
||||
- GF_SESSION_PROVIDER=mysql |
||||
- GF_SESSION_PROVIDER_CONFIG=grafana:password@tcp(db:3306)/grafana?allowNativePasswords=true |
||||
ports: |
||||
- 3010:3000 |
||||
depends_on: |
||||
db: |
||||
condition: service_healthy |
||||
|
||||
grafana2: |
||||
image: grafana/grafana:dev |
||||
volumes: |
||||
- ./grafana/provisioning/:/etc/grafana/provisioning/ |
||||
environment: |
||||
- VIRTUAL_HOST=grafana.loc |
||||
- GF_FEATURE_TOGGLES_ENABLE=ngalert |
||||
- GF_UNIFIED_ALERTING_HA_PEERS=ha-test-unified-alerting_grafana2_1:9094,ha-test-unified-alerting_grafana1_1:9094 |
||||
- GF_SERVER_ROOT_URL=http://grafana.loc |
||||
- GF_DATABASE_NAME=grafana |
||||
- GF_DATABASE_USER=grafana |
||||
- GF_DATABASE_PASSWORD=password |
||||
- GF_DATABASE_TYPE=mysql |
||||
- GF_DATABASE_HOST=db:3306 |
||||
- GF_DATABASE_MAX_OPEN_CONN=300 |
||||
- GF_SESSION_PROVIDER=mysql |
||||
- GF_SESSION_PROVIDER_CONFIG=grafana:password@tcp(db:3306)/grafana?allowNativePasswords=true |
||||
ports: |
||||
- 3020:3000 |
||||
depends_on: |
||||
db: |
||||
condition: service_healthy |
||||
|
@ -0,0 +1,203 @@ |
||||
local numAlerts = std.extVar('alerts'); |
||||
local condition = std.extVar('condition'); |
||||
local arr = std.range(1, numAlerts); |
||||
|
||||
local alertDashboardTemplate = { |
||||
"editable": true, |
||||
"gnetId": null, |
||||
"graphTooltip": 0, |
||||
"id": null, |
||||
"links": [], |
||||
"panels": [ |
||||
{ |
||||
"alert": { |
||||
"conditions": [ |
||||
{ |
||||
"evaluator": { |
||||
"params": [ |
||||
65 |
||||
], |
||||
"type": "gt" |
||||
}, |
||||
"operator": { |
||||
"type": "and" |
||||
}, |
||||
"query": { |
||||
"params": [ |
||||
"A", |
||||
"5m", |
||||
"now" |
||||
] |
||||
}, |
||||
"reducer": { |
||||
"params": [], |
||||
"type": "avg" |
||||
}, |
||||
"type": "query" |
||||
} |
||||
], |
||||
"executionErrorState": "alerting", |
||||
"frequency": "10s", |
||||
"handler": 1, |
||||
"for": "1m", |
||||
"name": "bulk alerting", |
||||
"noDataState": "no_data", |
||||
"notifications": [ |
||||
{ |
||||
"id": 2 |
||||
} |
||||
] |
||||
}, |
||||
"aliasColors": {}, |
||||
"bars": false, |
||||
"dashLength": 10, |
||||
"dashes": false, |
||||
"datasource": "Prometheus", |
||||
"fill": 1, |
||||
"gridPos": { |
||||
"h": 9, |
||||
"w": 12, |
||||
"x": 0, |
||||
"y": 0 |
||||
}, |
||||
"id": 2, |
||||
"legend": { |
||||
"avg": false, |
||||
"current": false, |
||||
"max": false, |
||||
"min": false, |
||||
"show": true, |
||||
"total": false, |
||||
"values": false |
||||
}, |
||||
"lines": true, |
||||
"linewidth": 1, |
||||
"nullPointMode": "null", |
||||
"percentage": false, |
||||
"pointradius": 5, |
||||
"points": false, |
||||
"renderer": "flot", |
||||
"seriesOverrides": [], |
||||
"spaceLength": 10, |
||||
"stack": false, |
||||
"steppedLine": false, |
||||
"targets": [ |
||||
{ |
||||
"$$hashKey": "object:117", |
||||
"expr": "go_goroutines", |
||||
"format": "time_series", |
||||
"intervalFactor": 1, |
||||
"refId": "A" |
||||
} |
||||
], |
||||
"thresholds": [ |
||||
{ |
||||
"colorMode": "critical", |
||||
"fill": true, |
||||
"line": true, |
||||
"op": "gt", |
||||
"value": 50 |
||||
} |
||||
], |
||||
"timeFrom": null, |
||||
"timeShift": null, |
||||
"title": "Panel Title", |
||||
"tooltip": { |
||||
"shared": true, |
||||
"sort": 0, |
||||
"value_type": "individual" |
||||
}, |
||||
"type": "graph", |
||||
"xaxis": { |
||||
"buckets": null, |
||||
"mode": "time", |
||||
"name": null, |
||||
"show": true, |
||||
"values": [] |
||||
}, |
||||
"yaxes": [ |
||||
{ |
||||
"format": "short", |
||||
"label": null, |
||||
"logBase": 1, |
||||
"max": null, |
||||
"min": null, |
||||
"show": true |
||||
}, |
||||
{ |
||||
"format": "short", |
||||
"label": null, |
||||
"logBase": 1, |
||||
"max": null, |
||||
"min": null, |
||||
"show": true |
||||
} |
||||
] |
||||
} |
||||
], |
||||
"schemaVersion": 16, |
||||
"style": "dark", |
||||
"tags": [], |
||||
"templating": { |
||||
"list": [] |
||||
}, |
||||
"time": { |
||||
"from": "now-6h", |
||||
"to": "now" |
||||
}, |
||||
"timepicker": { |
||||
"refresh_intervals": [ |
||||
"5s", |
||||
"10s", |
||||
"30s", |
||||
"1m", |
||||
"5m", |
||||
"15m", |
||||
"30m", |
||||
"1h", |
||||
"2h", |
||||
"1d" |
||||
], |
||||
"time_options": [ |
||||
"5m", |
||||
"15m", |
||||
"1h", |
||||
"6h", |
||||
"12h", |
||||
"24h", |
||||
"2d", |
||||
"7d", |
||||
"30d" |
||||
] |
||||
}, |
||||
"timezone": "", |
||||
"title": "New dashboard", |
||||
"uid": null, |
||||
"version": 0 |
||||
}; |
||||
|
||||
|
||||
{ |
||||
['alert-' + std.toString(x) + '.json']: |
||||
alertDashboardTemplate + { |
||||
panels: [ |
||||
alertDashboardTemplate.panels[0] + |
||||
{ |
||||
alert+: { |
||||
name: 'Alert rule ' + x, |
||||
conditions: [ |
||||
alertDashboardTemplate.panels[0].alert.conditions[0] + |
||||
{ |
||||
evaluator+: { |
||||
params: [condition] |
||||
} |
||||
}, |
||||
], |
||||
}, |
||||
}, |
||||
], |
||||
uid: 'alert-' + x, |
||||
title: 'Alert ' + x |
||||
}, |
||||
for x in arr |
||||
} |
@ -0,0 +1,172 @@ |
||||
{ |
||||
"annotations": { |
||||
"list": [ |
||||
{ |
||||
"builtIn": 1, |
||||
"datasource": "-- Grafana --", |
||||
"enable": true, |
||||
"hide": true, |
||||
"iconColor": "rgba(0, 211, 255, 1)", |
||||
"name": "Annotations & Alerts", |
||||
"type": "dashboard" |
||||
} |
||||
] |
||||
}, |
||||
"editable": true, |
||||
"gnetId": null, |
||||
"graphTooltip": 0, |
||||
"links": [], |
||||
"panels": [ |
||||
{ |
||||
"aliasColors": { |
||||
"Active alerts": "#bf1b00" |
||||
}, |
||||
"bars": false, |
||||
"dashLength": 10, |
||||
"dashes": false, |
||||
"datasource": "Prometheus", |
||||
"fill": 1, |
||||
"gridPos": { |
||||
"h": 12, |
||||
"w": 24, |
||||
"x": 0, |
||||
"y": 0 |
||||
}, |
||||
"id": 2, |
||||
"interval": "", |
||||
"legend": { |
||||
"alignAsTable": true, |
||||
"avg": false, |
||||
"current": true, |
||||
"max": false, |
||||
"min": false, |
||||
"rightSide": true, |
||||
"show": true, |
||||
"total": false, |
||||
"values": true |
||||
}, |
||||
"lines": true, |
||||
"linewidth": 2, |
||||
"links": [], |
||||
"nullPointMode": "null", |
||||
"percentage": false, |
||||
"pointradius": 5, |
||||
"points": false, |
||||
"renderer": "flot", |
||||
"seriesOverrides": [ |
||||
{ |
||||
"alias": "Active grafana instances", |
||||
"dashes": true, |
||||
"fill": 0 |
||||
} |
||||
], |
||||
"spaceLength": 10, |
||||
"stack": false, |
||||
"steppedLine": false, |
||||
"targets": [ |
||||
{ |
||||
"expr": "sum(increase(grafana_alerting_notification_sent_total[1m])) by(job)", |
||||
"format": "time_series", |
||||
"instant": false, |
||||
"interval": "1m", |
||||
"intervalFactor": 1, |
||||
"legendFormat": "Notifications sent", |
||||
"refId": "A" |
||||
}, |
||||
{ |
||||
"expr": "min(grafana_alerting_active_alerts) without(instance)", |
||||
"format": "time_series", |
||||
"interval": "1m", |
||||
"intervalFactor": 1, |
||||
"legendFormat": "Active alerts", |
||||
"refId": "B" |
||||
}, |
||||
{ |
||||
"expr": "count(up{job=\"grafana\"})", |
||||
"format": "time_series", |
||||
"intervalFactor": 1, |
||||
"legendFormat": "Active grafana instances", |
||||
"refId": "C" |
||||
} |
||||
], |
||||
"thresholds": [], |
||||
"timeFrom": null, |
||||
"timeShift": null, |
||||
"title": "Notifications sent vs active alerts", |
||||
"tooltip": { |
||||
"shared": true, |
||||
"sort": 0, |
||||
"value_type": "individual" |
||||
}, |
||||
"type": "graph", |
||||
"xaxis": { |
||||
"buckets": null, |
||||
"mode": "time", |
||||
"name": null, |
||||
"show": true, |
||||
"values": [] |
||||
}, |
||||
"yaxes": [ |
||||
{ |
||||
"format": "short", |
||||
"label": null, |
||||
"logBase": 1, |
||||
"max": null, |
||||
"min": "0", |
||||
"show": true |
||||
}, |
||||
{ |
||||
"format": "short", |
||||
"label": null, |
||||
"logBase": 1, |
||||
"max": null, |
||||
"min": null, |
||||
"show": true |
||||
} |
||||
], |
||||
"yaxis": { |
||||
"align": false, |
||||
"alignLevel": 3 |
||||
} |
||||
} |
||||
], |
||||
"schemaVersion": 16, |
||||
"style": "dark", |
||||
"tags": [], |
||||
"templating": { |
||||
"list": [] |
||||
}, |
||||
"time": { |
||||
"from": "now-1h", |
||||
"to": "now" |
||||
}, |
||||
"timepicker": { |
||||
"refresh_intervals": [ |
||||
"5s", |
||||
"10s", |
||||
"30s", |
||||
"1m", |
||||
"5m", |
||||
"15m", |
||||
"30m", |
||||
"1h", |
||||
"2h", |
||||
"1d" |
||||
], |
||||
"time_options": [ |
||||
"5m", |
||||
"15m", |
||||
"1h", |
||||
"6h", |
||||
"12h", |
||||
"24h", |
||||
"2d", |
||||
"7d", |
||||
"30d" |
||||
] |
||||
}, |
||||
"timezone": "", |
||||
"title": "Overview", |
||||
"uid": "xHy7-hAik", |
||||
"version": 6 |
||||
} |
@ -0,0 +1,14 @@ |
||||
apiVersion: 1 |
||||
|
||||
providers: |
||||
- name: 'Alerts' |
||||
folder: 'Alerts' |
||||
type: file |
||||
options: |
||||
path: /etc/grafana/provisioning/dashboards/alerts |
||||
|
||||
- name: 'MySQL' |
||||
folder: 'MySQL' |
||||
type: file |
||||
options: |
||||
path: /etc/grafana/provisioning/dashboards/mysql |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,16 @@ |
||||
apiVersion: 1 |
||||
|
||||
datasources: |
||||
- name: Prometheus |
||||
type: prometheus |
||||
access: proxy |
||||
url: http://prometheus:9090 |
||||
jsonData: |
||||
timeInterval: 10s |
||||
queryTimeout: 30s |
||||
httpMethod: POST |
||||
|
||||
- name: Loki |
||||
type: loki |
||||
access: proxy |
||||
url: http://loki:3100 |
@ -0,0 +1,47 @@ |
||||
# my global config |
||||
global: |
||||
scrape_interval: 10s # By default, scrape targets every 15 seconds. |
||||
evaluation_interval: 10s # By default, scrape targets every 15 seconds. |
||||
# scrape_timeout is set to the global default (10s). |
||||
|
||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds. |
||||
#rule_files: |
||||
# - "alert.rules" |
||||
# - "first.rules" |
||||
# - "second.rules" |
||||
|
||||
# alerting: |
||||
# alertmanagers: |
||||
# - scheme: http |
||||
# static_configs: |
||||
# - targets: |
||||
# - "127.0.0.1:9093" |
||||
|
||||
scrape_configs: |
||||
- job_name: 'prometheus' |
||||
static_configs: |
||||
- targets: ['localhost:9090'] |
||||
|
||||
- job_name: 'grafana' |
||||
dns_sd_configs: |
||||
- names: |
||||
- 'grafana' |
||||
type: 'A' |
||||
port: 3000 |
||||
refresh_interval: 10s |
||||
|
||||
- job_name: 'mysql' |
||||
dns_sd_configs: |
||||
- names: |
||||
- 'mysqld-exporter' |
||||
type: 'A' |
||||
port: 9104 |
||||
refresh_interval: 10s |
||||
|
||||
- job_name: 'loki' |
||||
dns_sd_configs: |
||||
- names: |
||||
- 'loki' |
||||
type: 'A' |
||||
port: 3100 |
||||
refresh_interval: 10s |
@ -0,0 +1,57 @@ |
||||
package setting |
||||
|
||||
import ( |
||||
"strings" |
||||
"time" |
||||
|
||||
"github.com/grafana/grafana/pkg/components/gtime" |
||||
|
||||
"github.com/prometheus/alertmanager/cluster" |
||||
"gopkg.in/ini.v1" |
||||
) |
||||
|
||||
const ( |
||||
AlertmanagerDefaultClusterAddr = "0.0.0.0:9094" |
||||
AlertmanagerDefaultPeerTimeout = 15 * time.Second |
||||
AlertmanagerDefaultGossipInterval = cluster.DefaultGossipInterval |
||||
AlertmanagerDefaultPushPullInterval = cluster.DefaultPushPullInterval |
||||
SchedulerDefaultAdminConfigPollInterval = 60 * time.Second |
||||
AlertmanagerDefaultConfigPollInterval = 60 * time.Second |
||||
) |
||||
|
||||
func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { |
||||
ua := iniFile.Section("unified_alerting") |
||||
var err error |
||||
cfg.AdminConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "admin_config_poll_interval", (SchedulerDefaultAdminConfigPollInterval).String())) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
cfg.AlertmanagerConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "alertmanager_config_poll_interval", (AlertmanagerDefaultConfigPollInterval).String())) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
cfg.HAPeerTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_peer_timeout", (AlertmanagerDefaultPeerTimeout).String())) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
cfg.HAGossipInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_gossip_interval", (AlertmanagerDefaultGossipInterval).String())) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
cfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (AlertmanagerDefaultPushPullInterval).String())) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
cfg.HAListenAddr = ua.Key("ha_listen_address").MustString(AlertmanagerDefaultClusterAddr) |
||||
cfg.HAAdvertiseAddr = ua.Key("ha_advertise_address").MustString("") |
||||
peers := ua.Key("ha_peers").MustString("") |
||||
cfg.HAPeers = make([]string, 0) |
||||
if peers != "" { |
||||
for _, peer := range strings.Split(peers, ",") { |
||||
peer = strings.TrimSpace(peer) |
||||
cfg.HAPeers = append(cfg.HAPeers, peer) |
||||
} |
||||
} |
||||
|
||||
return nil |
||||
} |
@ -0,0 +1,39 @@ |
||||
package setting |
||||
|
||||
import ( |
||||
"testing" |
||||
"time" |
||||
|
||||
"github.com/stretchr/testify/require" |
||||
) |
||||
|
||||
func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) { |
||||
cfg := NewCfg() |
||||
err := cfg.Load(CommandLineArgs{HomePath: "../../", Config: "../../conf/defaults.ini"}) |
||||
require.NoError(t, err) |
||||
|
||||
// It sets the correct defaults.
|
||||
{ |
||||
require.Equal(t, 60*time.Second, cfg.AdminConfigPollInterval) |
||||
require.Equal(t, 60*time.Second, cfg.AlertmanagerConfigPollInterval) |
||||
require.Equal(t, 15*time.Second, cfg.HAPeerTimeout) |
||||
require.Equal(t, "0.0.0.0:9094", cfg.HAListenAddr) |
||||
require.Equal(t, "", cfg.HAAdvertiseAddr) |
||||
require.Len(t, cfg.HAPeers, 0) |
||||
require.Equal(t, 200*time.Millisecond, cfg.HAGossipInterval) |
||||
require.Equal(t, 60*time.Second, cfg.HAPushPullInterval) |
||||
} |
||||
|
||||
// With peers set, it correctly parses them.
|
||||
{ |
||||
require.Len(t, cfg.HAPeers, 0) |
||||
s, err := cfg.Raw.NewSection("unified_alerting") |
||||
require.NoError(t, err) |
||||
_, err = s.NewKey("ha_peers", "hostname1:9090,hostname2:9090,hostname3:9090") |
||||
require.NoError(t, err) |
||||
|
||||
require.NoError(t, cfg.ReadUnifiedAlertingSettings(cfg.Raw)) |
||||
require.Len(t, cfg.HAPeers, 3) |
||||
require.ElementsMatch(t, []string{"hostname1:9090", "hostname2:9090", "hostname3:9090"}, cfg.HAPeers) |
||||
} |
||||
} |
Loading…
Reference in new issue