From 7db97097c9d4586f45fbff4a7ab7b85dc7cba15b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 16 Sep 2021 15:33:51 +0100 Subject: [PATCH] Alerting: Support Unified Alerting with Grafana HA (#37920) * Alerting: Support Unified Alerting in Grafana's HA mode. --- conf/defaults.ini | 34 +- conf/sample.ini | 33 +- .../ha-test-unified-alerting/.gitignore | 1 + .../docker/ha-test-unified-alerting/README.md | 66 + .../docker-compose.yaml | 90 + .../grafana/provisioning/alerts.jsonnet | 203 + .../dashboards/alerts/overview.json | 172 + .../provisioning/dashboards/dashboards.yaml | 14 + .../dashboards/mysql/overview.json | 5397 +++++++++++++++++ .../provisioning/datasources/datasources.yaml | 16 + .../prometheus/prometheus.yml | 47 + docs/sources/administration/configuration.md | 46 +- pkg/services/ngalert/metrics/ngalert.go | 2 + pkg/services/ngalert/ngalert.go | 8 +- pkg/services/ngalert/notifier/alertmanager.go | 44 +- .../ngalert/notifier/alertmanager_test.go | 2 +- .../ngalert/notifier/multiorg_alertmanager.go | 78 +- .../notifier/multiorg_alertmanager_test.go | 22 +- .../ngalert/schedule/schedule_unit_test.go | 4 +- pkg/setting/setting.go | 28 +- pkg/setting/setting_unified_alerting.go | 57 + pkg/setting/setting_unified_alerting_test.go | 39 + .../alerting/api_admin_configuration_test.go | 6 +- .../api_alertmanager_configuration_test.go | 14 +- pkg/tests/testinfra/testinfra.go | 33 +- 25 files changed, 6377 insertions(+), 79 deletions(-) create mode 100644 devenv/docker/ha-test-unified-alerting/.gitignore create mode 100644 devenv/docker/ha-test-unified-alerting/README.md create mode 100644 devenv/docker/ha-test-unified-alerting/docker-compose.yaml create mode 100644 devenv/docker/ha-test-unified-alerting/grafana/provisioning/alerts.jsonnet create mode 100644 devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/alerts/overview.json create mode 100644 devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/dashboards.yaml create mode 100644 devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/mysql/overview.json create mode 100644 devenv/docker/ha-test-unified-alerting/grafana/provisioning/datasources/datasources.yaml create mode 100644 devenv/docker/ha-test-unified-alerting/prometheus/prometheus.yml create mode 100644 pkg/setting/setting_unified_alerting.go create mode 100644 pkg/setting/setting_unified_alerting_test.go diff --git a/conf/defaults.ini b/conf/defaults.ini index af81a9e01c9..cb894729a52 100644 --- a/conf/defaults.ini +++ b/conf/defaults.ini @@ -211,7 +211,7 @@ rudderstack_data_plane_url = # Application Insights connection string. Specify an URL string to enable this feature. application_insights_connection_string = -# Optional. Specifies an Application Insights endpoint URL where the endpoint string is wrapped in backticks ``. +# Optional. Specifies an Application Insights endpoint URL where the endpoint string is wrapped in backticks ``. application_insights_endpoint_url = #################################### Security ############################ @@ -732,7 +732,37 @@ global_alert_rule = -1 #################################### Unified Alerting #################### [unified_alerting] # Specify the frequency of polling for admin config changes. -admin_config_poll_interval_seconds = 60 +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +admin_config_poll_interval = 60s + +# Specify the frequency of polling for Alertmanager config changes. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +alertmanager_config_poll_interval = 60s + +# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. +ha_listen_address = "0.0.0.0:9094" + +# Explicit address/hostname and port to advertise other Grafana instances. The port is used for both TCP and UDP. +ha_advertise_address = "" + +# Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting. +ha_peers = "" + +# Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will +# be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should +# each instance wait before sending the notification to take into account replication lag. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +ha_peer_timeout = 15s + +# The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated +# across cluster more quickly at the expense of increased bandwidth usage. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +ha_gossip_interval = 200ms + +# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds +# across larger clusters at the expense of increased bandwidth usage. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +ha_push_pull_interval = 60s #################################### Alerting ############################ [alerting] diff --git a/conf/sample.ini b/conf/sample.ini index 1c75d145d2f..54bfd8d3259 100644 --- a/conf/sample.ini +++ b/conf/sample.ini @@ -709,7 +709,38 @@ #################################### Unified Alerting #################### [unified_alerting] # Specify the frequency of polling for admin config changes. -;admin_config_poll_interval_seconds = 60 +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;admin_config_poll_interval = 60s + +# Specify the frequency of polling for Alertmanager config changes. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;alertmanager_config_poll_interval = 60s + +# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`. +;ha_listen_address = "0.0.0.0:9094" + +# Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`. +;ha_advertise_address = "" + +# Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting. +;ha_peers = "" + +# Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will +# be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should +# each instance wait before sending the notification to take into account replication lag. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;ha_peer_timeout = "15s" + +# The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated +# across cluster more quickly at the expense of increased bandwidth usage. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;ha_gossip_interval = "200ms" + +# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds +# across larger clusters at the expense of increased bandwidth usage. +# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. +;ha_push_pull_interval = "60s" + #################################### Alerting ############################ [alerting] diff --git a/devenv/docker/ha-test-unified-alerting/.gitignore b/devenv/docker/ha-test-unified-alerting/.gitignore new file mode 100644 index 00000000000..0f4e139e204 --- /dev/null +++ b/devenv/docker/ha-test-unified-alerting/.gitignore @@ -0,0 +1 @@ +grafana/provisioning/dashboards/alerts/alert-* \ No newline at end of file diff --git a/devenv/docker/ha-test-unified-alerting/README.md b/devenv/docker/ha-test-unified-alerting/README.md new file mode 100644 index 00000000000..48400d91abc --- /dev/null +++ b/devenv/docker/ha-test-unified-alerting/README.md @@ -0,0 +1,66 @@ +# Grafana Unified Alerting High Availability (HA) test setup + +A set of docker compose services which together creates a Grafana HA test setup for unified alerting. + +Included services + +- Grafana +- Mysql - Grafana configuration database, exporter for metrics and session storage +- Prometheus - Monitoring of Grafana and used as data source +- Nginx - Reverse proxy for Grafana and Prometheus. Enables browsing Grafana/Prometheus UI using a hostname + +## Prerequisites + +### Build grafana docker container + +Build a Grafana docker container from current branch and commit and tag it as grafana/grafana:dev. + +```bash +$ cd +$ make build-docker-full +``` + +### Virtual host names + +#### Alternative 1 - Use dnsmasq + +```bash +$ sudo apt-get install dnsmasq +$ echo 'address=/loc/127.0.0.1' | sudo tee /etc/dnsmasq.d/dnsmasq-loc.conf > /dev/null +$ sudo /etc/init.d/dnsmasq restart +$ ping whatever.loc +PING whatever.loc (127.0.0.1) 56(84) bytes of data. +64 bytes from localhost (127.0.0.1): icmp_seq=1 ttl=64 time=0.076 ms +--- whatever.loc ping statistics --- +1 packet transmitted, 1 received, 0% packet loss, time 1998ms +``` + +#### Alternative 2 - Manually update /etc/hosts + +Update your `/etc/hosts` to be able to access Grafana and/or Prometheus UI using a hostname. + +```bash +$ cat /etc/hosts +127.0.0.1 grafana.loc +127.0.0.1 prometheus.loc +``` + +## Start services + +```bash +$ docker-compose up -d +``` + +Browse +- http://grafana.loc/ +- http://prometheus.loc/ + + +## Test alerting + +### Create contact points +TBD +### Create alerts +TBD +### Create silences +TBD diff --git a/devenv/docker/ha-test-unified-alerting/docker-compose.yaml b/devenv/docker/ha-test-unified-alerting/docker-compose.yaml new file mode 100644 index 00000000000..8fe5305674f --- /dev/null +++ b/devenv/docker/ha-test-unified-alerting/docker-compose.yaml @@ -0,0 +1,90 @@ +version: "2.1" + +services: + db: + image: mysql:5.6 + platform: linux/x86_64 + environment: + MYSQL_ROOT_PASSWORD: rootpass + MYSQL_DATABASE: grafana + MYSQL_USER: grafana + MYSQL_PASSWORD: password + command: [mysqld, --character-set-server=utf8mb4, --collation-server=utf8mb4_unicode_ci, --innodb_monitor_enable=all, --max-connections=1001] + ports: + - 3306 + healthcheck: + test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"] + timeout: 10s + retries: 10 + mysqld-exporter: + image: prom/mysqld-exporter + environment: + - DATA_SOURCE_NAME=root:rootpass@(db:3306)/ + ports: + - 9104 + depends_on: + db: + condition: service_healthy + prometheus: + image: prom/prometheus:v2.4.2 + volumes: + - ./prometheus/:/etc/prometheus/ + environment: + - VIRTUAL_HOST=prometheus.loc + ports: + - 909 + nginx-proxy: + image: jwilder/nginx-proxy + ports: + - "80:80" + volumes: + - /var/run/docker.sock:/tmp/docker.sock:ro + depends_on: + db: + condition: service_healthy + grafana1: + image: grafana/grafana:dev + volumes: + - ./grafana/provisioning/:/etc/grafana/provisioning/ + environment: + - VIRTUAL_HOST=grafana.loc + - GF_FEATURE_TOGGLES_ENABLE=ngalert + - GF_UNIFIED_ALERTING_HA_PEERS=ha-test-unified-alerting_grafana2_1:9094,ha-test-unified-alerting_grafana1_1:9094 + - GF_SERVER_ROOT_URL=http://grafana.loc + - GF_DATABASE_NAME=grafana + - GF_DATABASE_USER=grafana + - GF_DATABASE_PASSWORD=password + - GF_DATABASE_TYPE=mysql + - GF_DATABASE_HOST=db:3306 + - GF_DATABASE_MAX_OPEN_CONN=300 + - GF_SESSION_PROVIDER=mysql + - GF_SESSION_PROVIDER_CONFIG=grafana:password@tcp(db:3306)/grafana?allowNativePasswords=true + ports: + - 3010:3000 + depends_on: + db: + condition: service_healthy + + grafana2: + image: grafana/grafana:dev + volumes: + - ./grafana/provisioning/:/etc/grafana/provisioning/ + environment: + - VIRTUAL_HOST=grafana.loc + - GF_FEATURE_TOGGLES_ENABLE=ngalert + - GF_UNIFIED_ALERTING_HA_PEERS=ha-test-unified-alerting_grafana2_1:9094,ha-test-unified-alerting_grafana1_1:9094 + - GF_SERVER_ROOT_URL=http://grafana.loc + - GF_DATABASE_NAME=grafana + - GF_DATABASE_USER=grafana + - GF_DATABASE_PASSWORD=password + - GF_DATABASE_TYPE=mysql + - GF_DATABASE_HOST=db:3306 + - GF_DATABASE_MAX_OPEN_CONN=300 + - GF_SESSION_PROVIDER=mysql + - GF_SESSION_PROVIDER_CONFIG=grafana:password@tcp(db:3306)/grafana?allowNativePasswords=true + ports: + - 3020:3000 + depends_on: + db: + condition: service_healthy + diff --git a/devenv/docker/ha-test-unified-alerting/grafana/provisioning/alerts.jsonnet b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/alerts.jsonnet new file mode 100644 index 00000000000..e9b8abfbb9c --- /dev/null +++ b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/alerts.jsonnet @@ -0,0 +1,203 @@ +local numAlerts = std.extVar('alerts'); +local condition = std.extVar('condition'); +local arr = std.range(1, numAlerts); + +local alertDashboardTemplate = { + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 65 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "10s", + "handler": 1, + "for": "1m", + "name": "bulk alerting", + "noDataState": "no_data", + "notifications": [ + { + "id": 2 + } + ] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:117", + "expr": "go_goroutines", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 50 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Panel Title", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "New dashboard", + "uid": null, + "version": 0 +}; + + +{ + ['alert-' + std.toString(x) + '.json']: + alertDashboardTemplate + { + panels: [ + alertDashboardTemplate.panels[0] + + { + alert+: { + name: 'Alert rule ' + x, + conditions: [ + alertDashboardTemplate.panels[0].alert.conditions[0] + + { + evaluator+: { + params: [condition] + } + }, + ], + }, + }, + ], + uid: 'alert-' + x, + title: 'Alert ' + x + }, + for x in arr +} \ No newline at end of file diff --git a/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/alerts/overview.json b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/alerts/overview.json new file mode 100644 index 00000000000..53e33c37b1f --- /dev/null +++ b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/alerts/overview.json @@ -0,0 +1,172 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "aliasColors": { + "Active alerts": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "fill": 1, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Active grafana instances", + "dashes": true, + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(grafana_alerting_notification_sent_total[1m])) by(job)", + "format": "time_series", + "instant": false, + "interval": "1m", + "intervalFactor": 1, + "legendFormat": "Notifications sent", + "refId": "A" + }, + { + "expr": "min(grafana_alerting_active_alerts) without(instance)", + "format": "time_series", + "interval": "1m", + "intervalFactor": 1, + "legendFormat": "Active alerts", + "refId": "B" + }, + { + "expr": "count(up{job=\"grafana\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Active grafana instances", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Notifications sent vs active alerts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 3 + } + } + ], + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Overview", + "uid": "xHy7-hAik", + "version": 6 +} \ No newline at end of file diff --git a/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/dashboards.yaml b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/dashboards.yaml new file mode 100644 index 00000000000..ad85bb7036f --- /dev/null +++ b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/dashboards.yaml @@ -0,0 +1,14 @@ +apiVersion: 1 + +providers: + - name: 'Alerts' + folder: 'Alerts' + type: file + options: + path: /etc/grafana/provisioning/dashboards/alerts + + - name: 'MySQL' + folder: 'MySQL' + type: file + options: + path: /etc/grafana/provisioning/dashboards/mysql diff --git a/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/mysql/overview.json b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/mysql/overview.json new file mode 100644 index 00000000000..d072e4c1d28 --- /dev/null +++ b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/dashboards/mysql/overview.json @@ -0,0 +1,5397 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": false, + "hide": true, + "iconColor": "#e0752d", + "limit": 100, + "name": "PMM Annotations", + "showIn": 0, + "tags": [ + "pmm_annotation" + ], + "type": "tags" + }, + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": false, + "hide": true, + "iconColor": "#6ed0e0", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ + + ], + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "iteration": 1540971751770, + "links": [ + { + "icon": "dashboard", + "includeVars": true, + "keepTime": true, + "tags": [ + "QAN" + ], + "targetBlank": false, + "title": "Query Analytics", + "type": "link", + "url": "/graph/dashboard/db/_pmm-query-analytics" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "OS" + ], + "targetBlank": false, + "title": "OS", + "type": "dashboards" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "MySQL" + ], + "targetBlank": false, + "title": "MySQL", + "type": "dashboards" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "MongoDB" + ], + "targetBlank": false, + "title": "MongoDB", + "type": "dashboards" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "PostgreSQL" + ], + "targetBlank": false, + "title": "PostgreSQL", + "type": "dashboards" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "HA" + ], + "targetBlank": false, + "title": "HA", + "type": "dashboards" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "Cloud" + ], + "targetBlank": false, + "title": "Cloud", + "type": "dashboards" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "Insight" + ], + "targetBlank": false, + "title": "Insight", + "type": "dashboards" + }, + { + "asDropdown": true, + "includeVars": true, + "keepTime": true, + "tags": [ + "PMM" + ], + "targetBlank": false, + "title": "PMM", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 382, + "panels": [ + + ], + "repeat": null, + "title": "", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "decimals": 1, + "description": "**MySQL Uptime**\n\nThe amount of time since the last restart of the MySQL server process.", + "editable": true, + "error": false, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 0, + "y": 1 + }, + "height": "125px", + "id": 12, + "interval": "$interval", + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "s", + "postfixFontSize": "80%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "calculatedInterval": "10m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_status_uptime{instance=\"$host\"}", + "format": "time_series", + "interval": "5m", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 300 + } + ], + "thresholds": "300,3600", + "title": "MySQL Uptime", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "decimals": 2, + "description": "**Current QPS**\n\nBased on the queries reported by MySQL's ``SHOW STATUS`` command, it is the number of statements executed by the server within the last second. This variable includes statements executed within stored programs, unlike the Questions variable. It does not count \n``COM_PING`` or ``COM_STATISTICS`` commands.", + "editable": true, + "error": false, + "format": "short", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 6, + "y": 1 + }, + "height": "125px", + "id": 13, + "interval": "$interval", + "links": [ + { + "targetBlank": true, + "title": "MySQL Server Status Variables", + "type": "absolute", + "url": "https://dev.mysql.com/doc/refman/5.7/en/server-status-variables.html#statvar_Queries" + } + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "calculatedInterval": "10m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_queries{instance=\"$host\"}[$interval]) or irate(mysql_global_status_queries{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": "35,75", + "title": "Current QPS", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "Prometheus", + "decimals": 0, + "description": "**InnoDB Buffer Pool Size**\n\nInnoDB maintains a storage area called the buffer pool for caching data and indexes in memory. Knowing how the InnoDB buffer pool works, and taking advantage of it to keep frequently accessed data in memory, is one of the most important aspects of MySQL tuning. The goal is to keep the working set in memory. In most cases, this should be between 60%-90% of available memory on a dedicated database host, but depends on many factors.", + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 12, + "y": 1 + }, + "height": "125px", + "id": 51, + "interval": "$interval", + "links": [ + { + "targetBlank": true, + "title": "Tuning the InnoDB Buffer Pool Size", + "type": "absolute", + "url": "https://www.percona.com/blog/2015/06/02/80-ram-tune-innodb_buffer_pool_size/" + } + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "calculatedInterval": "10m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_variables_innodb_buffer_pool_size{instance=\"$host\"}", + "format": "time_series", + "interval": "5m", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 300 + } + ], + "thresholds": "90,95", + "title": "InnoDB Buffer Pool Size", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "Prometheus", + "decimals": 0, + "description": "**InnoDB Buffer Pool Size % of Total RAM**\n\nInnoDB maintains a storage area called the buffer pool for caching data and indexes in memory. Knowing how the InnoDB buffer pool works, and taking advantage of it to keep frequently accessed data in memory, is one of the most important aspects of MySQL tuning. The goal is to keep the working set in memory. In most cases, this should be between 60%-90% of available memory on a dedicated database host, but depends on many factors.", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 1 + }, + "height": "125px", + "id": 52, + "interval": "$interval", + "links": [ + { + "targetBlank": true, + "title": "Tuning the InnoDB Buffer Pool Size", + "type": "absolute", + "url": "https://www.percona.com/blog/2015/06/02/80-ram-tune-innodb_buffer_pool_size/" + } + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "repeat": null, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "maxValue": 100, + "minValue": 0, + "show": true + }, + "tableColumn": "", + "targets": [ + { + "calculatedInterval": "10m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "(mysql_global_variables_innodb_buffer_pool_size{instance=\"$host\"} * 100) / on (instance) node_memory_MemTotal{instance=\"$host\"}", + "format": "time_series", + "interval": "5m", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 300 + } + ], + "thresholds": "40,80", + "title": "Buffer Pool Size of Total RAM", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + + ], + "valueName": "current" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 383, + "panels": [ + + ], + "repeat": null, + "title": "Connections", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 0, + "description": "**Max Connections** \n\nMax Connections is the maximum permitted number of simultaneous client connections. By default, this is 151. Increasing this value increases the number of file descriptors that mysqld requires. If the required number of descriptors are not available, the server reduces the value of Max Connections.\n\nmysqld actually permits Max Connections + 1 clients to connect. The extra connection is reserved for use by accounts that have the SUPER privilege, such as root.\n\nMax Used Connections is the maximum number of connections that have been in use simultaneously since the server started.\n\nConnections is the number of connection attempts (successful or not) to the MySQL server.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 4 + }, + "height": "250px", + "id": 92, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + { + "targetBlank": true, + "title": "MySQL Server System Variables", + "type": "absolute", + "url": "https://dev.mysql.com/doc/refman/5.7/en/server-system-variables.html#sysvar_max_connections" + } + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Max Connections", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "max(max_over_time(mysql_global_status_threads_connected{instance=\"$host\"}[$interval]) or mysql_global_status_threads_connected{instance=\"$host\"} )", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Connections", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_status_max_used_connections{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Max Used Connections", + "metric": "", + "refId": "C", + "step": 20, + "target": "" + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_variables_max_connections{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Max Connections", + "metric": "", + "refId": "B", + "step": 20, + "target": "" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Connections", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Active Threads**\n\nThreads Connected is the number of open connections, while Threads Running is the number of threads not sleeping.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Peak Threads Running", + "color": "#E24D42", + "lines": false, + "pointradius": 1, + "points": true + }, + { + "alias": "Peak Threads Connected", + "color": "#1F78C1" + }, + { + "alias": "Avg Threads Running", + "color": "#EAB839" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "max_over_time(mysql_global_status_threads_connected{instance=\"$host\"}[$interval]) or\nmax_over_time(mysql_global_status_threads_connected{instance=\"$host\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Peak Threads Connected", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "max_over_time(mysql_global_status_threads_running{instance=\"$host\"}[$interval]) or\nmax_over_time(mysql_global_status_threads_running{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Peak Threads Running", + "metric": "", + "refId": "B", + "step": 20 + }, + { + "expr": "avg_over_time(mysql_global_status_threads_running{instance=\"$host\"}[$interval]) or \navg_over_time(mysql_global_status_threads_running{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Avg Threads Running", + "refId": "C", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Client Thread Activity", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + "total" + ] + }, + "yaxes": [ + { + "format": "short", + "label": "Threads", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 384, + "panels": [ + + ], + "repeat": null, + "title": "Table Locks", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Questions**\n\nThe number of statements executed by the server. This includes only statements sent to the server by clients and not statements executed within stored programs, unlike the Queries used in the QPS calculation. \n\nThis variable does not count the following commands:\n* ``COM_PING``\n* ``COM_STATISTICS``\n* ``COM_STMT_PREPARE``\n* ``COM_STMT_CLOSE``\n* ``COM_STMT_RESET``", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 53, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + { + "targetBlank": true, + "title": "MySQL Queries and Questions", + "type": "absolute", + "url": "https://www.percona.com/blog/2014/05/29/how-mysql-queries-and-questions-are-measured/" + } + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_questions{instance=\"$host\"}[$interval]) or irate(mysql_global_status_questions{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Questions", + "metric": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Questions", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Thread Cache**\n\nThe thread_cache_size variable sets how many threads the server should cache to reuse. When a client disconnects, the client's threads are put in the cache if the cache is not full. It is autosized in MySQL 5.6.8 and above (capped to 100). Requests for threads are satisfied by reusing threads taken from the cache if possible, and only when the cache is empty is a new thread created.\n\n* *Threads_created*: The number of threads created to handle connections.\n* *Threads_cached*: The number of threads in the thread cache.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + { + "title": "Tuning information", + "type": "absolute", + "url": "https://dev.mysql.com/doc/refman/5.6/en/server-system-variables.html#sysvar_thread_cache_size" + } + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Threads Created", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_variables_thread_cache_size{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Thread Cache Size", + "metric": "", + "refId": "B", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_status_threads_cached{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Threads Cached", + "metric": "", + "refId": "C", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_threads_created{instance=\"$host\"}[$interval]) or irate(mysql_global_status_threads_created{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Threads Created", + "metric": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Thread Cache", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 385, + "panels": [ + + ], + "repeat": null, + "title": "Temporary Objects", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_created_tmp_tables{instance=\"$host\"}[$interval]) or irate(mysql_global_status_created_tmp_tables{instance=\"$host\"}[5m])", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Created Tmp Tables", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_created_tmp_disk_tables{instance=\"$host\"}[$interval]) or irate(mysql_global_status_created_tmp_disk_tables{instance=\"$host\"}[5m])", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Created Tmp Disk Tables", + "metric": "", + "refId": "B", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_created_tmp_files{instance=\"$host\"}[$interval]) or irate(mysql_global_status_created_tmp_files{instance=\"$host\"}[5m])", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Created Tmp Files", + "metric": "", + "refId": "C", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Temporary Objects", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Select Types**\n\nAs with most relational databases, selecting based on indexes is more efficient than scanning an entire table's data. Here we see the counters for selects not done with indexes.\n\n* ***Select Scan*** is how many queries caused full table scans, in which all the data in the table had to be read and either discarded or returned.\n* ***Select Range*** is how many queries used a range scan, which means MySQL scanned all rows in a given range.\n* ***Select Full Join*** is the number of joins that are not joined on an index, this is usually a huge performance hit.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 20 + }, + "height": "250px", + "id": 311, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_select_full_join{instance=\"$host\"}[$interval]) or irate(mysql_global_status_select_full_join{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Select Full Join", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_select_full_range_join{instance=\"$host\"}[$interval]) or irate(mysql_global_status_select_full_range_join{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Select Full Range Join", + "metric": "", + "refId": "B", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_select_range{instance=\"$host\"}[$interval]) or irate(mysql_global_status_select_range{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Select Range", + "metric": "", + "refId": "C", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_select_range_check{instance=\"$host\"}[$interval]) or irate(mysql_global_status_select_range_check{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Select Range Check", + "metric": "", + "refId": "D", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_select_scan{instance=\"$host\"}[$interval]) or irate(mysql_global_status_select_scan{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Select Scan", + "metric": "", + "refId": "E", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Select Types", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 386, + "panels": [ + + ], + "repeat": null, + "title": "Sorts", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Sorts**\n\nDue to a query's structure, order, or other requirements, MySQL sorts the rows before returning them. For example, if a table is ordered 1 to 10 but you want the results reversed, MySQL then has to sort the rows to return 10 to 1.\n\nThis graph also shows when sorts had to scan a whole table or a given range of a table in order to return the results and which could not have been sorted via an index.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 30, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_sort_rows{instance=\"$host\"}[$interval]) or irate(mysql_global_status_sort_rows{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Sort Rows", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_sort_range{instance=\"$host\"}[$interval]) or irate(mysql_global_status_sort_range{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Sort Range", + "metric": "", + "refId": "B", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_sort_merge_passes{instance=\"$host\"}[$interval]) or irate(mysql_global_status_sort_merge_passes{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Sort Merge Passes", + "metric": "", + "refId": "C", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_sort_scan{instance=\"$host\"}[$interval]) or irate(mysql_global_status_sort_scan{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Sort Scan", + "metric": "", + "refId": "D", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Sorts", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Slow Queries**\n\nSlow queries are defined as queries being slower than the long_query_time setting. For example, if you have long_query_time set to 3, all queries that take longer than 3 seconds to complete will show on this graph.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 48, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_slow_queries{instance=\"$host\"}[$interval]) or irate(mysql_global_status_slow_queries{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Slow Queries", + "metric": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Slow Queries", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 387, + "panels": [ + + ], + "repeat": null, + "title": "Aborted", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**Aborted Connections**\n\nWhen a given host connects to MySQL and the connection is interrupted in the middle (for example due to bad credentials), MySQL keeps that info in a system table (since 5.6 this table is exposed in performance_schema).\n\nIf the amount of failed requests without a successful connection reaches the value of max_connect_errors, mysqld assumes that something is wrong and blocks the host from further connection.\n\nTo allow connections from that host again, you need to issue the ``FLUSH HOSTS`` statement.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 47, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_aborted_connects{instance=\"$host\"}[$interval]) or irate(mysql_global_status_aborted_connects{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Aborted Connects (attempts)", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_aborted_clients{instance=\"$host\"}[$interval]) or irate(mysql_global_status_aborted_clients{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Aborted Clients (timeout)", + "metric": "", + "refId": "B", + "step": 20, + "target": "" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Aborted Connections", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**Table Locks**\n\nMySQL takes a number of different locks for varying reasons. In this graph we see how many Table level locks MySQL has requested from the storage engine. In the case of InnoDB, many times the locks could actually be row locks as it only takes table level locks in a few specific cases.\n\nIt is most useful to compare Locks Immediate and Locks Waited. If Locks waited is rising, it means you have lock contention. Otherwise, Locks Immediate rising and falling is normal activity.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 36 + }, + "id": 32, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_table_locks_immediate{instance=\"$host\"}[$interval]) or irate(mysql_global_status_table_locks_immediate{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Table Locks Immediate", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_table_locks_waited{instance=\"$host\"}[$interval]) or irate(mysql_global_status_table_locks_waited{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Table Locks Waited", + "metric": "", + "refId": "B", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Table Locks", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 43 + }, + "id": 388, + "panels": [ + + ], + "repeat": null, + "title": "Network", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Network Traffic**\n\nHere we can see how much network traffic is generated by MySQL. Outbound is network traffic sent from MySQL and Inbound is network traffic MySQL has received.", + "editable": true, + "error": false, + "fill": 6, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 44 + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_bytes_received{instance=\"$host\"}[$interval]) or irate(mysql_global_status_bytes_received{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Inbound", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_bytes_sent{instance=\"$host\"}[$interval]) or irate(mysql_global_status_bytes_sent{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Outbound", + "metric": "", + "refId": "B", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Network Traffic", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "none", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Network Usage Hourly**\n\nHere we can see how much network traffic is generated by MySQL per hour. You can use the bar graph to compare data sent by MySQL and data received by MySQL.", + "editable": true, + "error": false, + "fill": 6, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 44 + }, + "height": "250px", + "id": 381, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "increase(mysql_global_status_bytes_received{instance=\"$host\"}[1h])", + "format": "time_series", + "interval": "1h", + "intervalFactor": 1, + "legendFormat": "Received", + "metric": "", + "refId": "A", + "step": 3600 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "increase(mysql_global_status_bytes_sent{instance=\"$host\"}[1h])", + "format": "time_series", + "interval": "1h", + "intervalFactor": 1, + "legendFormat": "Sent", + "metric": "", + "refId": "B", + "step": 3600 + } + ], + "thresholds": [ + + ], + "timeFrom": "24h", + "timeShift": null, + "title": "MySQL Network Usage Hourly", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "none", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 51 + }, + "id": 389, + "panels": [ + + ], + "repeat": null, + "title": "Memory", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 0, + "description": "***System Memory***: Total Memory for the system.\\\n***InnoDB Buffer Pool Data***: InnoDB maintains a storage area called the buffer pool for caching data and indexes in memory.\\\n***TokuDB Cache Size***: Similar in function to the InnoDB Buffer Pool, TokuDB will allocate 50% of the installed RAM for its own cache.\\\n***Key Buffer Size***: Index blocks for MYISAM tables are buffered and are shared by all threads. key_buffer_size is the size of the buffer used for index blocks.\\\n***Adaptive Hash Index Size***: When InnoDB notices that some index values are being accessed very frequently, it builds a hash index for them in memory on top of B-Tree indexes.\\\n ***Query Cache Size***: The query cache stores the text of a SELECT statement together with the corresponding result that was sent to the client. The query cache has huge scalability problems in that only one thread can do an operation in the query cache at the same time.\\\n***InnoDB Dictionary Size***: The data dictionary is InnoDB ‘s internal catalog of tables. InnoDB stores the data dictionary on disk, and loads entries into memory while the server is running.\\\n***InnoDB Log Buffer Size***: The MySQL InnoDB log buffer allows transactions to run without having to write the log to disk before the transactions commit.", + "editable": true, + "error": false, + "fill": 6, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 52 + }, + "id": 50, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + { + "title": "Detailed descriptions about metrics", + "type": "absolute", + "url": "https://www.percona.com/doc/percona-monitoring-and-management/dashboard.mysql-overview.html#mysql-internal-memory-overview" + } + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "System Memory", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$host\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "System Memory", + "refId": "G", + "step": 4 + }, + { + "expr": "mysql_global_status_innodb_page_size{instance=\"$host\"} * on (instance) mysql_global_status_buffer_pool_pages{instance=\"$host\",state=\"data\"}", + "format": "time_series", + "hide": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "InnoDB Buffer Pool Data", + "refId": "A", + "step": 20 + }, + { + "expr": "mysql_global_variables_innodb_log_buffer_size{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "InnoDB Log Buffer Size", + "refId": "D", + "step": 20 + }, + { + "expr": "mysql_global_variables_innodb_additional_mem_pool_size{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 2, + "legendFormat": "InnoDB Additional Memory Pool Size", + "refId": "H", + "step": 40 + }, + { + "expr": "mysql_global_status_innodb_mem_dictionary{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "InnoDB Dictionary Size", + "refId": "F", + "step": 20 + }, + { + "expr": "mysql_global_variables_key_buffer_size{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Key Buffer Size", + "refId": "B", + "step": 20 + }, + { + "expr": "mysql_global_variables_query_cache_size{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Query Cache Size", + "refId": "C", + "step": 20 + }, + { + "expr": "mysql_global_status_innodb_mem_adaptive_hash{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Adaptive Hash Index Size", + "refId": "E", + "step": 20 + }, + { + "expr": "mysql_global_variables_tokudb_cache_size{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "TokuDB Cache Size", + "refId": "I", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Internal Memory Overview", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 59 + }, + "id": 390, + "panels": [ + + ], + "repeat": null, + "title": "Command, Handlers, Processes", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**Top Command Counters**\n\nThe Com_{{xxx}} statement counter variables indicate the number of times each xxx statement has been executed. There is one status variable for each type of statement. For example, Com_delete and Com_update count [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements, respectively. Com_delete_multi and Com_update_multi are similar but apply to [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements that use multiple-table syntax.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 14, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + { + "title": "Server Status Variables (Com_xxx)", + "type": "absolute", + "url": "https://dev.mysql.com/doc/refman/5.7/en/server-status-variables.html#statvar_Com_xxx" + } + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "topk(5, rate(mysql_global_status_commands_total{instance=\"$host\"}[$interval])>0) or topk(5, irate(mysql_global_status_commands_total{instance=\"$host\"}[5m])>0)", + "format": "time_series", + "hide": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Com_{{ command }}", + "metric": "", + "refId": "B", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Command Counters", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**Top Command Counters Hourly**\n\nThe Com_{{xxx}} statement counter variables indicate the number of times each xxx statement has been executed. There is one status variable for each type of statement. For example, Com_delete and Com_update count [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements, respectively. Com_delete_multi and Com_update_multi are similar but apply to [``DELETE``](https://dev.mysql.com/doc/refman/5.7/en/delete.html) and [``UPDATE``](https://dev.mysql.com/doc/refman/5.7/en/update.html) statements that use multiple-table syntax.", + "editable": true, + "error": false, + "fill": 6, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 67 + }, + "id": 39, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 2, + "links": [ + { + "dashboard": "https://dev.mysql.com/doc/refman/5.7/en/server-status-variables.html#statvar_Com_xxx", + "title": "Server Status Variables (Com_xxx)", + "type": "absolute", + "url": "https://dev.mysql.com/doc/refman/5.7/en/server-status-variables.html#statvar_Com_xxx" + } + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "topk(5, increase(mysql_global_status_commands_total{instance=\"$host\"}[1h])>0)", + "format": "time_series", + "interval": "1h", + "intervalFactor": 1, + "legendFormat": "Com_{{ command }}", + "metric": "", + "refId": "A", + "step": 3600 + } + ], + "thresholds": [ + + ], + "timeFrom": "24h", + "timeShift": null, + "title": "Top Command Counters Hourly", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Handlers**\n\nHandler statistics are internal statistics on how MySQL is selecting, updating, inserting, and modifying rows, tables, and indexes.\n\nThis is in fact the layer between the Storage Engine and MySQL.\n\n* `read_rnd_next` is incremented when the server performs a full table scan and this is a counter you don't really want to see with a high value.\n* `read_key` is incremented when a read is done with an index.\n* `read_next` is incremented when the storage engine is asked to 'read the next index entry'. A high value means a lot of index scans are being done.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 74 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_handlers_total{instance=\"$host\", handler!~\"commit|rollback|savepoint.*|prepare\"}[$interval]) or irate(mysql_global_status_handlers_total{instance=\"$host\", handler!~\"commit|rollback|savepoint.*|prepare\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{ handler }}", + "metric": "", + "refId": "J", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Handlers", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 81 + }, + "id": 28, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_handlers_total{instance=\"$host\", handler=~\"commit|rollback|savepoint.*|prepare\"}[$interval]) or irate(mysql_global_status_handlers_total{instance=\"$host\", handler=~\"commit|rollback|savepoint.*|prepare\"}[5m])", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{ handler }}", + "metric": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Transaction Handlers", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 88 + }, + "id": 40, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_info_schema_threads{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{ state }}", + "metric": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Process States", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 6, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 95 + }, + "id": 49, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "topk(5, avg_over_time(mysql_info_schema_threads{instance=\"$host\"}[1h]))", + "interval": "1h", + "intervalFactor": 1, + "legendFormat": "{{ state }}", + "metric": "", + "refId": "A", + "step": 3600 + } + ], + "thresholds": [ + + ], + "timeFrom": "24h", + "timeShift": null, + "title": "Top Process States Hourly", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 102 + }, + "id": 391, + "panels": [ + + ], + "repeat": null, + "title": "Query Cache", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Query Cache Memory**\n\nThe query cache has huge scalability problems in that only one thread can do an operation in the query cache at the same time. This serialization is true not only for SELECTs, but also for INSERT/UPDATE/DELETE.\n\nThis also means that the larger the `query_cache_size` is set to, the slower those operations become. In concurrent environments, the MySQL Query Cache quickly becomes a contention point, decreasing performance. MariaDB and AWS Aurora have done work to try and eliminate the query cache contention in their flavors of MySQL, while MySQL 8.0 has eliminated the query cache feature.\n\nThe recommended settings for most environments is to set:\n ``query_cache_type=0``\n ``query_cache_size=0``\n\nNote that while you can dynamically change these values, to completely remove the contention point you have to restart the database.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 103 + }, + "id": 46, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_status_qcache_free_memory{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Free Memory", + "metric": "", + "refId": "F", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_variables_query_cache_size{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Query Cache Size", + "metric": "", + "refId": "E", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Query Cache Memory", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Query Cache Activity**\n\nThe query cache has huge scalability problems in that only one thread can do an operation in the query cache at the same time. This serialization is true not only for SELECTs, but also for INSERT/UPDATE/DELETE.\n\nThis also means that the larger the `query_cache_size` is set to, the slower those operations become. In concurrent environments, the MySQL Query Cache quickly becomes a contention point, decreasing performance. MariaDB and AWS Aurora have done work to try and eliminate the query cache contention in their flavors of MySQL, while MySQL 8.0 has eliminated the query cache feature.\n\nThe recommended settings for most environments is to set:\n``query_cache_type=0``\n``query_cache_size=0``\n\nNote that while you can dynamically change these values, to completely remove the contention point you have to restart the database.", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 103 + }, + "height": "", + "id": 45, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_qcache_hits{instance=\"$host\"}[$interval]) or irate(mysql_global_status_qcache_hits{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Hits", + "metric": "", + "refId": "B", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_qcache_inserts{instance=\"$host\"}[$interval]) or irate(mysql_global_status_qcache_inserts{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Inserts", + "metric": "", + "refId": "C", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_qcache_not_cached{instance=\"$host\"}[$interval]) or irate(mysql_global_status_qcache_not_cached{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Not Cached", + "metric": "", + "refId": "D", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_qcache_lowmem_prunes{instance=\"$host\"}[$interval]) or irate(mysql_global_status_qcache_lowmem_prunes{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Prunes", + "metric": "", + "refId": "F", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_status_qcache_queries_in_cache{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Queries in Cache", + "metric": "", + "refId": "E", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Query Cache Activity", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 110 + }, + "id": 392, + "panels": [ + + ], + "repeat": null, + "title": "Files and Tables", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 111 + }, + "id": 43, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_opened_files{instance=\"$host\"}[$interval]) or irate(mysql_global_status_opened_files{instance=\"$host\"}[5m])", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Openings", + "metric": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL File Openings", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 111 + }, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_status_open_files{instance=\"$host\"}", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Open Files", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_variables_open_files_limit{instance=\"$host\"}", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Open Files Limit", + "metric": "", + "refId": "D", + "step": 20 + }, + { + "expr": "mysql_global_status_innodb_num_open_files{instance=\"$host\"}", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "InnoDB Open Files", + "refId": "B", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Open Files", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 118 + }, + "id": 393, + "panels": [ + + ], + "repeat": null, + "title": "Table Openings", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Table Open Cache Status**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 119 + }, + "id": 44, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + { + "title": "Server Status Variables (table_open_cache)", + "type": "absolute", + "url": "http://dev.mysql.com/doc/refman/5.6/en/server-system-variables.html#sysvar_table_open_cache" + } + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Table Open Cache Hit Ratio", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(mysql_global_status_opened_tables{instance=\"$host\"}[$interval]) or irate(mysql_global_status_opened_tables{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Openings", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "expr": "rate(mysql_global_status_table_open_cache_hits{instance=\"$host\"}[$interval]) or irate(mysql_global_status_table_open_cache_hits{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Hits", + "refId": "B", + "step": 20 + }, + { + "expr": "rate(mysql_global_status_table_open_cache_misses{instance=\"$host\"}[$interval]) or irate(mysql_global_status_table_open_cache_misses{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Misses", + "refId": "C", + "step": 20 + }, + { + "expr": "rate(mysql_global_status_table_open_cache_overflows{instance=\"$host\"}[$interval]) or irate(mysql_global_status_table_open_cache_overflows{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Misses due to Overflows", + "refId": "D", + "step": 20 + }, + { + "expr": "(rate(mysql_global_status_table_open_cache_hits{instance=\"$host\"}[$interval]) or irate(mysql_global_status_table_open_cache_hits{instance=\"$host\"}[5m]))/((rate(mysql_global_status_table_open_cache_hits{instance=\"$host\"}[$interval]) or irate(mysql_global_status_table_open_cache_hits{instance=\"$host\"}[5m]))+(rate(mysql_global_status_table_open_cache_misses{instance=\"$host\"}[$interval]) or irate(mysql_global_status_table_open_cache_misses{instance=\"$host\"}[5m])))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Table Open Cache Hit Ratio", + "refId": "E", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Table Open Cache Status", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "percentunit", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Open Tables**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 119 + }, + "id": 42, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + { + "title": "Server Status Variables (table_open_cache)", + "type": "absolute", + "url": "http://dev.mysql.com/doc/refman/5.6/en/server-system-variables.html#sysvar_table_open_cache" + } + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_status_open_tables{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Open Tables", + "metric": "", + "refId": "B", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_variables_table_open_cache{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Table Open Cache", + "metric": "", + "refId": "C", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Open Tables", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 126 + }, + "id": 394, + "panels": [ + + ], + "repeat": null, + "title": "MySQL Table Definition Cache", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "description": "**MySQL Table Definition Cache**\n\nThe recommendation is to set the `table_open_cache_instances` to a loose correlation to virtual CPUs, keeping in mind that more instances means the cache is split more times. If you have a cache set to 500 but it has 10 instances, each cache will only have 50 cached.\n\nThe `table_definition_cache` and `table_open_cache` can be left as default as they are auto-sized MySQL 5.6 and above (ie: do not set them to any value).", + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 127 + }, + "id": 54, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + { + "title": "Server Status Variables (table_open_cache)", + "type": "absolute", + "url": "http://dev.mysql.com/doc/refman/5.6/en/server-system-variables.html#sysvar_table_open_cache" + } + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Opened Table Definitions", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_status_open_table_definitions{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Open Table Definitions", + "metric": "", + "refId": "B", + "step": 20 + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "mysql_global_variables_table_definition_cache{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Table Definitions Cache Size", + "metric": "", + "refId": "C", + "step": 20 + }, + { + "expr": "rate(mysql_global_status_opened_table_definitions{instance=\"$host\"}[$interval]) or irate(mysql_global_status_opened_table_definitions{instance=\"$host\"}[5m])", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Opened Table Definitions", + "refId": "A", + "step": 20 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "MySQL Table Definition Cache", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 134 + }, + "id": 395, + "panels": [ + + ], + "repeat": null, + "title": "System Charts", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 135 + }, + "id": 31, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(node_vmstat_pgpgin{instance=\"$host\"}[$interval]) * 1024 or irate(node_vmstat_pgpgin{instance=\"$host\"}[5m]) * 1024", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Page In", + "metric": "", + "refId": "A", + "step": 20, + "target": "" + }, + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(node_vmstat_pgpgout{instance=\"$host\"}[$interval]) * 1024 or irate(node_vmstat_pgpgout{instance=\"$host\"}[5m]) * 1024", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Page Out", + "metric": "", + "refId": "B", + "step": 20, + "target": "" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "I/O Activity", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": null, + "editable": true, + "error": false, + "fill": 6, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 135 + }, + "height": "250px", + "id": 37, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "max(node_memory_MemTotal{instance=\"$host\"}) without(job) - \n(max(node_memory_MemFree{instance=\"$host\"}) without(job) + \nmax(node_memory_Buffers{instance=\"$host\"}) without(job) + \n(max(node_memory_Cached{instance=\"$host\",job=~\"rds-enhanced|linux\"}) without (job) or \nmax(node_memory_Cached{instance=\"$host\",job=\"rds-basic\"}) without (job)))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Used", + "metric": "", + "refId": "A", + "step": 20, + "target": "" + }, + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "node_memory_MemFree{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Free", + "metric": "", + "refId": "B", + "step": 20, + "target": "" + }, + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "node_memory_Buffers{instance=\"$host\"}", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Buffers", + "metric": "", + "refId": "D", + "step": 20, + "target": "" + }, + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "max(node_memory_Cached{instance=~\"$host\",job=~\"rds-enhanced|linux\"}) without (job) or \nmax(node_memory_Cached{instance=~\"$host\",job=~\"rds-basic\"}) without (job)", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Cached", + "metric": "", + "refId": "E", + "step": 20, + "target": "" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Distribution", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Load 1m": "#58140C", + "Max Core Utilization": "#bf1b00", + "iowait": "#e24d42", + "nice": "#1f78c1", + "softirq": "#806eb7", + "system": "#eab839", + "user": "#508642" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": null, + "editable": true, + "error": false, + "fill": 6, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 142 + }, + "height": "", + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Max Core Utilization", + "lines": false, + "pointradius": 1, + "points": true, + "stack": false + }, + { + "alias": "Load 1m", + "color": "#58140C", + "fill": 2, + "legend": false, + "stack": false, + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "clamp_max(((avg by (mode) ( (clamp_max(rate(node_cpu{instance=\"$host\",mode!=\"idle\"}[$interval]),1)) or (clamp_max(irate(node_cpu{instance=\"$host\",mode!=\"idle\"}[5m]),1)) ))*100 or (avg_over_time(node_cpu_average{instance=~\"$host\", mode!=\"total\", mode!=\"idle\"}[$interval]) or avg_over_time(node_cpu_average{instance=~\"$host\", mode!=\"total\", mode!=\"idle\"}[5m]))),100)", + "format": "time_series", + "hide": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{ mode }}", + "metric": "", + "refId": "A", + "step": 20 + }, + { + "expr": "clamp_max(max by () (sum by (cpu) ( (clamp_max(rate(node_cpu{instance=\"$host\",mode!=\"idle\",mode!=\"iowait\"}[$interval]),1)) or (clamp_max(irate(node_cpu{instance=\"$host\",mode!=\"idle\",mode!=\"iowait\"}[5m]),1)) ))*100,100)", + "format": "time_series", + "hide": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Max Core Utilization", + "refId": "B", + "step": 20 + }, + { + "expr": "node_load1{instance=\"$host\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Load 1m", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage / Load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "decimals": 1, + "format": "percent", + "label": "", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "none", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 142 + }, + "height": "250px", + "id": 36, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 1, + "points": true, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "sum((rate(node_disk_read_time_ms{device!~\"dm-.+\", instance=\"$host\"}[$interval]) / rate(node_disk_reads_completed{device!~\"dm-.+\", instance=\"$host\"}[$interval])) or (irate(node_disk_read_time_ms{device!~\"dm-.+\", instance=\"$host\"}[5m]) / irate(node_disk_reads_completed{device!~\"dm-.+\", instance=\"$host\"}[5m]))\nor avg_over_time(aws_rds_read_latency_average{instance=\"$host\"}[$interval]) or avg_over_time(aws_rds_read_latency_average{instance=\"$host\"}[5m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Read", + "metric": "", + "refId": "A", + "step": 20, + "target": "" + }, + { + "calculatedInterval": "2m", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "sum((rate(node_disk_write_time_ms{device!~\"dm-.+\", instance=\"$host\"}[$interval]) / rate(node_disk_writes_completed{device!~\"dm-.+\", instance=\"$host\"}[$interval])) or (irate(node_disk_write_time_ms{device!~\"dm-.+\", instance=\"$host\"}[5m]) / irate(node_disk_writes_completed{device!~\"dm-.+\", instance=\"$host\"}[5m])) or \navg_over_time(aws_rds_write_latency_average{instance=\"$host\"}[$interval]) or avg_over_time(aws_rds_write_latency_average{instance=\"$host\"}[5m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Write", + "metric": "", + "refId": "B", + "step": 20, + "target": "" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Latency", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": "", + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": null, + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 149 + }, + "height": "250px", + "id": 21, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Outbound", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "sum(rate(node_network_receive_bytes{instance=\"$host\", device!=\"lo\"}[$interval])) or sum(irate(node_network_receive_bytes{instance=\"$host\", device!=\"lo\"}[5m])) or sum(max_over_time(rdsosmetrics_network_rx{instance=\"$host\"}[$interval])) or sum(max_over_time(rdsosmetrics_network_rx{instance=\"$host\"}[5m])) ", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Inbound", + "metric": "", + "refId": "B", + "step": 20, + "target": "" + }, + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "sum(rate(node_network_transmit_bytes{instance=\"$host\", device!=\"lo\"}[$interval])) or sum(irate(node_network_transmit_bytes{instance=\"$host\", device!=\"lo\"}[5m])) or\nsum(max_over_time(rdsosmetrics_network_tx{instance=\"$host\"}[$interval])) or sum(max_over_time(rdsosmetrics_network_tx{instance=\"$host\"}[5m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Outbound", + "metric": "", + "refId": "A", + "step": 20, + "target": "" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Traffic", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Outbound (-) / Inbound (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "Prometheus", + "decimals": null, + "editable": true, + "error": false, + "fill": 2, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 149 + }, + "id": 38, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(node_vmstat_pswpin{instance=\"$host\"}[$interval]) * 4096 or irate(node_vmstat_pswpin{instance=\"$host\"}[5m]) * 4096", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Swap In (Reads)", + "metric": "", + "refId": "A", + "step": 20, + "target": "" + }, + { + "calculatedInterval": "2s", + "datasourceErrors": { + + }, + "errors": { + + }, + "expr": "rate(node_vmstat_pswpout{instance=\"$host\"}[$interval]) * 4096 or irate(node_vmstat_pswpout{instance=\"$host\"}[5m]) * 4096", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Swap Out (Writes)", + "metric": "", + "refId": "B", + "step": 20, + "target": "" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Swap Activity", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "1m", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "Percona", + "MySQL" + ], + "templating": { + "list": [ + { + "allFormat": "glob", + "auto": true, + "auto_count": 200, + "auto_min": "1s", + "current": { + "text": "auto", + "value": "$__auto_interval_interval" + }, + "datasource": "Prometheus", + "hide": 0, + "includeAll": false, + "label": "Interval", + "multi": false, + "multiFormat": "glob", + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "5s", + "value": "5s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + } + ], + "query": "1s,5s,1m,5m,1h,6h,1d", + "refresh": 2, + "type": "interval" + }, + { + "allFormat": "glob", + "allValue": null, + "current": { + + }, + "datasource": "Prometheus", + "hide": 0, + "includeAll": false, + "label": "Host", + "multi": false, + "multiFormat": "regex values", + "name": "host", + "options": [ + + ], + "query": "label_values(mysql_up, instance)", + "refresh": 1, + "refresh_on_load": false, + "regex": "", + "sort": 1, + "tagValuesQuery": null, + "tags": [ + + ], + "tagsQuery": null, + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "collapse": false, + "enable": true, + "hidden": false, + "notice": false, + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "status": "Stable", + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ], + "type": "timepicker" + }, + "timezone": "browser", + "title": "MySQL Overview", + "uid": "MQWgroiiz", + "version": 1 +} \ No newline at end of file diff --git a/devenv/docker/ha-test-unified-alerting/grafana/provisioning/datasources/datasources.yaml b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/datasources/datasources.yaml new file mode 100644 index 00000000000..3cf02b6a9b6 --- /dev/null +++ b/devenv/docker/ha-test-unified-alerting/grafana/provisioning/datasources/datasources.yaml @@ -0,0 +1,16 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + jsonData: + timeInterval: 10s + queryTimeout: 30s + httpMethod: POST + + - name: Loki + type: loki + access: proxy + url: http://loki:3100 diff --git a/devenv/docker/ha-test-unified-alerting/prometheus/prometheus.yml b/devenv/docker/ha-test-unified-alerting/prometheus/prometheus.yml new file mode 100644 index 00000000000..ab1711e9c6e --- /dev/null +++ b/devenv/docker/ha-test-unified-alerting/prometheus/prometheus.yml @@ -0,0 +1,47 @@ +# my global config +global: + scrape_interval: 10s # By default, scrape targets every 15 seconds. + evaluation_interval: 10s # By default, scrape targets every 15 seconds. + # scrape_timeout is set to the global default (10s). + +# Load and evaluate rules in this file every 'evaluation_interval' seconds. +#rule_files: +# - "alert.rules" +# - "first.rules" +# - "second.rules" + +# alerting: +# alertmanagers: +# - scheme: http +# static_configs: +# - targets: +# - "127.0.0.1:9093" + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'grafana' + dns_sd_configs: + - names: + - 'grafana' + type: 'A' + port: 3000 + refresh_interval: 10s + + - job_name: 'mysql' + dns_sd_configs: + - names: + - 'mysqld-exporter' + type: 'A' + port: 9104 + refresh_interval: 10s + + - job_name: 'loki' + dns_sd_configs: + - names: + - 'loki' + type: 'A' + port: 3100 + refresh_interval: 10s \ No newline at end of file diff --git a/docs/sources/administration/configuration.md b/docs/sources/administration/configuration.md index 1b542fb42f2..20be5059a01 100644 --- a/docs/sources/administration/configuration.md +++ b/docs/sources/administration/configuration.md @@ -1119,9 +1119,51 @@ Sets a global limit on number of alert rules that can be created. Default is -1 For more information about the Grafana 8 alerts, refer to [Unified Alerting]({{< relref "../alerting/unified-alerting/_index.md" >}}). -### admin_config_poll_interval_seconds +### admin_config_poll_interval -Specify the frequency of polling for admin config changes. The default value is `60`. +Specify the frequency of polling for admin config changes. The default value is `60s`. + +The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. + +### alertmanager_config_poll_interval + +Specify the frequency of polling for Alertmanager config changes. The default value is `60s`. + +The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. + +### ha_listen_address + +Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`. + +### ha_advertise_address + +Explicit address/hostname and port to advertise other Grafana instances. The port is used for both TCP and UDP. + +### ha_peers + +Comma-separated list of initial instances (in a format of host:port) that will form the HA cluster. Configuring this setting will enable High Availability mode for alerting. + +### ha_peer_timeout + +Time to wait for an instance to send a notification via the Alertmanager. In HA, each Grafana instance will +be assigned a position (e.g. 0, 1). We then multiply this position with the timeout to indicate how long should +each instance wait before sending the notification to take into account replication lag. The default value is `15s`. + +The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. + +### ha_gossip_interval + +The interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated +across cluster more quickly at the expense of increased bandwidth usage. The default value is `200ms`. + +The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. + +### ha_push_pull_interval + +The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds +across larger clusters at the expense of increased bandwidth usage. The default value is `60s`. + +The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
diff --git a/pkg/services/ngalert/metrics/ngalert.go b/pkg/services/ngalert/metrics/ngalert.go index 93dd28adfeb..da2b238b141 100644 --- a/pkg/services/ngalert/metrics/ngalert.go +++ b/pkg/services/ngalert/metrics/ngalert.go @@ -52,6 +52,7 @@ type Scheduler struct { } type MultiOrgAlertmanager struct { + Registerer prometheus.Registerer ActiveConfigurations prometheus.Gauge DiscoveredConfigurations prometheus.Gauge registries *OrgRegistries @@ -178,6 +179,7 @@ func newStateMetrics(r prometheus.Registerer) *State { func newMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager { return &MultiOrgAlertmanager{ + Registerer: r, registries: NewOrgRegistries(), DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{ Namespace: Namespace, diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index 4652ef33afa..10bb85da85f 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -84,6 +84,8 @@ type AlertNG struct { } func (ng *AlertNG) init() error { + var err error + baseInterval := ng.Cfg.AlertingBaseInterval if baseInterval <= 0 { baseInterval = defaultBaseIntervalSeconds @@ -97,7 +99,11 @@ func (ng *AlertNG) init() error { Logger: ng.Log, } - ng.MultiOrgAlertmanager = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore, ng.Metrics.GetMultiOrgAlertmanagerMetrics()) + multiOrgMetrics := ng.Metrics.GetMultiOrgAlertmanagerMetrics() + ng.MultiOrgAlertmanager, err = notifier.NewMultiOrgAlertmanager(ng.Cfg, store, store, ng.KVStore, multiOrgMetrics, log.New("ngalert.multiorg.alertmanager")) + if err != nil { + return err + } // Let's make sure we're able to complete an initial sync of Alertmanagers before we start the alerting components. if err := ng.MultiOrgAlertmanager.LoadAndSyncAlertmanagersForOrgs(context.Background()); err != nil { diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index cde9ec5fbdb..d7fec832c3b 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -15,6 +15,7 @@ import ( gokit_log "github.com/go-kit/kit/log" amv2 "github.com/prometheus/alertmanager/api/v2/models" + "github.com/prometheus/alertmanager/cluster" "github.com/prometheus/alertmanager/dispatch" "github.com/prometheus/alertmanager/inhibit" "github.com/prometheus/alertmanager/nflog" @@ -24,6 +25,7 @@ import ( "github.com/prometheus/alertmanager/silence" "github.com/prometheus/alertmanager/template" "github.com/prometheus/alertmanager/types" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" "github.com/grafana/grafana/pkg/components/securejsondata" @@ -77,9 +79,16 @@ const ( ` ) +type ClusterPeer interface { + AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel + Position() int + WaitReady(context.Context) error +} + type Alertmanager struct { logger log.Logger gokitLogger gokit_log.Logger + OrgID int64 Settings *setting.Cfg Store store.AlertingStore @@ -90,6 +99,8 @@ type Alertmanager struct { marker types.Marker alerts *mem.Alerts route *dispatch.Route + peer ClusterPeer + peerTimeout time.Duration dispatcher *dispatch.Dispatcher inhibitor *inhibit.Inhibitor @@ -111,7 +122,7 @@ type Alertmanager struct { orgID int64 } -func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, m *metrics.Alertmanager) (*Alertmanager, error) { +func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, kvStore kvstore.KVStore, peer ClusterPeer, m *metrics.Alertmanager) (*Alertmanager, error) { am := &Alertmanager{ Settings: cfg, stopc: make(chan struct{}), @@ -120,6 +131,8 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k stageMetrics: notify.NewMetrics(m.Registerer), dispatcherMetrics: dispatch.NewDispatcherMetrics(false, m.Registerer), Store: store, + peer: peer, + peerTimeout: cfg.HAPeerTimeout, Metrics: m, orgID: orgID, } @@ -148,6 +161,9 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k if err != nil { return nil, fmt.Errorf("unable to initialize the notification log component of alerting: %w", err) } + c := am.peer.AddState(fmt.Sprintf("notificationlog:%d", am.OrgID), am.notificationLog, m.Registerer) + am.notificationLog.SetBroadcast(c.Broadcast) + // Initialize silences am.silences, err = silence.New(silence.Options{ Metrics: m.Registerer, @@ -158,6 +174,9 @@ func newAlertmanager(orgID int64, cfg *setting.Cfg, store store.AlertingStore, k return nil, fmt.Errorf("unable to initialize the silencing component of alerting: %w", err) } + c = am.peer.AddState(fmt.Sprintf("silences:%d", am.OrgID), am.silences, m.Registerer) + am.silences.SetBroadcast(c.Broadcast) + am.wg.Add(1) go func() { am.silences.Maintenance(15*time.Minute, silencesFilePath, am.stopc, func() (int64, error) { @@ -392,15 +411,16 @@ func (am *Alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig am.inhibitor = inhibit.NewInhibitor(am.alerts, cfg.AlertmanagerConfig.InhibitRules, am.marker, am.gokitLogger) am.silencer = silence.NewSilencer(am.silences, am.marker, am.gokitLogger) + meshStage := notify.NewGossipSettleStage(am.peer) inhibitionStage := notify.NewMuteStage(am.inhibitor) silencingStage := notify.NewMuteStage(am.silencer) for name := range integrationsMap { - stage := am.createReceiverStage(name, integrationsMap[name], waitFunc, am.notificationLog) - routingStage[name] = notify.MultiStage{silencingStage, inhibitionStage, stage} + stage := am.createReceiverStage(name, integrationsMap[name], am.waitFunc, am.notificationLog) + routingStage[name] = notify.MultiStage{meshStage, silencingStage, inhibitionStage, stage} } am.route = dispatch.NewRoute(cfg.AlertmanagerConfig.Route, nil) - am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics) + am.dispatcher = dispatch.NewDispatcher(am.alerts, am.route, routingStage, am.marker, am.timeoutFunc, &nilLimits{}, am.gokitLogger, am.dispatcherMetrics) am.wg.Add(1) go func() { @@ -701,21 +721,17 @@ func (am *Alertmanager) createReceiverStage(name string, integrations []notify.I return fs } -func waitFunc() time.Duration { - // When it's a single instance, we don't need additional wait. The routing policies will have their own group wait. - // We need >0 wait here in case we have peers to sync the notification state with. 0 wait in that case can result - // in duplicate notifications being sent. - // TODO: we have setting.AlertingNotificationTimeout in legacy settings. Either use that or separate set of config - // for clustering with intuitive name, like "PeerTimeout". - return 0 +func (am *Alertmanager) waitFunc() time.Duration { + return time.Duration(am.peer.Position()) * am.peerTimeout } -func timeoutFunc(d time.Duration) time.Duration { - //TODO: What does MinTimeout means here? +func (am *Alertmanager) timeoutFunc(d time.Duration) time.Duration { + // time.Duration d relates to the receiver's group_interval. Even with a group interval of 1s, + // we need to make sure (non-position-0) peers in the cluster wait before flushing the notifications. if d < notify.MinTimeout { d = notify.MinTimeout } - return d + waitFunc() + return d + am.waitFunc() } type nilLimits struct{} diff --git a/pkg/services/ngalert/notifier/alertmanager_test.go b/pkg/services/ngalert/notifier/alertmanager_test.go index ecc56f6e658..f790b2bce15 100644 --- a/pkg/services/ngalert/notifier/alertmanager_test.go +++ b/pkg/services/ngalert/notifier/alertmanager_test.go @@ -48,7 +48,7 @@ func setupAMTest(t *testing.T) *Alertmanager { } kvStore := newFakeKVStore(t) - am, err := newAlertmanager(1, cfg, s, kvStore, m) + am, err := newAlertmanager(1, cfg, s, kvStore, &NilPeer{}, m) require.NoError(t, err) return am } diff --git a/pkg/services/ngalert/notifier/multiorg_alertmanager.go b/pkg/services/ngalert/notifier/multiorg_alertmanager.go index 51c2e21145f..ee7ac1a94b1 100644 --- a/pkg/services/ngalert/notifier/multiorg_alertmanager.go +++ b/pkg/services/ngalert/notifier/multiorg_alertmanager.go @@ -6,6 +6,12 @@ import ( "sync" "time" + "github.com/grafana/grafana/pkg/services/ngalert/logging" + + gokit_log "github.com/go-kit/kit/log" + "github.com/prometheus/alertmanager/cluster" + "github.com/prometheus/client_golang/prometheus" + "github.com/grafana/grafana/pkg/infra/kvstore" "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/services/ngalert/metrics" @@ -14,7 +20,6 @@ import ( ) var ( - SyncOrgsPollInterval = 1 * time.Minute ErrNoAlertmanagerForOrg = fmt.Errorf("Alertmanager does not exist for this organization") ErrAlertmanagerNotReady = fmt.Errorf("Alertmanager is not ready yet") ) @@ -26,6 +31,10 @@ type MultiOrgAlertmanager struct { settings *setting.Cfg logger log.Logger + // clusterPeer represents the clustering peers of Alertmanagers between Grafana instances. + peer ClusterPeer + settleCancel context.CancelFunc + configStore store.AlertingStore orgStore store.OrgStore kvStore kvstore.KVStore @@ -33,16 +42,52 @@ type MultiOrgAlertmanager struct { metrics *metrics.MultiOrgAlertmanager } -func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore, m *metrics.MultiOrgAlertmanager) *MultiOrgAlertmanager { - return &MultiOrgAlertmanager{ +func NewMultiOrgAlertmanager(cfg *setting.Cfg, configStore store.AlertingStore, orgStore store.OrgStore, kvStore kvstore.KVStore, m *metrics.MultiOrgAlertmanager, l log.Logger) (*MultiOrgAlertmanager, error) { + moa := &MultiOrgAlertmanager{ + logger: l, settings: cfg, - logger: log.New("multiorg.alertmanager"), alertmanagers: map[int64]*Alertmanager{}, configStore: configStore, orgStore: orgStore, kvStore: kvStore, metrics: m, } + + clusterLogger := gokit_log.With(gokit_log.NewLogfmtLogger(logging.NewWrapper(l)), "component", "cluster") + moa.peer = &NilPeer{} + if len(cfg.HAPeers) > 0 { + peer, err := cluster.Create( + clusterLogger, + m.Registerer, + cfg.HAListenAddr, + cfg.HAAdvertiseAddr, + cfg.HAPeers, // peers + true, + cfg.HAPushPullInterval, + cfg.HAGossipInterval, + cluster.DefaultTcpTimeout, + cluster.DefaultProbeTimeout, + cluster.DefaultProbeInterval, + nil, + ) + + if err != nil { + return nil, fmt.Errorf("unable to initialize gossip mesh: %w", err) + } + + err = peer.Join(cluster.DefaultReconnectInterval, cluster.DefaultReconnectTimeout) + if err != nil { + l.Error("msg", "unable to join gossip mesh while initializing cluster for high availability mode", "err", err) + } + // Attempt to verify the number of peers for 30s every 2s. The risk here is what we send a notification "too soon". + // Which should _never_ happen given we share the notification log via the database so the risk of double notification is very low. + var ctx context.Context + ctx, moa.settleCancel = context.WithTimeout(context.Background(), 30*time.Second) + go peer.Settle(ctx, cluster.DefaultGossipInterval*10) + moa.peer = peer + } + + return moa, nil } func (moa *MultiOrgAlertmanager) Run(ctx context.Context) error { @@ -53,7 +98,7 @@ func (moa *MultiOrgAlertmanager) Run(ctx context.Context) error { case <-ctx.Done(): moa.StopAndWait() return nil - case <-time.After(SyncOrgsPollInterval): + case <-time.After(moa.settings.AlertmanagerConfigPollInterval): if err := moa.LoadAndSyncAlertmanagersForOrgs(ctx); err != nil { moa.logger.Error("error while synchronizing Alertmanager orgs", "err", err) } @@ -90,7 +135,7 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(orgIDs []int64) { // To export them, we need to translate the metrics from each individual registry and, // then aggregate them on the main registry. m := metrics.NewAlertmanagerMetrics(moa.metrics.GetOrCreateOrgRegistry(orgID)) - am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, m) + am, err := newAlertmanager(orgID, moa.settings, moa.configStore, moa.kvStore, moa.peer, m) if err != nil { moa.logger.Error("unable to create Alertmanager for org", "org", orgID, "err", err) } @@ -130,6 +175,14 @@ func (moa *MultiOrgAlertmanager) StopAndWait() { for _, am := range moa.alertmanagers { am.StopAndWait() } + + p, ok := moa.peer.(*cluster.Peer) + if ok { + moa.settleCancel() + if err := p.Leave(10 * time.Second); err != nil { + moa.logger.Warn("unable to leave the gossip mesh", "err", err) + } + } } // AlertmanagerFor returns the Alertmanager instance for the organization provided. @@ -150,3 +203,16 @@ func (moa *MultiOrgAlertmanager) AlertmanagerFor(orgID int64) (*Alertmanager, er return orgAM, nil } + +// NilPeer and NilChannel implements the Alertmanager clustering interface. +type NilPeer struct{} + +func (p *NilPeer) Position() int { return 0 } +func (p *NilPeer) WaitReady(context.Context) error { return nil } +func (p *NilPeer) AddState(string, cluster.State, prometheus.Registerer) cluster.ClusterChannel { + return &NilChannel{} +} + +type NilChannel struct{} + +func (c *NilChannel) Broadcast([]byte) {} diff --git a/pkg/services/ngalert/notifier/multiorg_alertmanager_test.go b/pkg/services/ngalert/notifier/multiorg_alertmanager_test.go index 17759642ae0..3259f9902fc 100644 --- a/pkg/services/ngalert/notifier/multiorg_alertmanager_test.go +++ b/pkg/services/ngalert/notifier/multiorg_alertmanager_test.go @@ -8,6 +8,7 @@ import ( "testing" "time" + "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/services/ngalert/metrics" "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/setting" @@ -18,7 +19,6 @@ import ( ) func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) { - t.Skipf("Skipping multiorg alertmanager tests for now") configStore := &FakeConfigStore{ configs: map[int64]*models.AlertConfiguration{}, } @@ -28,12 +28,15 @@ func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) { tmpDir, err := ioutil.TempDir("", "test") require.NoError(t, err) - - SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests. kvStore := newFakeKVStore(t) reg := prometheus.NewPedanticRegistry() m := metrics.NewNGAlert(reg) - mam := NewMultiOrgAlertmanager(&setting.Cfg{DataPath: tmpDir}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics()) + cfg := &setting.Cfg{ + DataPath: tmpDir, + AlertmanagerConfigPollInterval: 3 * time.Minute, // do not poll in tests + } + mam, err := NewMultiOrgAlertmanager(cfg, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics(), log.New("testlogger")) + require.NoError(t, err) ctx := context.Background() t.Cleanup(cleanOrgDirectories(tmpDir, t)) @@ -82,22 +85,23 @@ grafana_alerting_discovered_configurations 4 } func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) { - t.Skipf("Skipping multiorg alertmanager tests for now") configStore := &FakeConfigStore{ configs: map[int64]*models.AlertConfiguration{}, } orgStore := &FakeOrgStore{ orgs: []int64{1, 2, 3}, } - tmpDir, err := ioutil.TempDir("", "test") require.NoError(t, err) - - SyncOrgsPollInterval = 10 * time.Minute // Don't poll in unit tests. + cfg := &setting.Cfg{ + DataPath: tmpDir, + AlertmanagerConfigPollInterval: 3 * time.Minute, // do not poll in tests + } kvStore := newFakeKVStore(t) reg := prometheus.NewPedanticRegistry() m := metrics.NewNGAlert(reg) - mam := NewMultiOrgAlertmanager(&setting.Cfg{DataPath: tmpDir}, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics()) + mam, err := NewMultiOrgAlertmanager(cfg, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics(), log.New("testlogger")) + require.NoError(t, err) ctx := context.Background() t.Cleanup(cleanOrgDirectories(tmpDir, t)) diff --git a/pkg/services/ngalert/schedule/schedule_unit_test.go b/pkg/services/ngalert/schedule/schedule_unit_test.go index f07d478feba..4f4284879af 100644 --- a/pkg/services/ngalert/schedule/schedule_unit_test.go +++ b/pkg/services/ngalert/schedule/schedule_unit_test.go @@ -231,6 +231,8 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac mockedClock := clock.NewMock() logger := log.New("ngalert schedule test") m := metrics.NewNGAlert(prometheus.NewPedanticRegistry()) + moa, err := notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, ¬ifier.FakeConfigStore{}, ¬ifier.FakeOrgStore{}, ¬ifier.FakeKVStore{}, nil, log.New("testlogger")) + require.NoError(t, err) schedCfg := SchedulerCfg{ C: mockedClock, BaseInterval: time.Second, @@ -239,7 +241,7 @@ func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, ac RuleStore: rs, InstanceStore: is, AdminConfigStore: acs, - MultiOrgNotifier: notifier.NewMultiOrgAlertmanager(&setting.Cfg{}, ¬ifier.FakeConfigStore{}, ¬ifier.FakeOrgStore{}, ¬ifier.FakeKVStore{}, nil), + MultiOrgNotifier: moa, Logger: logger, Metrics: m.GetSchedulerMetrics(), AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests. diff --git a/pkg/setting/setting.go b/pkg/setting/setting.go index 6948e10c638..d554e0dd5b0 100644 --- a/pkg/setting/setting.go +++ b/pkg/setting/setting.go @@ -18,15 +18,14 @@ import ( "strings" "time" - "github.com/gobwas/glob" - - "github.com/prometheus/common/model" - "gopkg.in/ini.v1" - "github.com/grafana/grafana-aws-sdk/pkg/awsds" "github.com/grafana/grafana/pkg/components/gtime" "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/util" + + "github.com/gobwas/glob" + "github.com/prometheus/common/model" + "gopkg.in/ini.v1" ) type Scheme string @@ -420,7 +419,14 @@ type Cfg struct { GeomapEnableCustomBaseLayers bool // Unified Alerting - AdminConfigPollInterval time.Duration + AdminConfigPollInterval time.Duration + AlertmanagerConfigPollInterval time.Duration + HAListenAddr string + HAAdvertiseAddr string + HAPeers []string + HAPeerTimeout time.Duration + HAGossipInterval time.Duration + HAPushPullInterval time.Duration } // IsLiveConfigEnabled returns true if live should be able to save configs to SQL tables @@ -916,8 +922,7 @@ func (cfg *Cfg) Load(args CommandLineArgs) error { if err := readAlertingSettings(iniFile); err != nil { return err } - - if err := cfg.readUnifiedAlertingSettings(iniFile); err != nil { + if err := cfg.ReadUnifiedAlertingSettings(iniFile); err != nil { return err } @@ -1374,13 +1379,6 @@ func (cfg *Cfg) readRenderingSettings(iniFile *ini.File) error { return nil } -func (cfg *Cfg) readUnifiedAlertingSettings(iniFile *ini.File) error { - ua := iniFile.Section("unified_alerting") - s := ua.Key("admin_config_poll_interval_seconds").MustInt(60) - cfg.AdminConfigPollInterval = time.Second * time.Duration(s) - return nil -} - func readAlertingSettings(iniFile *ini.File) error { alerting := iniFile.Section("alerting") AlertingEnabled = alerting.Key("enabled").MustBool(true) diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go new file mode 100644 index 00000000000..3432c01acc5 --- /dev/null +++ b/pkg/setting/setting_unified_alerting.go @@ -0,0 +1,57 @@ +package setting + +import ( + "strings" + "time" + + "github.com/grafana/grafana/pkg/components/gtime" + + "github.com/prometheus/alertmanager/cluster" + "gopkg.in/ini.v1" +) + +const ( + AlertmanagerDefaultClusterAddr = "0.0.0.0:9094" + AlertmanagerDefaultPeerTimeout = 15 * time.Second + AlertmanagerDefaultGossipInterval = cluster.DefaultGossipInterval + AlertmanagerDefaultPushPullInterval = cluster.DefaultPushPullInterval + SchedulerDefaultAdminConfigPollInterval = 60 * time.Second + AlertmanagerDefaultConfigPollInterval = 60 * time.Second +) + +func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { + ua := iniFile.Section("unified_alerting") + var err error + cfg.AdminConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "admin_config_poll_interval", (SchedulerDefaultAdminConfigPollInterval).String())) + if err != nil { + return err + } + cfg.AlertmanagerConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "alertmanager_config_poll_interval", (AlertmanagerDefaultConfigPollInterval).String())) + if err != nil { + return err + } + cfg.HAPeerTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_peer_timeout", (AlertmanagerDefaultPeerTimeout).String())) + if err != nil { + return err + } + cfg.HAGossipInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_gossip_interval", (AlertmanagerDefaultGossipInterval).String())) + if err != nil { + return err + } + cfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (AlertmanagerDefaultPushPullInterval).String())) + if err != nil { + return err + } + cfg.HAListenAddr = ua.Key("ha_listen_address").MustString(AlertmanagerDefaultClusterAddr) + cfg.HAAdvertiseAddr = ua.Key("ha_advertise_address").MustString("") + peers := ua.Key("ha_peers").MustString("") + cfg.HAPeers = make([]string, 0) + if peers != "" { + for _, peer := range strings.Split(peers, ",") { + peer = strings.TrimSpace(peer) + cfg.HAPeers = append(cfg.HAPeers, peer) + } + } + + return nil +} diff --git a/pkg/setting/setting_unified_alerting_test.go b/pkg/setting/setting_unified_alerting_test.go new file mode 100644 index 00000000000..1011df3a98f --- /dev/null +++ b/pkg/setting/setting_unified_alerting_test.go @@ -0,0 +1,39 @@ +package setting + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) { + cfg := NewCfg() + err := cfg.Load(CommandLineArgs{HomePath: "../../", Config: "../../conf/defaults.ini"}) + require.NoError(t, err) + + // It sets the correct defaults. + { + require.Equal(t, 60*time.Second, cfg.AdminConfigPollInterval) + require.Equal(t, 60*time.Second, cfg.AlertmanagerConfigPollInterval) + require.Equal(t, 15*time.Second, cfg.HAPeerTimeout) + require.Equal(t, "0.0.0.0:9094", cfg.HAListenAddr) + require.Equal(t, "", cfg.HAAdvertiseAddr) + require.Len(t, cfg.HAPeers, 0) + require.Equal(t, 200*time.Millisecond, cfg.HAGossipInterval) + require.Equal(t, 60*time.Second, cfg.HAPushPullInterval) + } + + // With peers set, it correctly parses them. + { + require.Len(t, cfg.HAPeers, 0) + s, err := cfg.Raw.NewSection("unified_alerting") + require.NoError(t, err) + _, err = s.NewKey("ha_peers", "hostname1:9090,hostname2:9090,hostname3:9090") + require.NoError(t, err) + + require.NoError(t, cfg.ReadUnifiedAlertingSettings(cfg.Raw)) + require.Len(t, cfg.HAPeers, 3) + require.ElementsMatch(t, []string{"hostname1:9090", "hostname2:9090", "hostname3:9090"}, cfg.HAPeers) + } +} diff --git a/pkg/tests/api/alerting/api_admin_configuration_test.go b/pkg/tests/api/alerting/api_admin_configuration_test.go index f6cabe2f55d..b9dab774617 100644 --- a/pkg/tests/api/alerting/api_admin_configuration_test.go +++ b/pkg/tests/api/alerting/api_admin_configuration_test.go @@ -21,9 +21,9 @@ import ( func TestAdminConfiguration_SendingToExternalAlertmanagers(t *testing.T) { dir, path := testinfra.CreateGrafDir(t, testinfra.GrafanaOpts{ - EnableFeatureToggles: []string{"ngalert"}, - DisableAnonymous: true, - NGAlertAdminConfigIntervalSeconds: 2, + EnableFeatureToggles: []string{"ngalert"}, + DisableAnonymous: true, + NGAlertAdminConfigPollInterval: 2 * time.Second, }) grafanaListedAddr, s := testinfra.StartGrafana(t, dir, path) diff --git a/pkg/tests/api/alerting/api_alertmanager_configuration_test.go b/pkg/tests/api/alerting/api_alertmanager_configuration_test.go index 770a5809add..7bd6e1804df 100644 --- a/pkg/tests/api/alerting/api_alertmanager_configuration_test.go +++ b/pkg/tests/api/alerting/api_alertmanager_configuration_test.go @@ -8,8 +8,6 @@ import ( "testing" "time" - "github.com/grafana/grafana/pkg/services/ngalert/notifier" - "github.com/grafana/grafana/pkg/bus" "github.com/grafana/grafana/pkg/models" "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" @@ -19,16 +17,10 @@ import ( ) func TestAlertmanagerConfigurationIsTransactional(t *testing.T) { - // TODO: We need a reliable way to ensure Alertmanagers have synced correctly. - // For now, make them sync quicker. - p := notifier.SyncOrgsPollInterval - notifier.SyncOrgsPollInterval = 2 * time.Second - t.Cleanup(func() { - notifier.SyncOrgsPollInterval = p - }) dir, path := testinfra.CreateGrafDir(t, testinfra.GrafanaOpts{ - EnableFeatureToggles: []string{"ngalert"}, - DisableAnonymous: true, + EnableFeatureToggles: []string{"ngalert"}, + NGAlertAlertmanagerConfigPollInterval: 2 * time.Second, + DisableAnonymous: true, }) grafanaListedAddr, store := testinfra.StartGrafana(t, dir, path) diff --git a/pkg/tests/testinfra/testinfra.go b/pkg/tests/testinfra/testinfra.go index 3668a22a7c7..81f28980848 100644 --- a/pkg/tests/testinfra/testinfra.go +++ b/pkg/tests/testinfra/testinfra.go @@ -10,6 +10,7 @@ import ( "path/filepath" "strings" "testing" + "time" "github.com/grafana/grafana/pkg/api" "github.com/grafana/grafana/pkg/infra/fs" @@ -204,13 +205,18 @@ func CreateGrafDir(t *testing.T, opts ...GrafanaOpts) (string, string) { _, err = featureSection.NewKey("enable", strings.Join(o.EnableFeatureToggles, " ")) require.NoError(t, err) } - if o.NGAlertAdminConfigIntervalSeconds != 0 { - ngalertingSection, err := cfg.NewSection("ngalerting") + if o.NGAlertAdminConfigPollInterval != 0 { + ngalertingSection, err := cfg.NewSection("unified_alerting") require.NoError(t, err) - _, err = ngalertingSection.NewKey("admin_config_poll_interval_seconds", fmt.Sprintf("%d", o.NGAlertAdminConfigIntervalSeconds)) + _, err = ngalertingSection.NewKey("admin_config_poll_interval", o.NGAlertAdminConfigPollInterval.String()) + require.NoError(t, err) + } + if o.NGAlertAlertmanagerConfigPollInterval != 0 { + ngalertingSection, err := cfg.NewSection("unified_alerting") + require.NoError(t, err) + _, err = ngalertingSection.NewKey("alertmanager_config_poll_interval", o.NGAlertAlertmanagerConfigPollInterval.String()) require.NoError(t, err) } - if o.AnonymousUserRole != "" { _, err = anonSect.NewKey("org_role", string(o.AnonymousUserRole)) require.NoError(t, err) @@ -252,13 +258,14 @@ func CreateGrafDir(t *testing.T, opts ...GrafanaOpts) (string, string) { } type GrafanaOpts struct { - EnableCSP bool - EnableFeatureToggles []string - NGAlertAdminConfigIntervalSeconds int - AnonymousUserRole models.RoleType - EnableQuota bool - DisableAnonymous bool - CatalogAppEnabled bool - ViewersCanEdit bool - PluginAdminEnabled bool + EnableCSP bool + EnableFeatureToggles []string + NGAlertAdminConfigPollInterval time.Duration + NGAlertAlertmanagerConfigPollInterval time.Duration + AnonymousUserRole models.RoleType + EnableQuota bool + DisableAnonymous bool + CatalogAppEnabled bool + ViewersCanEdit bool + PluginAdminEnabled bool }