Add monitoring mixing for Grafana (#28285)

Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com>
pull/28314/head
Carl Bergquist 5 years ago committed by GitHub
parent febdad4da2
commit 6002df580f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 3
      grafana-mixin/.gitignore
  2. 21
      grafana-mixin/Makefile
  3. 28
      grafana-mixin/README.md
  4. 14
      grafana-mixin/alerts/alerts.yaml
  5. 528
      grafana-mixin/dashboards/grafana-overview.json
  6. 15
      grafana-mixin/mixin.libsonnet
  7. 7
      grafana-mixin/rules/rules.yaml
  8. 1
      scripts/lib.star
  9. 7
      scripts/mixin-check.sh

@ -0,0 +1,3 @@
alerts.yaml
rules.yaml
dashboards_out

@ -0,0 +1,21 @@
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
all: fmt lint build clean
fmt:
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
xargs -n 1 -- $(JSONNET_FMT) -i
lint:
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
while read f; do \
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
done
mixtool lint mixin.libsonnet
build:
mixtool generate all mixin.libsonnet
clean:
rm -rf dashboards_out alerts.yaml rules.yaml

@ -0,0 +1,28 @@
# Grafana Mixin
_This is a work in progress. We aim for it to become a good role model for alerts
and dashboards eventually, but it is not quite there yet._
The Grafana Mixin is a set of configurable, reusable, and extensible alerts and
dashboards based on the metrics exported by Grafana. The mixin creates
recording and alerting rules for Prometheus and suitable dashboard descriptions
for Grafana.
To use them, you need to have `mixtool` and `jsonnetfmt` installed. If you
have a working Go development environment, it's easiest to run the following:
```bash
$ go get github.com/monitoring-mixins/mixtool/cmd/mixtool
$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt
```
You can then build the Prometheus rules files `alerts.yaml` and
`rules.yaml` and a directory `dashboard_out` with the JSON dashboard files
for Grafana:
```bash
$ make build
```
For more advanced uses of mixins, see
https://github.com/monitoring-mixins/docs.

@ -0,0 +1,14 @@
groups:
- name: GrafanaAlerts
rules:
- alert: GrafanaRequestsFailing
for: 5m
expr: |
100 * namespace_job_handler_statuscode:http_request_total:rate5m{handler!~"/datasources/proxy/:id.*|/ds/query|/tsdb/query", statuscode=~"5.."}
/
namespace_job_handler_statuscode:http_request_total:rate5m{handler!~"/datasources/proxy/:id.*|/ds/query|/tsdb/query"}
> 0.5
labels:
severity: 'critical'
annotations:
message: "'{{ $labels.namespace }}' / '{{ $labels.job }}' / '{{ $labels.handler }}' is experiencing {{ $value | humanize }}% errors"

@ -0,0 +1,528 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 35,
"iteration": 1602761142538,
"links": [],
"panels": [
{
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [],
"noValue": "0",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 0,
"y": 0
},
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
}
},
"pluginVersion": "7.0.4",
"targets": [
{
"expr": "grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Firing Alerts",
"type": "stat"
},
{
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 6,
"x": 6,
"y": 0
},
"id": 8,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
}
},
"pluginVersion": "7.0.4",
"targets": [
{
"expr": "sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})",
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Dashboards",
"type": "stat"
},
{
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {
"align": null
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 5,
"w": 12,
"x": 12,
"y": 0
},
"id": 10,
"options": {
"showHeader": true
},
"pluginVersion": "7.0.4",
"targets": [
{
"expr": "grafana_build_info{job=~\"$job\", instance=~\"$instance\"}",
"instant": true,
"interval": "",
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "Build Info",
"transformations": [
{
"id": "labelsToFields",
"options": {}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"Value": true,
"branch": true,
"container": true,
"goversion": true,
"namespace": true,
"pod": true,
"revision": true
},
"indexByName": {
"Time": 7,
"Value": 11,
"branch": 4,
"container": 8,
"edition": 2,
"goversion": 6,
"instance": 1,
"job": 0,
"namespace": 9,
"pod": 10,
"revision": 5,
"version": 3
},
"renameByName": {}
}
}
],
"type": "table"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 5
},
"hiddenSeries": false,
"id": 2,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by (statuscode) (irate(http_request_total{job=~\"$job\", instance=~\"$instance\"}[1m])) ",
"interval": "",
"legendFormat": "{{statuscode}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "RPS",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:157",
"format": "reqps",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:158",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 5
},
"hiddenSeries": false,
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "max(http_request_duration_milliseconds{job=~\"$job\", instance=~\"$instance\", quantile=\"0.99\"})",
"interval": "",
"legendFormat": "max-99th",
"refId": "A"
},
{
"expr": "max(http_request_duration_milliseconds{job=~\"$job\", instance=~\"$instance\", quantile=\"0.9\"})",
"interval": "",
"legendFormat": "max-90th",
"refId": "B"
},
{
"expr": "sum(irate(http_request_duration_milliseconds_sum{job=~\"$job\", instance=~\"$instance\"}[$__interval])) / sum(irate(http_request_duration_milliseconds_count{job=~\"$job\", instance=~\"$instance\"}[$__interval])) ",
"interval": "",
"legendFormat": "avg",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Request Latency",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"$$hashKey": "object:210",
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"$$hashKey": "object:211",
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"schemaVersion": 25,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "prometheus",
"value": "prometheus"
},
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"allValue": ".*",
"current": {
"selected": true,
"tags": [],
"text": "All",
"value": [
"$__all"
]
},
"datasource": "$datasource",
"definition": "label_values(grafana_build_info, job)",
"hide": 0,
"includeAll": true,
"label": null,
"multi": true,
"name": "job",
"options": [],
"query": "label_values(grafana_build_info, job)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"definition": "label_values(grafana_build_info, instance)",
"hide": 0,
"includeAll": true,
"label": null,
"multi": true,
"name": "instance",
"options": [],
"query": "label_values(grafana_build_info, instance)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "Grafana Overview",
"uid": "6be0s85Mk",
"version": 4
}

@ -0,0 +1,15 @@
{
grafanaDashboards: {
'grafana-overview.json': (import 'dashboards/grafana-overview.json'),
},
// Helper function to ensure that we don't override other rules, by forcing
// the patching of the groups list, and not the overall rules object.
local importRules(rules) = {
groups+: std.native('parseYaml')(rules)[0].groups,
},
prometheusRules+: importRules(importstr 'rules/rules.yaml'),
prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'),
}

@ -0,0 +1,7 @@
groups:
- name: grafana_rules
rules:
# Record error rate of http requests excluding dataproxy, /ds/query and /tsdb/query requests
- record: namespace_job_handler_statuscode:http_request_total:rate5m
expr: |
sum by (namespace, job, handler, statuscode) (rate(http_request_total[5m]))

@ -224,6 +224,7 @@ def lint_backend_step(edition):
'revive -formatter stylish -config scripts/go/configs/revive.toml ./pkg/...',
'./scripts/revive-strict',
'./scripts/tidy-check.sh',
'./scripts/mixin-check.sh,
],
}

@ -0,0 +1,7 @@
#!/bin/bash
set -eo pipefail
cd grafana-mixin
go install github.com/monitoring-mixins/mixtool/cmd/mixtool
go install github.com/google/go-jsonnet/cmd/jsonnetfmt
make lint build
Loading…
Cancel
Save