mirror of https://github.com/grafana/grafana
Add monitoring mixing for Grafana (#28285)
Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com>pull/28314/head
parent
febdad4da2
commit
6002df580f
@ -0,0 +1,3 @@ |
||||
alerts.yaml |
||||
rules.yaml |
||||
dashboards_out |
@ -0,0 +1,21 @@ |
||||
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
|
||||
|
||||
all: fmt lint build clean |
||||
|
||||
fmt: |
||||
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||
xargs -n 1 -- $(JSONNET_FMT) -i
|
||||
|
||||
lint: |
||||
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||
while read f; do \
|
||||
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
|
||||
done
|
||||
|
||||
mixtool lint mixin.libsonnet
|
||||
|
||||
build: |
||||
mixtool generate all mixin.libsonnet
|
||||
|
||||
clean: |
||||
rm -rf dashboards_out alerts.yaml rules.yaml
|
@ -0,0 +1,28 @@ |
||||
# Grafana Mixin |
||||
|
||||
_This is a work in progress. We aim for it to become a good role model for alerts |
||||
and dashboards eventually, but it is not quite there yet._ |
||||
|
||||
The Grafana Mixin is a set of configurable, reusable, and extensible alerts and |
||||
dashboards based on the metrics exported by Grafana. The mixin creates |
||||
recording and alerting rules for Prometheus and suitable dashboard descriptions |
||||
for Grafana. |
||||
|
||||
To use them, you need to have `mixtool` and `jsonnetfmt` installed. If you |
||||
have a working Go development environment, it's easiest to run the following: |
||||
|
||||
```bash |
||||
$ go get github.com/monitoring-mixins/mixtool/cmd/mixtool |
||||
$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt |
||||
``` |
||||
|
||||
You can then build the Prometheus rules files `alerts.yaml` and |
||||
`rules.yaml` and a directory `dashboard_out` with the JSON dashboard files |
||||
for Grafana: |
||||
|
||||
```bash |
||||
$ make build |
||||
``` |
||||
|
||||
For more advanced uses of mixins, see |
||||
https://github.com/monitoring-mixins/docs. |
@ -0,0 +1,14 @@ |
||||
groups: |
||||
- name: GrafanaAlerts |
||||
rules: |
||||
- alert: GrafanaRequestsFailing |
||||
for: 5m |
||||
expr: | |
||||
100 * namespace_job_handler_statuscode:http_request_total:rate5m{handler!~"/datasources/proxy/:id.*|/ds/query|/tsdb/query", statuscode=~"5.."} |
||||
/ |
||||
namespace_job_handler_statuscode:http_request_total:rate5m{handler!~"/datasources/proxy/:id.*|/ds/query|/tsdb/query"} |
||||
> 0.5 |
||||
labels: |
||||
severity: 'critical' |
||||
annotations: |
||||
message: "'{{ $labels.namespace }}' / '{{ $labels.job }}' / '{{ $labels.handler }}' is experiencing {{ $value | humanize }}% errors" |
@ -0,0 +1,528 @@ |
||||
{ |
||||
"annotations": { |
||||
"list": [ |
||||
{ |
||||
"builtIn": 1, |
||||
"datasource": "-- Grafana --", |
||||
"enable": true, |
||||
"hide": true, |
||||
"iconColor": "rgba(0, 211, 255, 1)", |
||||
"name": "Annotations & Alerts", |
||||
"type": "dashboard" |
||||
} |
||||
] |
||||
}, |
||||
"editable": true, |
||||
"gnetId": null, |
||||
"graphTooltip": 0, |
||||
"id": 35, |
||||
"iteration": 1602761142538, |
||||
"links": [], |
||||
"panels": [ |
||||
{ |
||||
"datasource": "$datasource", |
||||
"fieldConfig": { |
||||
"defaults": { |
||||
"custom": {}, |
||||
"mappings": [], |
||||
"noValue": "0", |
||||
"thresholds": { |
||||
"mode": "absolute", |
||||
"steps": [ |
||||
{ |
||||
"color": "green", |
||||
"value": null |
||||
}, |
||||
{ |
||||
"color": "red", |
||||
"value": 80 |
||||
} |
||||
] |
||||
} |
||||
}, |
||||
"overrides": [] |
||||
}, |
||||
"gridPos": { |
||||
"h": 5, |
||||
"w": 6, |
||||
"x": 0, |
||||
"y": 0 |
||||
}, |
||||
"id": 6, |
||||
"options": { |
||||
"colorMode": "value", |
||||
"graphMode": "area", |
||||
"justifyMode": "auto", |
||||
"orientation": "auto", |
||||
"reduceOptions": { |
||||
"calcs": [ |
||||
"mean" |
||||
], |
||||
"fields": "", |
||||
"values": false |
||||
} |
||||
}, |
||||
"pluginVersion": "7.0.4", |
||||
"targets": [ |
||||
{ |
||||
"expr": "grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}", |
||||
"instant": true, |
||||
"interval": "", |
||||
"legendFormat": "", |
||||
"refId": "A" |
||||
} |
||||
], |
||||
"timeFrom": null, |
||||
"timeShift": null, |
||||
"title": "Firing Alerts", |
||||
"type": "stat" |
||||
}, |
||||
{ |
||||
"datasource": "$datasource", |
||||
"fieldConfig": { |
||||
"defaults": { |
||||
"custom": {}, |
||||
"mappings": [], |
||||
"thresholds": { |
||||
"mode": "absolute", |
||||
"steps": [ |
||||
{ |
||||
"color": "green", |
||||
"value": null |
||||
}, |
||||
{ |
||||
"color": "red", |
||||
"value": 80 |
||||
} |
||||
] |
||||
} |
||||
}, |
||||
"overrides": [] |
||||
}, |
||||
"gridPos": { |
||||
"h": 5, |
||||
"w": 6, |
||||
"x": 6, |
||||
"y": 0 |
||||
}, |
||||
"id": 8, |
||||
"options": { |
||||
"colorMode": "value", |
||||
"graphMode": "area", |
||||
"justifyMode": "auto", |
||||
"orientation": "auto", |
||||
"reduceOptions": { |
||||
"calcs": [ |
||||
"mean" |
||||
], |
||||
"fields": "", |
||||
"values": false |
||||
} |
||||
}, |
||||
"pluginVersion": "7.0.4", |
||||
"targets": [ |
||||
{ |
||||
"expr": "sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})", |
||||
"interval": "", |
||||
"legendFormat": "", |
||||
"refId": "A" |
||||
} |
||||
], |
||||
"timeFrom": null, |
||||
"timeShift": null, |
||||
"title": "Dashboards", |
||||
"type": "stat" |
||||
}, |
||||
{ |
||||
"datasource": "$datasource", |
||||
"fieldConfig": { |
||||
"defaults": { |
||||
"custom": { |
||||
"align": null |
||||
}, |
||||
"mappings": [], |
||||
"thresholds": { |
||||
"mode": "absolute", |
||||
"steps": [ |
||||
{ |
||||
"color": "green", |
||||
"value": null |
||||
}, |
||||
{ |
||||
"color": "red", |
||||
"value": 80 |
||||
} |
||||
] |
||||
} |
||||
}, |
||||
"overrides": [] |
||||
}, |
||||
"gridPos": { |
||||
"h": 5, |
||||
"w": 12, |
||||
"x": 12, |
||||
"y": 0 |
||||
}, |
||||
"id": 10, |
||||
"options": { |
||||
"showHeader": true |
||||
}, |
||||
"pluginVersion": "7.0.4", |
||||
"targets": [ |
||||
{ |
||||
"expr": "grafana_build_info{job=~\"$job\", instance=~\"$instance\"}", |
||||
"instant": true, |
||||
"interval": "", |
||||
"legendFormat": "", |
||||
"refId": "A" |
||||
} |
||||
], |
||||
"timeFrom": null, |
||||
"timeShift": null, |
||||
"title": "Build Info", |
||||
"transformations": [ |
||||
{ |
||||
"id": "labelsToFields", |
||||
"options": {} |
||||
}, |
||||
{ |
||||
"id": "organize", |
||||
"options": { |
||||
"excludeByName": { |
||||
"Time": true, |
||||
"Value": true, |
||||
"branch": true, |
||||
"container": true, |
||||
"goversion": true, |
||||
"namespace": true, |
||||
"pod": true, |
||||
"revision": true |
||||
}, |
||||
"indexByName": { |
||||
"Time": 7, |
||||
"Value": 11, |
||||
"branch": 4, |
||||
"container": 8, |
||||
"edition": 2, |
||||
"goversion": 6, |
||||
"instance": 1, |
||||
"job": 0, |
||||
"namespace": 9, |
||||
"pod": 10, |
||||
"revision": 5, |
||||
"version": 3 |
||||
}, |
||||
"renameByName": {} |
||||
} |
||||
} |
||||
], |
||||
"type": "table" |
||||
}, |
||||
{ |
||||
"aliasColors": {}, |
||||
"bars": false, |
||||
"dashLength": 10, |
||||
"dashes": false, |
||||
"datasource": "$datasource", |
||||
"fieldConfig": { |
||||
"defaults": { |
||||
"custom": {} |
||||
}, |
||||
"overrides": [] |
||||
}, |
||||
"fill": 1, |
||||
"fillGradient": 0, |
||||
"gridPos": { |
||||
"h": 8, |
||||
"w": 12, |
||||
"x": 0, |
||||
"y": 5 |
||||
}, |
||||
"hiddenSeries": false, |
||||
"id": 2, |
||||
"legend": { |
||||
"avg": false, |
||||
"current": false, |
||||
"max": false, |
||||
"min": false, |
||||
"show": true, |
||||
"total": false, |
||||
"values": false |
||||
}, |
||||
"lines": true, |
||||
"linewidth": 1, |
||||
"nullPointMode": "null", |
||||
"options": { |
||||
"dataLinks": [] |
||||
}, |
||||
"percentage": false, |
||||
"pointradius": 2, |
||||
"points": false, |
||||
"renderer": "flot", |
||||
"seriesOverrides": [], |
||||
"spaceLength": 10, |
||||
"stack": true, |
||||
"steppedLine": false, |
||||
"targets": [ |
||||
{ |
||||
"expr": "sum by (statuscode) (irate(http_request_total{job=~\"$job\", instance=~\"$instance\"}[1m])) ", |
||||
"interval": "", |
||||
"legendFormat": "{{statuscode}}", |
||||
"refId": "A" |
||||
} |
||||
], |
||||
"thresholds": [], |
||||
"timeFrom": null, |
||||
"timeRegions": [], |
||||
"timeShift": null, |
||||
"title": "RPS", |
||||
"tooltip": { |
||||
"shared": true, |
||||
"sort": 0, |
||||
"value_type": "individual" |
||||
}, |
||||
"type": "graph", |
||||
"xaxis": { |
||||
"buckets": null, |
||||
"mode": "time", |
||||
"name": null, |
||||
"show": true, |
||||
"values": [] |
||||
}, |
||||
"yaxes": [ |
||||
{ |
||||
"$$hashKey": "object:157", |
||||
"format": "reqps", |
||||
"label": null, |
||||
"logBase": 1, |
||||
"max": null, |
||||
"min": null, |
||||
"show": true |
||||
}, |
||||
{ |
||||
"$$hashKey": "object:158", |
||||
"format": "short", |
||||
"label": null, |
||||
"logBase": 1, |
||||
"max": null, |
||||
"min": null, |
||||
"show": false |
||||
} |
||||
], |
||||
"yaxis": { |
||||
"align": false, |
||||
"alignLevel": null |
||||
} |
||||
}, |
||||
{ |
||||
"aliasColors": {}, |
||||
"bars": false, |
||||
"dashLength": 10, |
||||
"dashes": false, |
||||
"datasource": "$datasource", |
||||
"fieldConfig": { |
||||
"defaults": { |
||||
"custom": {} |
||||
}, |
||||
"overrides": [] |
||||
}, |
||||
"fill": 1, |
||||
"fillGradient": 0, |
||||
"gridPos": { |
||||
"h": 8, |
||||
"w": 12, |
||||
"x": 12, |
||||
"y": 5 |
||||
}, |
||||
"hiddenSeries": false, |
||||
"id": 4, |
||||
"legend": { |
||||
"avg": false, |
||||
"current": false, |
||||
"max": false, |
||||
"min": false, |
||||
"show": true, |
||||
"total": false, |
||||
"values": false |
||||
}, |
||||
"lines": true, |
||||
"linewidth": 1, |
||||
"nullPointMode": "null", |
||||
"options": { |
||||
"dataLinks": [] |
||||
}, |
||||
"percentage": false, |
||||
"pointradius": 2, |
||||
"points": false, |
||||
"renderer": "flot", |
||||
"seriesOverrides": [], |
||||
"spaceLength": 10, |
||||
"stack": false, |
||||
"steppedLine": false, |
||||
"targets": [ |
||||
{ |
||||
"expr": "max(http_request_duration_milliseconds{job=~\"$job\", instance=~\"$instance\", quantile=\"0.99\"})", |
||||
"interval": "", |
||||
"legendFormat": "max-99th", |
||||
"refId": "A" |
||||
}, |
||||
{ |
||||
"expr": "max(http_request_duration_milliseconds{job=~\"$job\", instance=~\"$instance\", quantile=\"0.9\"})", |
||||
"interval": "", |
||||
"legendFormat": "max-90th", |
||||
"refId": "B" |
||||
}, |
||||
{ |
||||
"expr": "sum(irate(http_request_duration_milliseconds_sum{job=~\"$job\", instance=~\"$instance\"}[$__interval])) / sum(irate(http_request_duration_milliseconds_count{job=~\"$job\", instance=~\"$instance\"}[$__interval])) ", |
||||
"interval": "", |
||||
"legendFormat": "avg", |
||||
"refId": "C" |
||||
} |
||||
], |
||||
"thresholds": [], |
||||
"timeFrom": null, |
||||
"timeRegions": [], |
||||
"timeShift": null, |
||||
"title": "Request Latency", |
||||
"tooltip": { |
||||
"shared": true, |
||||
"sort": 0, |
||||
"value_type": "individual" |
||||
}, |
||||
"type": "graph", |
||||
"xaxis": { |
||||
"buckets": null, |
||||
"mode": "time", |
||||
"name": null, |
||||
"show": true, |
||||
"values": [] |
||||
}, |
||||
"yaxes": [ |
||||
{ |
||||
"$$hashKey": "object:210", |
||||
"format": "ms", |
||||
"label": null, |
||||
"logBase": 1, |
||||
"max": null, |
||||
"min": null, |
||||
"show": true |
||||
}, |
||||
{ |
||||
"$$hashKey": "object:211", |
||||
"format": "short", |
||||
"label": null, |
||||
"logBase": 1, |
||||
"max": null, |
||||
"min": null, |
||||
"show": true |
||||
} |
||||
], |
||||
"yaxis": { |
||||
"align": false, |
||||
"alignLevel": null |
||||
} |
||||
} |
||||
], |
||||
"schemaVersion": 25, |
||||
"style": "dark", |
||||
"tags": [], |
||||
"templating": { |
||||
"list": [ |
||||
{ |
||||
"current": { |
||||
"selected": false, |
||||
"text": "prometheus", |
||||
"value": "prometheus" |
||||
}, |
||||
"hide": 0, |
||||
"includeAll": false, |
||||
"label": null, |
||||
"multi": false, |
||||
"name": "datasource", |
||||
"options": [], |
||||
"query": "prometheus", |
||||
"queryValue": "", |
||||
"refresh": 1, |
||||
"regex": "", |
||||
"skipUrlSync": false, |
||||
"type": "datasource" |
||||
}, |
||||
{ |
||||
"allValue": ".*", |
||||
"current": { |
||||
"selected": true, |
||||
"tags": [], |
||||
"text": "All", |
||||
"value": [ |
||||
"$__all" |
||||
] |
||||
}, |
||||
"datasource": "$datasource", |
||||
"definition": "label_values(grafana_build_info, job)", |
||||
"hide": 0, |
||||
"includeAll": true, |
||||
"label": null, |
||||
"multi": true, |
||||
"name": "job", |
||||
"options": [], |
||||
"query": "label_values(grafana_build_info, job)", |
||||
"refresh": 1, |
||||
"regex": "", |
||||
"skipUrlSync": false, |
||||
"sort": 0, |
||||
"tagValuesQuery": "", |
||||
"tags": [], |
||||
"tagsQuery": "", |
||||
"type": "query", |
||||
"useTags": false |
||||
}, |
||||
{ |
||||
"allValue": ".*", |
||||
"current": { |
||||
"selected": false, |
||||
"text": "All", |
||||
"value": "$__all" |
||||
}, |
||||
"datasource": "$datasource", |
||||
"definition": "label_values(grafana_build_info, instance)", |
||||
"hide": 0, |
||||
"includeAll": true, |
||||
"label": null, |
||||
"multi": true, |
||||
"name": "instance", |
||||
"options": [], |
||||
"query": "label_values(grafana_build_info, instance)", |
||||
"refresh": 1, |
||||
"regex": "", |
||||
"skipUrlSync": false, |
||||
"sort": 0, |
||||
"tagValuesQuery": "", |
||||
"tags": [], |
||||
"tagsQuery": "", |
||||
"type": "query", |
||||
"useTags": false |
||||
} |
||||
] |
||||
}, |
||||
"time": { |
||||
"from": "now-6h", |
||||
"to": "now" |
||||
}, |
||||
"timepicker": { |
||||
"refresh_intervals": [ |
||||
"10s", |
||||
"30s", |
||||
"1m", |
||||
"5m", |
||||
"15m", |
||||
"30m", |
||||
"1h", |
||||
"2h", |
||||
"1d" |
||||
] |
||||
}, |
||||
"timezone": "", |
||||
"title": "Grafana Overview", |
||||
"uid": "6be0s85Mk", |
||||
"version": 4 |
||||
} |
@ -0,0 +1,15 @@ |
||||
{ |
||||
grafanaDashboards: { |
||||
'grafana-overview.json': (import 'dashboards/grafana-overview.json'), |
||||
}, |
||||
|
||||
// Helper function to ensure that we don't override other rules, by forcing |
||||
// the patching of the groups list, and not the overall rules object. |
||||
local importRules(rules) = { |
||||
groups+: std.native('parseYaml')(rules)[0].groups, |
||||
}, |
||||
|
||||
prometheusRules+: importRules(importstr 'rules/rules.yaml'), |
||||
|
||||
prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'), |
||||
} |
@ -0,0 +1,7 @@ |
||||
groups: |
||||
- name: grafana_rules |
||||
rules: |
||||
# Record error rate of http requests excluding dataproxy, /ds/query and /tsdb/query requests |
||||
- record: namespace_job_handler_statuscode:http_request_total:rate5m |
||||
expr: | |
||||
sum by (namespace, job, handler, statuscode) (rate(http_request_total[5m])) |
@ -0,0 +1,7 @@ |
||||
#!/bin/bash |
||||
set -eo pipefail |
||||
|
||||
cd grafana-mixin |
||||
go install github.com/monitoring-mixins/mixtool/cmd/mixtool |
||||
go install github.com/google/go-jsonnet/cmd/jsonnetfmt |
||||
make lint build |
Loading…
Reference in new issue