mirror of https://github.com/grafana/grafana
grafana-mixin: Fix `GrafanaRequestsFailing` alert (#43116)
Due to PromQL's label matching, both sides of the division will have same series; that means that, whenever there's a 5xx error, both sides will have the same value and the division will be `1`. I believe the idea was to get the ratio of 5xx compared will all status code, and to do that, we need to aggregate the `status_code` dimension away.pull/43379/head
parent
4dc63698ac
commit
18fdb89554
@ -0,0 +1,31 @@ |
|||||||
|
{ |
||||||
|
_config+:: { |
||||||
|
grafanaRequestsFailingThresholdPercent: 50, |
||||||
|
}, |
||||||
|
|
||||||
|
prometheusAlerts+:: { |
||||||
|
groups+: [ |
||||||
|
{ |
||||||
|
name: 'GrafanaAlerts', |
||||||
|
rules: [ |
||||||
|
{ |
||||||
|
alert: 'GrafanaRequestsFailing', |
||||||
|
expr: ||| |
||||||
|
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} |
||||||
|
/ ignoring (status_code) |
||||||
|
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) |
||||||
|
> %(grafanaRequestsFailingThresholdPercent)s |
||||||
|
||| % $._config, |
||||||
|
labels: { |
||||||
|
severity: 'warning', |
||||||
|
}, |
||||||
|
annotations: { |
||||||
|
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing {{ $value | humanize }}% errors', |
||||||
|
}, |
||||||
|
'for': '5m', |
||||||
|
}, |
||||||
|
], |
||||||
|
}, |
||||||
|
], |
||||||
|
}, |
||||||
|
} |
@ -1,14 +0,0 @@ |
|||||||
groups: |
|
||||||
- name: GrafanaAlerts |
|
||||||
rules: |
|
||||||
- alert: GrafanaRequestsFailing |
|
||||||
for: 5m |
|
||||||
expr: | |
|
||||||
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} |
|
||||||
/ |
|
||||||
namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"} |
|
||||||
> 0.5 |
|
||||||
labels: |
|
||||||
severity: 'warning' |
|
||||||
annotations: |
|
||||||
message: "'{{ $labels.namespace }}' / '{{ $labels.job }}' / '{{ $labels.handler }}' is experiencing {{ $value | humanize }}% errors" |
|
@ -0,0 +1,5 @@ |
|||||||
|
{ |
||||||
|
grafanaDashboards+:: { |
||||||
|
'grafana-overview.json': (import 'grafana-overview.json'), |
||||||
|
}, |
||||||
|
} |
@ -1,15 +1,3 @@ |
|||||||
{ |
(import 'alerts/alerts.libsonnet') + |
||||||
grafanaDashboards: { |
(import 'dashboards/dashboards.libsonnet') + |
||||||
'grafana-overview.json': (import 'dashboards/grafana-overview.json'), |
(import 'rules/rules.libsonnet') |
||||||
}, |
|
||||||
|
|
||||||
// Helper function to ensure that we don't override other rules, by forcing |
|
||||||
// the patching of the groups list, and not the overall rules object. |
|
||||||
local importRules(rules) = { |
|
||||||
groups+: std.native('parseYaml')(rules)[0].groups, |
|
||||||
}, |
|
||||||
|
|
||||||
prometheusRules+: importRules(importstr 'rules/rules.yaml'), |
|
||||||
|
|
||||||
prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'), |
|
||||||
} |
|
||||||
|
@ -0,0 +1,17 @@ |
|||||||
|
{ |
||||||
|
prometheusRules+:: { |
||||||
|
groups+: [ |
||||||
|
{ |
||||||
|
name: 'grafana_rules', |
||||||
|
rules: [ |
||||||
|
{ |
||||||
|
record: 'namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m', |
||||||
|
expr: ||| |
||||||
|
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) |
||||||
|
|||, |
||||||
|
}, |
||||||
|
], |
||||||
|
}, |
||||||
|
], |
||||||
|
}, |
||||||
|
} |
@ -1,7 +0,0 @@ |
|||||||
groups: |
|
||||||
- name: grafana_rules |
|
||||||
rules: |
|
||||||
# Record error rate of http requests excluding dataproxy, /ds/query and /tsdb/query requests |
|
||||||
- record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m |
|
||||||
expr: | |
|
||||||
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) |
|
Loading…
Reference in new issue