mirror of https://github.com/grafana/grafana
grafana-mixin: Fix `GrafanaRequestsFailing` alert (#43116)
Due to PromQL's label matching, both sides of the division will have same series; that means that, whenever there's a 5xx error, both sides will have the same value and the division will be `1`. I believe the idea was to get the ratio of 5xx compared will all status code, and to do that, we need to aggregate the `status_code` dimension away.pull/43379/head
parent
4dc63698ac
commit
18fdb89554
@ -0,0 +1,31 @@ |
||||
{ |
||||
_config+:: { |
||||
grafanaRequestsFailingThresholdPercent: 50, |
||||
}, |
||||
|
||||
prometheusAlerts+:: { |
||||
groups+: [ |
||||
{ |
||||
name: 'GrafanaAlerts', |
||||
rules: [ |
||||
{ |
||||
alert: 'GrafanaRequestsFailing', |
||||
expr: ||| |
||||
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} |
||||
/ ignoring (status_code) |
||||
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) |
||||
> %(grafanaRequestsFailingThresholdPercent)s |
||||
||| % $._config, |
||||
labels: { |
||||
severity: 'warning', |
||||
}, |
||||
annotations: { |
||||
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing {{ $value | humanize }}% errors', |
||||
}, |
||||
'for': '5m', |
||||
}, |
||||
], |
||||
}, |
||||
], |
||||
}, |
||||
} |
@ -1,14 +0,0 @@ |
||||
groups: |
||||
- name: GrafanaAlerts |
||||
rules: |
||||
- alert: GrafanaRequestsFailing |
||||
for: 5m |
||||
expr: | |
||||
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} |
||||
/ |
||||
namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"} |
||||
> 0.5 |
||||
labels: |
||||
severity: 'warning' |
||||
annotations: |
||||
message: "'{{ $labels.namespace }}' / '{{ $labels.job }}' / '{{ $labels.handler }}' is experiencing {{ $value | humanize }}% errors" |
@ -0,0 +1,5 @@ |
||||
{ |
||||
grafanaDashboards+:: { |
||||
'grafana-overview.json': (import 'grafana-overview.json'), |
||||
}, |
||||
} |
@ -1,15 +1,3 @@ |
||||
{ |
||||
grafanaDashboards: { |
||||
'grafana-overview.json': (import 'dashboards/grafana-overview.json'), |
||||
}, |
||||
|
||||
// Helper function to ensure that we don't override other rules, by forcing |
||||
// the patching of the groups list, and not the overall rules object. |
||||
local importRules(rules) = { |
||||
groups+: std.native('parseYaml')(rules)[0].groups, |
||||
}, |
||||
|
||||
prometheusRules+: importRules(importstr 'rules/rules.yaml'), |
||||
|
||||
prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'), |
||||
} |
||||
(import 'alerts/alerts.libsonnet') + |
||||
(import 'dashboards/dashboards.libsonnet') + |
||||
(import 'rules/rules.libsonnet') |
||||
|
@ -0,0 +1,17 @@ |
||||
{ |
||||
prometheusRules+:: { |
||||
groups+: [ |
||||
{ |
||||
name: 'grafana_rules', |
||||
rules: [ |
||||
{ |
||||
record: 'namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m', |
||||
expr: ||| |
||||
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) |
||||
|||, |
||||
}, |
||||
], |
||||
}, |
||||
], |
||||
}, |
||||
} |
@ -1,7 +0,0 @@ |
||||
groups: |
||||
- name: grafana_rules |
||||
rules: |
||||
# Record error rate of http requests excluding dataproxy, /ds/query and /tsdb/query requests |
||||
- record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m |
||||
expr: | |
||||
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) |
Loading…
Reference in new issue