From f60dc4441f39e8c06c730f20b9c86fe6bb76e949 Mon Sep 17 00:00:00 2001 From: Alex Moreno Date: Thu, 23 Feb 2023 12:38:27 +0100 Subject: [PATCH] Alerting: Add status label to GroupRules metric (#63454) * Add status label to GroupRules metric * Add state (active and paused) label to GrouRules * Add active/paused metrics tests --- pkg/services/ngalert/metrics/scheduler.go | 9 +- pkg/services/ngalert/schedule/schedule.go | 13 +- .../ngalert/schedule/schedule_unit_test.go | 115 ++++++++++++++++-- 3 files changed, 120 insertions(+), 17 deletions(-) diff --git a/pkg/services/ngalert/metrics/scheduler.go b/pkg/services/ngalert/metrics/scheduler.go index a82e6266b11..06531db8c69 100644 --- a/pkg/services/ngalert/metrics/scheduler.go +++ b/pkg/services/ngalert/metrics/scheduler.go @@ -7,6 +7,11 @@ import ( "github.com/grafana/grafana/pkg/util/ticker" ) +const ( + AlertRuleActiveLabelValue = "active" + AlertRulePausedLabelValue = "paused" +) + type Scheduler struct { Registerer prometheus.Registerer BehindSeconds prometheus.Gauge @@ -69,9 +74,9 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler { Namespace: Namespace, Subsystem: Subsystem, Name: "rule_group_rules", - Help: "The number of rules.", + Help: "The number of alert rules that are scheduled, both active and paused.", }, - []string{"org"}, + []string{"org", "state"}, ), SchedulePeriodicDuration: promauto.With(r).NewHistogram( prometheus.HistogramOpts{ diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go index 1361de96ad2..fc2a8ad2dca 100644 --- a/pkg/services/ngalert/schedule/schedule.go +++ b/pkg/services/ngalert/schedule/schedule.go @@ -209,12 +209,19 @@ type readyToRunItem struct { } func (sch *schedule) updateRulesMetrics(alertRules []*ngmodels.AlertRule) { - orgs := make(map[int64]int64) + orgs := make(map[int64]int64, len(alertRules)) + orgsPaused := make(map[int64]int64, len(alertRules)) for _, rule := range alertRules { orgs[rule.OrgID]++ + if rule.IsPaused { + orgsPaused[rule.OrgID]++ + } } - for org, numRules := range orgs { - sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(org)).Set(float64(numRules)) + + for orgID, numRules := range orgs { + numRulesPaused := orgsPaused[orgID] + sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRuleActiveLabelValue).Set(float64(numRules - numRulesPaused)) + sch.metrics.GroupRules.WithLabelValues(fmt.Sprint(orgID), metrics.AlertRulePausedLabelValue).Set(float64(numRulesPaused)) } // While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be diff --git a/pkg/services/ngalert/schedule/schedule_unit_test.go b/pkg/services/ngalert/schedule/schedule_unit_test.go index 44b19f02cbe..abd94a4ae55 100644 --- a/pkg/services/ngalert/schedule/schedule_unit_test.go +++ b/pkg/services/ngalert/schedule/schedule_unit_test.go @@ -114,11 +114,12 @@ func TestProcessTicks(t *testing.T) { assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey()) }) - t.Run("after 1st tick rule metrics should report one rule", func(t *testing.T) { + t.Run("after 1st tick rule metrics should report one active alert rule", func(t *testing.T) { expectedMetric := fmt.Sprintf( - `# HELP grafana_alerting_rule_group_rules The number of rules. + `# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused. # TYPE grafana_alerting_rule_group_rules gauge - grafana_alerting_rule_group_rules{org="%[1]d"} 1 + grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1 + grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0 `, alertRule1.OrgID) err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules") @@ -140,11 +141,12 @@ func TestProcessTicks(t *testing.T) { assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey()) }) - t.Run("after 2nd tick rule metrics should report two rules", func(t *testing.T) { + t.Run("after 2nd tick rule metrics should report two active alert rules", func(t *testing.T) { expectedMetric := fmt.Sprintf( - `# HELP grafana_alerting_rule_group_rules The number of rules. + `# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused. # TYPE grafana_alerting_rule_group_rules gauge - grafana_alerting_rule_group_rules{org="%[1]d"} 2 + grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2 + grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0 `, alertRule1.OrgID) err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules") @@ -180,7 +182,95 @@ func TestProcessTicks(t *testing.T) { assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey()) }) - t.Run("on 5th tick deleted rule should not be evaluated but stopped", func(t *testing.T) { + t.Run("on 5th tick an alert rule is paused (it still enters evaluation but it is early skipped)", func(t *testing.T) { + tick = tick.Add(cfg.BaseInterval) + + alertRule1.IsPaused = true + + scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick) + + require.Len(t, scheduled, 1) + require.Equal(t, alertRule1, scheduled[0].rule) + require.Equal(t, tick, scheduled[0].scheduledAt) + require.Emptyf(t, stopped, "None rules are expected to be stopped") + + assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey()) + }) + + t.Run("after 5th tick rule metrics should report one active and one paused alert rules", func(t *testing.T) { + expectedMetric := fmt.Sprintf( + `# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused. + # TYPE grafana_alerting_rule_group_rules gauge + grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1 + grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 1 + `, alertRule1.OrgID) + + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules") + require.NoError(t, err) + }) + + t.Run("on 6th tick all alert rule are paused (it still enters evaluation but it is early skipped)", func(t *testing.T) { + tick = tick.Add(cfg.BaseInterval) + + alertRule2.IsPaused = true + + scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick) + + require.Len(t, scheduled, 2) + var keys []models.AlertRuleKey + for _, item := range scheduled { + keys = append(keys, item.rule.GetKey()) + require.Equal(t, tick, item.scheduledAt) + } + require.Contains(t, keys, alertRule1.GetKey()) + require.Contains(t, keys, alertRule2.GetKey()) + + require.Emptyf(t, stopped, "None rules are expected to be stopped") + + assertEvalRun(t, evalAppliedCh, tick, keys...) + }) + + t.Run("after 6th tick rule metrics should report two paused alert rules", func(t *testing.T) { + expectedMetric := fmt.Sprintf( + `# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused. + # TYPE grafana_alerting_rule_group_rules gauge + grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 0 + grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 2 + `, alertRule1.OrgID) + + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules") + require.NoError(t, err) + }) + + t.Run("on 7th tick unpause all alert rules", func(t *testing.T) { + tick = tick.Add(cfg.BaseInterval) + + alertRule1.IsPaused = false + alertRule2.IsPaused = false + + scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick) + + require.Len(t, scheduled, 1) + require.Equal(t, alertRule1, scheduled[0].rule) + require.Equal(t, tick, scheduled[0].scheduledAt) + require.Emptyf(t, stopped, "None rules are expected to be stopped") + + assertEvalRun(t, evalAppliedCh, tick, alertRule1.GetKey()) + }) + + t.Run("after 7th tick rule metrics should report two active alert rules", func(t *testing.T) { + expectedMetric := fmt.Sprintf( + `# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused. + # TYPE grafana_alerting_rule_group_rules gauge + grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 2 + grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0 + `, alertRule1.OrgID) + + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules") + require.NoError(t, err) + }) + + t.Run("on 8th tick deleted rule should not be evaluated but stopped", func(t *testing.T) { tick = tick.Add(cfg.BaseInterval) ruleStore.DeleteRule(alertRule1) @@ -195,18 +285,19 @@ func TestProcessTicks(t *testing.T) { assertStopRun(t, stopAppliedCh, alertRule1.GetKey()) }) - t.Run("after 5th tick rule metrics should report one rules", func(t *testing.T) { + t.Run("after 8th tick rule metrics should report one active alert rule", func(t *testing.T) { expectedMetric := fmt.Sprintf( - `# HELP grafana_alerting_rule_group_rules The number of rules. + `# HELP grafana_alerting_rule_group_rules The number of alert rules that are scheduled, both active and paused. # TYPE grafana_alerting_rule_group_rules gauge - grafana_alerting_rule_group_rules{org="%[1]d"} 1 + grafana_alerting_rule_group_rules{org="%[1]d",state="active"} 1 + grafana_alerting_rule_group_rules{org="%[1]d",state="paused"} 0 `, alertRule1.OrgID) err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_group_rules") require.NoError(t, err) }) - t.Run("on 6th tick one alert rule should be evaluated", func(t *testing.T) { + t.Run("on 9th tick one alert rule should be evaluated", func(t *testing.T) { tick = tick.Add(cfg.BaseInterval) scheduled, stopped := sched.processTick(ctx, dispatcherGroup, tick) @@ -219,7 +310,7 @@ func TestProcessTicks(t *testing.T) { assertEvalRun(t, evalAppliedCh, tick, alertRule2.GetKey()) }) - t.Run("on 7th tick a new alert rule should be evaluated", func(t *testing.T) { + t.Run("on 10th tick a new alert rule should be evaluated", func(t *testing.T) { // create alert rule with one base interval alertRule3 := models.AlertRuleGen(models.WithOrgID(mainOrgID), models.WithInterval(cfg.BaseInterval), models.WithTitle("rule-3"))() ruleStore.PutRule(ctx, alertRule3)