From d44728f4e5e9fd9fd73711f1b92f3c1aad66efbb Mon Sep 17 00:00:00 2001 From: Alexander Akhmetov Date: Wed, 5 Mar 2025 14:02:28 +0100 Subject: [PATCH] Alerting: Metric to count imported from Prometheus rules (#100847) --- .../ngalert/api/api_convert_prometheus.go | 6 +- .../api/api_convert_prometheus_test.go | 5 +- pkg/services/ngalert/metrics/scheduler.go | 10 +++ pkg/services/ngalert/models/alert_rule.go | 17 +++-- .../ngalert/models/alert_rule_test.go | 67 +++++++++++++++++++ pkg/services/ngalert/schedule/metrics.go | 10 +++ .../ngalert/schedule/schedule_unit_test.go | 57 ++++++++++++++++ pkg/services/ngalert/store/alert_rule.go | 5 +- pkg/services/ngalert/tests/fakes/rules.go | 6 +- 9 files changed, 161 insertions(+), 22 deletions(-) diff --git a/pkg/services/ngalert/api/api_convert_prometheus.go b/pkg/services/ngalert/api/api_convert_prometheus.go index 9f38b47db17..56deec061f5 100644 --- a/pkg/services/ngalert/api/api_convert_prometheus.go +++ b/pkg/services/ngalert/api/api_convert_prometheus.go @@ -457,9 +457,9 @@ func grafanaRuleGroupToPrometheus(group string, rules []models.AlertRule) (apimo } for i, rule := range rules { - promDefinition := rule.PrometheusRuleDefinition() - if promDefinition == "" { - return apimodels.PrometheusRuleGroup{}, fmt.Errorf("failed to get the Prometheus definition of the rule with UID %s", rule.UID) + promDefinition, err := rule.PrometheusRuleDefinition() + if err != nil { + return apimodels.PrometheusRuleGroup{}, fmt.Errorf("failed to get the Prometheus definition of the rule with UID %s: %w", rule.UID, err) } var r apimodels.PrometheusRule if err := yaml.Unmarshal([]byte(promDefinition), &r); err != nil { diff --git a/pkg/services/ngalert/api/api_convert_prometheus_test.go b/pkg/services/ngalert/api/api_convert_prometheus_test.go index 5fd920438d0..44d1973f519 100644 --- a/pkg/services/ngalert/api/api_convert_prometheus_test.go +++ b/pkg/services/ngalert/api/api_convert_prometheus_test.go @@ -115,7 +115,10 @@ func TestRouteConvertPrometheusPostRuleGroup(t *testing.T) { require.Equal(t, fmt.Sprintf("[%s] %s", simpleGroup.Name, simpleGroup.Rules[0].Alert), remaining[0].Title) promRuleYAML, err := yaml.Marshal(simpleGroup.Rules[0]) require.NoError(t, err) - require.Equal(t, string(promRuleYAML), remaining[0].PrometheusRuleDefinition()) + + promDefinition, err := remaining[0].PrometheusRuleDefinition() + require.NoError(t, err) + require.Equal(t, string(promRuleYAML), promDefinition) }) t.Run("should fail to replace a provisioned rule group", func(t *testing.T) { diff --git a/pkg/services/ngalert/metrics/scheduler.go b/pkg/services/ngalert/metrics/scheduler.go index 76b0dbf22a1..fb28e273ebb 100644 --- a/pkg/services/ngalert/metrics/scheduler.go +++ b/pkg/services/ngalert/metrics/scheduler.go @@ -32,6 +32,7 @@ type Scheduler struct { Ticker *ticker.Metrics EvaluationMissed *prometheus.CounterVec SimplifiedEditorRules *prometheus.GaugeVec + PrometheusImportedRules *prometheus.GaugeVec } func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler { @@ -192,5 +193,14 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler { }, []string{"org", "setting"}, ), + PrometheusImportedRules: promauto.With(r).NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "prometheus_imported_rules", + Help: "The number of rules imported from a Prometheus-compatible source.", + }, + []string{"org"}, + ), } } diff --git a/pkg/services/ngalert/models/alert_rule.go b/pkg/services/ngalert/models/alert_rule.go index 743d2295389..2536c900b1c 100644 --- a/pkg/services/ngalert/models/alert_rule.go +++ b/pkg/services/ngalert/models/alert_rule.go @@ -397,19 +397,18 @@ func WithoutInternalLabels() LabelOption { } func (alertRule *AlertRule) ImportedFromPrometheus() bool { - if alertRule.Metadata.PrometheusStyleRule == nil { - return false - } - - return alertRule.Metadata.PrometheusStyleRule.OriginalRuleDefinition != "" + _, err := alertRule.PrometheusRuleDefinition() + return err == nil } -func (alertRule *AlertRule) PrometheusRuleDefinition() string { - if !alertRule.ImportedFromPrometheus() { - return "" +func (alertRule *AlertRule) PrometheusRuleDefinition() (string, error) { + if alertRule.Metadata.PrometheusStyleRule != nil { + if alertRule.Metadata.PrometheusStyleRule.OriginalRuleDefinition != "" { + return alertRule.Metadata.PrometheusStyleRule.OriginalRuleDefinition, nil + } } - return alertRule.Metadata.PrometheusStyleRule.OriginalRuleDefinition + return "", fmt.Errorf("prometheus rule definition is missing") } // GetLabels returns the labels specified as part of the alert rule. diff --git a/pkg/services/ngalert/models/alert_rule_test.go b/pkg/services/ngalert/models/alert_rule_test.go index 43d5092842a..b8e47df7bbf 100644 --- a/pkg/services/ngalert/models/alert_rule_test.go +++ b/pkg/services/ngalert/models/alert_rule_test.go @@ -1017,3 +1017,70 @@ func TestGeneratorFillsAllFields(t *testing.T) { require.FailNow(t, "AlertRule generator does not populate fields", "skipped fields: %v", maps.Keys(fields)) } + +func TestAlertRule_PrometheusRuleDefinition(t *testing.T) { + tests := []struct { + name string + rule AlertRule + expectedResult string + expectedErrorMsg string + }{ + { + name: "rule with prometheus definition", + rule: AlertRule{ + Metadata: AlertRuleMetadata{ + PrometheusStyleRule: &PrometheusStyleRule{ + OriginalRuleDefinition: "groups:\n- name: example\n rules:\n - alert: HighRequestLatency\n expr: request_latency_seconds{job=\"myjob\"} > 0.5\n for: 10m\n labels:\n severity: page\n annotations:\n summary: High request latency", + }, + }, + }, + expectedResult: "groups:\n- name: example\n rules:\n - alert: HighRequestLatency\n expr: request_latency_seconds{job=\"myjob\"} > 0.5\n for: 10m\n labels:\n severity: page\n annotations:\n summary: High request latency", + expectedErrorMsg: "", + }, + { + name: "rule with empty prometheus definition", + rule: AlertRule{ + Metadata: AlertRuleMetadata{ + PrometheusStyleRule: &PrometheusStyleRule{ + OriginalRuleDefinition: "", + }, + }, + }, + expectedResult: "", + expectedErrorMsg: "prometheus rule definition is missing", + }, + { + name: "rule with nil prometheus style rule", + rule: AlertRule{ + Metadata: AlertRuleMetadata{ + PrometheusStyleRule: nil, + }, + }, + expectedResult: "", + expectedErrorMsg: "prometheus rule definition is missing", + }, + { + name: "rule with empty metadata", + rule: AlertRule{}, + expectedResult: "", + expectedErrorMsg: "prometheus rule definition is missing", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := tt.rule.PrometheusRuleDefinition() + isPrometheusRule := tt.rule.ImportedFromPrometheus() + + if tt.expectedErrorMsg != "" { + require.Error(t, err) + require.Equal(t, tt.expectedErrorMsg, err.Error()) + require.False(t, isPrometheusRule) + } else { + require.NoError(t, err) + require.Equal(t, tt.expectedResult, result) + require.True(t, isPrometheusRule) + } + }) + } +} diff --git a/pkg/services/ngalert/schedule/metrics.go b/pkg/services/ngalert/schedule/metrics.go index 1eb50efaa71..440403e3492 100644 --- a/pkg/services/ngalert/schedule/metrics.go +++ b/pkg/services/ngalert/schedule/metrics.go @@ -46,6 +46,8 @@ func (sch *schedule) updateRulesMetrics(alertRules []*models.AlertRule) { orgsNfSettings := make(map[int64]int64) // gauge for groups per org groupsPerOrg := make(map[int64]map[string]struct{}) + // gauge for rules imported from Prometheus per org + orgsRulesPrometheusImported := make(map[int64]int64) simplifiedEditorSettingsPerOrg := make(map[int64]map[string]int64) // orgID -> setting -> count @@ -86,6 +88,10 @@ func (sch *schedule) updateRulesMetrics(alertRules []*models.AlertRule) { } } + if rule.ImportedFromPrometheus() { + orgsRulesPrometheusImported[rule.OrgID]++ + } + // Count groups per org orgGroups, ok := groupsPerOrg[rule.OrgID] if !ok { @@ -100,6 +106,7 @@ func (sch *schedule) updateRulesMetrics(alertRules []*models.AlertRule) { sch.metrics.SimpleNotificationRules.Reset() sch.metrics.Groups.Reset() sch.metrics.SimplifiedEditorRules.Reset() + sch.metrics.PrometheusImportedRules.Reset() // Set metrics for key, count := range buckets { @@ -111,6 +118,9 @@ func (sch *schedule) updateRulesMetrics(alertRules []*models.AlertRule) { for orgID, groups := range groupsPerOrg { sch.metrics.Groups.WithLabelValues(fmt.Sprint(orgID)).Set(float64(len(groups))) } + for orgID, count := range orgsRulesPrometheusImported { + sch.metrics.PrometheusImportedRules.WithLabelValues(fmt.Sprint(orgID)).Set(float64(count)) + } for orgID, settings := range simplifiedEditorSettingsPerOrg { for setting, count := range settings { sch.metrics.SimplifiedEditorRules.WithLabelValues(fmt.Sprint(orgID), setting).Set(float64(count)) diff --git a/pkg/services/ngalert/schedule/schedule_unit_test.go b/pkg/services/ngalert/schedule/schedule_unit_test.go index 390c966786f..825f2f2342d 100644 --- a/pkg/services/ngalert/schedule/schedule_unit_test.go +++ b/pkg/services/ngalert/schedule/schedule_unit_test.go @@ -562,6 +562,7 @@ func TestSchedule_updateRulesMetrics(t *testing.T) { sch := setupScheduler(t, ruleStore, nil, reg, nil, nil, nil) ctx := context.Background() const firstOrgID int64 = 1 + const secondOrgID int64 = 2 t.Run("grafana_alerting_rule_group_rules metric should reflect the current state", func(t *testing.T) { // Without any rules there are no metrics @@ -640,6 +641,62 @@ func TestSchedule_updateRulesMetrics(t *testing.T) { }) }) + t.Run("prometheus_imported_rules metric should reflect the current state", func(t *testing.T) { + // Without any imported rules there are no metrics + t.Run("it should not show metrics", func(t *testing.T) { + sch.updateRulesMetrics([]*models.AlertRule{}) + + expectedMetric := "" + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_prometheus_imported_rules") + require.NoError(t, err) + }) + + alertRule1 := models.RuleGen.With( + models.RuleGen.WithOrgID(firstOrgID), + models.RuleGen.WithPrometheusOriginalRuleDefinition("1"), + ).GenerateRef() + + t.Run("it should show one imported rule in a single org", func(t *testing.T) { + sch.updateRulesMetrics([]*models.AlertRule{alertRule1}) + + expectedMetric := fmt.Sprintf( + `# HELP grafana_alerting_prometheus_imported_rules The number of rules imported from a Prometheus-compatible source. + # TYPE grafana_alerting_prometheus_imported_rules gauge + grafana_alerting_prometheus_imported_rules{org="%[1]d"} 1 + `, alertRule1.OrgID) + + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_prometheus_imported_rules") + require.NoError(t, err) + }) + + alertRule2 := models.RuleGen.With( + models.RuleGen.WithOrgID(secondOrgID), + models.RuleGen.WithPrometheusOriginalRuleDefinition("1"), + ).GenerateRef() + + t.Run("it should show two imported rules in two orgs", func(t *testing.T) { + sch.updateRulesMetrics([]*models.AlertRule{alertRule1, alertRule2}) + + expectedMetric := fmt.Sprintf( + `# HELP grafana_alerting_prometheus_imported_rules The number of rules imported from a Prometheus-compatible source. + # TYPE grafana_alerting_prometheus_imported_rules gauge + grafana_alerting_prometheus_imported_rules{org="%[1]d"} 1 + grafana_alerting_prometheus_imported_rules{org="%[2]d"} 1 + `, alertRule1.OrgID, alertRule2.OrgID) + + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_prometheus_imported_rules") + require.NoError(t, err) + }) + + t.Run("after removing all rules it should not show any metrics", func(t *testing.T) { + sch.updateRulesMetrics([]*models.AlertRule{}) + + expectedMetric := "" + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_prometheus_imported_rules") + require.NoError(t, err) + }) + }) + t.Run("rule_groups metric should reflect the current state", func(t *testing.T) { const firstOrgID int64 = 1 const secondOrgID int64 = 2 diff --git a/pkg/services/ngalert/store/alert_rule.go b/pkg/services/ngalert/store/alert_rule.go index 7a97d7f02ca..089543d51b2 100644 --- a/pkg/services/ngalert/store/alert_rule.go +++ b/pkg/services/ngalert/store/alert_rule.go @@ -607,10 +607,7 @@ func (st DBstore) ListAlertRules(ctx context.Context, query *ngmodels.ListAlertR } } if query.ImportedPrometheusRule != nil { // remove false-positive hits from the result - hasOriginalRuleDefinition := converted.Metadata.PrometheusStyleRule != nil && len(converted.Metadata.PrometheusStyleRule.OriginalRuleDefinition) > 0 - if *query.ImportedPrometheusRule && !hasOriginalRuleDefinition { - continue - } else if !*query.ImportedPrometheusRule && hasOriginalRuleDefinition { + if *query.ImportedPrometheusRule != converted.ImportedFromPrometheus() { continue } } diff --git a/pkg/services/ngalert/tests/fakes/rules.go b/pkg/services/ngalert/tests/fakes/rules.go index cb90d3bfb77..993cdcf2049 100644 --- a/pkg/services/ngalert/tests/fakes/rules.go +++ b/pkg/services/ngalert/tests/fakes/rules.go @@ -215,11 +215,7 @@ func (f *RuleStore) ListAlertRules(_ context.Context, q *models.ListAlertRulesQu continue } if q.ImportedPrometheusRule != nil { - hasOriginalRuleDefinition := r.PrometheusRuleDefinition() != "" - if *q.ImportedPrometheusRule && !hasOriginalRuleDefinition { - continue - } - if !*q.ImportedPrometheusRule && hasOriginalRuleDefinition { + if *q.ImportedPrometheusRule != r.ImportedFromPrometheus() { continue } }