From 199996cbf9459f1638c5c7364ae6ee59d21dc31c Mon Sep 17 00:00:00 2001 From: Yuriy Tseretyan Date: Wed, 21 Sep 2022 13:24:47 -0400 Subject: [PATCH] Alerting: Resolve stale state + add state reason to notifications (#49352) * adds a new reserved annotation `grafana_state_reason` * explicitly resolve stale states --- pkg/services/ngalert/models/alert_rule.go | 7 ++ pkg/services/ngalert/schedule/compat.go | 7 ++ pkg/services/ngalert/schedule/compat_test.go | 7 ++ pkg/services/ngalert/state/manager.go | 19 +++- pkg/services/ngalert/state/manager_test.go | 98 ++++++++++++++++++++ 5 files changed, 133 insertions(+), 5 deletions(-) diff --git a/pkg/services/ngalert/models/alert_rule.go b/pkg/services/ngalert/models/alert_rule.go index a25b6be156d..1f545cdb9ad 100644 --- a/pkg/services/ngalert/models/alert_rule.go +++ b/pkg/services/ngalert/models/alert_rule.go @@ -98,6 +98,13 @@ const ( // FolderTitleLabel is the label that will contain the title of an alert's folder/namespace. FolderTitleLabel = GrafanaReservedLabelPrefix + "folder" + + // StateReasonAnnotation is the name of the annotation that explains the difference between evaluation state and alert state (i.e. changing state when NoData or Error). + StateReasonAnnotation = GrafanaReservedLabelPrefix + "state_reason" +) + +var ( + StateReasonMissingSeries = "MissingSeries" ) var ( diff --git a/pkg/services/ngalert/schedule/compat.go b/pkg/services/ngalert/schedule/compat.go index 42870988b20..4aa597a9c28 100644 --- a/pkg/services/ngalert/schedule/compat.go +++ b/pkg/services/ngalert/schedule/compat.go @@ -43,6 +43,10 @@ func stateToPostableAlert(alertState *state.State, appURL *url.URL) *models.Post nA[ngModels.ImageTokenAnnotation] = alertState.Image.Token } + if alertState.StateReason != "" { + nA[ngModels.StateReasonAnnotation] = alertState.StateReason + } + var urlStr string if uid := nL[ngModels.RuleUIDLabel]; len(uid) > 0 && appURL != nil { u := *appURL @@ -124,6 +128,9 @@ func FromAlertStateToPostableAlerts(firingStates []*state.State, stateManager *s } alert := stateToPostableAlert(alertState, appURL) alerts.PostableAlerts = append(alerts.PostableAlerts, *alert) + if alertState.StateReason == ngModels.StateReasonMissingSeries { // do not put stale state back to state manager + continue + } alertState.LastSentAt = ts sentAlerts = append(sentAlerts, alertState) } diff --git a/pkg/services/ngalert/schedule/compat_test.go b/pkg/services/ngalert/schedule/compat_test.go index ff7a2218610..51d101e2f08 100644 --- a/pkg/services/ngalert/schedule/compat_test.go +++ b/pkg/services/ngalert/schedule/compat_test.go @@ -135,6 +135,13 @@ func Test_stateToPostableAlert(t *testing.T) { }) }) + t.Run("should add state reason annotation if not empty", func(t *testing.T) { + alertState := randomState(tc.state) + alertState.StateReason = "TEST_STATE_REASON" + result := stateToPostableAlert(alertState, appURL) + require.Equal(t, alertState.StateReason, result.Annotations[ngModels.StateReasonAnnotation]) + }) + switch tc.state { case eval.NoData: t.Run("should keep existing labels and change name", func(t *testing.T) { diff --git a/pkg/services/ngalert/state/manager.go b/pkg/services/ngalert/state/manager.go index f7fba240804..b36ba1cc4b0 100644 --- a/pkg/services/ngalert/state/manager.go +++ b/pkg/services/ngalert/state/manager.go @@ -182,7 +182,7 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time states = append(states, s) processedResults[s.CacheId] = s } - st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults) + resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults) if len(states) > 0 { logger.Debug("saving new states to the database", "count", len(states)) for _, state := range states { @@ -191,7 +191,7 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time } } } - return states + return append(states, resolvedStates...) } // Maybe take a screenshot. Do it if: @@ -404,7 +404,8 @@ func (st *Manager) annotateState(ctx context.Context, alertRule *ngModels.AlertR } } -func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) { +func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) []*State { + var resolvedStates []*State allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID) for _, s := range allStates { _, ok := states[s.CacheId] @@ -422,12 +423,20 @@ func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Tim } if s.State == eval.Alerting { + previousState := InstanceStateAndReason{State: s.State, Reason: s.StateReason} + s.State = eval.Normal + s.StateReason = ngModels.StateReasonMissingSeries + s.EndsAt = evaluatedAt + s.Resolved = true st.annotateState(ctx, alertRule, s.Labels, evaluatedAt, - InstanceStateAndReason{State: eval.Normal, Reason: ""}, - InstanceStateAndReason{State: s.State, Reason: s.StateReason}) + InstanceStateAndReason{State: eval.Normal, Reason: s.StateReason}, + previousState, + ) + resolvedStates = append(resolvedStates, s) } } } + return resolvedStates } func isItStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool { diff --git a/pkg/services/ngalert/state/manager_test.go b/pkg/services/ngalert/state/manager_test.go index 1382c72ea38..7d6799d5732 100644 --- a/pkg/services/ngalert/state/manager_test.go +++ b/pkg/services/ngalert/state/manager_test.go @@ -2156,3 +2156,101 @@ func TestStaleResultsHandler(t *testing.T) { assert.Equal(t, tc.finalStateCount, len(existingStatesForRule)) } } + +func TestStaleResults(t *testing.T) { + getCacheID := func(t *testing.T, rule *models.AlertRule, result eval.Result) string { + t.Helper() + labels := data.Labels{} + for key, value := range rule.Labels { + labels[key] = value + } + for key, value := range result.Instance { + labels[key] = value + } + lbls := models.InstanceLabels(labels) + key, err := lbls.StringKey() + require.NoError(t, err) + return key + } + + checkExpectedStates := func(t *testing.T, actual []*state.State, expected map[string]struct{}) { + t.Helper() + require.Len(t, actual, len(expected)) + for _, currentState := range actual { + _, ok := expected[currentState.CacheId] + require.Truef(t, ok, "State %s is not expected. States: %v", currentState.CacheId, expected) + } + } + + t.Run("should mark missing states as stale", func(t *testing.T) { + // init + ctx := context.Background() + _, dbstore := tests.SetupTestEnv(t, 1) + clk := clock.NewMock() + clk.Set(time.Now()) + + st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &dashboards.FakeDashboardService{}, &image.NoopImageService{}, clk) + + orgID := rand.Int63() + rule := tests.CreateTestAlertRule(t, ctx, dbstore, 10, orgID) + + initResults := eval.Results{ + eval.Result{ + Instance: data.Labels{"test1": "testValue1"}, + State: eval.Alerting, + EvaluatedAt: clk.Now(), + }, + eval.Result{ + Instance: data.Labels{"test1": "testValue2"}, + State: eval.Alerting, + EvaluatedAt: clk.Now(), + }, + eval.Result{ + Instance: data.Labels{"test1": "testValue3"}, + State: eval.Normal, + EvaluatedAt: clk.Now(), + }, + } + + initStates := map[string]struct{}{ + getCacheID(t, rule, initResults[0]): {}, + getCacheID(t, rule, initResults[1]): {}, + getCacheID(t, rule, initResults[2]): {}, + } + + // Init + processed := st.ProcessEvalResults(ctx, clk.Now(), rule, initResults, nil) + checkExpectedStates(t, processed, initStates) + currentStates := st.GetStatesForRuleUID(orgID, rule.UID) + checkExpectedStates(t, currentStates, initStates) + + staleDuration := 2 * time.Duration(rule.IntervalSeconds) * time.Second + clk.Add(staleDuration) + results := eval.Results{ + eval.Result{ + Instance: data.Labels{"test1": "testValue1"}, + State: eval.Alerting, + EvaluatedAt: clk.Now(), + }, + } + clk.Add(time.Nanosecond) // we use time now when calculate stale states. Evaluation tick and real time are not the same. usually, difference is way greater than nanosecond. + expectedStaleReturned := getCacheID(t, rule, initResults[1]) + processed = st.ProcessEvalResults(ctx, clk.Now(), rule, results, nil) + checkExpectedStates(t, processed, map[string]struct{}{ + getCacheID(t, rule, results[0]): {}, + expectedStaleReturned: {}, + }) + for _, s := range processed { + if s.CacheId == expectedStaleReturned { + assert.Truef(t, s.Resolved, "Returned stale state should have Resolved set to true") + assert.Equal(t, eval.Normal, s.State) + assert.Equal(t, models.StateReasonMissingSeries, s.StateReason) + break + } + } + currentStates = st.GetStatesForRuleUID(orgID, rule.UID) + checkExpectedStates(t, currentStates, map[string]struct{}{ + getCacheID(t, rule, results[0]): {}, + }) + }) +}