diff --git a/go.work.sum b/go.work.sum index c9cfa4ae15a..f29575e7e8d 100644 --- a/go.work.sum +++ b/go.work.sum @@ -692,7 +692,6 @@ github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiG github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o= github.com/couchbase/moss v0.2.0/go.mod h1:9MaHIaRuy9pvLPUJxB8sh8OrLfyDczECVL37grCIubs= github.com/cpuguy83/go-md2man v1.0.10 h1:BSKMNlYxDvnunlTymqtgONjNnaRV1sTpcovwwjF22jk= -github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/creasty/defaults v1.8.0 h1:z27FJxCAa0JKt3utc0sCImAEb+spPucmKoOdLHvHYKk= @@ -923,6 +922,7 @@ github.com/grafana/tail v0.0.0-20230510142333-77b18831edf0/go.mod h1:7t5XR+2IA8P github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79 h1:+ngKgrYPPJrOjhax5N+uePQ0Fh1Z7PheYoUI/0nzkPA= github.com/gregjones/httpcache v0.0.0-20190611155906-901d90724c79/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/go-grpc-middleware v1.3.0/go.mod h1:z0ButlSOZa5vEBq9m2m2hlwIgKw+rp3sdCBRoJY+30Y= +github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.3.0/go.mod h1:qOchhhIlmRcqk/O9uCo/puJlyo07YINaIqdZfZG3Jkc= github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1/go.mod h1:5SN9VR2LTsRFsrEC6FHgRbTWrTHu6tqPeKxEQv15giM= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0/go.mod h1:ggCgvZ2r7uOoQjOyu2Y1NhHmEPPzzuhWgcza5M1Ji1I= @@ -1197,7 +1197,6 @@ github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5I github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/spf13/afero v1.10.0/go.mod h1:UBogFpq8E9Hx+xc5CNTTEpTnuHVmXDwZcZcE1eb/UhQ= github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= -github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/jwalterweatherman v1.1.0 h1:ue6voC5bR5F8YxI5S67j9i582FU4Qvo2bmqnqMYADFk= github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad h1:fiWzISvDn0Csy5H0iwgAuJGQTUpVfEMJJd4nRFXogbc= github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= @@ -1460,7 +1459,6 @@ golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= -golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= golang.org/x/oauth2 v0.15.0/go.mod h1:q48ptWNTY5XWf+JNten23lcvHpLJ0ZSxF5ttTHKVCAM= golang.org/x/oauth2 v0.19.0/go.mod h1:vYi7skDa1x015PmRRYZ7+s1cWyPgrPiSYRe4rnsexc8= golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= diff --git a/pkg/services/ngalert/models/alert_rule.go b/pkg/services/ngalert/models/alert_rule.go index 69219b8b09f..022599a7158 100644 --- a/pkg/services/ngalert/models/alert_rule.go +++ b/pkg/services/ngalert/models/alert_rule.go @@ -294,6 +294,11 @@ type AlertRule struct { IsPaused bool NotificationSettings []NotificationSettings Metadata AlertRuleMetadata + // MissingSeriesEvalsToResolve specifies the number of consecutive evaluation intervals + // required before resolving an alert state (a dimension) when data is missing. + // If nil, alerts resolve after 2 missing evaluation intervals + // (i.e., resolution occurs during the second evaluation where data is absent). + MissingSeriesEvalsToResolve *int } type AlertRuleMetadata struct { @@ -578,6 +583,18 @@ func (alertRule *AlertRule) GetGroupKey() AlertRuleGroupKey { return AlertRuleGroupKey{OrgID: alertRule.OrgID, NamespaceUID: alertRule.NamespaceUID, RuleGroup: alertRule.RuleGroup} } +// GetMissingSeriesEvalsToResolve returns the number of consecutive evaluation intervals +// to wait before resolving an alert rule instance when its data is missing. +// If not configured, it returns the default value (2), which means the alert +// resolves after missing for two evaluation intervals. +func (alertRule *AlertRule) GetMissingSeriesEvalsToResolve() int { + if alertRule.MissingSeriesEvalsToResolve == nil { + return 2 // default value + } + + return *alertRule.MissingSeriesEvalsToResolve +} + // PreSave sets default values and loads the updated model for each alert query. func (alertRule *AlertRule) PreSave(timeNow func() time.Time, userUID *UserUID) error { for i, q := range alertRule.Data { @@ -659,6 +676,10 @@ func validateAlertRuleFields(rule *AlertRule) error { return err } + if rule.MissingSeriesEvalsToResolve != nil && *rule.MissingSeriesEvalsToResolve <= 0 { + return fmt.Errorf("%w: field `missing_series_evals_to_resolve` must be greater than 0", ErrAlertRuleFailedValidation) + } + return nil } @@ -708,25 +729,26 @@ func (alertRule *AlertRule) Copy() *AlertRule { return nil } result := AlertRule{ - ID: alertRule.ID, - GUID: alertRule.GUID, - OrgID: alertRule.OrgID, - Title: alertRule.Title, - Condition: alertRule.Condition, - Updated: alertRule.Updated, - UpdatedBy: alertRule.UpdatedBy, - IntervalSeconds: alertRule.IntervalSeconds, - Version: alertRule.Version, - UID: alertRule.UID, - NamespaceUID: alertRule.NamespaceUID, - RuleGroup: alertRule.RuleGroup, - RuleGroupIndex: alertRule.RuleGroupIndex, - NoDataState: alertRule.NoDataState, - ExecErrState: alertRule.ExecErrState, - For: alertRule.For, - Record: alertRule.Record, - IsPaused: alertRule.IsPaused, - Metadata: alertRule.Metadata, + ID: alertRule.ID, + GUID: alertRule.GUID, + OrgID: alertRule.OrgID, + Title: alertRule.Title, + Condition: alertRule.Condition, + Updated: alertRule.Updated, + UpdatedBy: alertRule.UpdatedBy, + IntervalSeconds: alertRule.IntervalSeconds, + Version: alertRule.Version, + UID: alertRule.UID, + NamespaceUID: alertRule.NamespaceUID, + RuleGroup: alertRule.RuleGroup, + RuleGroupIndex: alertRule.RuleGroupIndex, + NoDataState: alertRule.NoDataState, + ExecErrState: alertRule.ExecErrState, + For: alertRule.For, + Record: alertRule.Record, + IsPaused: alertRule.IsPaused, + Metadata: alertRule.Metadata, + MissingSeriesEvalsToResolve: alertRule.MissingSeriesEvalsToResolve, } if alertRule.DashboardUID != nil { @@ -789,6 +811,7 @@ func ClearRecordingRuleIgnoredFields(rule *AlertRule) { rule.Condition = "" rule.For = 0 rule.NotificationSettings = nil + rule.MissingSeriesEvalsToResolve = nil } // GetAlertRuleByUIDQuery is the query for retrieving/deleting an alert rule by UID and organisation ID. diff --git a/pkg/services/ngalert/models/alert_rule_test.go b/pkg/services/ngalert/models/alert_rule_test.go index b8e47df7bbf..978b6d49127 100644 --- a/pkg/services/ngalert/models/alert_rule_test.go +++ b/pkg/services/ngalert/models/alert_rule_test.go @@ -18,6 +18,7 @@ import ( "golang.org/x/exp/maps" "gopkg.in/yaml.v3" + "github.com/grafana/grafana/pkg/setting" "github.com/grafana/grafana/pkg/util" "github.com/grafana/grafana/pkg/util/cmputil" ) @@ -386,6 +387,7 @@ func TestPatchPartialAlertRule(t *testing.T) { }) } +// nolint:gocyclo func TestDiff(t *testing.T) { t.Run("should return nil if there is no diff", func(t *testing.T) { rule1 := RuleGen.GenerateRef() @@ -406,7 +408,9 @@ func TestDiff(t *testing.T) { t.Run("should find diff in simple fields", func(t *testing.T) { rule1 := RuleGen.GenerateRef() - rule2 := RuleGen.GenerateRef() + rule2 := RuleGen.With( + RuleGen.WithMissingSeriesEvalsToResolve(*rule1.MissingSeriesEvalsToResolve + 1), + ).GenerateRef() diffs := rule1.Diff(rule2, "Data", "Annotations", "Labels", "NotificationSettings", "Metadata") // these fields will be tested separately @@ -540,6 +544,13 @@ func TestDiff(t *testing.T) { assert.Equal(t, rule2.Record, diff[0].Right.String()) difCnt++ } + if rule1.MissingSeriesEvalsToResolve != rule2.MissingSeriesEvalsToResolve { + diff := diffs.GetDiffsForField("MissingSeriesEvalsToResolve") + assert.Len(t, diff, 1) + assert.Equal(t, *rule1.MissingSeriesEvalsToResolve, int(diff[0].Left.Int())) + assert.Equal(t, *rule2.MissingSeriesEvalsToResolve, int(diff[0].Right.Int())) + difCnt++ + } require.Lenf(t, diffs, difCnt, "Got some unexpected diffs. Either add to ignore or add assert to it") @@ -963,6 +974,21 @@ func TestAlertRuleGetKeyWithGroup(t *testing.T) { }) } +func TestAlertRuleGetMissingSeriesEvalsToResolve(t *testing.T) { + t.Run("should return the default 2 if MissingSeriesEvalsToResolve is nil", func(t *testing.T) { + rule := RuleGen.GenerateRef() + rule.MissingSeriesEvalsToResolve = nil + require.Equal(t, 2, rule.GetMissingSeriesEvalsToResolve()) + }) + + t.Run("should return the correct value", func(t *testing.T) { + rule := RuleGen.With( + RuleMuts.WithMissingSeriesEvalsToResolve(3), + ).GenerateRef() + require.Equal(t, 3, rule.GetMissingSeriesEvalsToResolve()) + }) +} + func TestAlertRuleCopy(t *testing.T) { t.Run("should return a copy of the rule", func(t *testing.T) { for i := 0; i < 100; i++ { @@ -1084,3 +1110,54 @@ func TestAlertRule_PrometheusRuleDefinition(t *testing.T) { }) } } + +func TestMissingSeriesEvalsToResolveValidation(t *testing.T) { + testCases := []struct { + name string + missingSeriesEvalsToResolve *int + expectedErrorContains string + }{ + { + name: "should allow nil value", + missingSeriesEvalsToResolve: nil, + }, + { + name: "should reject negative value", + missingSeriesEvalsToResolve: util.Pointer(-1), + expectedErrorContains: "field `missing_series_evals_to_resolve` must be greater than 0", + }, + { + name: "should reject 0", + missingSeriesEvalsToResolve: util.Pointer(0), + expectedErrorContains: "field `missing_series_evals_to_resolve` must be greater than 0", + }, + { + name: "should accept positive value", + missingSeriesEvalsToResolve: util.Pointer(2), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + baseIntervalSeconds := int64(10) + cfg := setting.UnifiedAlertingSettings{ + BaseInterval: time.Duration(baseIntervalSeconds) * time.Second, + } + + rule := RuleGen.With( + RuleMuts.WithIntervalSeconds(baseIntervalSeconds * 2), + ).Generate() + rule.MissingSeriesEvalsToResolve = tc.missingSeriesEvalsToResolve + + err := rule.ValidateAlertRule(cfg) + + if tc.expectedErrorContains != "" { + require.Error(t, err) + require.ErrorIs(t, err, ErrAlertRuleFailedValidation) + require.Contains(t, err.Error(), tc.expectedErrorContains) + } else { + require.NoError(t, err) + } + }) + } +} diff --git a/pkg/services/ngalert/models/testing.go b/pkg/services/ngalert/models/testing.go index eb9d838f8df..242aafc0025 100644 --- a/pkg/services/ngalert/models/testing.go +++ b/pkg/services/ngalert/models/testing.go @@ -103,29 +103,30 @@ func (g *AlertRuleGenerator) Generate() AlertRule { } rule := AlertRule{ - ID: 0, - GUID: uuid.NewString(), - OrgID: rand.Int63n(1500) + 1, // Prevent OrgID=0 as this does not pass alert rule validation. - Title: fmt.Sprintf("title-%s", util.GenerateShortUID()), - Condition: "A", - Data: []AlertQuery{g.GenerateQuery()}, - Updated: time.Now().Add(-time.Duration(rand.Intn(100) + 1)), - UpdatedBy: updatedBy, - IntervalSeconds: rand.Int63n(60) + 1, - Version: rand.Int63n(1500), // Don't generate a rule ID too big for postgres - UID: util.GenerateShortUID(), - NamespaceUID: util.GenerateShortUID(), - DashboardUID: dashUID, - PanelID: panelID, - RuleGroup: fmt.Sprintf("group-%s,", util.GenerateShortUID()), - RuleGroupIndex: rand.Intn(1500), - NoDataState: randNoDataState(), - ExecErrState: randErrState(), - For: forInterval, - Annotations: annotations, - Labels: labels, - NotificationSettings: ns, - Metadata: GenerateMetadata(), + ID: 0, + GUID: uuid.NewString(), + OrgID: rand.Int63n(1500) + 1, // Prevent OrgID=0 as this does not pass alert rule validation. + Title: fmt.Sprintf("title-%s", util.GenerateShortUID()), + Condition: "A", + Data: []AlertQuery{g.GenerateQuery()}, + Updated: time.Now().Add(-time.Duration(rand.Intn(100) + 1)), + UpdatedBy: updatedBy, + IntervalSeconds: rand.Int63n(60) + 1, + Version: rand.Int63n(1500), // Don't generate a rule ID too big for postgres + UID: util.GenerateShortUID(), + NamespaceUID: util.GenerateShortUID(), + DashboardUID: dashUID, + PanelID: panelID, + RuleGroup: fmt.Sprintf("group-%s,", util.GenerateShortUID()), + RuleGroupIndex: rand.Intn(1500), + NoDataState: randNoDataState(), + ExecErrState: randErrState(), + For: forInterval, + Annotations: annotations, + Labels: labels, + NotificationSettings: ns, + Metadata: GenerateMetadata(), + MissingSeriesEvalsToResolve: util.Pointer(2), } for _, mutator := range g.mutators { @@ -499,6 +500,15 @@ func (a *AlertRuleMutators) WithSameGroup() AlertRuleMutator { } } +func (a *AlertRuleMutators) WithMissingSeriesEvalsToResolve(timesOfInterval int) AlertRuleMutator { + return func(rule *AlertRule) { + if timesOfInterval <= 0 { + panic("timesOfInterval must be greater than 0") + } + rule.MissingSeriesEvalsToResolve = util.Pointer(timesOfInterval) + } +} + func (a *AlertRuleMutators) WithNotificationSettingsGen(ns func() NotificationSettings) AlertRuleMutator { return func(rule *AlertRule) { rule.NotificationSettings = []NotificationSettings{ns()} @@ -1343,6 +1353,7 @@ func ConvertToRecordingRule(rule *AlertRule) { rule.ExecErrState = "" rule.For = 0 rule.NotificationSettings = nil + rule.MissingSeriesEvalsToResolve = nil } func nameToUid(name string) string { // Avoid legacy_storage.NameToUid import cycle. diff --git a/pkg/services/ngalert/schedule/registry_test.go b/pkg/services/ngalert/schedule/registry_test.go index 2a52f67b264..01acc80fa41 100644 --- a/pkg/services/ngalert/schedule/registry_test.go +++ b/pkg/services/ngalert/schedule/registry_test.go @@ -12,6 +12,7 @@ import ( "github.com/stretchr/testify/require" "github.com/grafana/grafana/pkg/services/ngalert/models" + "github.com/grafana/grafana/pkg/util" ) func TestSchedulableAlertRulesRegistry(t *testing.T) { @@ -211,6 +212,7 @@ func TestRuleWithFolderFingerprint(t *testing.T) { SimplifiedNotificationsSection: false, }, }, + MissingSeriesEvalsToResolve: util.Pointer(2), } r2 := &models.AlertRule{ ID: 2, @@ -255,6 +257,7 @@ func TestRuleWithFolderFingerprint(t *testing.T) { SimplifiedQueryAndExpressionsSection: true, }, }, + MissingSeriesEvalsToResolve: util.Pointer(1), } excludedFields := map[string]struct{}{ diff --git a/pkg/services/ngalert/state/manager.go b/pkg/services/ngalert/state/manager.go index f66002bbb3b..e23d3bce143 100644 --- a/pkg/services/ngalert/state/manager.go +++ b/pkg/services/ngalert/state/manager.go @@ -519,7 +519,7 @@ func (st *Manager) deleteStaleStatesFromCache(logger log.Logger, evaluatedAt tim // If we are removing two or more stale series it makes sense to share the resolved image as the alert rule is the same. // TODO: We will need to change this when we support images without screenshots as each series will have a different image staleStates := st.cache.deleteRuleStates(alertRule.GetKey(), func(s *State) bool { - return stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds) + return stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds, alertRule.GetMissingSeriesEvalsToResolve()) }) resolvedStates := make([]StateTransition, 0, len(staleStates)) @@ -551,8 +551,18 @@ func (st *Manager) deleteStaleStatesFromCache(logger log.Logger, evaluatedAt tim return resolvedStates } -func stateIsStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool { - return !lastEval.Add(2 * time.Duration(intervalSeconds) * time.Second).After(evaluatedAt) +// stateIsStale determines whether the evaluation state is considered stale. +// A state is considered stale if the data has been missing for at least missingSeriesEvalsToResolve evaluation intervals. +func stateIsStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64, missingSeriesEvalsToResolve int) bool { + // If the last evaluation time equals the current evaluation time, the state is not stale. + if evaluatedAt.Equal(lastEval) { + return false + } + + resolveIfMissingDuration := time.Duration(int64(missingSeriesEvalsToResolve)*intervalSeconds) * time.Second + + // timeSinceLastEval >= resolveIfMissingDuration + return evaluatedAt.Sub(lastEval) >= resolveIfMissingDuration } func StatesToRuleStatus(states []*State) ngModels.RuleStatus { diff --git a/pkg/services/ngalert/state/manager_private_test.go b/pkg/services/ngalert/state/manager_private_test.go index 16de5cd57ea..42fcddf41a3 100644 --- a/pkg/services/ngalert/state/manager_private_test.go +++ b/pkg/services/ngalert/state/manager_private_test.go @@ -31,40 +31,81 @@ func TestStateIsStale(t *testing.T) { now := time.Now() intervalSeconds := rand.Int63n(10) + 5 + threeIntervals := time.Duration(intervalSeconds) * time.Second * 3 + fourIntervals := time.Duration(intervalSeconds) * time.Second * 4 + fiveIntervals := time.Duration(intervalSeconds) * time.Second * 5 + testCases := []struct { - name string - lastEvaluation time.Time - expectedResult bool + name string + lastEvaluation time.Time + expectedResult bool + missingSeriesEvalsToResolve int }{ { - name: "false if last evaluation is now", - lastEvaluation: now, - expectedResult: false, + name: "false if last evaluation is now", + lastEvaluation: now, + missingSeriesEvalsToResolve: 2, + expectedResult: false, + }, + { + name: "false if last evaluation is 1 interval before now", + lastEvaluation: now.Add(-time.Duration(intervalSeconds)), + missingSeriesEvalsToResolve: 2, + expectedResult: false, + }, + { + name: "false if last evaluation is little less than 2 interval before now", + lastEvaluation: now.Add(-time.Duration(intervalSeconds) * time.Second * 2).Add(100 * time.Millisecond), + missingSeriesEvalsToResolve: 2, + expectedResult: false, + }, + { + name: "true if last evaluation is 2 intervals from now", + lastEvaluation: now.Add(-time.Duration(intervalSeconds) * time.Second * 2), + missingSeriesEvalsToResolve: 2, + expectedResult: true, + }, + { + name: "true if last evaluation is 3 intervals from now", + lastEvaluation: now.Add(-time.Duration(intervalSeconds) * time.Second * 3), + missingSeriesEvalsToResolve: 2, + expectedResult: true, }, { - name: "false if last evaluation is 1 interval before now", - lastEvaluation: now.Add(-time.Duration(intervalSeconds)), - expectedResult: false, + name: "false if last evaluation is within custom resolve after missing for", + lastEvaluation: now.Add(-threeIntervals), + missingSeriesEvalsToResolve: 4, + expectedResult: false, }, { - name: "false if last evaluation is little less than 2 interval before now", - lastEvaluation: now.Add(-time.Duration(intervalSeconds) * time.Second * 2).Add(100 * time.Millisecond), - expectedResult: false, + name: "true if last evaluation equals custom resolve after missing for", + lastEvaluation: now.Add(-fourIntervals), + missingSeriesEvalsToResolve: 4, + expectedResult: true, }, { - name: "true if last evaluation is 2 intervals from now", - lastEvaluation: now.Add(-time.Duration(intervalSeconds) * time.Second * 2), - expectedResult: true, + name: "true if last evaluation exceeds custom resolve after missing for", + lastEvaluation: now.Add(-fiveIntervals), + missingSeriesEvalsToResolve: 4, + expectedResult: true, }, { - name: "true if last evaluation is 3 intervals from now", - lastEvaluation: now.Add(-time.Duration(intervalSeconds) * time.Second * 3), - expectedResult: true, + name: "when missingSeriesEvalsToResolve is 1 and the state is just created", + lastEvaluation: now, + missingSeriesEvalsToResolve: 1, + expectedResult: false, + }, + { + name: "when missingSeriesEvalsToResolve is 1 and the state is created in the past", + lastEvaluation: now.Add(-time.Duration(intervalSeconds) * time.Second * 1), + missingSeriesEvalsToResolve: 1, + expectedResult: true, }, } + for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - require.Equal(t, tc.expectedResult, stateIsStale(now, tc.lastEvaluation, intervalSeconds)) + require.Equal(t, tc.expectedResult, stateIsStale(now, tc.lastEvaluation, intervalSeconds, tc.missingSeriesEvalsToResolve)) }) } } @@ -115,6 +156,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) { t1 := tN(1) t2 := tN(2) t3 := tN(3) + t4 := tN(4) baseRule := &ngmodels.AlertRule{ OrgID: 1, @@ -738,6 +780,308 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) { }, }, }, + { + desc: "t1[1:alerting] t2[NoData] t3[NoData] at t2,t3", + alertRule: baseRule, + results: map[time.Time]eval.Results{ + t1: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), + }, + t2: { + newResult(eval.WithState(eval.NoData), eval.WithLabels(noDataLabels)), + }, + t3: { + newResult(eval.WithState(eval.NoData), eval.WithLabels(noDataLabels)), + }, + }, + expectedTransitions: map[time.Time][]StateTransition{ + t1: { + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule + labels1"], + State: eval.Alerting, + LatestResult: newEvaluation(t1, eval.Alerting), + StartsAt: t1, + EndsAt: t1.Add(ResendDelay * 4), + LastEvaluationTime: t1, + LastSentAt: &t1, + }, + }, + }, + t2: { + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule + no-data"], + State: eval.NoData, + LatestResult: newEvaluation(t2, eval.NoData), + StartsAt: t2, + EndsAt: t2.Add(ResendDelay * 4), + LastEvaluationTime: t2, + LastSentAt: &t2, + }, + }, + }, + t3: { + { + PreviousState: eval.NoData, + State: &State{ + Labels: labels["system + rule + no-data"], + State: eval.NoData, + LatestResult: newEvaluation(t3, eval.NoData), + StartsAt: t2, + EndsAt: t3.Add(ResendDelay * 4), + LastSentAt: &t2, + LastEvaluationTime: t3, + }, + }, + // This is the transition of the alerting state from t1 to Normal + // after 2 evaluations as it became stale. + { + PreviousState: eval.Alerting, + State: &State{ + Labels: labels["system + rule + labels1"], + State: eval.Normal, + StateReason: ngmodels.StateReasonMissingSeries, + LatestResult: newEvaluation(t1, eval.Alerting), + StartsAt: t1, + EndsAt: t3, + LastEvaluationTime: t3, + ResolvedAt: &t3, + LastSentAt: &t3, + }, + }, + }, + }, + }, + { + desc: "t1[1:alerting] t2[NoData] t3[NoData] t4[NoData] with missing_series_evals_to_resolve=3 at t3,t4", + alertRule: baseRuleWith(ngmodels.RuleMuts.WithMissingSeriesEvalsToResolve(3)), + results: map[time.Time]eval.Results{ + t1: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), + }, + t2: { + newResult(eval.WithState(eval.NoData), eval.WithLabels(noDataLabels)), + }, + t3: { + newResult(eval.WithState(eval.NoData), eval.WithLabels(noDataLabels)), + }, + t4: { + newResult(eval.WithState(eval.NoData), eval.WithLabels(noDataLabels)), + }, + }, + expectedTransitions: map[time.Time][]StateTransition{ + t3: { + { + PreviousState: eval.NoData, + State: &State{ + Labels: labels["system + rule + no-data"], + State: eval.NoData, + LatestResult: newEvaluation(t3, eval.NoData), + StartsAt: t2, + EndsAt: t3.Add(ResendDelay * 4), + LastEvaluationTime: t3, + LastSentAt: &t2, + }, + }, + }, + t4: { + { + PreviousState: eval.NoData, + State: &State{ + Labels: labels["system + rule + no-data"], + State: eval.NoData, + LatestResult: newEvaluation(t4, eval.NoData), + StartsAt: t2, + EndsAt: t4.Add(ResendDelay * 4), + LastSentAt: &t2, + LastEvaluationTime: t4, + }, + }, + // This is the transition of the alerting state from t1 to Normal + // after 3 evaluations as it became stale. + { + PreviousState: eval.Alerting, + State: &State{ + Labels: labels["system + rule + labels1"], + State: eval.Normal, + StateReason: ngmodels.StateReasonMissingSeries, + LatestResult: newEvaluation(t1, eval.Alerting), + StartsAt: t1, + EndsAt: t4, + LastEvaluationTime: t4, + ResolvedAt: &t4, + LastSentAt: &t4, + }, + }, + }, + }, + }, + { + desc: "t1[1:alerting] t2[NoData] t3[NoData] with missing_series_evals_to_resolve=1 at t2,t3", + alertRule: baseRuleWith(ngmodels.RuleMuts.WithMissingSeriesEvalsToResolve(1)), + results: map[time.Time]eval.Results{ + t1: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), + }, + t2: { + newResult(eval.WithState(eval.NoData), eval.WithLabels(noDataLabels)), + }, + t3: { + newResult(eval.WithState(eval.NoData), eval.WithLabels(noDataLabels)), + }, + }, + expectedTransitions: map[time.Time][]StateTransition{ + t1: { + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule + labels1"], + State: eval.Alerting, + LatestResult: newEvaluation(t1, eval.Alerting), + StartsAt: t1, + EndsAt: t1.Add(ResendDelay * 4), + LastEvaluationTime: t1, + LastSentAt: &t1, + }, + }, + }, + t2: { + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule + no-data"], + State: eval.NoData, + LatestResult: newEvaluation(t2, eval.NoData), + StartsAt: t2, + EndsAt: t2.Add(ResendDelay * 4), + LastEvaluationTime: t2, + LastSentAt: &t2, + }, + }, + // This is the transition of the alerting state from t1 to Normal + // after 2 evaluations as it became stale. + { + PreviousState: eval.Alerting, + State: &State{ + Labels: labels["system + rule + labels1"], + State: eval.Normal, + StateReason: ngmodels.StateReasonMissingSeries, + LatestResult: newEvaluation(t1, eval.Alerting), + StartsAt: t1, + EndsAt: t2, + LastEvaluationTime: t2, + ResolvedAt: &t2, + LastSentAt: &t2, + }, + }, + }, + t3: { + { + PreviousState: eval.NoData, + State: &State{ + Labels: labels["system + rule + no-data"], + State: eval.NoData, + LatestResult: newEvaluation(t3, eval.NoData), + StartsAt: t2, + EndsAt: t3.Add(ResendDelay * 4), + LastSentAt: &t2, + LastEvaluationTime: t3, + }, + }, + }, + }, + }, + { + desc: "t1[1:alerting,2:alerting] t2[1:alerting] t3[1:alerting] with missing_series_evals_to_resolve=1 at t2,t3", + alertRule: baseRuleWith(ngmodels.RuleMuts.WithMissingSeriesEvalsToResolve(1)), + results: map[time.Time]eval.Results{ + t1: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels2)), + }, + t2: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels2)), + }, + t3: { + newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels2)), + }, + }, + expectedTransitions: map[time.Time][]StateTransition{ + t1: { + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule + labels1"], + State: eval.Alerting, + LatestResult: newEvaluation(t1, eval.Alerting), + StartsAt: t1, + EndsAt: t1.Add(ResendDelay * 4), + LastEvaluationTime: t1, + LastSentAt: &t1, + }, + }, + { + PreviousState: eval.Normal, + State: &State{ + Labels: labels["system + rule + labels2"], + State: eval.Alerting, + LatestResult: newEvaluation(t1, eval.Alerting), + StartsAt: t1, + EndsAt: t1.Add(ResendDelay * 4), + LastEvaluationTime: t1, + LastSentAt: &t1, + }, + }, + }, + t2: { + { + PreviousState: eval.Alerting, + State: &State{ + Labels: labels["system + rule + labels2"], + State: eval.Alerting, + LatestResult: newEvaluation(t2, eval.Alerting), + StartsAt: t1, + EndsAt: t2.Add(ResendDelay * 4), + LastEvaluationTime: t2, + LastSentAt: &t1, + }, + }, + // This is the transition of the alerting state from t1 to Normal + // after 2 evaluations as it became stale. + { + PreviousState: eval.Alerting, + State: &State{ + Labels: labels["system + rule + labels1"], + State: eval.Normal, + StateReason: ngmodels.StateReasonMissingSeries, + LatestResult: newEvaluation(t1, eval.Alerting), + StartsAt: t1, + EndsAt: t2, + LastEvaluationTime: t2, + ResolvedAt: &t2, + LastSentAt: &t2, + }, + }, + }, + t3: { + { + PreviousState: eval.Alerting, + State: &State{ + Labels: labels["system + rule + labels2"], + State: eval.Alerting, + LatestResult: newEvaluation(t3, eval.Alerting), + StartsAt: t1, + EndsAt: t3.Add(ResendDelay * 4), + LastEvaluationTime: t3, + LastSentAt: &t1, + }, + }, + }, + }, + }, { desc: "t1[{}:normal] t2[{}:alerting] at t2", alertRule: baseRule, diff --git a/pkg/services/ngalert/state/manager_test.go b/pkg/services/ngalert/state/manager_test.go index b6b863f40df..1af185871b8 100644 --- a/pkg/services/ngalert/state/manager_test.go +++ b/pkg/services/ngalert/state/manager_test.go @@ -1906,7 +1906,7 @@ func TestStaleResults(t *testing.T) { st := state.NewManager(cfg, state.NewNoopPersister()) gen := models.RuleGen - rule := gen.With(gen.WithFor(0)).GenerateRef() + rule := gen.With(gen.WithFor(0), gen.WithMissingSeriesEvalsToResolve(2)).GenerateRef() initResults := eval.Results{ eval.ResultGen(eval.WithState(eval.Alerting), eval.WithEvaluatedAt(clk.Now()))(), diff --git a/pkg/services/ngalert/store/compat.go b/pkg/services/ngalert/store/compat.go index 8021233ee93..8d489a2b240 100644 --- a/pkg/services/ngalert/store/compat.go +++ b/pkg/services/ngalert/store/compat.go @@ -18,23 +18,24 @@ func alertRuleToModelsAlertRule(ar alertRule, l log.Logger) (models.AlertRule, e } result := models.AlertRule{ - ID: ar.ID, - OrgID: ar.OrgID, - GUID: ar.GUID, - Title: ar.Title, - Condition: ar.Condition, - Data: data, - Updated: ar.Updated, - IntervalSeconds: ar.IntervalSeconds, - Version: ar.Version, - UID: ar.UID, - NamespaceUID: ar.NamespaceUID, - DashboardUID: ar.DashboardUID, - PanelID: ar.PanelID, - RuleGroup: ar.RuleGroup, - RuleGroupIndex: ar.RuleGroupIndex, - For: ar.For, - IsPaused: ar.IsPaused, + ID: ar.ID, + OrgID: ar.OrgID, + GUID: ar.GUID, + Title: ar.Title, + Condition: ar.Condition, + Data: data, + Updated: ar.Updated, + IntervalSeconds: ar.IntervalSeconds, + Version: ar.Version, + UID: ar.UID, + NamespaceUID: ar.NamespaceUID, + DashboardUID: ar.DashboardUID, + PanelID: ar.PanelID, + RuleGroup: ar.RuleGroup, + RuleGroupIndex: ar.RuleGroupIndex, + For: ar.For, + IsPaused: ar.IsPaused, + MissingSeriesEvalsToResolve: ar.MissingSeriesEvalsToResolve, } if ar.UpdatedBy != nil { @@ -107,24 +108,25 @@ func parseNotificationSettings(s string) ([]models.NotificationSettings, error) func alertRuleFromModelsAlertRule(ar models.AlertRule) (alertRule, error) { result := alertRule{ - ID: ar.ID, - GUID: ar.GUID, - OrgID: ar.OrgID, - Title: ar.Title, - Condition: ar.Condition, - Updated: ar.Updated, - IntervalSeconds: ar.IntervalSeconds, - Version: ar.Version, - UID: ar.UID, - NamespaceUID: ar.NamespaceUID, - DashboardUID: ar.DashboardUID, - PanelID: ar.PanelID, - RuleGroup: ar.RuleGroup, - RuleGroupIndex: ar.RuleGroupIndex, - NoDataState: ar.NoDataState.String(), - ExecErrState: ar.ExecErrState.String(), - For: ar.For, - IsPaused: ar.IsPaused, + ID: ar.ID, + GUID: ar.GUID, + OrgID: ar.OrgID, + Title: ar.Title, + Condition: ar.Condition, + Updated: ar.Updated, + IntervalSeconds: ar.IntervalSeconds, + Version: ar.Version, + UID: ar.UID, + NamespaceUID: ar.NamespaceUID, + DashboardUID: ar.DashboardUID, + PanelID: ar.PanelID, + RuleGroup: ar.RuleGroup, + RuleGroupIndex: ar.RuleGroupIndex, + NoDataState: ar.NoDataState.String(), + ExecErrState: ar.ExecErrState.String(), + For: ar.For, + IsPaused: ar.IsPaused, + MissingSeriesEvalsToResolve: ar.MissingSeriesEvalsToResolve, } if ar.UpdatedBy != nil { @@ -181,30 +183,31 @@ func alertRuleFromModelsAlertRule(ar models.AlertRule) (alertRule, error) { func alertRuleToAlertRuleVersion(rule alertRule) alertRuleVersion { return alertRuleVersion{ - RuleOrgID: rule.OrgID, - RuleGUID: rule.GUID, - RuleUID: rule.UID, - RuleNamespaceUID: rule.NamespaceUID, - RuleGroup: rule.RuleGroup, - RuleGroupIndex: rule.RuleGroupIndex, - ParentVersion: 0, - RestoredFrom: 0, - Version: rule.Version, - Created: rule.Updated, // assuming the Updated time as the creation time - CreatedBy: rule.UpdatedBy, - Title: rule.Title, - Condition: rule.Condition, - Data: rule.Data, - IntervalSeconds: rule.IntervalSeconds, - Record: rule.Record, - NoDataState: rule.NoDataState, - ExecErrState: rule.ExecErrState, - For: rule.For, - Annotations: rule.Annotations, - Labels: rule.Labels, - IsPaused: rule.IsPaused, - NotificationSettings: rule.NotificationSettings, - Metadata: rule.Metadata, + RuleOrgID: rule.OrgID, + RuleGUID: rule.GUID, + RuleUID: rule.UID, + RuleNamespaceUID: rule.NamespaceUID, + RuleGroup: rule.RuleGroup, + RuleGroupIndex: rule.RuleGroupIndex, + ParentVersion: 0, + RestoredFrom: 0, + Version: rule.Version, + Created: rule.Updated, // assuming the Updated time as the creation time + CreatedBy: rule.UpdatedBy, + Title: rule.Title, + Condition: rule.Condition, + Data: rule.Data, + IntervalSeconds: rule.IntervalSeconds, + Record: rule.Record, + NoDataState: rule.NoDataState, + ExecErrState: rule.ExecErrState, + For: rule.For, + Annotations: rule.Annotations, + Labels: rule.Labels, + IsPaused: rule.IsPaused, + NotificationSettings: rule.NotificationSettings, + Metadata: rule.Metadata, + MissingSeriesEvalsToResolve: rule.MissingSeriesEvalsToResolve, } } @@ -224,18 +227,19 @@ func alertRuleVersionToAlertRule(version alertRuleVersion) alertRule { NamespaceUID: version.RuleNamespaceUID, // Versions do not store Dashboard\Panel as separate column. // However, these fields are part of annotations and information in these fields is redundant - DashboardUID: nil, - PanelID: nil, - RuleGroup: version.RuleGroup, - RuleGroupIndex: version.RuleGroupIndex, - Record: version.Record, - NoDataState: version.NoDataState, - ExecErrState: version.ExecErrState, - For: version.For, - Annotations: version.Annotations, - Labels: version.Labels, - IsPaused: version.IsPaused, - NotificationSettings: version.NotificationSettings, - Metadata: version.Metadata, + DashboardUID: nil, + PanelID: nil, + RuleGroup: version.RuleGroup, + RuleGroupIndex: version.RuleGroupIndex, + Record: version.Record, + NoDataState: version.NoDataState, + ExecErrState: version.ExecErrState, + For: version.For, + Annotations: version.Annotations, + Labels: version.Labels, + IsPaused: version.IsPaused, + NotificationSettings: version.NotificationSettings, + Metadata: version.Metadata, + MissingSeriesEvalsToResolve: version.MissingSeriesEvalsToResolve, } } diff --git a/pkg/services/ngalert/store/models.go b/pkg/services/ngalert/store/models.go index 8c5d44a9d8e..63cb4e21cd4 100644 --- a/pkg/services/ngalert/store/models.go +++ b/pkg/services/ngalert/store/models.go @@ -4,31 +4,32 @@ import "time" // alertRule represents a record in alert_rule table type alertRule struct { - ID int64 `xorm:"pk autoincr 'id'"` - GUID string `xorm:"guid"` - OrgID int64 `xorm:"org_id"` - Title string - Condition string - Data string - Updated time.Time - UpdatedBy *string `xorm:"updated_by"` - IntervalSeconds int64 - Version int64 `xorm:"version"` // this tag makes xorm add optimistic lock (see https://xorm.io/docs/chapter-06/1.lock/) - UID string `xorm:"uid"` - NamespaceUID string `xorm:"namespace_uid"` - DashboardUID *string `xorm:"dashboard_uid"` - PanelID *int64 `xorm:"panel_id"` - RuleGroup string - RuleGroupIndex int `xorm:"rule_group_idx"` - Record string - NoDataState string - ExecErrState string - For time.Duration - Annotations string - Labels string - IsPaused bool - NotificationSettings string `xorm:"notification_settings"` - Metadata string `xorm:"metadata"` + ID int64 `xorm:"pk autoincr 'id'"` + GUID string `xorm:"guid"` + OrgID int64 `xorm:"org_id"` + Title string + Condition string + Data string + Updated time.Time + UpdatedBy *string `xorm:"updated_by"` + IntervalSeconds int64 + Version int64 `xorm:"version"` // this tag makes xorm add optimistic lock (see https://xorm.io/docs/chapter-06/1.lock/) + UID string `xorm:"uid"` + NamespaceUID string `xorm:"namespace_uid"` + DashboardUID *string `xorm:"dashboard_uid"` + PanelID *int64 `xorm:"panel_id"` + RuleGroup string + RuleGroupIndex int `xorm:"rule_group_idx"` + Record string + NoDataState string + ExecErrState string + For time.Duration + Annotations string + Labels string + IsPaused bool + NotificationSettings string `xorm:"notification_settings"` + Metadata string `xorm:"metadata"` + MissingSeriesEvalsToResolve *int `xorm:"missing_series_evals_to_resolve"` } func (a alertRule) TableName() string { @@ -59,12 +60,13 @@ type alertRuleVersion struct { ExecErrState string // ideally this field should have been apimodels.ApiDuration // but this is currently not possible because of circular dependencies - For time.Duration - Annotations string - Labels string - IsPaused bool - NotificationSettings string `xorm:"notification_settings"` - Metadata string `xorm:"metadata"` + For time.Duration + Annotations string + Labels string + IsPaused bool + NotificationSettings string `xorm:"notification_settings"` + Metadata string `xorm:"metadata"` + MissingSeriesEvalsToResolve *int `xorm:"missing_series_evals_to_resolve"` } // EqualSpec compares two alertRuleVersion objects for equality based on their specifications and returns true if they match. @@ -88,7 +90,8 @@ func (a alertRuleVersion) EqualSpec(b alertRuleVersion) bool { a.Labels == b.Labels && a.IsPaused == b.IsPaused && a.NotificationSettings == b.NotificationSettings && - a.Metadata == b.Metadata + a.Metadata == b.Metadata && + a.MissingSeriesEvalsToResolve == b.MissingSeriesEvalsToResolve } func (a alertRuleVersion) TableName() string { diff --git a/pkg/services/sqlstore/migrations/migrations.go b/pkg/services/sqlstore/migrations/migrations.go index 47cc25c2888..edf822bb0b6 100644 --- a/pkg/services/sqlstore/migrations/migrations.go +++ b/pkg/services/sqlstore/migrations/migrations.go @@ -147,4 +147,6 @@ func (oss *OSSMigrations) AddMigration(mg *Migrator) { ualert.AddAlertRuleStateTable(mg) ualert.AddAlertRuleGuidMigration(mg) + + ualert.AddAlertRuleMissingSeriesEvalsToResolve(mg) } diff --git a/pkg/services/sqlstore/migrations/ualert/alert_rule_missing_series_evals_to_resolve.go b/pkg/services/sqlstore/migrations/ualert/alert_rule_missing_series_evals_to_resolve.go new file mode 100644 index 00000000000..a114a12aaa3 --- /dev/null +++ b/pkg/services/sqlstore/migrations/ualert/alert_rule_missing_series_evals_to_resolve.go @@ -0,0 +1,17 @@ +package ualert + +import "github.com/grafana/grafana/pkg/services/sqlstore/migrator" + +// AddAlertRuleMissingSeriesEvalsToResolve adds missing_series_evals_to_resolve column to alert_rule and alert_rule_version tables. +func AddAlertRuleMissingSeriesEvalsToResolve(mg *migrator.Migrator) { + column := &migrator.Column{Name: "missing_series_evals_to_resolve", Type: migrator.DB_SmallInt, Nullable: true} + + mg.AddMigration( + "add missing_series_evals_to_resolve column to alert_rule", + migrator.NewAddColumnMigration(migrator.Table{Name: "alert_rule"}, column), + ) + mg.AddMigration( + "add missing_series_evals_to_resolve column to alert_rule_version", + migrator.NewAddColumnMigration(migrator.Table{Name: "alert_rule_version"}, column), + ) +}