From 9d57b1c72eb817c5fe1b43ad7bc9706a264b2da0 Mon Sep 17 00:00:00 2001 From: Yuri Tseretyan Date: Fri, 13 Jan 2023 18:29:29 -0500 Subject: [PATCH] Alerting: Do not persist noop transition from Normal state. (#61201) * add feature flag `alertingNoNormalState` * update instance database to support exclusion of state in list operation * do not save normal state and delete transitions to normal * update get methods to filter out normal state --- .../feature-toggles/index.md | 1 + .../src/types/featureToggles.gen.ts | 1 + pkg/services/featuremgmt/registry.go | 6 ++ pkg/services/featuremgmt/toggles_gen.go | 4 + pkg/services/ngalert/models/instance.go | 6 +- pkg/services/ngalert/ngalert.go | 13 +-- pkg/services/ngalert/state/cache.go | 10 ++- pkg/services/ngalert/state/manager.go | 40 ++++++--- .../ngalert/state/manager_private_test.go | 82 +++++++++++++++++++ pkg/services/ngalert/state/state.go | 5 ++ .../ngalert/store/instance_database.go | 10 +-- .../ngalert/store/instance_database_test.go | 65 +++++++++++++-- pkg/services/ngalert/tests/util.go | 14 +--- pkg/services/quota/quotaimpl/quota_test.go | 3 +- 14 files changed, 206 insertions(+), 54 deletions(-) diff --git a/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md b/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md index cbb3ed87204..e204c799f47 100644 --- a/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md +++ b/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md @@ -45,6 +45,7 @@ Some stable features are enabled by default. You can disable a stable feature by | `autoMigrateGraphPanels` | Replace the angular graph panel with timeseries | | `datasourceLogger` | Logs all datasource requests | | `accessControlOnCall` | Access control primitives for OnCall | +| `alertingNoNormalState` | Stop maintaining state of alerts that are not firing | ## Alpha feature toggles diff --git a/packages/grafana-data/src/types/featureToggles.gen.ts b/packages/grafana-data/src/types/featureToggles.gen.ts index 2e91c7d3fa4..829f798f5b9 100644 --- a/packages/grafana-data/src/types/featureToggles.gen.ts +++ b/packages/grafana-data/src/types/featureToggles.gen.ts @@ -88,4 +88,5 @@ export interface FeatureToggles { sessionRemoteCache?: boolean; disablePrometheusExemplarSampling?: boolean; alertingBacktesting?: boolean; + alertingNoNormalState?: boolean; } diff --git a/pkg/services/featuremgmt/registry.go b/pkg/services/featuremgmt/registry.go index 8214e24ad9f..5dd409da495 100644 --- a/pkg/services/featuremgmt/registry.go +++ b/pkg/services/featuremgmt/registry.go @@ -403,5 +403,11 @@ var ( Description: "Rule backtesting API for alerting", State: FeatureStateAlpha, }, + { + Name: "alertingNoNormalState", + Description: "Stop maintaining state of alerts that are not firing", + State: FeatureStateBeta, + RequiresRestart: false, + }, } ) diff --git a/pkg/services/featuremgmt/toggles_gen.go b/pkg/services/featuremgmt/toggles_gen.go index 8ab5ddce906..1ec1a331b04 100644 --- a/pkg/services/featuremgmt/toggles_gen.go +++ b/pkg/services/featuremgmt/toggles_gen.go @@ -294,4 +294,8 @@ const ( // FlagAlertingBacktesting // Rule backtesting API for alerting FlagAlertingBacktesting = "alertingBacktesting" + + // FlagAlertingNoNormalState + // Stop maintaining state of alerts that are not firing + FlagAlertingNoNormalState = "alertingNoNormalState" ) diff --git a/pkg/services/ngalert/models/instance.go b/pkg/services/ngalert/models/instance.go index 3637cb83d00..7a62fcb1fce 100644 --- a/pkg/services/ngalert/models/instance.go +++ b/pkg/services/ngalert/models/instance.go @@ -50,10 +50,8 @@ func (i InstanceStateType) IsValid() bool { // ListAlertInstancesQuery is the query list alert Instances. type ListAlertInstancesQuery struct { - RuleOrgID int64 `json:"-"` - RuleUID string - State InstanceStateType - StateReason string + RuleUID string + RuleOrgID int64 `json:"-"` Result []*AlertInstance } diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index beb3722bba8..993b35bd2da 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -210,12 +210,13 @@ func (ng *AlertNG) init() error { return err } cfg := state.ManagerCfg{ - Metrics: ng.Metrics.GetStateMetrics(), - ExternalURL: appUrl, - InstanceStore: store, - Images: ng.imageService, - Clock: clk, - Historian: history, + Metrics: ng.Metrics.GetStateMetrics(), + ExternalURL: appUrl, + InstanceStore: store, + Images: ng.imageService, + Clock: clk, + Historian: history, + DoNotSaveNormalState: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState), } stateManager := state.NewManager(cfg) scheduler := schedule.NewScheduler(schedCfg, stateManager) diff --git a/pkg/services/ngalert/state/cache.go b/pkg/services/ngalert/state/cache.go index 7cee0e1a30d..97531420c58 100644 --- a/pkg/services/ngalert/state/cache.go +++ b/pkg/services/ngalert/state/cache.go @@ -209,19 +209,22 @@ func (c *cache) get(orgID int64, alertRuleUID, stateId string) *State { return nil } -func (c *cache) getAll(orgID int64) []*State { +func (c *cache) getAll(orgID int64, skipNormalState bool) []*State { var states []*State c.mtxStates.RLock() defer c.mtxStates.RUnlock() for _, v1 := range c.states[orgID] { for _, v2 := range v1.states { + if skipNormalState && IsNormalStateWithNoReason(v2) { + continue + } states = append(states, v2) } } return states } -func (c *cache) getStatesForRuleUID(orgID int64, alertRuleUID string) []*State { +func (c *cache) getStatesForRuleUID(orgID int64, alertRuleUID string, skipNormalState bool) []*State { c.mtxStates.RLock() defer c.mtxStates.RUnlock() orgRules, ok := c.states[orgID] @@ -234,6 +237,9 @@ func (c *cache) getStatesForRuleUID(orgID int64, alertRuleUID string) []*State { } result := make([]*State, 0, len(rs.states)) for _, state := range rs.states { + if skipNormalState && IsNormalStateWithNoReason(state) { + continue + } result = append(result, state) } return result diff --git a/pkg/services/ngalert/state/manager.go b/pkg/services/ngalert/state/manager.go index fd31375de7b..d937a7328a5 100644 --- a/pkg/services/ngalert/state/manager.go +++ b/pkg/services/ngalert/state/manager.go @@ -37,6 +37,8 @@ type Manager struct { images ImageCapturer historian Historian externalURL *url.URL + + doNotSaveNormalState bool } type ManagerCfg struct { @@ -46,19 +48,22 @@ type ManagerCfg struct { Images ImageCapturer Clock clock.Clock Historian Historian + // DoNotSaveNormalState controls whether eval.Normal state is persisted to the database and returned by get methods + DoNotSaveNormalState bool } func NewManager(cfg ManagerCfg) *Manager { return &Manager{ - cache: newCache(), - ResendDelay: ResendDelay, // TODO: make this configurable - log: log.New("ngalert.state.manager"), - metrics: cfg.Metrics, - instanceStore: cfg.InstanceStore, - images: cfg.Images, - historian: cfg.Historian, - clock: cfg.Clock, - externalURL: cfg.ExternalURL, + cache: newCache(), + ResendDelay: ResendDelay, // TODO: make this configurable + log: log.New("ngalert.state.manager"), + metrics: cfg.Metrics, + instanceStore: cfg.InstanceStore, + images: cfg.Images, + historian: cfg.Historian, + clock: cfg.Clock, + externalURL: cfg.ExternalURL, + doNotSaveNormalState: cfg.DoNotSaveNormalState, } } @@ -271,11 +276,11 @@ func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRu } func (st *Manager) GetAll(orgID int64) []*State { - return st.cache.getAll(orgID) + allStates := st.cache.getAll(orgID, st.doNotSaveNormalState) + return allStates } - func (st *Manager) GetStatesForRuleUID(orgID int64, alertRuleUID string) []*State { - return st.cache.getStatesForRuleUID(orgID, alertRuleUID) + return st.cache.getStatesForRuleUID(orgID, alertRuleUID, st.doNotSaveNormalState) } func (st *Manager) Put(states []*State) { @@ -294,9 +299,14 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state instances := make([]ngModels.AlertInstance, 0, len(states)) for _, s := range states { + // Do not save normal state to database and remove transition to Normal state but keep mapped states + if st.doNotSaveNormalState && IsNormalStateWithNoReason(s.State) && !s.Changed() { + continue + } + key, err := s.GetAlertInstanceKey() if err != nil { - logger.Error("Failed to create a key for alert state to save it to database. The state will be ignored ", "cacheID", s.CacheID, "error", err) + logger.Error("Failed to create a key for alert state to save it to database. The state will be ignored ", "cacheID", s.CacheID, "error", err, "labels", s.Labels.String()) continue } fields := ngModels.AlertInstance{ @@ -311,6 +321,10 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state instances = append(instances, fields) } + if len(instances) == 0 { + return + } + if err := st.instanceStore.SaveAlertInstances(ctx, instances...); err != nil { type debugInfo struct { State string diff --git a/pkg/services/ngalert/state/manager_private_test.go b/pkg/services/ngalert/state/manager_private_test.go index a23d7fe36b9..44cc72043b5 100644 --- a/pkg/services/ngalert/state/manager_private_test.go +++ b/pkg/services/ngalert/state/manager_private_test.go @@ -7,9 +7,13 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/grafana/grafana/pkg/infra/log/logtest" + "github.com/grafana/grafana/pkg/services/ngalert/eval" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" + "github.com/grafana/grafana/pkg/util" ) // Not for parallel tests. @@ -65,3 +69,81 @@ func TestStateIsStale(t *testing.T) { }) } } + +func TestManager_saveAlertStates(t *testing.T) { + type stateWithReason struct { + State eval.State + Reason string + } + create := func(s eval.State, r string) stateWithReason { + return stateWithReason{ + State: s, + Reason: r, + } + } + allStates := [...]stateWithReason{ + create(eval.Normal, ""), + create(eval.Normal, eval.NoData.String()), + create(eval.Normal, eval.Error.String()), + create(eval.Normal, util.GenerateShortUID()), + create(eval.Alerting, ""), + create(eval.Pending, ""), + create(eval.NoData, ""), + create(eval.Error, ""), + } + + transitionToKey := map[ngmodels.AlertInstanceKey]StateTransition{} + transitions := make([]StateTransition, 0) + for _, fromState := range allStates { + for i, toState := range allStates { + tr := StateTransition{ + State: &State{ + State: toState.State, + StateReason: toState.Reason, + Labels: ngmodels.GenerateAlertLabels(5, fmt.Sprintf("%d--", i)), + }, + PreviousState: fromState.State, + PreviousStateReason: fromState.Reason, + } + key, err := tr.GetAlertInstanceKey() + require.NoError(t, err) + transitionToKey[key] = tr + transitions = append(transitions, tr) + } + } + + t.Run("should save all transitions if doNotSaveNormalState is false", func(t *testing.T) { + st := &FakeInstanceStore{} + m := Manager{instanceStore: st, doNotSaveNormalState: false} + m.saveAlertStates(context.Background(), &logtest.Fake{}, transitions...) + + savedKeys := map[ngmodels.AlertInstanceKey]ngmodels.AlertInstance{} + for _, op := range st.RecordedOps { + saved := op.(ngmodels.AlertInstance) + savedKeys[saved.AlertInstanceKey] = saved + } + assert.Len(t, transitionToKey, len(savedKeys)) + + for key, tr := range transitionToKey { + assert.Containsf(t, savedKeys, key, "state %s (%s) was not saved but should be", tr.State.State, tr.StateReason) + } + }) + + t.Run("should not save Normal->Normal if doNotSaveNormalState is true", func(t *testing.T) { + st := &FakeInstanceStore{} + m := Manager{instanceStore: st, doNotSaveNormalState: true} + m.saveAlertStates(context.Background(), &logtest.Fake{}, transitions...) + + savedKeys := map[ngmodels.AlertInstanceKey]ngmodels.AlertInstance{} + for _, op := range st.RecordedOps { + saved := op.(ngmodels.AlertInstance) + savedKeys[saved.AlertInstanceKey] = saved + } + for key, tr := range transitionToKey { + if tr.State.State == eval.Normal && tr.StateReason == "" && tr.PreviousState == eval.Normal && tr.PreviousStateReason == "" { + continue + } + assert.Containsf(t, savedKeys, key, "state %s (%s) was not saved but should be", tr.State.State, tr.StateReason) + } + }) +} diff --git a/pkg/services/ngalert/state/state.go b/pkg/services/ngalert/state/state.go index 03e33036d2b..8468fae25db 100644 --- a/pkg/services/ngalert/state/state.go +++ b/pkg/services/ngalert/state/state.go @@ -140,6 +140,11 @@ func (a *State) Maintain(interval int64, evaluatedAt time.Time) { a.EndsAt = nextEndsTime(interval, evaluatedAt) } +// IsNormalStateWithNoReason returns true if the state is Normal and reason is empty +func IsNormalStateWithNoReason(s *State) bool { + return s.State == eval.Normal && s.StateReason == "" +} + // StateTransition describes the transition from one state to another. type StateTransition struct { *State diff --git a/pkg/services/ngalert/store/instance_database.go b/pkg/services/ngalert/store/instance_database.go index 4723a323683..bb3df4259ea 100644 --- a/pkg/services/ngalert/store/instance_database.go +++ b/pkg/services/ngalert/store/instance_database.go @@ -30,15 +30,9 @@ func (st DBstore) ListAlertInstances(ctx context.Context, cmd *models.ListAlertI if cmd.RuleUID != "" { addToQuery(` AND rule_uid = ?`, cmd.RuleUID) } - - if cmd.State != "" { - addToQuery(` AND current_state = ?`, cmd.State) - } - - if cmd.StateReason != "" { - addToQuery(` AND current_reason = ?`, cmd.StateReason) + if st.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState) { + s.WriteString(fmt.Sprintf(" AND NOT (current_state = '%s' AND current_reason = '')", models.InstanceStateNormal)) } - if err := sess.SQL(s.String(), params...).Find(&alertInstances); err != nil { return err } diff --git a/pkg/services/ngalert/store/instance_database_test.go b/pkg/services/ngalert/store/instance_database_test.go index 80b9358d682..2fcc287f2c9 100644 --- a/pkg/services/ngalert/store/instance_database_test.go +++ b/pkg/services/ngalert/store/instance_database_test.go @@ -7,8 +7,10 @@ import ( "github.com/stretchr/testify/require" + "github.com/grafana/grafana/pkg/services/featuremgmt" "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/tests" + "github.com/grafana/grafana/pkg/util" ) const baseIntervalSeconds = 10 @@ -17,7 +19,7 @@ func BenchmarkAlertInstanceOperations(b *testing.B) { b.StopTimer() ctx := context.Background() _, dbstore := tests.SetupTestEnv(b, baseIntervalSeconds) - dbstore.FeatureToggles.(*tests.FakeFeatures).BigTransactions = false + dbstore.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertingBigTransactions) const mainOrgID int64 = 1 @@ -86,7 +88,8 @@ func TestIntegrationAlertInstanceBulkWrite(t *testing.T) { } for _, bigStmts := range []bool{false, true} { - dbstore.FeatureToggles.(*tests.FakeFeatures).BigTransactions = bigStmts + dbstore.FeatureToggles = featuremgmt.WithFeatures([]interface{}{featuremgmt.FlagAlertingBigTransactions, bigStmts}) + t.Log("Saving") err := dbstore.SaveAlertInstances(ctx, instances...) require.NoError(t, err) t.Log("Finished database write") @@ -126,6 +129,16 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) { const mainOrgID int64 = 1 + containsHash := func(t *testing.T, instances []*models.AlertInstance, hash string) { + t.Helper() + for _, i := range instances { + if i.LabelsHash == hash { + return + } + } + require.Fail(t, "%v does not contain an instance with hash %s", instances, hash) + } + alertRule1 := tests.CreateTestAlertRule(t, ctx, dbstore, 60, mainOrgID) orgID := alertRule1.OrgID @@ -249,16 +262,56 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) { require.Len(t, listQuery.Result, 4) }) - t.Run("can list all added instances in org filtered by current state", func(t *testing.T) { + t.Run("should ignore Normal state with no reason if feature flag is enabled", func(t *testing.T) { + labels := models.InstanceLabels{"test": util.GenerateShortUID()} + instance1 := models.AlertInstance{ + AlertInstanceKey: models.AlertInstanceKey{ + RuleOrgID: orgID, + RuleUID: util.GenerateShortUID(), + LabelsHash: util.GenerateShortUID(), + }, + CurrentState: models.InstanceStateNormal, + CurrentReason: "", + Labels: labels, + } + instance2 := models.AlertInstance{ + AlertInstanceKey: models.AlertInstanceKey{ + RuleOrgID: orgID, + RuleUID: util.GenerateShortUID(), + LabelsHash: util.GenerateShortUID(), + }, + CurrentState: models.InstanceStateNormal, + CurrentReason: models.StateReasonError, + Labels: labels, + } + err := dbstore.SaveAlertInstances(ctx, instance1, instance2) + require.NoError(t, err) + listQuery := &models.ListAlertInstancesQuery{ RuleOrgID: orgID, - State: models.InstanceStateNormal, } - err := dbstore.ListAlertInstances(ctx, listQuery) + err = dbstore.ListAlertInstances(ctx, listQuery) require.NoError(t, err) - require.Len(t, listQuery.Result, 1) + containsHash(t, listQuery.Result, instance1.LabelsHash) + + f := dbstore.FeatureToggles + dbstore.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertingNoNormalState) + t.Cleanup(func() { + dbstore.FeatureToggles = f + }) + + err = dbstore.ListAlertInstances(ctx, listQuery) + require.NoError(t, err) + + containsHash(t, listQuery.Result, instance2.LabelsHash) + + for _, instance := range listQuery.Result { + if instance.CurrentState == models.InstanceStateNormal && instance.CurrentReason == "" { + require.Fail(t, "List operation expected to return all states except Normal but the result contains Normal states") + } + } }) t.Run("update instance with same org_id, uid and different state", func(t *testing.T) { diff --git a/pkg/services/ngalert/tests/util.go b/pkg/services/ngalert/tests/util.go index c7df12d3774..098a4af36aa 100644 --- a/pkg/services/ngalert/tests/util.go +++ b/pkg/services/ngalert/tests/util.go @@ -43,18 +43,6 @@ import ( "github.com/grafana/grafana/pkg/util" ) -type FakeFeatures struct { - BigTransactions bool -} - -func (f *FakeFeatures) IsEnabled(feature string) bool { - if feature == featuremgmt.FlagAlertingBigTransactions { - return f.BigTransactions - } - - return false -} - // SetupTestEnv initializes a store to used by the tests. func SetupTestEnv(tb testing.TB, baseInterval time.Duration) (*ngalert.AlertNG, *store.DBstore) { tb.Helper() @@ -100,7 +88,7 @@ func SetupTestEnv(tb testing.TB, baseInterval time.Duration) (*ngalert.AlertNG, folderService := folderimpl.ProvideService(ac, bus, cfg, dashboardService, dashboardStore, nil, features, folderPermissions, nil) ng, err := ngalert.ProvideService( - cfg, &FakeFeatures{}, nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotatest.New(false, nil), + cfg, featuremgmt.WithFeatures(), nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotatest.New(false, nil), secretsService, nil, m, folderService, ac, &dashboards.FakeDashboardService{}, nil, bus, ac, annotationstest.NewFakeAnnotationsRepo(), &plugins.FakePluginStore{}, tracer, ) require.NoError(tb, err) diff --git a/pkg/services/quota/quotaimpl/quota_test.go b/pkg/services/quota/quotaimpl/quota_test.go index 3bacfb87d96..0a622b678f4 100644 --- a/pkg/services/quota/quotaimpl/quota_test.go +++ b/pkg/services/quota/quotaimpl/quota_test.go @@ -29,7 +29,6 @@ import ( "github.com/grafana/grafana/pkg/services/ngalert" "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngalertmodels "github.com/grafana/grafana/pkg/services/ngalert/models" - ngalerttests "github.com/grafana/grafana/pkg/services/ngalert/tests" "github.com/grafana/grafana/pkg/services/org" "github.com/grafana/grafana/pkg/services/org/orgimpl" "github.com/grafana/grafana/pkg/services/quota" @@ -478,7 +477,7 @@ func setupEnv(t *testing.T, sqlStore *sqlstore.SQLStore, b bus.Bus, quotaService require.NoError(t, err) m := metrics.NewNGAlert(prometheus.NewRegistry()) _, err = ngalert.ProvideService( - sqlStore.Cfg, &ngalerttests.FakeFeatures{}, nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotaService, + sqlStore.Cfg, featuremgmt.WithFeatures(), nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotaService, secretsService, nil, m, &foldertest.FakeService{}, &acmock.Mock{}, &dashboards.FakeDashboardService{}, nil, b, &acmock.Mock{}, annotationstest.NewFakeAnnotationsRepo(), &plugins.FakePluginStore{}, tracer, ) require.NoError(t, err)