diff --git a/conf/defaults.ini b/conf/defaults.ini index ee703f76d2a..8fc8296fead 100644 --- a/conf/defaults.ini +++ b/conf/defaults.ini @@ -1406,6 +1406,14 @@ resolved_alert_retention = 15m # 0 value means no limit rule_version_record_limit = 0 +# The retention period for deleted alerting rules. +# Determines how long deleted rules are retained before being permanently removed. +# The retention duration must be specified using a time format with unit suffixes +# such as ms, s, m, h, d (e.g., 30d for 30 days). +# Default: 30d +# 0 value means that rules are deleted permanently immediately. +deleted_rule_retention = 30d + [unified_alerting.screenshots] # Enable screenshots in notifications. You must have either installed the Grafana image rendering # plugin, or set up Grafana to use a remote rendering service. diff --git a/conf/sample.ini b/conf/sample.ini index 207fa535410..152fbf6fb96 100644 --- a/conf/sample.ini +++ b/conf/sample.ini @@ -1389,6 +1389,14 @@ # 0 value means no limit ;rule_version_record_limit= 0 +# The retention period for deleted alerting rules. +# Determines how long deleted rules are retained before being permanently removed. +# The retention duration must be specified using a time format with unit suffixes +# such as ms, s, m, h, d (e.g., 30d for 30 days). +# Default: 30d +# 0 value means that rules are deleted permanently immediately. +;deleted_rule_retention = 30d + [unified_alerting.screenshots] # Enable screenshots in notifications. You must have either installed the Grafana image rendering # plugin, or set up Grafana to use a remote rendering service. diff --git a/pkg/server/wire.go b/pkg/server/wire.go index 82b7e891669..c24a40e80d4 100644 --- a/pkg/server/wire.go +++ b/pkg/server/wire.go @@ -10,6 +10,7 @@ import ( "github.com/google/wire" sdkhttpclient "github.com/grafana/grafana-plugin-sdk-go/backend/httpclient" + "github.com/grafana/grafana/pkg/api" "github.com/grafana/grafana/pkg/api/avatar" "github.com/grafana/grafana/pkg/api/routing" @@ -421,6 +422,7 @@ var wireSet = wire.NewSet( prefimpl.ProvideService, oauthtoken.ProvideService, wire.Bind(new(oauthtoken.OAuthTokenService), new(*oauthtoken.Service)), + wire.Bind(new(cleanup.AlertRuleService), new(*ngstore.DBstore)), ) var wireCLISet = wire.NewSet( @@ -453,6 +455,7 @@ var wireTestSet = wire.NewSet( oauthtoken.ProvideService, oauthtokentest.ProvideService, wire.Bind(new(oauthtoken.OAuthTokenService), new(*oauthtokentest.Service)), + wire.Bind(new(cleanup.AlertRuleService), new(*ngstore.DBstore)), ) func Initialize(cfg *setting.Cfg, opts Options, apiOpts api.ServerOptions) (*Server, error) { diff --git a/pkg/services/cleanup/cleanup.go b/pkg/services/cleanup/cleanup.go index 240bcf532fd..479d34a6a75 100644 --- a/pkg/services/cleanup/cleanup.go +++ b/pkg/services/cleanup/cleanup.go @@ -27,6 +27,10 @@ import ( "github.com/grafana/grafana/pkg/setting" ) +type AlertRuleService interface { + CleanUpDeletedAlertRules(ctx context.Context) (int64, error) +} + type CleanUpService struct { log log.Logger tracer tracing.Tracer @@ -41,12 +45,13 @@ type CleanUpService struct { tempUserService tempuser.Service annotationCleaner annotations.Cleaner dashboardService dashboards.DashboardService + alertRuleService AlertRuleService } func ProvideService(cfg *setting.Cfg, serverLockService *serverlock.ServerLockService, shortURLService shorturls.Service, sqlstore db.DB, queryHistoryService queryhistory.Service, dashboardVersionService dashver.Service, dashSnapSvc dashboardsnapshots.Service, deleteExpiredImageService *image.DeleteExpiredService, - tempUserService tempuser.Service, tracer tracing.Tracer, annotationCleaner annotations.Cleaner, dashboardService dashboards.DashboardService) *CleanUpService { + tempUserService tempuser.Service, tracer tracing.Tracer, annotationCleaner annotations.Cleaner, dashboardService dashboards.DashboardService, service AlertRuleService) *CleanUpService { s := &CleanUpService{ Cfg: cfg, ServerLockService: serverLockService, @@ -61,6 +66,7 @@ func ProvideService(cfg *setting.Cfg, serverLockService *serverlock.ServerLockSe tracer: tracer, annotationCleaner: annotationCleaner, dashboardService: dashboardService, + alertRuleService: service, } return s } @@ -112,6 +118,10 @@ func (srv *CleanUpService) clean(ctx context.Context) { cleanupJobs = append(cleanupJobs, cleanUpJob{"delete stale short URLs", srv.deleteStaleShortURLs}) } + if srv.Cfg.UnifiedAlerting.DeletedRuleRetention > 0 { + cleanupJobs = append(cleanupJobs, cleanUpJob{"cleanup trash alert rules", srv.cleanUpTrashAlertRules}) + } + logger := srv.log.FromContext(ctx) logger.Debug("Starting cleanup jobs", "jobs", fmt.Sprintf("%v", cleanupJobs)) @@ -313,3 +323,13 @@ func (srv *CleanUpService) cleanUpTrashDashboards(ctx context.Context) { logger.Debug("Cleaned up deleted dashboards", "dashboards affected", affected) } } + +func (srv *CleanUpService) cleanUpTrashAlertRules(ctx context.Context) { + logger := srv.log.FromContext(ctx) + affected, err := srv.alertRuleService.CleanUpDeletedAlertRules(ctx) + if err != nil { + logger.Error("Problem cleaning up deleted alert rules", "error", err) + } else { + logger.Debug("Cleaned up deleted alert rules", "rows affected", affected) + } +} diff --git a/pkg/services/ngalert/store/alert_rule.go b/pkg/services/ngalert/store/alert_rule.go index d4fb0202a78..ccba09c5a3b 100644 --- a/pkg/services/ngalert/store/alert_rule.go +++ b/pkg/services/ngalert/store/alert_rule.go @@ -73,7 +73,7 @@ func (st DBstore) DeleteAlertRulesByUID(ctx context.Context, orgID int64, user * logger.Debug("Deleted alert rule state", "count", rows) var versions []alertRuleVersion - if st.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertRuleRestore) { + if st.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertRuleRestore) && st.Cfg.DeletedRuleRetention > 0 { // save deleted version only if retention is greater than 0 versions, err = st.getLatestVersionOfRulesByUID(ctx, orgID, ruleUID) if err != nil { logger.Error("Failed to get latest version of deleted alert rules. The recovery will not be possible", "error", err) @@ -1243,6 +1243,24 @@ func (st DBstore) GetNamespacesByRuleUID(ctx context.Context, orgID int64, uids return result, err } +func (st DBstore) CleanUpDeletedAlertRules(ctx context.Context) (int64, error) { + affectedRows := int64(-1) + err := st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error { + expire := TimeNow().Add(-st.Cfg.DeletedRuleRetention) + st.Logger.Debug("Permanently remove expired deleted rules", "deletedBefore", expire) + result, err := sess.Exec("DELETE FROM alert_rule_version WHERE rule_uid='' AND created <= ?", expire) + if err != nil { + return err + } + affectedRows, err = result.RowsAffected() + if err != nil { + st.Logger.Warn("Failed to get rows affected by the delete operation", "error", err) + } + return nil + }) + return affectedRows, err +} + func getINSubQueryArgs[T any](inputSlice []T) ([]any, []string) { args := make([]any, 0, len(inputSlice)) in := make([]string, 0, len(inputSlice)) diff --git a/pkg/services/ngalert/store/alert_rule_test.go b/pkg/services/ngalert/store/alert_rule_test.go index c3e94e7e806..46fc19eb1fb 100644 --- a/pkg/services/ngalert/store/alert_rule_test.go +++ b/pkg/services/ngalert/store/alert_rule_test.go @@ -784,13 +784,15 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) { require.Empty(t, savedInstances) }) - t.Run("should remove all version and insert one with empty rule_uid", func(t *testing.T) { + t.Run("should remove all version and insert one with empty rule_uid when DeletedRuleRetention is set", func(t *testing.T) { orgID := int64(rand.Intn(1000)) gen = gen.With(gen.WithOrgID(orgID)) // Create a new store to pass the custom bus to check the signal b := &fakeBus{} logger := log.New("test-dbstore") + cfg.UnifiedAlerting.DeletedRuleRetention = 1000 * time.Hour + store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b) store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore) @@ -848,6 +850,59 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) { return nil }) }) + + t.Run("should remove all versions and not keep history if DeletedRuleRetention = 0", func(t *testing.T) { + orgID := int64(rand.Intn(1000)) + gen = gen.With(gen.WithOrgID(orgID)) + // Create a new store to pass the custom bus to check the signal + b := &fakeBus{} + logger := log.New("test-dbstore") + + cfg.UnifiedAlerting.DeletedRuleRetention = 0 + + store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b) + store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore) + + result, err := store.InsertAlertRules(context.Background(), &models.AlertingUserUID, gen.GenerateMany(3)) + uids := make([]string, 0, len(result)) + for _, rule := range result { + uids = append(uids, rule.UID) + } + require.NoError(t, err) + rules, err := store.ListAlertRules(context.Background(), &models.ListAlertRulesQuery{OrgID: orgID, RuleUIDs: uids}) + require.NoError(t, err) + + updates := make([]models.UpdateRule, 0, len(rules)) + for _, rule := range rules { + rule2 := models.CopyRule(rule, gen.WithTitle(util.GenerateShortUID())) + updates = append(updates, models.UpdateRule{ + Existing: rule, + New: *rule2, + }) + } + err = store.UpdateAlertRules(context.Background(), &models.AlertingUserUID, updates) + require.NoError(t, err) + + versions, err := store.GetAlertRuleVersions(context.Background(), orgID, rules[0].GUID) + require.NoError(t, err) + require.Len(t, versions, 2) + + err = store.DeleteAlertRulesByUID(context.Background(), orgID, util.Pointer(models.UserUID("test")), uids...) + require.NoError(t, err) + + guids := make([]string, 0, len(rules)) + for _, rule := range rules { + guids = append(guids, rule.GUID) + } + + _ = sqlStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error { + var versions []alertRuleVersion + err = sess.Table(alertRuleVersion{}).Where(`rule_uid = ''`).In("rule_guid", guids).Find(&versions) + require.NoError(t, err) + require.Emptyf(t, versions, "some rules were not permanently deleted") // should be one version per GUID + return nil + }) + }) } func TestIntegrationInsertAlertRules(t *testing.T) { @@ -1962,6 +2017,7 @@ func TestIntegration_ListDeletedRules(t *testing.T) { cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{ BaseInterval: 1 * time.Second, RuleVersionRecordLimit: -1, + DeletedRuleRetention: 10 * time.Hour, } sqlStore := db.InitTestDB(t) folderService := setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures()) @@ -2011,6 +2067,72 @@ func TestIntegration_ListDeletedRules(t *testing.T) { }) } +func TestIntegration_CleanUpDeletedAlertRules(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test") + } + + oldClk := TimeNow + t.Cleanup(func() { + TimeNow = oldClk + }) + + t0 := time.Now().UTC().Truncate(time.Second) + TimeNow = func() time.Time { + return t0 + } + + sqlStore := db.InitTestDB(t, sqlstore.InitTestDBOpt{ + Cfg: nil, + }) + cfg := setting.NewCfg() + cfg.UnifiedAlerting.BaseInterval = 1 * time.Second + cfg.UnifiedAlerting.RuleVersionRecordLimit = -1 + cfg.UnifiedAlerting.DeletedRuleRetention = 10 * time.Second + + folderService := setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures()) + logger := log.New("test-dbstore") + store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, &fakeBus{}) + store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore) + + gen := models.RuleGen + orgID := int64(rand.Intn(1000)) + + gen = gen.With(gen.WithOrgID(orgID)) + + result, err := store.InsertAlertRules(context.Background(), &models.AlertingUserUID, gen.GenerateMany(3)) + uids := make([]string, 0, len(result)) + for _, rule := range result { + uids = append(uids, rule.UID) + } + require.NoError(t, err) + + // simulate rule deletion at different time. + // t0, t0+10s, t0+20s + for idx, uid := range uids { + TimeNow = func() time.Time { + return t0.Add(time.Duration(idx) * 10 * time.Second) + } + err = store.DeleteAlertRulesByUID(context.Background(), orgID, util.Pointer(models.UserUID("test")), uid) + require.NoError(t, err) + } + + before, err := store.ListDeletedRules(context.Background(), orgID) + require.NoError(t, err) + require.Len(t, before, 3) + + // retention is 10s, now=t+20s, therefore, only one row should be deleted + _, err = store.CleanUpDeletedAlertRules(context.Background()) + require.NoError(t, err) + + after, err := store.ListDeletedRules(context.Background(), orgID) + require.NoError(t, err) + assert.Len(t, after, 1) + for _, rule := range after { + assert.GreaterOrEqual(t, rule.Updated, TimeNow().Add(-cfg.UnifiedAlerting.DeletedRuleRetention)) + } +} + func createTestStore( sqlStore db.DB, folderService folder.Service, diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go index 33f002cb571..b14d22b97bc 100644 --- a/pkg/setting/setting_unified_alerting.go +++ b/pkg/setting/setting_unified_alerting.go @@ -129,6 +129,9 @@ type UnifiedAlertingSettings struct { // should be stored in the database for each alert_rule in an organization including the current one. // 0 value means no limit RuleVersionRecordLimit int + + // DeletedRuleRetention defines the maximum duration to retain deleted alerting rules before permanent removal. + DeletedRuleRetention time.Duration } type RecordingRuleSettings struct { @@ -477,6 +480,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { return fmt.Errorf("setting 'rule_version_record_limit' is invalid, only 0 or a positive integer are allowed") } + uaCfg.DeletedRuleRetention = ua.Key("deleted_rule_retention").MustDuration(30 * 24 * time.Hour) + if uaCfg.DeletedRuleRetention < 0 { + return fmt.Errorf("setting 'deleted_rule_retention' is invalid, only 0 or a positive duration are allowed") + } + cfg.UnifiedAlerting = uaCfg return nil }