Alerting: Add scheduled clean-up of deleted rules (#101963)

* add scheduled clean up of deleted rules


---------

Signed-off-by: Yuri Tseretyan <yuriy.tseretyan@grafana.com>
1036-docs-add-documentation-for-alert-rule-history
Yuri Tseretyan 4 months ago committed by GitHub
parent 9870718c3a
commit 943b73a682
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 8
      conf/defaults.ini
  2. 8
      conf/sample.ini
  3. 3
      pkg/server/wire.go
  4. 22
      pkg/services/cleanup/cleanup.go
  5. 20
      pkg/services/ngalert/store/alert_rule.go
  6. 124
      pkg/services/ngalert/store/alert_rule_test.go
  7. 8
      pkg/setting/setting_unified_alerting.go

@ -1406,6 +1406,14 @@ resolved_alert_retention = 15m
# 0 value means no limit # 0 value means no limit
rule_version_record_limit = 0 rule_version_record_limit = 0
# The retention period for deleted alerting rules.
# Determines how long deleted rules are retained before being permanently removed.
# The retention duration must be specified using a time format with unit suffixes
# such as ms, s, m, h, d (e.g., 30d for 30 days).
# Default: 30d
# 0 value means that rules are deleted permanently immediately.
deleted_rule_retention = 30d
[unified_alerting.screenshots] [unified_alerting.screenshots]
# Enable screenshots in notifications. You must have either installed the Grafana image rendering # Enable screenshots in notifications. You must have either installed the Grafana image rendering
# plugin, or set up Grafana to use a remote rendering service. # plugin, or set up Grafana to use a remote rendering service.

@ -1389,6 +1389,14 @@
# 0 value means no limit # 0 value means no limit
;rule_version_record_limit= 0 ;rule_version_record_limit= 0
# The retention period for deleted alerting rules.
# Determines how long deleted rules are retained before being permanently removed.
# The retention duration must be specified using a time format with unit suffixes
# such as ms, s, m, h, d (e.g., 30d for 30 days).
# Default: 30d
# 0 value means that rules are deleted permanently immediately.
;deleted_rule_retention = 30d
[unified_alerting.screenshots] [unified_alerting.screenshots]
# Enable screenshots in notifications. You must have either installed the Grafana image rendering # Enable screenshots in notifications. You must have either installed the Grafana image rendering
# plugin, or set up Grafana to use a remote rendering service. # plugin, or set up Grafana to use a remote rendering service.

@ -10,6 +10,7 @@ import (
"github.com/google/wire" "github.com/google/wire"
sdkhttpclient "github.com/grafana/grafana-plugin-sdk-go/backend/httpclient" sdkhttpclient "github.com/grafana/grafana-plugin-sdk-go/backend/httpclient"
"github.com/grafana/grafana/pkg/api" "github.com/grafana/grafana/pkg/api"
"github.com/grafana/grafana/pkg/api/avatar" "github.com/grafana/grafana/pkg/api/avatar"
"github.com/grafana/grafana/pkg/api/routing" "github.com/grafana/grafana/pkg/api/routing"
@ -421,6 +422,7 @@ var wireSet = wire.NewSet(
prefimpl.ProvideService, prefimpl.ProvideService,
oauthtoken.ProvideService, oauthtoken.ProvideService,
wire.Bind(new(oauthtoken.OAuthTokenService), new(*oauthtoken.Service)), wire.Bind(new(oauthtoken.OAuthTokenService), new(*oauthtoken.Service)),
wire.Bind(new(cleanup.AlertRuleService), new(*ngstore.DBstore)),
) )
var wireCLISet = wire.NewSet( var wireCLISet = wire.NewSet(
@ -453,6 +455,7 @@ var wireTestSet = wire.NewSet(
oauthtoken.ProvideService, oauthtoken.ProvideService,
oauthtokentest.ProvideService, oauthtokentest.ProvideService,
wire.Bind(new(oauthtoken.OAuthTokenService), new(*oauthtokentest.Service)), wire.Bind(new(oauthtoken.OAuthTokenService), new(*oauthtokentest.Service)),
wire.Bind(new(cleanup.AlertRuleService), new(*ngstore.DBstore)),
) )
func Initialize(cfg *setting.Cfg, opts Options, apiOpts api.ServerOptions) (*Server, error) { func Initialize(cfg *setting.Cfg, opts Options, apiOpts api.ServerOptions) (*Server, error) {

@ -27,6 +27,10 @@ import (
"github.com/grafana/grafana/pkg/setting" "github.com/grafana/grafana/pkg/setting"
) )
type AlertRuleService interface {
CleanUpDeletedAlertRules(ctx context.Context) (int64, error)
}
type CleanUpService struct { type CleanUpService struct {
log log.Logger log log.Logger
tracer tracing.Tracer tracer tracing.Tracer
@ -41,12 +45,13 @@ type CleanUpService struct {
tempUserService tempuser.Service tempUserService tempuser.Service
annotationCleaner annotations.Cleaner annotationCleaner annotations.Cleaner
dashboardService dashboards.DashboardService dashboardService dashboards.DashboardService
alertRuleService AlertRuleService
} }
func ProvideService(cfg *setting.Cfg, serverLockService *serverlock.ServerLockService, func ProvideService(cfg *setting.Cfg, serverLockService *serverlock.ServerLockService,
shortURLService shorturls.Service, sqlstore db.DB, queryHistoryService queryhistory.Service, shortURLService shorturls.Service, sqlstore db.DB, queryHistoryService queryhistory.Service,
dashboardVersionService dashver.Service, dashSnapSvc dashboardsnapshots.Service, deleteExpiredImageService *image.DeleteExpiredService, dashboardVersionService dashver.Service, dashSnapSvc dashboardsnapshots.Service, deleteExpiredImageService *image.DeleteExpiredService,
tempUserService tempuser.Service, tracer tracing.Tracer, annotationCleaner annotations.Cleaner, dashboardService dashboards.DashboardService) *CleanUpService { tempUserService tempuser.Service, tracer tracing.Tracer, annotationCleaner annotations.Cleaner, dashboardService dashboards.DashboardService, service AlertRuleService) *CleanUpService {
s := &CleanUpService{ s := &CleanUpService{
Cfg: cfg, Cfg: cfg,
ServerLockService: serverLockService, ServerLockService: serverLockService,
@ -61,6 +66,7 @@ func ProvideService(cfg *setting.Cfg, serverLockService *serverlock.ServerLockSe
tracer: tracer, tracer: tracer,
annotationCleaner: annotationCleaner, annotationCleaner: annotationCleaner,
dashboardService: dashboardService, dashboardService: dashboardService,
alertRuleService: service,
} }
return s return s
} }
@ -112,6 +118,10 @@ func (srv *CleanUpService) clean(ctx context.Context) {
cleanupJobs = append(cleanupJobs, cleanUpJob{"delete stale short URLs", srv.deleteStaleShortURLs}) cleanupJobs = append(cleanupJobs, cleanUpJob{"delete stale short URLs", srv.deleteStaleShortURLs})
} }
if srv.Cfg.UnifiedAlerting.DeletedRuleRetention > 0 {
cleanupJobs = append(cleanupJobs, cleanUpJob{"cleanup trash alert rules", srv.cleanUpTrashAlertRules})
}
logger := srv.log.FromContext(ctx) logger := srv.log.FromContext(ctx)
logger.Debug("Starting cleanup jobs", "jobs", fmt.Sprintf("%v", cleanupJobs)) logger.Debug("Starting cleanup jobs", "jobs", fmt.Sprintf("%v", cleanupJobs))
@ -313,3 +323,13 @@ func (srv *CleanUpService) cleanUpTrashDashboards(ctx context.Context) {
logger.Debug("Cleaned up deleted dashboards", "dashboards affected", affected) logger.Debug("Cleaned up deleted dashboards", "dashboards affected", affected)
} }
} }
func (srv *CleanUpService) cleanUpTrashAlertRules(ctx context.Context) {
logger := srv.log.FromContext(ctx)
affected, err := srv.alertRuleService.CleanUpDeletedAlertRules(ctx)
if err != nil {
logger.Error("Problem cleaning up deleted alert rules", "error", err)
} else {
logger.Debug("Cleaned up deleted alert rules", "rows affected", affected)
}
}

@ -73,7 +73,7 @@ func (st DBstore) DeleteAlertRulesByUID(ctx context.Context, orgID int64, user *
logger.Debug("Deleted alert rule state", "count", rows) logger.Debug("Deleted alert rule state", "count", rows)
var versions []alertRuleVersion var versions []alertRuleVersion
if st.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertRuleRestore) { if st.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertRuleRestore) && st.Cfg.DeletedRuleRetention > 0 { // save deleted version only if retention is greater than 0
versions, err = st.getLatestVersionOfRulesByUID(ctx, orgID, ruleUID) versions, err = st.getLatestVersionOfRulesByUID(ctx, orgID, ruleUID)
if err != nil { if err != nil {
logger.Error("Failed to get latest version of deleted alert rules. The recovery will not be possible", "error", err) logger.Error("Failed to get latest version of deleted alert rules. The recovery will not be possible", "error", err)
@ -1243,6 +1243,24 @@ func (st DBstore) GetNamespacesByRuleUID(ctx context.Context, orgID int64, uids
return result, err return result, err
} }
func (st DBstore) CleanUpDeletedAlertRules(ctx context.Context) (int64, error) {
affectedRows := int64(-1)
err := st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error {
expire := TimeNow().Add(-st.Cfg.DeletedRuleRetention)
st.Logger.Debug("Permanently remove expired deleted rules", "deletedBefore", expire)
result, err := sess.Exec("DELETE FROM alert_rule_version WHERE rule_uid='' AND created <= ?", expire)
if err != nil {
return err
}
affectedRows, err = result.RowsAffected()
if err != nil {
st.Logger.Warn("Failed to get rows affected by the delete operation", "error", err)
}
return nil
})
return affectedRows, err
}
func getINSubQueryArgs[T any](inputSlice []T) ([]any, []string) { func getINSubQueryArgs[T any](inputSlice []T) ([]any, []string) {
args := make([]any, 0, len(inputSlice)) args := make([]any, 0, len(inputSlice))
in := make([]string, 0, len(inputSlice)) in := make([]string, 0, len(inputSlice))

@ -784,13 +784,15 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) {
require.Empty(t, savedInstances) require.Empty(t, savedInstances)
}) })
t.Run("should remove all version and insert one with empty rule_uid", func(t *testing.T) { t.Run("should remove all version and insert one with empty rule_uid when DeletedRuleRetention is set", func(t *testing.T) {
orgID := int64(rand.Intn(1000)) orgID := int64(rand.Intn(1000))
gen = gen.With(gen.WithOrgID(orgID)) gen = gen.With(gen.WithOrgID(orgID))
// Create a new store to pass the custom bus to check the signal // Create a new store to pass the custom bus to check the signal
b := &fakeBus{} b := &fakeBus{}
logger := log.New("test-dbstore") logger := log.New("test-dbstore")
cfg.UnifiedAlerting.DeletedRuleRetention = 1000 * time.Hour
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b) store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b)
store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore) store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore)
@ -848,6 +850,59 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) {
return nil return nil
}) })
}) })
t.Run("should remove all versions and not keep history if DeletedRuleRetention = 0", func(t *testing.T) {
orgID := int64(rand.Intn(1000))
gen = gen.With(gen.WithOrgID(orgID))
// Create a new store to pass the custom bus to check the signal
b := &fakeBus{}
logger := log.New("test-dbstore")
cfg.UnifiedAlerting.DeletedRuleRetention = 0
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b)
store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore)
result, err := store.InsertAlertRules(context.Background(), &models.AlertingUserUID, gen.GenerateMany(3))
uids := make([]string, 0, len(result))
for _, rule := range result {
uids = append(uids, rule.UID)
}
require.NoError(t, err)
rules, err := store.ListAlertRules(context.Background(), &models.ListAlertRulesQuery{OrgID: orgID, RuleUIDs: uids})
require.NoError(t, err)
updates := make([]models.UpdateRule, 0, len(rules))
for _, rule := range rules {
rule2 := models.CopyRule(rule, gen.WithTitle(util.GenerateShortUID()))
updates = append(updates, models.UpdateRule{
Existing: rule,
New: *rule2,
})
}
err = store.UpdateAlertRules(context.Background(), &models.AlertingUserUID, updates)
require.NoError(t, err)
versions, err := store.GetAlertRuleVersions(context.Background(), orgID, rules[0].GUID)
require.NoError(t, err)
require.Len(t, versions, 2)
err = store.DeleteAlertRulesByUID(context.Background(), orgID, util.Pointer(models.UserUID("test")), uids...)
require.NoError(t, err)
guids := make([]string, 0, len(rules))
for _, rule := range rules {
guids = append(guids, rule.GUID)
}
_ = sqlStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
var versions []alertRuleVersion
err = sess.Table(alertRuleVersion{}).Where(`rule_uid = ''`).In("rule_guid", guids).Find(&versions)
require.NoError(t, err)
require.Emptyf(t, versions, "some rules were not permanently deleted") // should be one version per GUID
return nil
})
})
} }
func TestIntegrationInsertAlertRules(t *testing.T) { func TestIntegrationInsertAlertRules(t *testing.T) {
@ -1962,6 +2017,7 @@ func TestIntegration_ListDeletedRules(t *testing.T) {
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{ cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{
BaseInterval: 1 * time.Second, BaseInterval: 1 * time.Second,
RuleVersionRecordLimit: -1, RuleVersionRecordLimit: -1,
DeletedRuleRetention: 10 * time.Hour,
} }
sqlStore := db.InitTestDB(t) sqlStore := db.InitTestDB(t)
folderService := setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures()) folderService := setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures())
@ -2011,6 +2067,72 @@ func TestIntegration_ListDeletedRules(t *testing.T) {
}) })
} }
func TestIntegration_CleanUpDeletedAlertRules(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test")
}
oldClk := TimeNow
t.Cleanup(func() {
TimeNow = oldClk
})
t0 := time.Now().UTC().Truncate(time.Second)
TimeNow = func() time.Time {
return t0
}
sqlStore := db.InitTestDB(t, sqlstore.InitTestDBOpt{
Cfg: nil,
})
cfg := setting.NewCfg()
cfg.UnifiedAlerting.BaseInterval = 1 * time.Second
cfg.UnifiedAlerting.RuleVersionRecordLimit = -1
cfg.UnifiedAlerting.DeletedRuleRetention = 10 * time.Second
folderService := setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures())
logger := log.New("test-dbstore")
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, &fakeBus{})
store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore)
gen := models.RuleGen
orgID := int64(rand.Intn(1000))
gen = gen.With(gen.WithOrgID(orgID))
result, err := store.InsertAlertRules(context.Background(), &models.AlertingUserUID, gen.GenerateMany(3))
uids := make([]string, 0, len(result))
for _, rule := range result {
uids = append(uids, rule.UID)
}
require.NoError(t, err)
// simulate rule deletion at different time.
// t0, t0+10s, t0+20s
for idx, uid := range uids {
TimeNow = func() time.Time {
return t0.Add(time.Duration(idx) * 10 * time.Second)
}
err = store.DeleteAlertRulesByUID(context.Background(), orgID, util.Pointer(models.UserUID("test")), uid)
require.NoError(t, err)
}
before, err := store.ListDeletedRules(context.Background(), orgID)
require.NoError(t, err)
require.Len(t, before, 3)
// retention is 10s, now=t+20s, therefore, only one row should be deleted
_, err = store.CleanUpDeletedAlertRules(context.Background())
require.NoError(t, err)
after, err := store.ListDeletedRules(context.Background(), orgID)
require.NoError(t, err)
assert.Len(t, after, 1)
for _, rule := range after {
assert.GreaterOrEqual(t, rule.Updated, TimeNow().Add(-cfg.UnifiedAlerting.DeletedRuleRetention))
}
}
func createTestStore( func createTestStore(
sqlStore db.DB, sqlStore db.DB,
folderService folder.Service, folderService folder.Service,

@ -129,6 +129,9 @@ type UnifiedAlertingSettings struct {
// should be stored in the database for each alert_rule in an organization including the current one. // should be stored in the database for each alert_rule in an organization including the current one.
// 0 value means no limit // 0 value means no limit
RuleVersionRecordLimit int RuleVersionRecordLimit int
// DeletedRuleRetention defines the maximum duration to retain deleted alerting rules before permanent removal.
DeletedRuleRetention time.Duration
} }
type RecordingRuleSettings struct { type RecordingRuleSettings struct {
@ -477,6 +480,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
return fmt.Errorf("setting 'rule_version_record_limit' is invalid, only 0 or a positive integer are allowed") return fmt.Errorf("setting 'rule_version_record_limit' is invalid, only 0 or a positive integer are allowed")
} }
uaCfg.DeletedRuleRetention = ua.Key("deleted_rule_retention").MustDuration(30 * 24 * time.Hour)
if uaCfg.DeletedRuleRetention < 0 {
return fmt.Errorf("setting 'deleted_rule_retention' is invalid, only 0 or a positive duration are allowed")
}
cfg.UnifiedAlerting = uaCfg cfg.UnifiedAlerting = uaCfg
return nil return nil
} }

Loading…
Cancel
Save