Alerting: Add scheduled clean-up of deleted rules (#101963)

* add scheduled clean up of deleted rules


---------

Signed-off-by: Yuri Tseretyan <yuriy.tseretyan@grafana.com>
1036-docs-add-documentation-for-alert-rule-history
Yuri Tseretyan 4 months ago committed by GitHub
parent 9870718c3a
commit 943b73a682
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 8
      conf/defaults.ini
  2. 8
      conf/sample.ini
  3. 3
      pkg/server/wire.go
  4. 22
      pkg/services/cleanup/cleanup.go
  5. 20
      pkg/services/ngalert/store/alert_rule.go
  6. 124
      pkg/services/ngalert/store/alert_rule_test.go
  7. 8
      pkg/setting/setting_unified_alerting.go

@ -1406,6 +1406,14 @@ resolved_alert_retention = 15m
# 0 value means no limit
rule_version_record_limit = 0
# The retention period for deleted alerting rules.
# Determines how long deleted rules are retained before being permanently removed.
# The retention duration must be specified using a time format with unit suffixes
# such as ms, s, m, h, d (e.g., 30d for 30 days).
# Default: 30d
# 0 value means that rules are deleted permanently immediately.
deleted_rule_retention = 30d
[unified_alerting.screenshots]
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
# plugin, or set up Grafana to use a remote rendering service.

@ -1389,6 +1389,14 @@
# 0 value means no limit
;rule_version_record_limit= 0
# The retention period for deleted alerting rules.
# Determines how long deleted rules are retained before being permanently removed.
# The retention duration must be specified using a time format with unit suffixes
# such as ms, s, m, h, d (e.g., 30d for 30 days).
# Default: 30d
# 0 value means that rules are deleted permanently immediately.
;deleted_rule_retention = 30d
[unified_alerting.screenshots]
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
# plugin, or set up Grafana to use a remote rendering service.

@ -10,6 +10,7 @@ import (
"github.com/google/wire"
sdkhttpclient "github.com/grafana/grafana-plugin-sdk-go/backend/httpclient"
"github.com/grafana/grafana/pkg/api"
"github.com/grafana/grafana/pkg/api/avatar"
"github.com/grafana/grafana/pkg/api/routing"
@ -421,6 +422,7 @@ var wireSet = wire.NewSet(
prefimpl.ProvideService,
oauthtoken.ProvideService,
wire.Bind(new(oauthtoken.OAuthTokenService), new(*oauthtoken.Service)),
wire.Bind(new(cleanup.AlertRuleService), new(*ngstore.DBstore)),
)
var wireCLISet = wire.NewSet(
@ -453,6 +455,7 @@ var wireTestSet = wire.NewSet(
oauthtoken.ProvideService,
oauthtokentest.ProvideService,
wire.Bind(new(oauthtoken.OAuthTokenService), new(*oauthtokentest.Service)),
wire.Bind(new(cleanup.AlertRuleService), new(*ngstore.DBstore)),
)
func Initialize(cfg *setting.Cfg, opts Options, apiOpts api.ServerOptions) (*Server, error) {

@ -27,6 +27,10 @@ import (
"github.com/grafana/grafana/pkg/setting"
)
type AlertRuleService interface {
CleanUpDeletedAlertRules(ctx context.Context) (int64, error)
}
type CleanUpService struct {
log log.Logger
tracer tracing.Tracer
@ -41,12 +45,13 @@ type CleanUpService struct {
tempUserService tempuser.Service
annotationCleaner annotations.Cleaner
dashboardService dashboards.DashboardService
alertRuleService AlertRuleService
}
func ProvideService(cfg *setting.Cfg, serverLockService *serverlock.ServerLockService,
shortURLService shorturls.Service, sqlstore db.DB, queryHistoryService queryhistory.Service,
dashboardVersionService dashver.Service, dashSnapSvc dashboardsnapshots.Service, deleteExpiredImageService *image.DeleteExpiredService,
tempUserService tempuser.Service, tracer tracing.Tracer, annotationCleaner annotations.Cleaner, dashboardService dashboards.DashboardService) *CleanUpService {
tempUserService tempuser.Service, tracer tracing.Tracer, annotationCleaner annotations.Cleaner, dashboardService dashboards.DashboardService, service AlertRuleService) *CleanUpService {
s := &CleanUpService{
Cfg: cfg,
ServerLockService: serverLockService,
@ -61,6 +66,7 @@ func ProvideService(cfg *setting.Cfg, serverLockService *serverlock.ServerLockSe
tracer: tracer,
annotationCleaner: annotationCleaner,
dashboardService: dashboardService,
alertRuleService: service,
}
return s
}
@ -112,6 +118,10 @@ func (srv *CleanUpService) clean(ctx context.Context) {
cleanupJobs = append(cleanupJobs, cleanUpJob{"delete stale short URLs", srv.deleteStaleShortURLs})
}
if srv.Cfg.UnifiedAlerting.DeletedRuleRetention > 0 {
cleanupJobs = append(cleanupJobs, cleanUpJob{"cleanup trash alert rules", srv.cleanUpTrashAlertRules})
}
logger := srv.log.FromContext(ctx)
logger.Debug("Starting cleanup jobs", "jobs", fmt.Sprintf("%v", cleanupJobs))
@ -313,3 +323,13 @@ func (srv *CleanUpService) cleanUpTrashDashboards(ctx context.Context) {
logger.Debug("Cleaned up deleted dashboards", "dashboards affected", affected)
}
}
func (srv *CleanUpService) cleanUpTrashAlertRules(ctx context.Context) {
logger := srv.log.FromContext(ctx)
affected, err := srv.alertRuleService.CleanUpDeletedAlertRules(ctx)
if err != nil {
logger.Error("Problem cleaning up deleted alert rules", "error", err)
} else {
logger.Debug("Cleaned up deleted alert rules", "rows affected", affected)
}
}

@ -73,7 +73,7 @@ func (st DBstore) DeleteAlertRulesByUID(ctx context.Context, orgID int64, user *
logger.Debug("Deleted alert rule state", "count", rows)
var versions []alertRuleVersion
if st.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertRuleRestore) {
if st.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertRuleRestore) && st.Cfg.DeletedRuleRetention > 0 { // save deleted version only if retention is greater than 0
versions, err = st.getLatestVersionOfRulesByUID(ctx, orgID, ruleUID)
if err != nil {
logger.Error("Failed to get latest version of deleted alert rules. The recovery will not be possible", "error", err)
@ -1243,6 +1243,24 @@ func (st DBstore) GetNamespacesByRuleUID(ctx context.Context, orgID int64, uids
return result, err
}
func (st DBstore) CleanUpDeletedAlertRules(ctx context.Context) (int64, error) {
affectedRows := int64(-1)
err := st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error {
expire := TimeNow().Add(-st.Cfg.DeletedRuleRetention)
st.Logger.Debug("Permanently remove expired deleted rules", "deletedBefore", expire)
result, err := sess.Exec("DELETE FROM alert_rule_version WHERE rule_uid='' AND created <= ?", expire)
if err != nil {
return err
}
affectedRows, err = result.RowsAffected()
if err != nil {
st.Logger.Warn("Failed to get rows affected by the delete operation", "error", err)
}
return nil
})
return affectedRows, err
}
func getINSubQueryArgs[T any](inputSlice []T) ([]any, []string) {
args := make([]any, 0, len(inputSlice))
in := make([]string, 0, len(inputSlice))

@ -784,13 +784,15 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) {
require.Empty(t, savedInstances)
})
t.Run("should remove all version and insert one with empty rule_uid", func(t *testing.T) {
t.Run("should remove all version and insert one with empty rule_uid when DeletedRuleRetention is set", func(t *testing.T) {
orgID := int64(rand.Intn(1000))
gen = gen.With(gen.WithOrgID(orgID))
// Create a new store to pass the custom bus to check the signal
b := &fakeBus{}
logger := log.New("test-dbstore")
cfg.UnifiedAlerting.DeletedRuleRetention = 1000 * time.Hour
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b)
store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore)
@ -848,6 +850,59 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) {
return nil
})
})
t.Run("should remove all versions and not keep history if DeletedRuleRetention = 0", func(t *testing.T) {
orgID := int64(rand.Intn(1000))
gen = gen.With(gen.WithOrgID(orgID))
// Create a new store to pass the custom bus to check the signal
b := &fakeBus{}
logger := log.New("test-dbstore")
cfg.UnifiedAlerting.DeletedRuleRetention = 0
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b)
store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore)
result, err := store.InsertAlertRules(context.Background(), &models.AlertingUserUID, gen.GenerateMany(3))
uids := make([]string, 0, len(result))
for _, rule := range result {
uids = append(uids, rule.UID)
}
require.NoError(t, err)
rules, err := store.ListAlertRules(context.Background(), &models.ListAlertRulesQuery{OrgID: orgID, RuleUIDs: uids})
require.NoError(t, err)
updates := make([]models.UpdateRule, 0, len(rules))
for _, rule := range rules {
rule2 := models.CopyRule(rule, gen.WithTitle(util.GenerateShortUID()))
updates = append(updates, models.UpdateRule{
Existing: rule,
New: *rule2,
})
}
err = store.UpdateAlertRules(context.Background(), &models.AlertingUserUID, updates)
require.NoError(t, err)
versions, err := store.GetAlertRuleVersions(context.Background(), orgID, rules[0].GUID)
require.NoError(t, err)
require.Len(t, versions, 2)
err = store.DeleteAlertRulesByUID(context.Background(), orgID, util.Pointer(models.UserUID("test")), uids...)
require.NoError(t, err)
guids := make([]string, 0, len(rules))
for _, rule := range rules {
guids = append(guids, rule.GUID)
}
_ = sqlStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
var versions []alertRuleVersion
err = sess.Table(alertRuleVersion{}).Where(`rule_uid = ''`).In("rule_guid", guids).Find(&versions)
require.NoError(t, err)
require.Emptyf(t, versions, "some rules were not permanently deleted") // should be one version per GUID
return nil
})
})
}
func TestIntegrationInsertAlertRules(t *testing.T) {
@ -1962,6 +2017,7 @@ func TestIntegration_ListDeletedRules(t *testing.T) {
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{
BaseInterval: 1 * time.Second,
RuleVersionRecordLimit: -1,
DeletedRuleRetention: 10 * time.Hour,
}
sqlStore := db.InitTestDB(t)
folderService := setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures())
@ -2011,6 +2067,72 @@ func TestIntegration_ListDeletedRules(t *testing.T) {
})
}
func TestIntegration_CleanUpDeletedAlertRules(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test")
}
oldClk := TimeNow
t.Cleanup(func() {
TimeNow = oldClk
})
t0 := time.Now().UTC().Truncate(time.Second)
TimeNow = func() time.Time {
return t0
}
sqlStore := db.InitTestDB(t, sqlstore.InitTestDBOpt{
Cfg: nil,
})
cfg := setting.NewCfg()
cfg.UnifiedAlerting.BaseInterval = 1 * time.Second
cfg.UnifiedAlerting.RuleVersionRecordLimit = -1
cfg.UnifiedAlerting.DeletedRuleRetention = 10 * time.Second
folderService := setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures())
logger := log.New("test-dbstore")
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, &fakeBus{})
store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore)
gen := models.RuleGen
orgID := int64(rand.Intn(1000))
gen = gen.With(gen.WithOrgID(orgID))
result, err := store.InsertAlertRules(context.Background(), &models.AlertingUserUID, gen.GenerateMany(3))
uids := make([]string, 0, len(result))
for _, rule := range result {
uids = append(uids, rule.UID)
}
require.NoError(t, err)
// simulate rule deletion at different time.
// t0, t0+10s, t0+20s
for idx, uid := range uids {
TimeNow = func() time.Time {
return t0.Add(time.Duration(idx) * 10 * time.Second)
}
err = store.DeleteAlertRulesByUID(context.Background(), orgID, util.Pointer(models.UserUID("test")), uid)
require.NoError(t, err)
}
before, err := store.ListDeletedRules(context.Background(), orgID)
require.NoError(t, err)
require.Len(t, before, 3)
// retention is 10s, now=t+20s, therefore, only one row should be deleted
_, err = store.CleanUpDeletedAlertRules(context.Background())
require.NoError(t, err)
after, err := store.ListDeletedRules(context.Background(), orgID)
require.NoError(t, err)
assert.Len(t, after, 1)
for _, rule := range after {
assert.GreaterOrEqual(t, rule.Updated, TimeNow().Add(-cfg.UnifiedAlerting.DeletedRuleRetention))
}
}
func createTestStore(
sqlStore db.DB,
folderService folder.Service,

@ -129,6 +129,9 @@ type UnifiedAlertingSettings struct {
// should be stored in the database for each alert_rule in an organization including the current one.
// 0 value means no limit
RuleVersionRecordLimit int
// DeletedRuleRetention defines the maximum duration to retain deleted alerting rules before permanent removal.
DeletedRuleRetention time.Duration
}
type RecordingRuleSettings struct {
@ -477,6 +480,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
return fmt.Errorf("setting 'rule_version_record_limit' is invalid, only 0 or a positive integer are allowed")
}
uaCfg.DeletedRuleRetention = ua.Key("deleted_rule_retention").MustDuration(30 * 24 * time.Hour)
if uaCfg.DeletedRuleRetention < 0 {
return fmt.Errorf("setting 'deleted_rule_retention' is invalid, only 0 or a positive duration are allowed")
}
cfg.UnifiedAlerting = uaCfg
return nil
}

Loading…
Cancel
Save