Alerting: Limit and clean up old alert rules versions (#89754)

pull/94297/head
Brandon 8 months ago committed by GitHub
parent 4ec75bcc60
commit fbad76007d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 9
      conf/defaults.ini
  2. 5
      conf/sample.ini
  3. 57
      pkg/services/ngalert/store/alert_rule.go
  4. 129
      pkg/services/ngalert/store/alert_rule_test.go
  5. 10
      pkg/setting/setting_unified_alerting.go

@ -1346,6 +1346,11 @@ notification_log_retention = 5d
# Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
resolved_alert_retention = 15m
# Defines the limit of how many alert rule versions
# should be stored in the database for each alert rule in an organization including the current one.
# 0 value means no limit
rule_version_record_limit = 0
[unified_alerting.screenshots]
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
# plugin, or set up Grafana to use a remote rendering service.
@ -1560,8 +1565,8 @@ expire_time = 7
#################################### Internal Grafana Metrics ############
# Metrics available at HTTP URL /metrics and /metrics/plugins/:pluginId
[metrics]
enabled = true
interval_seconds = 10
enabled = true
interval_seconds = 10
# Disable total stats (stat_totals_*) metrics to be generated
disable_total_stats = false
# The interval at which the total stats collector will update the stats. Default is 1800 seconds.

@ -1335,6 +1335,11 @@
# Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
;resolved_alert_retention = 15m
# Defines the limit of how many alert rule versions
# should be stored in the database for each alert rule in an organization including the current one.
# 0 value means no limit
;rule_version_record_limit= 0
[unified_alerting.screenshots]
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
# plugin, or set up Grafana to use a remote rendering service.

@ -274,9 +274,64 @@ func (st DBstore) UpdateAlertRules(ctx context.Context, rules []ngmodels.UpdateR
if _, err := sess.Insert(&ruleVersions); err != nil {
return fmt.Errorf("failed to create new rule versions: %w", err)
}
for _, rule := range ruleVersions {
// delete old versions of alert rule
_, err = st.deleteOldAlertRuleVersions(ctx, rule.RuleUID, rule.RuleOrgID, st.Cfg.RuleVersionRecordLimit)
if err != nil {
st.Logger.Warn("Failed to delete old alert rule versions", "org", rule.RuleOrgID, "rule", rule.RuleUID, "error", err)
}
}
}
return nil
})
}
func (st DBstore) deleteOldAlertRuleVersions(ctx context.Context, ruleUID string, orgID int64, limit int) (int64, error) {
if limit < 0 {
return 0, fmt.Errorf("failed to delete old alert rule versions: limit is set to '%d' but needs to be > 0", limit)
}
if limit < 1 {
return 0, nil
}
var affectedRows int64
err := st.SQLStore.WithDbSession(ctx, func(sess *db.Session) error {
highest := &alertRuleVersion{}
ok, err := sess.Table("alert_rule_version").Desc("id").Where("rule_org_id = ?", orgID).Where("rule_uid = ?", ruleUID).Limit(1, limit).Get(highest)
if err != nil {
return err
}
if !ok {
// No alert rule versions past the limit exist. Nothing to clean up.
affectedRows = 0
return nil
}
res, err := sess.Exec(`
DELETE FROM
alert_rule_version
WHERE
rule_org_id = ? AND rule_uid = ?
AND
id <= ?
`, orgID, ruleUID, highest.ID)
if err != nil {
return err
}
rows, err := res.RowsAffected()
if err != nil {
return err
}
affectedRows = rows
if affectedRows > 0 {
st.Logger.Info("Deleted old alert_rule_version(s)", "org", orgID, "limit", limit, "delete_count", affectedRows)
}
return nil
})
return affectedRows, err
}
// preventIntermediateUniqueConstraintViolations prevents unique constraint violations caused by an intermediate update.
@ -352,7 +407,7 @@ func newTitlesOverlapExisting(rules []ngmodels.UpdateRule) bool {
// CountInFolder is a handler for retrieving the number of alert rules of
// specific organisation associated with a given namespace (parent folder).
func (st DBstore) CountInFolders(ctx context.Context, orgID int64, folderUIDs []string, u identity.Requester) (int64, error) {
func (st DBstore) CountInFolders(ctx context.Context, orgID int64, folderUIDs []string, _ identity.Requester) (int64, error) {
if len(folderUIDs) == 0 {
return 0, nil
}

@ -1472,3 +1472,132 @@ func setupFolderService(t *testing.T, sqlStore db.DB, cfg *setting.Cfg, features
return testutil.SetupFolderService(t, cfg, sqlStore, dashboardStore, folderStore, inProcBus, features, &actest.FakeAccessControl{ExpectedEvaluate: true})
}
func TestIntegration_AlertRuleVersionsCleanup(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test")
}
cfg := setting.NewCfg()
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{
BaseInterval: time.Duration(rand.Int63n(100)+1) * time.Second,
}
sqlStore := db.InitTestDB(t)
store := &DBstore{
SQLStore: sqlStore,
Cfg: cfg.UnifiedAlerting,
FolderService: setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures()),
Logger: &logtest.Fake{},
}
generator := models.RuleGen
generator = generator.With(generator.WithIntervalMatching(store.Cfg.BaseInterval), generator.WithUniqueOrgID())
t.Run("when calling the cleanup with fewer records than the limit all records should stay", func(t *testing.T) {
alertingCfgSnapshot := cfg.UnifiedAlerting
defer func() {
cfg.UnifiedAlerting = alertingCfgSnapshot
}()
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{BaseInterval: alertingCfgSnapshot.BaseInterval, RuleVersionRecordLimit: 10}
rule := createRule(t, store, generator)
firstNewRule := models.CopyRule(rule)
firstNewRule.Title = util.GenerateShortUID()
err := store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
Existing: rule,
New: *firstNewRule,
},
})
require.NoError(t, err)
firstNewRule.Version = firstNewRule.Version + 1
secondNewRule := models.CopyRule(firstNewRule)
secondNewRule.Title = util.GenerateShortUID()
err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
Existing: firstNewRule,
New: *secondNewRule,
},
})
require.NoError(t, err)
titleMap := map[string]bool{
secondNewRule.Title: false,
rule.Title: false,
}
err = sqlStore.WithDbSession(context.Background(), func(sess *db.Session) error {
alertRuleVersions := make([]*alertRuleVersion, 0)
err := sess.Table(alertRuleVersion{}).Desc("id").Where("rule_org_id = ? and rule_uid = ?", rule.OrgID, rule.UID).Find(&alertRuleVersions)
if err != nil {
return err
}
require.NoError(t, err)
assert.Len(t, alertRuleVersions, 2)
for _, value := range alertRuleVersions {
assert.False(t, titleMap[value.Title])
titleMap[value.Title] = true
}
assert.Equal(t, true, titleMap[firstNewRule.Title])
assert.Equal(t, true, titleMap[secondNewRule.Title])
return err
})
require.NoError(t, err)
})
t.Run("only oldest records surpassing the limit should be deleted", func(t *testing.T) {
alertingCfgSnapshot := cfg.UnifiedAlerting
defer func() {
cfg.UnifiedAlerting = alertingCfgSnapshot
}()
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{BaseInterval: alertingCfgSnapshot.BaseInterval, RuleVersionRecordLimit: 1}
rule := createRule(t, store, generator)
oldRule := models.CopyRule(rule)
oldRule.Title = "old-record"
err := store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
Existing: rule,
New: *oldRule,
}}) // first entry in `rule_version_history` table happens here
require.NoError(t, err)
rule.Version = rule.Version + 1
middleRule := models.CopyRule(rule)
middleRule.Title = "middle-record"
err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
Existing: rule,
New: *middleRule,
}}) //second entry in `rule_version_history` table happens here
require.NoError(t, err)
rule.Version = rule.Version + 1
newerRule := models.CopyRule(rule)
newerRule.Title = "newer-record"
err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
Existing: rule,
New: *newerRule,
}}) //second entry in `rule_version_history` table happens here
require.NoError(t, err)
// only the `old-record` should be deleted since limit is set to 1 and there are total 2 records
rowsAffected, err := store.deleteOldAlertRuleVersions(context.Background(), rule.UID, rule.OrgID, 1)
require.NoError(t, err)
require.Equal(t, int64(2), rowsAffected)
err = sqlStore.WithDbSession(context.Background(), func(sess *db.Session) error {
var alertRuleVersions []*alertRuleVersion
err := sess.Table(alertRuleVersion{}).Desc("id").Where("rule_org_id = ? and rule_uid = ?", rule.OrgID, rule.UID).Find(&alertRuleVersions)
if err != nil {
return err
}
require.NoError(t, err)
assert.Len(t, alertRuleVersions, 1)
assert.Equal(t, "newer-record", alertRuleVersions[0].Title)
return err
})
require.NoError(t, err)
})
t.Run("limit set to 0 should not fail", func(t *testing.T) {
count, err := store.deleteOldAlertRuleVersions(context.Background(), "", 1, 0)
require.NoError(t, err)
require.Equal(t, int64(0), count)
})
t.Run("limit set to negative should fail", func(t *testing.T) {
_, err := store.deleteOldAlertRuleVersions(context.Background(), "", 1, -1)
require.Error(t, err)
})
}

@ -121,6 +121,11 @@ type UnifiedAlertingSettings struct {
// Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
ResolvedAlertRetention time.Duration
// RuleVersionRecordLimit defines the limit of how many alert rule versions
// should be stored in the database for each alert_rule in an organization including the current one.
// 0 value means no limit
RuleVersionRecordLimit int
}
type RecordingRuleSettings struct {
@ -455,6 +460,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
return err
}
uaCfg.RuleVersionRecordLimit = ua.Key("rule_version_record_limit").MustInt(0)
if uaCfg.RuleVersionRecordLimit < 0 {
return fmt.Errorf("setting 'rule_version_record_limit' is invalid, only 0 or a positive integer are allowed")
}
cfg.UnifiedAlerting = uaCfg
return nil
}

Loading…
Cancel
Save