Alerting: Change default for max_attempts to 3. (#97461)

Currently the default is 1, this means that by default users will see transient
query errors reflected as alert evaluation failures, when often an immediate
retry is sufficient to evaluate the rule successfully.

Enabling retries by default leads to a better experience out of the box.
pull/97517/head
Steve Simpson 7 months ago committed by GitHub
parent 6a1685ab5e
commit c440bd2bda
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 4
      conf/defaults.ini
  2. 4
      conf/sample.ini
  3. 2
      pkg/setting/setting_unified_alerting.go
  4. 8
      pkg/setting/setting_unified_alerting_test.go

@ -1338,8 +1338,8 @@ execute_alerts = true
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
evaluation_timeout = 30s
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1.
max_attempts = 1
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 3.
max_attempts = 3
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

@ -1322,8 +1322,8 @@
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;evaluation_timeout = 30s
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1.
;max_attempts = 1
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 3.
;max_attempts = 3
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

@ -49,7 +49,7 @@ const (
evaluatorDefaultEvaluationTimeout = 30 * time.Second
schedulerDefaultAdminConfigPollInterval = time.Minute
schedulerDefaultExecuteAlerts = true
schedulerDefaultMaxAttempts = 1
schedulerDefaultMaxAttempts = 3
schedulerDefaultLegacyMinInterval = 1
screenshotsDefaultCapture = false
screenshotsDefaultCaptureTimeout = 10 * time.Second

@ -120,14 +120,14 @@ func TestUnifiedAlertingSettings(t *testing.T) {
"evaluation_timeout": evaluatorDefaultEvaluationTimeout.String(),
},
alertingOptions: map[string]string{
"max_attempts": "1",
"max_attempts": "1", // Note: Ignored, setting does not exist.
"min_interval_seconds": "120",
"execute_alerts": "true",
"evaluation_timeout_seconds": "160",
},
verifyCfg: func(t *testing.T, cfg Cfg) {
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.AdminConfigPollInterval)
require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, int64(3), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
require.Equal(t, true, cfg.UnifiedAlerting.ExecuteAlerts)
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)
@ -168,14 +168,14 @@ func TestUnifiedAlertingSettings(t *testing.T) {
"evaluation_timeout": "invalid",
},
alertingOptions: map[string]string{
"max_attempts": "1",
"max_attempts": "1", // Note: Ignored, setting does not exist.
"min_interval_seconds": "120",
"execute_alerts": "false",
"evaluation_timeout_seconds": "160",
},
verifyCfg: func(t *testing.T, cfg Cfg) {
require.Equal(t, alertmanagerDefaultConfigPollInterval, cfg.UnifiedAlerting.AdminConfigPollInterval)
require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, int64(3), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
require.Equal(t, false, cfg.UnifiedAlerting.ExecuteAlerts)
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)

Loading…
Cancel
Save