[v9.4.x] Unified Alerting: Set to 1 by default (#79108)

Unified Alerting: Set `max_attempts` to 1 by default (#79095)

* Unified Alerting: Set `max_attempts` to 1 by default

The retry logic for unified alerting has been broken as far as v9.4.x, rather than fixing it in one go and causing a headache to our users with rules putting extra load on their datasources - I think a better approach is to simply set 1 as a default and then let our users change it.

I see two cons with this approach:

- Configuration for legacy to unified alerting cannot be ported over automatically, users will have to manually set `max_attempts` to 3 when migrating.
- Users expecting to get any sort of retrying (as with legacy alerting) will not have it out of the box and will have to manually edit the configuration.

Signed-off-by: gotjosh <josue.abreu@gmail.com>
---------

Signed-off-by: gotjosh <josue.abreu@gmail.com>
(cherry picked from commit 0c9356a3c7)
pull/79180/head
gotjosh 2 years ago committed by GitHub
parent 0cc6d80d52
commit d0c178499c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 4
      conf/defaults.ini
  2. 4
      conf/sample.ini
  3. 2
      docs/sources/setup-grafana/configure-grafana/_index.md
  4. 12
      pkg/setting/setting_unified_alerting.go
  5. 9
      pkg/setting/setting_unified_alerting_test.go

@ -957,8 +957,8 @@ execute_alerts = true
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
evaluation_timeout = 30s
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. This option has a legacy version in the `[alerting]` section that takes precedence.
max_attempts = 3
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1.
max_attempts = 1
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

@ -943,8 +943,8 @@
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;evaluation_timeout = 30s
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. This option has a legacy version in the `[alerting]` section that takes precedence.
;max_attempts = 3
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1.
;max_attempts = 1
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

@ -1440,7 +1440,7 @@ The timeout string is a possibly signed sequence of decimal numbers, followed by
### max_attempts
Sets a maximum number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is `3`. This option has a [legacy version in the alerting section]({{< relref "#max_attempts-1">}}) that takes precedence.
Sets a maximum number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is `1`.
### min_interval

@ -46,7 +46,7 @@ const (
evaluatorDefaultEvaluationTimeout = 30 * time.Second
schedulerDefaultAdminConfigPollInterval = time.Minute
schedulereDefaultExecuteAlerts = true
schedulerDefaultMaxAttempts = 3
schedulerDefaultMaxAttempts = 1
schedulerDefaultLegacyMinInterval = 1
screenshotsDefaultCapture = false
screenshotsDefaultCaptureTimeout = 10 * time.Second
@ -259,15 +259,7 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
}
uaCfg.EvaluationTimeout = uaEvaluationTimeout
uaMaxAttempts := ua.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
if uaMaxAttempts == schedulerDefaultMaxAttempts { // unified option or equals the default
legacyMaxAttempts := alerting.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
if legacyMaxAttempts != schedulerDefaultMaxAttempts {
cfg.Logger.Warn("falling back to legacy setting of 'max_attempts'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
}
uaMaxAttempts = legacyMaxAttempts
}
uaCfg.MaxAttempts = uaMaxAttempts
uaCfg.MaxAttempts = ua.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
uaCfg.BaseInterval = SchedulerBaseInterval

@ -77,20 +77,19 @@ func TestUnifiedAlertingSettings(t *testing.T) {
desc: "when the unified options equal the defaults, it should apply the legacy ones",
unifiedAlertingOptions: map[string]string{
"admin_config_poll_interval": "120s",
"max_attempts": strconv.FormatInt(schedulerDefaultMaxAttempts, 10),
"min_interval": SchedulerBaseInterval.String(),
"execute_alerts": strconv.FormatBool(schedulereDefaultExecuteAlerts),
"evaluation_timeout": evaluatorDefaultEvaluationTimeout.String(),
},
alertingOptions: map[string]string{
"max_attempts": "12",
"max_attempts": "1",
"min_interval_seconds": "120",
"execute_alerts": "true",
"evaluation_timeout_seconds": "160",
},
verifyCfg: func(t *testing.T, cfg Cfg) {
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.AdminConfigPollInterval)
require.Equal(t, int64(12), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
require.Equal(t, true, cfg.UnifiedAlerting.ExecuteAlerts)
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)
@ -131,14 +130,14 @@ func TestUnifiedAlertingSettings(t *testing.T) {
"evaluation_timeout": "invalid",
},
alertingOptions: map[string]string{
"max_attempts": "12",
"max_attempts": "1",
"min_interval_seconds": "120",
"execute_alerts": "false",
"evaluation_timeout_seconds": "160",
},
verifyCfg: func(t *testing.T, cfg Cfg) {
require.Equal(t, alertmanagerDefaultConfigPollInterval, cfg.UnifiedAlerting.AdminConfigPollInterval)
require.Equal(t, int64(12), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
require.Equal(t, false, cfg.UnifiedAlerting.ExecuteAlerts)
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)

Loading…
Cancel
Save