From d0c178499cb66829a1c0e615fe63d49818c5bff3 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 5 Dec 2023 20:14:01 +0000 Subject: [PATCH] [v9.4.x] Unified Alerting: Set to 1 by default (#79108) Unified Alerting: Set `max_attempts` to 1 by default (#79095) * Unified Alerting: Set `max_attempts` to 1 by default The retry logic for unified alerting has been broken as far as v9.4.x, rather than fixing it in one go and causing a headache to our users with rules putting extra load on their datasources - I think a better approach is to simply set 1 as a default and then let our users change it. I see two cons with this approach: - Configuration for legacy to unified alerting cannot be ported over automatically, users will have to manually set `max_attempts` to 3 when migrating. - Users expecting to get any sort of retrying (as with legacy alerting) will not have it out of the box and will have to manually edit the configuration. Signed-off-by: gotjosh --------- Signed-off-by: gotjosh (cherry picked from commit 0c9356a3c78b83b8bb5abf99263053b10de84d48) --- conf/defaults.ini | 4 ++-- conf/sample.ini | 4 ++-- .../setup-grafana/configure-grafana/_index.md | 2 +- pkg/setting/setting_unified_alerting.go | 12 ++---------- pkg/setting/setting_unified_alerting_test.go | 9 ++++----- 5 files changed, 11 insertions(+), 20 deletions(-) diff --git a/conf/defaults.ini b/conf/defaults.ini index c7876f60952..a2427ae8f82 100644 --- a/conf/defaults.ini +++ b/conf/defaults.ini @@ -957,8 +957,8 @@ execute_alerts = true # The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. evaluation_timeout = 30s -# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. This option has a legacy version in the `[alerting]` section that takes precedence. -max_attempts = 3 +# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1. +max_attempts = 1 # Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. diff --git a/conf/sample.ini b/conf/sample.ini index c3e8f5f4beb..5c7b331f5c2 100644 --- a/conf/sample.ini +++ b/conf/sample.ini @@ -943,8 +943,8 @@ # The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. ;evaluation_timeout = 30s -# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. This option has a legacy version in the `[alerting]` section that takes precedence. -;max_attempts = 3 +# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1. +;max_attempts = 1 # Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. diff --git a/docs/sources/setup-grafana/configure-grafana/_index.md b/docs/sources/setup-grafana/configure-grafana/_index.md index 4a19f3993b0..f9a0ce913b2 100644 --- a/docs/sources/setup-grafana/configure-grafana/_index.md +++ b/docs/sources/setup-grafana/configure-grafana/_index.md @@ -1440,7 +1440,7 @@ The timeout string is a possibly signed sequence of decimal numbers, followed by ### max_attempts -Sets a maximum number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is `3`. This option has a [legacy version in the alerting section]({{< relref "#max_attempts-1">}}) that takes precedence. +Sets a maximum number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is `1`. ### min_interval diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go index bf80548abbf..45df6ac956a 100644 --- a/pkg/setting/setting_unified_alerting.go +++ b/pkg/setting/setting_unified_alerting.go @@ -46,7 +46,7 @@ const ( evaluatorDefaultEvaluationTimeout = 30 * time.Second schedulerDefaultAdminConfigPollInterval = time.Minute schedulereDefaultExecuteAlerts = true - schedulerDefaultMaxAttempts = 3 + schedulerDefaultMaxAttempts = 1 schedulerDefaultLegacyMinInterval = 1 screenshotsDefaultCapture = false screenshotsDefaultCaptureTimeout = 10 * time.Second @@ -259,15 +259,7 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { } uaCfg.EvaluationTimeout = uaEvaluationTimeout - uaMaxAttempts := ua.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts) - if uaMaxAttempts == schedulerDefaultMaxAttempts { // unified option or equals the default - legacyMaxAttempts := alerting.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts) - if legacyMaxAttempts != schedulerDefaultMaxAttempts { - cfg.Logger.Warn("falling back to legacy setting of 'max_attempts'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.") - } - uaMaxAttempts = legacyMaxAttempts - } - uaCfg.MaxAttempts = uaMaxAttempts + uaCfg.MaxAttempts = ua.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts) uaCfg.BaseInterval = SchedulerBaseInterval diff --git a/pkg/setting/setting_unified_alerting_test.go b/pkg/setting/setting_unified_alerting_test.go index 92fa07648ad..558cd277215 100644 --- a/pkg/setting/setting_unified_alerting_test.go +++ b/pkg/setting/setting_unified_alerting_test.go @@ -77,20 +77,19 @@ func TestUnifiedAlertingSettings(t *testing.T) { desc: "when the unified options equal the defaults, it should apply the legacy ones", unifiedAlertingOptions: map[string]string{ "admin_config_poll_interval": "120s", - "max_attempts": strconv.FormatInt(schedulerDefaultMaxAttempts, 10), "min_interval": SchedulerBaseInterval.String(), "execute_alerts": strconv.FormatBool(schedulereDefaultExecuteAlerts), "evaluation_timeout": evaluatorDefaultEvaluationTimeout.String(), }, alertingOptions: map[string]string{ - "max_attempts": "12", + "max_attempts": "1", "min_interval_seconds": "120", "execute_alerts": "true", "evaluation_timeout_seconds": "160", }, verifyCfg: func(t *testing.T, cfg Cfg) { require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.AdminConfigPollInterval) - require.Equal(t, int64(12), cfg.UnifiedAlerting.MaxAttempts) + require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts) require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval) require.Equal(t, true, cfg.UnifiedAlerting.ExecuteAlerts) require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout) @@ -131,14 +130,14 @@ func TestUnifiedAlertingSettings(t *testing.T) { "evaluation_timeout": "invalid", }, alertingOptions: map[string]string{ - "max_attempts": "12", + "max_attempts": "1", "min_interval_seconds": "120", "execute_alerts": "false", "evaluation_timeout_seconds": "160", }, verifyCfg: func(t *testing.T, cfg Cfg) { require.Equal(t, alertmanagerDefaultConfigPollInterval, cfg.UnifiedAlerting.AdminConfigPollInterval) - require.Equal(t, int64(12), cfg.UnifiedAlerting.MaxAttempts) + require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts) require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval) require.Equal(t, false, cfg.UnifiedAlerting.ExecuteAlerts) require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)