From 8bc001484596703d2282264d2dc1b003c7ffbcb3 Mon Sep 17 00:00:00 2001 From: Fayzal Ghantiwala <114010985+fayzal-g@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:46:50 +0000 Subject: [PATCH] [v11.3.x] Alerting: Make context deadline on AlertNG service startup configurable (#96135) Alerting: Make context deadline on AlertNG service startup configurable (#96053) * Make alerting context deadline configurable * Remove debug logs * Change default timeout * Update tests (cherry picked from commit 1fdc48fabafe8b8480a58fb4a169d01ebb535fb2) --- conf/defaults.ini | 3 +++ conf/sample.ini | 3 +++ .../cloudmigrationimpl/cloudmigration_test.go | 1 + pkg/services/ngalert/ngalert.go | 2 +- pkg/services/ngalert/tests/util.go | 3 ++- pkg/services/quota/quotaimpl/quota_test.go | 1 + pkg/setting/setting_unified_alerting.go | 7 +++++++ pkg/setting/setting_unified_alerting_test.go | 4 ++++ 8 files changed, 22 insertions(+), 2 deletions(-) diff --git a/conf/defaults.ini b/conf/defaults.ini index 662e300834f..970a24abc39 100644 --- a/conf/defaults.ini +++ b/conf/defaults.ini @@ -1209,6 +1209,9 @@ enabled = # Comma-separated list of organization IDs for which to disable unified alerting. Only supported if unified alerting is enabled. disabled_orgs = +# Specify how long to wait for the alerting service to initialize +initialization_timeout = 30s + # Specify the frequency of polling for admin config changes. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. admin_config_poll_interval = 60s diff --git a/conf/sample.ini b/conf/sample.ini index 366b04d884a..b207a7829aa 100644 --- a/conf/sample.ini +++ b/conf/sample.ini @@ -1197,6 +1197,9 @@ # Comma-separated list of organization IDs for which to disable unified alerting. Only supported if unified alerting is enabled. ;disabled_orgs = +# Specify how long to wait for the alerting service to initialize +;initialization_timeout = 30s + # Specify the frequency of polling for admin config changes. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. ;admin_config_poll_interval = 60s diff --git a/pkg/services/cloudmigration/cloudmigrationimpl/cloudmigration_test.go b/pkg/services/cloudmigration/cloudmigrationimpl/cloudmigration_test.go index 74944e407fe..17a7ae1c2ff 100644 --- a/pkg/services/cloudmigration/cloudmigrationimpl/cloudmigration_test.go +++ b/pkg/services/cloudmigration/cloudmigrationimpl/cloudmigration_test.go @@ -782,6 +782,7 @@ func setUpServiceTest(t *testing.T, withDashboardMock bool) cloudmigration.Servi fakeAccessControlService := actest.FakeService{} alertMetrics := metrics.NewNGAlert(prometheus.NewRegistry()) + cfg.UnifiedAlerting.InitializationTimeout = 30 * time.Second ruleStore, err := ngalertstore.ProvideDBStore(cfg, featureToggles, sqlStore, mockFolder, dashboardService, fakeAccessControl) require.NoError(t, err) diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index ab2c1f4d9b7..e163206e3fb 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -161,7 +161,7 @@ type AlertNG struct { func (ng *AlertNG) init() error { // AlertNG should be initialized before the cancellation deadline of initCtx - initCtx, cancelFunc := context.WithTimeout(context.Background(), 30*time.Second) + initCtx, cancelFunc := context.WithTimeout(context.Background(), ng.Cfg.UnifiedAlerting.InitializationTimeout) defer cancelFunc() ng.store.Logger = ng.Log diff --git a/pkg/services/ngalert/tests/util.go b/pkg/services/ngalert/tests/util.go index ae5664adeec..1dfa2819bec 100644 --- a/pkg/services/ngalert/tests/util.go +++ b/pkg/services/ngalert/tests/util.go @@ -47,7 +47,8 @@ func SetupTestEnv(tb testing.TB, baseInterval time.Duration) (*ngalert.AlertNG, cfg := setting.NewCfg() cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{ - BaseInterval: setting.SchedulerBaseInterval, + BaseInterval: setting.SchedulerBaseInterval, + InitializationTimeout: 30 * time.Second, } // AlertNG database migrations run and the relative database tables are created only when it's enabled cfg.UnifiedAlerting.Enabled = new(bool) diff --git a/pkg/services/quota/quotaimpl/quota_test.go b/pkg/services/quota/quotaimpl/quota_test.go index 4422ed67ebe..dedb5cdb072 100644 --- a/pkg/services/quota/quotaimpl/quota_test.go +++ b/pkg/services/quota/quotaimpl/quota_test.go @@ -498,6 +498,7 @@ func setupEnv(t *testing.T, sqlStore db.DB, cfg *setting.Cfg, b bus.Bus, quotaSe ac := acimpl.ProvideAccessControl(featuremgmt.WithFeatures(), zanzana.NewNoopClient()) ruleStore, err := ngstore.ProvideDBStore(cfg, featuremgmt.WithFeatures(), sqlStore, &foldertest.FakeService{}, &dashboards.FakeDashboardService{}, ac) require.NoError(t, err) + cfg.UnifiedAlerting.InitializationTimeout = 30 * time.Second _, err = ngalert.ProvideService( cfg, featuremgmt.WithFeatures(), nil, nil, routing.NewRouteRegister(), sqlStore, ngalertfakes.NewFakeKVStore(t), nil, nil, quotaService, secretsService, nil, m, &foldertest.FakeService{}, &acmock.Mock{}, &dashboards.FakeDashboardService{}, nil, b, &acmock.Mock{}, diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go index 8fe863247b7..a39636533da 100644 --- a/pkg/setting/setting_unified_alerting.go +++ b/pkg/setting/setting_unified_alerting.go @@ -45,6 +45,7 @@ const ( } } ` + alertingDefaultInitializationTimeout = 30 * time.Second evaluatorDefaultEvaluationTimeout = 30 * time.Second schedulerDefaultAdminConfigPollInterval = time.Minute schedulerDefaultExecuteAlerts = true @@ -90,6 +91,7 @@ type UnifiedAlertingSettings struct { HARedisMaxConns int HARedisTLSEnabled bool HARedisTLSConfig dstls.ClientConfig + InitializationTimeout time.Duration MaxAttempts int64 MinInterval time.Duration EvaluationTimeout time.Duration @@ -229,6 +231,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { uaCfg.DisabledOrgs[orgID] = struct{}{} } + uaCfg.InitializationTimeout, err = gtime.ParseDuration(valueAsString(ua, "initialization_timeout", (alertingDefaultInitializationTimeout).String())) + if err != nil { + return err + } + uaCfg.AdminConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "admin_config_poll_interval", (schedulerDefaultAdminConfigPollInterval).String())) if err != nil { return err diff --git a/pkg/setting/setting_unified_alerting_test.go b/pkg/setting/setting_unified_alerting_test.go index e60dedf8fb6..0b453b46e12 100644 --- a/pkg/setting/setting_unified_alerting_test.go +++ b/pkg/setting/setting_unified_alerting_test.go @@ -26,6 +26,7 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) { require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval) require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval) require.Equal(t, 6*time.Hour, cfg.UnifiedAlerting.HAReconnectTimeout) + require.Equal(t, alertingDefaultInitializationTimeout, cfg.UnifiedAlerting.InitializationTimeout) } // With peers set, it correctly parses them. @@ -35,10 +36,13 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) { require.NoError(t, err) _, err = s.NewKey("ha_peers", "hostname1:9090,hostname2:9090,hostname3:9090") require.NoError(t, err) + _, err = s.NewKey("initialization_timeout", "123s") + require.NoError(t, err) require.NoError(t, cfg.ReadUnifiedAlertingSettings(cfg.Raw)) require.Len(t, cfg.UnifiedAlerting.HAPeers, 3) require.ElementsMatch(t, []string{"hostname1:9090", "hostname2:9090", "hostname3:9090"}, cfg.UnifiedAlerting.HAPeers) + require.Equal(t, 123*time.Second, cfg.UnifiedAlerting.InitializationTimeout) } t.Run("should read 'scheduler_tick_interval'", func(t *testing.T) {