From faef3a825882a94fbf3f3a30a829176271fce8cf Mon Sep 17 00:00:00 2001 From: Alexander Weaver Date: Mon, 13 Mar 2023 15:54:46 -0500 Subject: [PATCH] Alerting: Log error but don't fail initialization if state history connection test fails (#64699) Don't return init error if ping fails, add tests --- pkg/services/ngalert/ngalert.go | 6 +-- pkg/services/ngalert/ngalert_test.go | 80 ++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index 75fd61edfff..dca37115d89 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -210,7 +210,7 @@ func (ng *AlertNG) init() error { Tracer: ng.tracer, } - history, err := configureHistorianBackend(initCtx, ng.Cfg.UnifiedAlerting.StateHistory, ng.annotationsRepo, ng.dashboardService, ng.store, ng.Metrics.GetHistorianMetrics()) + history, err := configureHistorianBackend(initCtx, ng.Cfg.UnifiedAlerting.StateHistory, ng.annotationsRepo, ng.dashboardService, ng.store, ng.Metrics.GetHistorianMetrics(), ng.Log) if err != nil { return err } @@ -389,7 +389,7 @@ type Historian interface { state.Historian } -func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingStateHistorySettings, ar annotations.Repository, ds dashboards.DashboardService, rs historian.RuleStore, met *metrics.Historian) (Historian, error) { +func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingStateHistorySettings, ar annotations.Repository, ds dashboards.DashboardService, rs historian.RuleStore, met *metrics.Historian, l log.Logger) (Historian, error) { if !cfg.Enabled { met.Info.WithLabelValues("noop").Set(0) return historian.NewNopHistorian(), nil @@ -410,7 +410,7 @@ func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingS testConnCtx, cancelFunc := context.WithTimeout(ctx, 10*time.Second) defer cancelFunc() if err := backend.TestConnection(testConnCtx); err != nil { - return nil, fmt.Errorf("failed to ping the remote loki historian: %w", err) + l.Error("Failed to communicate with configured remote Loki backend, state history may not be persisted", "error", err) } return backend, nil } diff --git a/pkg/services/ngalert/ngalert_test.go b/pkg/services/ngalert/ngalert_test.go index a8b7d6c6f39..547981418e8 100644 --- a/pkg/services/ngalert/ngalert_test.go +++ b/pkg/services/ngalert/ngalert_test.go @@ -1,11 +1,14 @@ package ngalert import ( + "bytes" "context" "math/rand" "testing" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" @@ -14,9 +17,11 @@ import ( "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/services/folder" + "github.com/grafana/grafana/pkg/services/ngalert/metrics" "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/schedule" "github.com/grafana/grafana/pkg/services/ngalert/tests/fakes" + "github.com/grafana/grafana/pkg/setting" "github.com/grafana/grafana/pkg/util" ) @@ -72,3 +77,78 @@ func Test_subscribeToFolderChanges(t *testing.T) { scheduler.AssertCalled(t, "UpdateAlertRule", rule.GetKey(), rule.Version, false) } } + +func TestConfigureHistorianBackend(t *testing.T) { + t.Run("fail initialization if invalid backend", func(t *testing.T) { + met := metrics.NewHistorianMetrics(prometheus.NewRegistry()) + logger := log.NewNopLogger() + cfg := setting.UnifiedAlertingStateHistorySettings{ + Enabled: true, + Backend: "invalid-backend", + } + + _, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger) + + require.ErrorContains(t, err, "unrecognized") + }) + + t.Run("do not fail initialization if pinging Loki fails", func(t *testing.T) { + met := metrics.NewHistorianMetrics(prometheus.NewRegistry()) + logger := log.NewNopLogger() + cfg := setting.UnifiedAlertingStateHistorySettings{ + Enabled: true, + Backend: "loki", + // Should never resolve at the DNS level: https://www.rfc-editor.org/rfc/rfc6761#section-6.4 + LokiReadURL: "http://gone.invalid", + LokiWriteURL: "http://gone.invalid", + } + + h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger) + + require.NotNil(t, h) + require.NoError(t, err) + }) + + t.Run("emit metric describing chosen backend", func(t *testing.T) { + reg := prometheus.NewRegistry() + met := metrics.NewHistorianMetrics(reg) + logger := log.NewNopLogger() + cfg := setting.UnifiedAlertingStateHistorySettings{ + Enabled: true, + Backend: "annotations", + } + + h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger) + + require.NotNil(t, h) + require.NoError(t, err) + exp := bytes.NewBufferString(` +# HELP grafana_alerting_state_history_info Information about the state history store. +# TYPE grafana_alerting_state_history_info gauge +grafana_alerting_state_history_info{backend="annotations"} 1 +`) + err = testutil.GatherAndCompare(reg, exp, "grafana_alerting_state_history_info") + require.NoError(t, err) + }) + + t.Run("emit special zero metric if state history disabled", func(t *testing.T) { + reg := prometheus.NewRegistry() + met := metrics.NewHistorianMetrics(reg) + logger := log.NewNopLogger() + cfg := setting.UnifiedAlertingStateHistorySettings{ + Enabled: false, + } + + h, err := configureHistorianBackend(context.Background(), cfg, nil, nil, nil, met, logger) + + require.NotNil(t, h) + require.NoError(t, err) + exp := bytes.NewBufferString(` +# HELP grafana_alerting_state_history_info Information about the state history store. +# TYPE grafana_alerting_state_history_info gauge +grafana_alerting_state_history_info{backend="noop"} 0 +`) + err = testutil.GatherAndCompare(reg, exp, "grafana_alerting_state_history_info") + require.NoError(t, err) + }) +}