Alerting: Use exponential backoff in the remote Alertmanager readiness check (#99756)

* Alerting: Use exponential backoff in the remote Alertmanager readiness check

* fix capitalized error

* remove unnecessary 'for'

* refactor, use time.After() instead of channel
pull/99775/head
Santiago 4 months ago committed by GitHub
parent 243ce03b93
commit 7d4895c3c9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 14
      pkg/services/ngalert/remote/alertmanager.go
  2. 104
      pkg/services/ngalert/remote/client/alertmanager.go

@ -232,19 +232,15 @@ func (am *Alertmanager) ApplyConfig(ctx context.Context, config *models.AlertCon
}
func (am *Alertmanager) checkReadiness(ctx context.Context) error {
ready, err := am.amClient.IsReadyWithBackoff(ctx)
err := am.amClient.IsReadyWithBackoff(ctx)
if err != nil {
return err
}
if ready {
am.log.Debug("Alertmanager readiness check successful")
am.metrics.LastReadinessCheck.SetToCurrentTime()
am.ready = true
return nil
}
return notifier.ErrAlertmanagerNotReady
am.log.Debug("Alertmanager readiness check successful")
am.metrics.LastReadinessCheck.SetToCurrentTime()
am.ready = true
return nil
}
// CompareAndSendConfiguration checks whether a given configuration is being used by the remote Alertmanager.

@ -66,69 +66,65 @@ func (am *Alertmanager) GetAuthedClient() client.Requester {
}
// IsReadyWithBackoff executes a readiness check against the `/-/ready` Alertmanager endpoint.
// If it takes more than 10s to get a response back - we abort the check.
func (am *Alertmanager) IsReadyWithBackoff(ctx context.Context) (bool, error) {
ctx, cancel := context.WithCancel(ctx)
// It uses exponential backoff (100ms * 2^attempts) with a 10s timeout.
func (am *Alertmanager) IsReadyWithBackoff(ctx context.Context) error {
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
readyURL := am.url.JoinPath(alertmanagerAPIMountPath, alertmanagerReadyPath)
attempt := func() (int, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, readyURL.String(), nil)
if err != nil {
return 0, fmt.Errorf("error creating the readiness request: %w", err)
}
var wait time.Duration
for attempt := 1; ; attempt++ {
select {
case <-ctx.Done():
return fmt.Errorf("readiness check timed out")
case <-time.After(wait):
wait = time.Duration(100<<attempt-1) * time.Millisecond
status, err := am.checkReadiness(ctx)
if err != nil {
am.logger.Debug("Readiness check attempt failed", "attempt", attempt, "err", err)
break
}
res, err := am.httpClient.Do(req)
if err != nil {
return 0, fmt.Errorf("error performing the readiness check: %w", err)
}
if status == http.StatusOK {
return nil
}
defer func() {
if err := res.Body.Close(); err != nil {
am.logger.Warn("Error closing response body", "err", err)
if status == http.StatusNotAcceptable {
// Mimir returns a 406 when the Alertmanager for the tenant is not running.
// This is expected if the Grafana Alertmanager configuration is default or not promoted.
// We can still use the endpoints to store and retrieve configuration/state.
am.logger.Debug("Remote Alertmanager not initialized for tenant")
return nil
}
}()
return res.StatusCode, nil
if status >= 400 && status < 500 {
return fmt.Errorf("readiness check failed on attempt %d with non-retriable status code %d", attempt, status)
}
am.logger.Debug("Readiness check failed, status code is not 200", "attempt", attempt, "status", status, "err", err)
}
}
}
var attempts int
ticker := time.NewTicker(100 * time.Millisecond)
deadlineCh := time.After(10 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
attempts++
status, err := attempt()
if err != nil {
am.logger.Debug("Ready check attempt failed", "attempt", attempts, "err", err)
continue
}
func (am *Alertmanager) checkReadiness(ctx context.Context) (int, error) {
req, err := http.NewRequestWithContext(
ctx,
http.MethodGet,
am.url.JoinPath(alertmanagerAPIMountPath, alertmanagerReadyPath).String(),
nil,
)
if err != nil {
return 0, fmt.Errorf("error creating the readiness request: %w", err)
}
if status != http.StatusOK {
if status >= 400 && status < 500 {
if status == http.StatusNotAcceptable {
// Mimir returns a 406 when the Alertmanager for the tenant is not running.
// This is expected if the Grafana Alertmanager configuration is default or not promoted.
// We can still use the endpoints to store and retrieve configuration/state.
am.logger.Debug("Remote Alertmanager not initialized for tenant", "attempt", attempts, "status", status)
return true, nil
}
am.logger.Debug("Ready check failed with non-retriable status code", "attempt", attempts, "status", status)
return false, fmt.Errorf("ready check failed with non-retriable status code %d", status)
}
am.logger.Debug("Ready check failed, status code is not 200", "attempt", attempts, "status", status, "err", err)
continue
}
res, err := am.httpClient.Do(req)
if err != nil {
return 0, fmt.Errorf("error performing the readiness check: %w", err)
}
return true, nil
case <-deadlineCh:
cancel()
return false, fmt.Errorf("ready check timed out after %d attempts", attempts)
defer func() {
if err := res.Body.Close(); err != nil {
am.logger.Warn("Error closing response body", "err", err)
}
}
}()
return res.StatusCode, nil
}

Loading…
Cancel
Save