loki/pkg/querier/queryrange/queryrangebase/retry.go

package queryrangebase

import (
	"context"
	"reflect"
	"time"

	"github.com/go-kit/log"
	"github.com/go-kit/log/level"
	"github.com/grafana/dskit/backoff"
	"github.com/grafana/dskit/grpcutil"
	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promauto"

	"github.com/grafana/loki/v3/pkg/util"
	util_log "github.com/grafana/loki/v3/pkg/util/log"
)

type RetryMiddlewareMetrics struct {
	retriesCount prometheus.Histogram
}

func NewRetryMiddlewareMetrics(registerer prometheus.Registerer, metricsNamespace string) *RetryMiddlewareMetrics {
	return &RetryMiddlewareMetrics{
		retriesCount: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
			Namespace: metricsNamespace,
			Name:      "query_frontend_retries",
			Help:      "Number of times a request is retried.",
			Buckets:   []float64{0, 1, 2, 3, 4, 5},
		}),
	}
}

type retry struct {
	log        log.Logger
	next       Handler
	maxRetries int

	metrics *RetryMiddlewareMetrics
}

// NewRetryMiddleware returns a middleware that retries requests if they
// fail with 500 or a non-HTTP error.
func NewRetryMiddleware(log log.Logger, maxRetries int, metrics *RetryMiddlewareMetrics, metricsNamespace string) Middleware {
	if metrics == nil {
		metrics = NewRetryMiddlewareMetrics(nil, metricsNamespace)
	}

	return MiddlewareFunc(func(next Handler) Handler {
		return retry{
			log:        log,
			next:       next,
			maxRetries: maxRetries,
			metrics:    metrics,
		}
	})
}

func (r retry) Do(ctx context.Context, req Request) (Response, error) {
	tries := 0
	defer func() { r.metrics.retriesCount.Observe(float64(tries)) }()

	var lastErr error

	// For the default of 5 tries
	// try 0: no delay
	// try 1: 250ms wait
	// try 2: 500ms wait
	// try 3: 1s wait
	// try 4: 2s wait

	cfg := backoff.Config{
		MinBackoff: 250 * time.Millisecond,
		MaxBackoff: 5 * time.Second,
		MaxRetries: 0,
	}
	bk := backoff.New(ctx, cfg)

	start := req.GetStart()
	end := req.GetEnd()
	query := req.GetQuery()

	for ; tries < r.maxRetries; tries++ {
		// Make sure the context isn't done before sending the request
		if ctx.Err() != nil {
			return nil, ctx.Err()
		}

		resp, err := r.next.Do(ctx, req)
		if err == nil {
			return resp, nil
		}

		// Make sure the context isn't done before retrying the request
		if ctx.Err() != nil {
			return nil, ctx.Err()
		}

		code := grpcutil.ErrorToStatusCode(err)
		// Error handling is tricky... There are many places we wrap any error and set an HTTP style status code
		// but there are also places where we return an existing GRPC object which will use GRPC status codes
		// If the code is < 100 it's a gRPC status code, currently we retry all of these, even codes.Canceled
		// because when our pools close connections they do so with a cancel and we want to retry these
		// If it's > 100, it's an HTTP code and we only retry 5xx
		if code < 100 || code/100 == 5 {
			lastErr = err
			level.Error(util_log.WithContext(ctx, r.log)).Log(
				"msg", "error processing request",
				"try", tries,
				"type", logImplementingType(req),
				"query", query,
				"query_hash", util.HashedQuery(query),
				"start", start.Format(time.RFC3339Nano),
				"end", end.Format(time.RFC3339Nano),
				"start_delta", time.Since(start),
				"end_delta", time.Since(end),
				"length", end.Sub(start),
				"retry_in", bk.NextDelay(),
				"code", code,
				"err", err,
			)
			bk.Wait()
			continue
		} else {
			level.Warn(util_log.WithContext(ctx, r.log)).Log("msg", "received an error but not a retryable code, this is possibly a bug.", "code", code, "err", err)
		}

		return nil, err
	}
	return nil, lastErr
}

func logImplementingType(i Request) string {
	if i == nil {
		return "nil"
	}

	t := reflect.TypeOf(i)

	// Check if it's a pointer and get the underlying type if so
	if t.Kind() == reflect.Ptr {
		t = t.Elem()
	}

	return t.String()
}