Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/querier/queryrange/queryrangebase/retry.go

146 lines
3.8 KiB

package queryrangebase
import (
"context"
"reflect"
"time"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/backoff"
"github.com/grafana/dskit/grpcutil"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/grafana/loki/v3/pkg/util"
util_log "github.com/grafana/loki/v3/pkg/util/log"
)
type RetryMiddlewareMetrics struct {
retriesCount prometheus.Histogram
}
func NewRetryMiddlewareMetrics(registerer prometheus.Registerer, metricsNamespace string) *RetryMiddlewareMetrics {
return &RetryMiddlewareMetrics{
retriesCount: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Name: "query_frontend_retries",
Help: "Number of times a request is retried.",
Buckets: []float64{0, 1, 2, 3, 4, 5},
}),
}
}
type retry struct {
log log.Logger
next Handler
maxRetries int
metrics *RetryMiddlewareMetrics
}
// NewRetryMiddleware returns a middleware that retries requests if they
// fail with 500 or a non-HTTP error.
func NewRetryMiddleware(log log.Logger, maxRetries int, metrics *RetryMiddlewareMetrics, metricsNamespace string) Middleware {
if metrics == nil {
metrics = NewRetryMiddlewareMetrics(nil, metricsNamespace)
}
return MiddlewareFunc(func(next Handler) Handler {
return retry{
log: log,
next: next,
maxRetries: maxRetries,
metrics: metrics,
}
})
}
func (r retry) Do(ctx context.Context, req Request) (Response, error) {
tries := 0
defer func() { r.metrics.retriesCount.Observe(float64(tries)) }()
var lastErr error
// For the default of 5 tries
// try 0: no delay
// try 1: 250ms wait
// try 2: 500ms wait
// try 3: 1s wait
// try 4: 2s wait
cfg := backoff.Config{
MinBackoff: 250 * time.Millisecond,
MaxBackoff: 5 * time.Second,
MaxRetries: 0,
}
bk := backoff.New(ctx, cfg)
start := req.GetStart()
end := req.GetEnd()
query := req.GetQuery()
for ; tries < r.maxRetries; tries++ {
// Make sure the context isn't done before sending the request
if ctx.Err() != nil {
return nil, ctx.Err()
}
resp, err := r.next.Do(ctx, req)
if err == nil {
return resp, nil
}
// Make sure the context isn't done before retrying the request
if ctx.Err() != nil {
return nil, ctx.Err()
}
code := grpcutil.ErrorToStatusCode(err)
// Error handling is tricky... There are many places we wrap any error and set an HTTP style status code
// but there are also places where we return an existing GRPC object which will use GRPC status codes
// If the code is < 100 it's a gRPC status code, currently we retry all of these, even codes.Canceled
// because when our pools close connections they do so with a cancel and we want to retry these
// If it's > 100, it's an HTTP code and we only retry 5xx
if code < 100 || code/100 == 5 {
lastErr = err
level.Error(util_log.WithContext(ctx, r.log)).Log(
"msg", "error processing request",
"try", tries,
"type", logImplementingType(req),
"query", query,
"query_hash", util.HashedQuery(query),
"start", start.Format(time.RFC3339Nano),
"end", end.Format(time.RFC3339Nano),
"start_delta", time.Since(start),
"end_delta", time.Since(end),
"length", end.Sub(start),
"retry_in", bk.NextDelay(),
"code", code,
"err", err,
)
bk.Wait()
continue
} else {
level.Warn(util_log.WithContext(ctx, r.log)).Log("msg", "received an error but not a retryable code, this is possibly a bug.", "code", code, "err", err)
}
return nil, err
}
return nil, lastErr
}
func logImplementingType(i Request) string {
if i == nil {
return "nil"
}
t := reflect.TypeOf(i)
// Check if it's a pointer and get the underlying type if so
if t.Kind() == reflect.Ptr {
t = t.Elem()
}
return t.String()
}