Adapt queryTimeout to be a per-tenant configuration (#6835)

What this PR does / why we need it:

* Add a new per-tenant query timeout
* Adds new middleware to query calls. This new middleware will timeout requests based on the new per-tenant query timeout
* Add span logs to query calls missing it
* Deprecate `engine.timeout` configuration

The motivation to change this configuration to be per-tenant instead of global is that it makes sense to alleviate the timeout for particular tenants or make it more strict for others. Especially useful for scenarios where the Loki client doesn't handle context cancellations correctly; in such scenarios, since Loki would still process expensive queries that were canceled, this allows one to have small timeouts for most tenants, which will help mitigate unnecessary work without having a timeout that is too short for important tenants.
pull/6970/head
Dylan Guedes 3 years ago committed by GitHub
parent e15da437f0
commit 15f8f42295
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 5
      CHANGELOG.md
  2. 3
      docs/sources/configuration/_index.md
  3. 6
      docs/sources/upgrading/_index.md
  4. 4
      pkg/logcli/client/file.go
  5. 1
      pkg/logql/downstream.go
  6. 21
      pkg/logql/engine.go
  7. 7
      pkg/logql/limits.go
  8. 37
      pkg/loki/modules.go
  9. 55
      pkg/querier/http.go
  10. 18
      pkg/querier/querier.go
  11. 10
      pkg/querier/querier_test.go
  12. 4
      pkg/querier/queryrange/querysharding_test.go
  13. 5
      pkg/querier/queryrange/roundtrip_test.go
  14. 3
      pkg/querier/queryrange/split_by_range_test.go
  15. 7
      pkg/validation/limits.go
  16. 22
      pkg/validation/limits_test.go

@ -19,9 +19,10 @@
* [6358](https://github.com/grafana/loki/pull/6358) **taharah**: Fixes sigv4 authentication for the Ruler's remote write configuration by allowing both a global and per tenant configuration.
* [6375](https://github.com/grafana/loki/pull/6375) **dannykopping**: Fix bug that prevented users from using the `json` parser after a `line_format` pipeline stage.
##### Changes
* [6726](https://github.com/grafana/loki/pull/6726) **kavirajk** upgrades go from 1.17.9 -> 1.18.4
* [6415](https://github.com/grafana/loki/pull/6415) **salvacorts** Evenly spread queriers across kubernetes nodes.
* [6726](https://github.com/grafana/loki/pull/6726) **kavirajk**: upgrades go from 1.17.9 -> 1.18.4
* [6415](https://github.com/grafana/loki/pull/6415) **salvacorts**: Evenly spread queriers across kubernetes nodes.
* [6349](https://github.com/grafana/loki/pull/6349) **simonswine**: Update the default HTTP listen port from 80 to 3100. Make sure to configure the port explicitly if you are using port 80.
* [6835](https://github.com/grafana/loki/pull/6835) **DylanGuedes**: Add new per-tenant query timeout configuration and remove engine query timeout.
#### Promtail

@ -351,7 +351,8 @@ The `querier` block configures the Loki Querier.
# Configuration options for the LogQL engine.
engine:
# Timeout for query execution
# Timeout for query execution.
# Deprecated: use querier.query-timeout instead.
# CLI flag: -querier.engine.timeout
[timeout: <duration> | default = 3m]

@ -33,6 +33,12 @@ The output is incredibly verbose as it shows the entire internal config struct u
### Loki
### Engine query timeout is deprecated
Previously, we had two configurations to define a query timeout: `engine.timeout` and `querier.query-timeout`.
As they were conflicting and `engine.timeout` isn't as expressive as `querier.query-tiomeout`,
we're deprecating it in favor of relying on `engine.query-timeout` only.
#### Fifocache is deprecated
We introduced a new cache called `embedded-cache` which is an in-process cache system that make it possible to run Loki without the need for an external cache (like Memcached, Redis, etc). It can be run in two modes `distributed: false` (default, and same as old `fifocache`) and `distributed: true` which runs cache in distributed fashion sharding keys across peers if Loki is run in microservices or SSD mode.

@ -190,6 +190,10 @@ func (l *limiter) MaxQuerySeries(userID string) int {
return l.n
}
func (l *limiter) QueryTimeout(userID string) time.Duration {
return time.Minute * 5
}
type querier struct {
r io.Reader
labels labels.Labels

@ -65,7 +65,6 @@ func (ng *DownstreamEngine) Opts() EngineOpts { return ng.opts }
func (ng *DownstreamEngine) Query(ctx context.Context, p Params, mapped syntax.Expr) Query {
return &query{
logger: ng.logger,
timeout: ng.opts.Timeout,
params: p,
evaluator: NewDownstreamEvaluator(ng.downstreamable.Downstreamer(ctx)),
parse: func(_ context.Context, _ string) (syntax.Expr, error) {

@ -96,22 +96,22 @@ type Querier interface {
// EngineOpts is the list of options to use with the LogQL query engine.
type EngineOpts struct {
// TODO: remove this after next release.
// Timeout for queries execution
Timeout time.Duration `yaml:"timeout"`
// MaxLookBackPeriod is the maximum amount of time to look back for log lines.
// only used for instant log queries.
MaxLookBackPeriod time.Duration `yaml:"max_look_back_period"`
}
func (opts *EngineOpts) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
f.DurationVar(&opts.Timeout, prefix+".engine.timeout", 5*time.Minute, "Timeout for query execution.")
// TODO: remove this configuration after next release.
f.DurationVar(&opts.Timeout, prefix+".engine.timeout", 5*time.Minute, "Timeout for query execution. Instead, rely only on querier.query-timeout. (deprecated)")
f.DurationVar(&opts.MaxLookBackPeriod, prefix+".engine.max-lookback-period", 30*time.Second, "The maximum amount of time to look back for log lines. Used only for instant log queries.")
}
func (opts *EngineOpts) applyDefault() {
if opts.Timeout == 0 {
opts.Timeout = 5 * time.Minute
}
if opts.MaxLookBackPeriod == 0 {
opts.MaxLookBackPeriod = 30 * time.Second
}
@ -120,7 +120,6 @@ func (opts *EngineOpts) applyDefault() {
// Engine is the LogQL engine.
type Engine struct {
logger log.Logger
timeout time.Duration
evaluator Evaluator
limits Limits
}
@ -133,7 +132,6 @@ func NewEngine(opts EngineOpts, q Querier, l Limits, logger log.Logger) *Engine
}
return &Engine{
logger: logger,
timeout: opts.Timeout,
evaluator: NewDefaultEvaluator(q, opts.MaxLookBackPeriod),
limits: l,
}
@ -143,7 +141,6 @@ func NewEngine(opts EngineOpts, q Querier, l Limits, logger log.Logger) *Engine
func (ng *Engine) Query(params Params) Query {
return &query{
logger: ng.logger,
timeout: ng.timeout,
params: params,
evaluator: ng.evaluator,
parse: func(_ context.Context, query string) (syntax.Expr, error) {
@ -162,7 +159,6 @@ type Query interface {
type query struct {
logger log.Logger
timeout time.Duration
params Params
parse func(context.Context, string) (syntax.Expr, error)
limits Limits
@ -226,7 +222,14 @@ func (q *query) Exec(ctx context.Context) (logqlmodel.Result, error) {
}
func (q *query) Eval(ctx context.Context) (promql_parser.Value, error) {
ctx, cancel := context.WithTimeout(ctx, q.timeout)
queryTimeout := time.Minute * 5
userID, err := tenant.TenantID(ctx)
if err != nil {
level.Warn(q.logger).Log("msg", fmt.Sprintf("couldn't fetch tenantID to evaluate query timeout, using default value of %s", queryTimeout), "err", err)
} else {
queryTimeout = q.limits.QueryTimeout(userID)
}
ctx, cancel := context.WithTimeout(ctx, queryTimeout)
defer cancel()
expr, err := q.parse(ctx, q.params.Query())

@ -2,6 +2,7 @@ package logql
import (
"math"
"time"
)
var (
@ -11,12 +12,18 @@ var (
// Limits allow the engine to fetch limits for a given users.
type Limits interface {
MaxQuerySeries(userID string) int
QueryTimeout(userID string) time.Duration
}
type fakeLimits struct {
maxSeries int
timeout time.Duration
}
func (f fakeLimits) MaxQuerySeries(userID string) int {
return f.maxSeries
}
func (f fakeLimits) QueryTimeout(userID string) time.Duration {
return f.timeout
}

@ -331,18 +331,31 @@ func (t *Loki) initQuerier() (services.Service, error) {
logger := log.With(util_log.Logger, "component", "querier")
t.querierAPI = querier.NewQuerierAPI(t.Cfg.Querier, t.Querier, t.overrides, logger)
queryHandlers := map[string]http.Handler{
"/loki/api/v1/query_range": httpMiddleware.Wrap(http.HandlerFunc(t.querierAPI.RangeQueryHandler)),
"/loki/api/v1/query": httpMiddleware.Wrap(http.HandlerFunc(t.querierAPI.InstantQueryHandler)),
"/loki/api/v1/label": http.HandlerFunc(t.querierAPI.LabelHandler),
"/loki/api/v1/labels": http.HandlerFunc(t.querierAPI.LabelHandler),
"/loki/api/v1/label/{name}/values": http.HandlerFunc(t.querierAPI.LabelHandler),
"/loki/api/v1/series": http.HandlerFunc(t.querierAPI.SeriesHandler),
"/loki/api/v1/index/stats": http.HandlerFunc(t.querierAPI.IndexStatsHandler),
"/api/prom/query": httpMiddleware.Wrap(http.HandlerFunc(t.querierAPI.LogQueryHandler)),
"/api/prom/label": http.HandlerFunc(t.querierAPI.LabelHandler),
"/api/prom/label/{name}/values": http.HandlerFunc(t.querierAPI.LabelHandler),
"/api/prom/series": http.HandlerFunc(t.querierAPI.SeriesHandler),
"/loki/api/v1/query_range": middleware.Merge(
httpMiddleware,
querier.WrapQuerySpanAndTimeout("query.RangeQuery", t.querierAPI),
).Wrap(http.HandlerFunc(t.querierAPI.RangeQueryHandler)),
"/loki/api/v1/query": middleware.Merge(
httpMiddleware,
querier.WrapQuerySpanAndTimeout("query.InstantQuery", t.querierAPI),
).Wrap(http.HandlerFunc(t.querierAPI.InstantQueryHandler)),
"/loki/api/v1/label": querier.WrapQuerySpanAndTimeout("query.Label", t.querierAPI).Wrap(http.HandlerFunc(t.querierAPI.LabelHandler)),
"/loki/api/v1/labels": querier.WrapQuerySpanAndTimeout("query.Label", t.querierAPI).Wrap(http.HandlerFunc(t.querierAPI.LabelHandler)),
"/loki/api/v1/label/{name}/values": querier.WrapQuerySpanAndTimeout("query.Label", t.querierAPI).Wrap(http.HandlerFunc(t.querierAPI.LabelHandler)),
"/loki/api/v1/series": querier.WrapQuerySpanAndTimeout("query.Series", t.querierAPI).Wrap(http.HandlerFunc(t.querierAPI.SeriesHandler)),
"/loki/api/v1/index/stats": querier.WrapQuerySpanAndTimeout("query.IndexStats", t.querierAPI).Wrap(http.HandlerFunc(t.querierAPI.IndexStatsHandler)),
"/api/prom/query": middleware.Merge(
httpMiddleware,
querier.WrapQuerySpanAndTimeout("query.LogQuery", t.querierAPI),
).Wrap(http.HandlerFunc(t.querierAPI.LogQueryHandler)),
"/api/prom/label": querier.WrapQuerySpanAndTimeout("query.Label", t.querierAPI).Wrap(http.HandlerFunc(t.querierAPI.LabelHandler)),
"/api/prom/label/{name}/values": querier.WrapQuerySpanAndTimeout("query.Label", t.querierAPI).Wrap(http.HandlerFunc(t.querierAPI.LabelHandler)),
"/api/prom/series": querier.WrapQuerySpanAndTimeout("query.Series", t.querierAPI).Wrap(http.HandlerFunc(t.querierAPI.SeriesHandler)),
}
// We always want to register tail routes externally, tail requests are different from normal queries, they

@ -13,6 +13,7 @@ import (
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/promql/parser"
"github.com/weaveworks/common/httpgrpc"
"github.com/weaveworks/common/middleware"
"github.com/grafana/dskit/tenant"
@ -64,16 +65,13 @@ func NewQuerierAPI(cfg Config, querier Querier, limits *validation.Overrides, lo
// RangeQueryHandler is a http.HandlerFunc for range queries.
func (q *QuerierAPI) RangeQueryHandler(w http.ResponseWriter, r *http.Request) {
// Enforce the query timeout while querying backends
ctx, cancel := context.WithDeadline(r.Context(), time.Now().Add(q.cfg.QueryTimeout))
defer cancel()
request, err := loghttp.ParseRangeQuery(r)
if err != nil {
serverutil.WriteError(httpgrpc.Errorf(http.StatusBadRequest, err.Error()), w)
return
}
ctx := r.Context()
if err := q.validateEntriesLimits(ctx, request.Query, request.Limit); err != nil {
serverutil.WriteError(err, w)
return
@ -103,16 +101,13 @@ func (q *QuerierAPI) RangeQueryHandler(w http.ResponseWriter, r *http.Request) {
// InstantQueryHandler is a http.HandlerFunc for instant queries.
func (q *QuerierAPI) InstantQueryHandler(w http.ResponseWriter, r *http.Request) {
// Enforce the query timeout while querying backends
ctx, cancel := context.WithDeadline(r.Context(), time.Now().Add(q.cfg.QueryTimeout))
defer cancel()
request, err := loghttp.ParseInstantQuery(r)
if err != nil {
serverutil.WriteError(httpgrpc.Errorf(http.StatusBadRequest, err.Error()), w)
return
}
ctx := r.Context()
if err := q.validateEntriesLimits(ctx, request.Query, request.Limit); err != nil {
serverutil.WriteError(err, w)
return
@ -143,10 +138,6 @@ func (q *QuerierAPI) InstantQueryHandler(w http.ResponseWriter, r *http.Request)
// LogQueryHandler is a http.HandlerFunc for log only queries.
func (q *QuerierAPI) LogQueryHandler(w http.ResponseWriter, r *http.Request) {
// Enforce the query timeout while querying backends
ctx, cancel := context.WithDeadline(r.Context(), time.Now().Add(q.cfg.QueryTimeout))
defer cancel()
request, err := loghttp.ParseRangeQuery(r)
if err != nil {
serverutil.WriteError(httpgrpc.Errorf(http.StatusBadRequest, err.Error()), w)
@ -170,6 +161,7 @@ func (q *QuerierAPI) LogQueryHandler(w http.ResponseWriter, r *http.Request) {
return
}
ctx := r.Context()
if err := q.validateEntriesLimits(ctx, request.Query, request.Limit); err != nil {
serverutil.WriteError(err, w)
return
@ -207,13 +199,11 @@ func (q *QuerierAPI) LabelHandler(w http.ResponseWriter, r *http.Request) {
return
}
log, ctx := spanlogger.New(r.Context(), "query.Label")
timer := prometheus.NewTimer(logql.QueryTime.WithLabelValues("labels"))
defer timer.ObserveDuration()
start := time.Now()
statsCtx, ctx := stats.NewContext(ctx)
statsCtx, ctx := stats.NewContext(r.Context())
resp, err := q.querier.Label(r.Context(), req)
queueTime, _ := ctx.Value(httpreq.QueryQueueTimeHTTPHeader).(time.Duration)
@ -224,6 +214,7 @@ func (q *QuerierAPI) LabelHandler(w http.ResponseWriter, r *http.Request) {
}
// record stats about the label query
statResult := statsCtx.Result(time.Since(start), queueTime, resLength)
log := spanlogger.FromContext(ctx)
statResult.Log(level.Debug(log))
status := 200
@ -382,13 +373,11 @@ func (q *QuerierAPI) SeriesHandler(w http.ResponseWriter, r *http.Request) {
return
}
log, ctx := spanlogger.New(r.Context(), "query.Series")
timer := prometheus.NewTimer(logql.QueryTime.WithLabelValues("series"))
defer timer.ObserveDuration()
start := time.Now()
statsCtx, ctx := stats.NewContext(ctx)
statsCtx, ctx := stats.NewContext(r.Context())
resp, err := q.querier.Series(r.Context(), req)
queueTime, _ := ctx.Value(httpreq.QueryQueueTimeHTTPHeader).(time.Duration)
@ -400,6 +389,7 @@ func (q *QuerierAPI) SeriesHandler(w http.ResponseWriter, r *http.Request) {
// record stats about the label query
statResult := statsCtx.Result(time.Since(start), queueTime, resLength)
log := spanlogger.FromContext(ctx)
statResult.Log(level.Debug(log))
status := 200
@ -422,17 +412,14 @@ func (q *QuerierAPI) SeriesHandler(w http.ResponseWriter, r *http.Request) {
// IndexStatsHandler queries the index for the data statistics related to a query
func (q *QuerierAPI) IndexStatsHandler(w http.ResponseWriter, r *http.Request) {
req, err := loghttp.ParseIndexStatsQuery(r)
if err != nil {
serverutil.WriteError(httpgrpc.Errorf(http.StatusBadRequest, err.Error()), w)
return
}
_, ctx := spanlogger.New(r.Context(), "query.IndexStats")
// TODO(owen-d): log metadata, record stats?
resp, err := q.querier.IndexStats(ctx, req)
resp, err := q.querier.IndexStats(r.Context(), req)
if resp == nil {
// Some stores don't implement this
resp = &index_stats.Stats{}
@ -492,3 +479,27 @@ func (q *QuerierAPI) validateEntriesLimits(ctx context.Context, query string, li
}
return nil
}
// WrapQuerySpanAndTimeout applies a context deadline and a span logger to a query call.
//
// The timeout is based on the per-tenant query timeout configuration.
func WrapQuerySpanAndTimeout(call string, q *QuerierAPI) middleware.Interface {
return middleware.Func(func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
log, ctx := spanlogger.New(req.Context(), call)
userID, err := tenant.TenantID(ctx)
if err != nil {
level.Error(log).Log("msg", "couldn't fetch tenantID", "err", err)
serverutil.WriteError(httpgrpc.Errorf(http.StatusBadRequest, err.Error()), w)
return
}
// Enforce the query timeout while querying backends
queryTimeout := q.limits.QueryTimeout(userID)
_, cancel := context.WithDeadline(ctx, time.Now().Add(queryTimeout))
defer cancel()
next.ServeHTTP(w, req)
})
})
}

@ -44,7 +44,6 @@ type interval struct {
// Config for a querier.
type Config struct {
QueryTimeout time.Duration `yaml:"query_timeout"`
TailMaxDuration time.Duration `yaml:"tail_max_duration"`
ExtraQueryDelay time.Duration `yaml:"extra_query_delay,omitempty"`
QueryIngestersWithin time.Duration `yaml:"query_ingesters_within,omitempty"`
@ -60,7 +59,6 @@ type Config struct {
func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
cfg.Engine.RegisterFlagsWithPrefix("querier", f)
f.DurationVar(&cfg.TailMaxDuration, "querier.tail-max-duration", 1*time.Hour, "Limit the duration for which live tailing request would be served")
f.DurationVar(&cfg.QueryTimeout, "querier.query-timeout", 1*time.Minute, "Timeout when querying backends (ingesters or storage) during the execution of a query request")
f.DurationVar(&cfg.ExtraQueryDelay, "querier.extra-query-delay", 0, "Time to wait before sending more than the minimum successful query requests.")
f.DurationVar(&cfg.QueryIngestersWithin, "querier.query-ingesters-within", 3*time.Hour, "Maximum lookback beyond which queries are not sent to ingester. 0 means all queries are sent to ingester.")
f.IntVar(&cfg.MaxConcurrent, "querier.max-concurrent", 10, "The maximum number of concurrent queries.")
@ -355,7 +353,8 @@ func (q *SingleTenantQuerier) Label(ctx context.Context, req *logproto.LabelRequ
}
// Enforce the query timeout while querying backends
ctx, cancel := context.WithDeadline(ctx, time.Now().Add(q.cfg.QueryTimeout))
queryTimeout := q.limits.QueryTimeout(userID)
ctx, cancel := context.WithDeadline(ctx, time.Now().Add(queryTimeout))
defer cancel()
g, ctx := errgroup.WithContext(ctx)
@ -439,7 +438,12 @@ func (q *SingleTenantQuerier) Tail(ctx context.Context, req *logproto.TailReques
// Enforce the query timeout except when tailing, otherwise the tailing
// will be terminated once the query timeout is reached
tailCtx := ctx
queryCtx, cancelQuery := context.WithDeadline(ctx, time.Now().Add(q.cfg.QueryTimeout))
tenantID, err := tenant.TenantID(tailCtx)
if err != nil {
return nil, errors.Wrap(err, "failed to load tenant")
}
queryTimeout := q.limits.QueryTimeout(tenantID)
queryCtx, cancelQuery := context.WithDeadline(ctx, time.Now().Add(queryTimeout))
defer cancelQuery()
tailClients, err := q.ingesterQuerier.Tail(tailCtx, req)
@ -482,7 +486,8 @@ func (q *SingleTenantQuerier) Series(ctx context.Context, req *logproto.SeriesRe
}
// Enforce the query timeout while querying backends
ctx, cancel := context.WithDeadline(ctx, time.Now().Add(q.cfg.QueryTimeout))
queryTimeout := q.limits.QueryTimeout(userID)
ctx, cancel := context.WithDeadline(ctx, time.Now().Add(queryTimeout))
defer cancel()
return q.awaitSeries(ctx, req)
@ -704,7 +709,8 @@ func (q *SingleTenantQuerier) IndexStats(ctx context.Context, req *loghttp.Range
}
// Enforce the query timeout while querying backends
ctx, cancel := context.WithDeadline(ctx, time.Now().Add(q.cfg.QueryTimeout))
queryTimeout := q.limits.QueryTimeout(userID)
ctx, cancel := context.WithDeadline(ctx, time.Now().Add(queryTimeout))
defer cancel()
return q.store.Stats(

@ -47,8 +47,9 @@ func TestQuerier_Label_QueryTimeoutConfigFlag(t *testing.T) {
store := newStoreMock()
store.On("LabelValuesForMetricName", mock.Anything, "test", model.TimeFromUnixNano(startTime.UnixNano()), model.TimeFromUnixNano(endTime.UnixNano()), "logs", "test").Return([]string{"foo", "bar"}, nil)
limits, err := validation.NewOverrides(defaultLimitsTestConfig(), nil)
limitsCfg := defaultLimitsTestConfig()
limitsCfg.QueryTimeout = model.Duration(queryTimeout)
limits, err := validation.NewOverrides(limitsCfg, nil)
require.NoError(t, err)
q, err := newQuerier(
@ -101,7 +102,9 @@ func TestQuerier_Tail_QueryTimeoutConfigFlag(t *testing.T) {
ingesterClient.On("Tail", mock.Anything, &request, mock.Anything).Return(tailClient, nil)
ingesterClient.On("TailersCount", mock.Anything, mock.Anything, mock.Anything).Return(&logproto.TailersCountResponse{}, nil)
limits, err := validation.NewOverrides(defaultLimitsTestConfig(), nil)
limitsCfg := defaultLimitsTestConfig()
limitsCfg.QueryTimeout = model.Duration(queryTimeout)
limits, err := validation.NewOverrides(limitsCfg, nil)
require.NoError(t, err)
q, err := newQuerier(
@ -140,7 +143,6 @@ func TestQuerier_Tail_QueryTimeoutConfigFlag(t *testing.T) {
func mockQuerierConfig() Config {
return Config{
TailMaxDuration: 1 * time.Minute,
QueryTimeout: queryTimeout,
}
}

@ -116,6 +116,7 @@ func Test_shardSplitter(t *testing.T) {
now: func() time.Time { return end },
limits: fakeLimits{
minShardingLookback: tc.lookback,
queryTimeout: time.Minute,
maxQueryParallelism: 1,
},
}
@ -156,7 +157,7 @@ func Test_astMapper(t *testing.T) {
handler,
log.NewNopLogger(),
nilShardingMetrics,
fakeLimits{maxSeries: math.MaxInt32, maxQueryParallelism: 1},
fakeLimits{maxSeries: math.MaxInt32, maxQueryParallelism: 1, queryTimeout: time.Second},
)
resp, err := mware.Do(user.InjectOrgID(context.Background(), "1"), defaultReq().WithQuery(`{food="bar"}`))
@ -257,6 +258,7 @@ func Test_InstantSharding(t *testing.T) {
fakeLimits{
maxSeries: math.MaxInt32,
maxQueryParallelism: 10,
queryTimeout: time.Second,
})
response, err := sharding.Wrap(queryrangebase.HandlerFunc(func(c context.Context, r queryrangebase.Request) (queryrangebase.Response, error) {
lock.Lock()

@ -603,6 +603,7 @@ type fakeLimits struct {
maxSeries int
splits map[string]time.Duration
minShardingLookback time.Duration
queryTimeout time.Duration
}
func (f fakeLimits) QuerySplitDuration(key string) time.Duration {
@ -643,6 +644,10 @@ func (f fakeLimits) MinShardingLookback(string) time.Duration {
return f.minShardingLookback
}
func (f fakeLimits) QueryTimeout(string) time.Duration {
return f.queryTimeout
}
func counter() (*int, http.Handler) {
count := 0
var lock sync.Mutex

@ -18,7 +18,8 @@ import (
func Test_RangeVectorSplit(t *testing.T) {
srm := NewSplitByRangeMiddleware(log.NewNopLogger(), fakeLimits{
maxSeries: 10000,
maxSeries: 10000,
queryTimeout: time.Second,
splits: map[string]time.Duration{
"tenant": time.Minute,
},

@ -85,6 +85,7 @@ type Limits struct {
MaxCacheFreshness model.Duration `yaml:"max_cache_freshness_per_query" json:"max_cache_freshness_per_query"`
MaxQueriersPerTenant int `yaml:"max_queriers_per_tenant" json:"max_queriers_per_tenant"`
QueryReadyIndexNumDays int `yaml:"query_ready_index_num_days" json:"query_ready_index_num_days"`
QueryTimeout model.Duration `yaml:"query_timeout" json:"query_timeout"`
// Query frontend enforced limits. The default is actually parameterized by the queryrange config.
QuerySplitDuration model.Duration `yaml:"split_queries_by_interval" json:"split_queries_by_interval"`
@ -171,6 +172,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
_ = l.MaxQueryLength.Set("721h")
f.Var(&l.MaxQueryLength, "store.max-query-length", "Limit to length of chunk store queries, 0 to disable.")
f.IntVar(&l.MaxQuerySeries, "querier.max-query-series", 500, "Limit the maximum of unique series returned by a metric query. When the limit is reached an error is returned.")
_ = l.QueryTimeout.Set("1m")
f.Var(&l.QueryTimeout, "querier.query-timeout", "Timeout when querying backends (ingesters or storage) during the execution of a query request. If a specific per-tenant timeout is used, this timeout is ignored.")
_ = l.MaxQueryLookback.Set("0s")
f.Var(&l.MaxQueryLookback, "querier.max-query-lookback", "Limit how long back data (series and metadata) can be queried, up until <lookback> duration ago. This limit is enforced in the query-frontend, querier and ruler. If the requested time range is outside the allowed range, the request will not fail but will be manipulated to only query data within the allowed time range. 0 to disable.")
@ -443,6 +446,10 @@ func (o *Overrides) MaxEntriesLimitPerQuery(userID string) int {
return o.getOverridesForUser(userID).MaxEntriesLimitPerQuery
}
func (o *Overrides) QueryTimeout(userID string) time.Duration {
return time.Duration(o.getOverridesForUser(userID).QueryTimeout)
}
func (o *Overrides) MaxCacheFreshness(userID string) time.Duration {
return time.Duration(o.getOverridesForUser(userID).MaxCacheFreshness)
}

@ -71,6 +71,7 @@ ruler_remote_write_sigv4_config:
region: us-east-1
per_tenant_override_config: ""
per_tenant_override_period: 230s
query_timeout: 5m
`
inputJSON := `
{
@ -106,7 +107,8 @@ per_tenant_override_period: 230s
"region": "us-east-1"
},
"per_tenant_override_config": "",
"per_tenant_override_period": "230s"
"per_tenant_override_period": "230s",
"query_timeout": "5m"
}
`
@ -239,6 +241,24 @@ reject_old_samples: true
},
},
},
{
desc: "per tenant query timeout",
yaml: `
query_timeout: 5m
`,
exp: Limits{
QueryTimeout: model.Duration(5 * time.Minute),
// Rest from new defaults.
RulerRemoteWriteHeaders: OverwriteMarshalingStringMap{map[string]string{"a": "b"}},
StreamRetention: []StreamRetention{
{
Period: model.Duration(24 * time.Hour),
Selector: `{a="b"}`,
},
},
},
},
} {
t.Run(tc.desc, func(t *testing.T) {

Loading…
Cancel
Save