Ruler: rule evaluation jitter (#8848)

**What this PR does / why we need it**:
This PR introduces a configurable random sleep before each rule
evaluation to prevent contention on the query engine should multiple
runs execute concurrently. It's quite likely that rules will be
evaluated ~concurrently if they have the same interval.
pull/8853/head
Danny Kopping 2 years ago committed by GitHub
parent f5f1753851
commit 5c3d204ebf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      CHANGELOG.md
  2. 5
      docs/sources/configuration/_index.md
  3. 3
      pkg/loki/modules.go
  4. 3
      pkg/ruler/base/ruler.go
  5. 37
      pkg/ruler/evaluator_jitter.go
  6. 46
      pkg/ruler/evaluator_jitter_test.go
  7. 2
      pkg/validation/limits.go

@ -8,6 +8,7 @@
##### Enhancements
* [8848](https://github.com/grafana/loki/pull/8848) **dannykopping**: Ruler: add configurable rule evaluation jitter.
* [8752](https://github.com/grafana/loki/pull/8752) **chaudum**: Add query fairness control across actors within a tenant to scheduler, which can be enabled by passing the `X-Loki-Actor-Path` header to the HTTP request of the query.
* [8786](https://github.com/grafana/loki/pull/8786) **DylanGuedes**: Ingester: add new /ingester/prepare_shutdown endpoint.
* [8744](https://github.com/grafana/loki/pull/8744) **dannykopping**: Ruler: remote rule evaluation.

@ -817,6 +817,11 @@ storage:
# CLI flag: -ruler.rule-path
[rule_path: <string> | default = "/rules"]
# Upper bound of random duration to wait before rule evaluation to avoid
# contention during concurrent execution of rules. Set 0 to disable (default).
# CLI flag: -ruler.evaluation-jitter
[evaluation_jitter: <duration> | default = 0s]
# Comma-separated list of Alertmanager URLs to send notifications to. Each
# Alertmanager URL is treated as a separate group in the configuration. Multiple
# Alertmanagers in HA per group can be supported by using DNS resolution via

@ -4,6 +4,7 @@ import (
"context"
"errors"
"fmt"
"math/rand"
"net/http"
"net/http/httputil"
"net/url"
@ -1014,7 +1015,7 @@ func (t *Loki) initRuleEvaluator() (services.Service, error) {
return nil, fmt.Errorf("failed to create %s rule evaluator: %w", mode, err)
}
t.ruleEvaluator = evaluator
t.ruleEvaluator = ruler.NewEvaluatorWithJitter(evaluator, t.Cfg.Ruler.EvaluationJitter, rand.NewSource(time.Now().UnixNano()))
return nil, nil
}

@ -88,6 +88,8 @@ type Config struct {
StoreConfig RuleStoreConfig `yaml:"storage" doc:"deprecated|description=Use -ruler-storage. CLI flags and their respective YAML config options instead."`
// Path to store rule files for prom manager.
RulePath string `yaml:"rule_path"`
// Maximum time to sleep before each rule evaluation.
EvaluationJitter time.Duration `yaml:"evaluation_jitter"`
// Global alertmanager config.
config.AlertManagerConfig `yaml:",inline"`
@ -161,6 +163,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
f.Var(&cfg.ExternalURL, "ruler.external.url", "URL of alerts return path.")
f.DurationVar(&cfg.EvaluationInterval, "ruler.evaluation-interval", 1*time.Minute, "How frequently to evaluate rules.")
f.DurationVar(&cfg.PollInterval, "ruler.poll-interval", 1*time.Minute, "How frequently to poll for rule changes.")
f.DurationVar(&cfg.EvaluationJitter, "ruler.evaluation-jitter", 0, "Upper bound of random duration to wait before rule evaluation to avoid contention during concurrent execution of rules. Set 0 to disable (default).")
f.StringVar(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "", "Comma-separated list of Alertmanager URLs to send notifications to. Each Alertmanager URL is treated as a separate group in the configuration. Multiple Alertmanagers in HA per group can be supported by using DNS resolution via '-ruler.alertmanager-discovery'.")
f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover Alertmanager hosts.")

@ -0,0 +1,37 @@
package ruler
import (
"context"
"math/rand"
"time"
"github.com/grafana/loki/pkg/logqlmodel"
)
// EvaluatorWithJitter wraps a given Evaluator. It applies a randomly-generated jitter (sleep) before each evaluation to
// protect against thundering-herd scenarios where multiple rules are evaluated at the same time.
type EvaluatorWithJitter struct {
inner Evaluator
maxJitter time.Duration
rng *rand.Rand
}
func NewEvaluatorWithJitter(inner Evaluator, maxJitter time.Duration, rngSource rand.Source) Evaluator {
if maxJitter <= 0 {
// jitter is disabled or invalid
return inner
}
return &EvaluatorWithJitter{
inner: inner,
maxJitter: maxJitter,
rng: rand.New(rngSource),
}
}
func (e *EvaluatorWithJitter) Eval(ctx context.Context, qs string, now time.Time) (*logqlmodel.Result, error) {
jitter := time.Duration(e.rng.Int63n(e.maxJitter.Nanoseconds()))
time.Sleep(jitter)
return e.inner.Eval(ctx, qs, now)
}

@ -0,0 +1,46 @@
package ruler
import (
"context"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/grafana/loki/pkg/logqlmodel"
)
const fixedRandNum int64 = 987654321
type mockEval struct{}
func (m mockEval) Eval(context.Context, string, time.Time) (*logqlmodel.Result, error) {
return nil, nil
}
type fakeSource struct{}
func (f fakeSource) Int63() int64 { return fixedRandNum }
func (f fakeSource) Seed(int64) {}
func TestEvaluationWithJitter(t *testing.T) {
const jitter = 2 * time.Second
eval := NewEvaluatorWithJitter(mockEval{}, jitter, fakeSource{})
then := time.Now()
_, _ = eval.Eval(context.Background(), "some logql query...", time.Now())
since := time.Since(then)
require.GreaterOrEqual(t, since.Nanoseconds(), fixedRandNum)
}
func TestEvaluationWithNoJitter(t *testing.T) {
const jitter = 0
inner := mockEval{}
eval := NewEvaluatorWithJitter(inner, jitter, fakeSource{})
// return the inner evaluator if jitter is disabled
require.Exactly(t, inner, eval)
}

@ -101,6 +101,8 @@ type Limits struct {
MinShardingLookback model.Duration `yaml:"min_sharding_lookback" json:"min_sharding_lookback"`
// Ruler defaults and limits.
// TODO(dannyk): this setting is misnamed and probably deprecatable.
RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"`
RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"`
RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"`

Loading…
Cancel
Save