Ruler: rule evaluation jitter (#8848)

**What this PR does / why we need it**: This PR introduces a configurable random sleep before each rule evaluation to prevent contention on the query engine should multiple runs execute concurrently. It's quite likely that rules will be evaluated ~concurrently if they have the same interval.
2 years ago · 5c3d204ebf
parent f5f1753851
commit 5c3d204ebf
7 changed files with 96 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,6 +8,7 @@

 ##### Enhancements

+* [8848](https://github.com/grafana/loki/pull/8848) **dannykopping**: Ruler: add configurable rule evaluation jitter.
 * [8752](https://github.com/grafana/loki/pull/8752) **chaudum**: Add query fairness control across actors within a tenant to scheduler, which can be enabled by passing the `X-Loki-Actor-Path` header to the HTTP request of the query.
 * [8786](https://github.com/grafana/loki/pull/8786) **DylanGuedes**: Ingester: add new /ingester/prepare_shutdown endpoint.
 * [8744](https://github.com/grafana/loki/pull/8744) **dannykopping**: Ruler: remote rule evaluation.
--- a/docs/sources/configuration/_index.md
+++ b/docs/sources/configuration/_index.md
@ -817,6 +817,11 @@ storage:
 # CLI flag: -ruler.rule-path
 [rule_path: <string> | default = "/rules"]

+# Upper bound of random duration to wait before rule evaluation to avoid
+# contention during concurrent execution of rules. Set 0 to disable (default).
+# CLI flag: -ruler.evaluation-jitter
+[evaluation_jitter: <duration> | default = 0s]
+
 # Comma-separated list of Alertmanager URLs to send notifications to. Each
 # Alertmanager URL is treated as a separate group in the configuration. Multiple
 # Alertmanagers in HA per group can be supported by using DNS resolution via
--- a/pkg/loki/modules.go
+++ b/pkg/loki/modules.go
@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"math/rand"
 	"net/http"
 	"net/http/httputil"
 	"net/url"
@ -1014,7 +1015,7 @@ func (t *Loki) initRuleEvaluator() (services.Service, error) {
 		return nil, fmt.Errorf("failed to create %s rule evaluator: %w", mode, err)
 	}

-	t.ruleEvaluator = evaluator
+	t.ruleEvaluator = ruler.NewEvaluatorWithJitter(evaluator, t.Cfg.Ruler.EvaluationJitter, rand.NewSource(time.Now().UnixNano()))

 	return nil, nil
 }
--- a/pkg/ruler/base/ruler.go
+++ b/pkg/ruler/base/ruler.go
@ -88,6 +88,8 @@ type Config struct {
 	StoreConfig RuleStoreConfig `yaml:"storage" doc:"deprecated|description=Use -ruler-storage. CLI flags and their respective YAML config options instead."`
 	// Path to store rule files for prom manager.
 	RulePath string `yaml:"rule_path"`
+	// Maximum time to sleep before each rule evaluation.
+	EvaluationJitter time.Duration `yaml:"evaluation_jitter"`

 	// Global alertmanager config.
 	config.AlertManagerConfig `yaml:",inline"`
@ -161,6 +163,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
 	f.Var(&cfg.ExternalURL, "ruler.external.url", "URL of alerts return path.")
 	f.DurationVar(&cfg.EvaluationInterval, "ruler.evaluation-interval", 1*time.Minute, "How frequently to evaluate rules.")
 	f.DurationVar(&cfg.PollInterval, "ruler.poll-interval", 1*time.Minute, "How frequently to poll for rule changes.")
+	f.DurationVar(&cfg.EvaluationJitter, "ruler.evaluation-jitter", 0, "Upper bound of random duration to wait before rule evaluation to avoid contention during concurrent execution of rules. Set 0 to disable (default).")

 	f.StringVar(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "", "Comma-separated list of Alertmanager URLs to send notifications to. Each Alertmanager URL is treated as a separate group in the configuration. Multiple Alertmanagers in HA per group can be supported by using DNS resolution via '-ruler.alertmanager-discovery'.")
 	f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover Alertmanager hosts.")
--- a/pkg/ruler/evaluator_jitter.go
+++ b/pkg/ruler/evaluator_jitter.go
@ -0,0 +1,37 @@
+package ruler
+
+import (
+	"context"
+	"math/rand"
+	"time"
+
+	"github.com/grafana/loki/pkg/logqlmodel"
+)
+
+// EvaluatorWithJitter wraps a given Evaluator. It applies a randomly-generated jitter (sleep) before each evaluation to
+// protect against thundering-herd scenarios where multiple rules are evaluated at the same time.
+type EvaluatorWithJitter struct {
+	inner     Evaluator
+	maxJitter time.Duration
+	rng       *rand.Rand
+}
+
+func NewEvaluatorWithJitter(inner Evaluator, maxJitter time.Duration, rngSource rand.Source) Evaluator {
+	if maxJitter <= 0 {
+		// jitter is disabled or invalid
+		return inner
+	}
+
+	return &EvaluatorWithJitter{
+		inner:     inner,
+		maxJitter: maxJitter,
+		rng:       rand.New(rngSource),
+	}
+}
+
+func (e *EvaluatorWithJitter) Eval(ctx context.Context, qs string, now time.Time) (*logqlmodel.Result, error) {
+	jitter := time.Duration(e.rng.Int63n(e.maxJitter.Nanoseconds()))
+	time.Sleep(jitter)
+
+	return e.inner.Eval(ctx, qs, now)
+}
--- a/pkg/ruler/evaluator_jitter_test.go
+++ b/pkg/ruler/evaluator_jitter_test.go
@ -0,0 +1,46 @@
+package ruler
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/grafana/loki/pkg/logqlmodel"
+)
+
+const fixedRandNum int64 = 987654321
+
+type mockEval struct{}
+
+func (m mockEval) Eval(context.Context, string, time.Time) (*logqlmodel.Result, error) {
+	return nil, nil
+}
+
+type fakeSource struct{}
+
+func (f fakeSource) Int63() int64 { return fixedRandNum }
+func (f fakeSource) Seed(int64)   {}
+
+func TestEvaluationWithJitter(t *testing.T) {
+	const jitter = 2 * time.Second
+
+	eval := NewEvaluatorWithJitter(mockEval{}, jitter, fakeSource{})
+
+	then := time.Now()
+	_, _ = eval.Eval(context.Background(), "some logql query...", time.Now())
+	since := time.Since(then)
+
+	require.GreaterOrEqual(t, since.Nanoseconds(), fixedRandNum)
+}
+
+func TestEvaluationWithNoJitter(t *testing.T) {
+	const jitter = 0
+
+	inner := mockEval{}
+	eval := NewEvaluatorWithJitter(inner, jitter, fakeSource{})
+
+	// return the inner evaluator if jitter is disabled
+	require.Exactly(t, inner, eval)
+}
--- a/pkg/validation/limits.go
+++ b/pkg/validation/limits.go
@ -101,6 +101,8 @@ type Limits struct {
 	MinShardingLookback model.Duration `yaml:"min_sharding_lookback" json:"min_sharding_lookback"`

 	// Ruler defaults and limits.
+
+	// TODO(dannyk): this setting is misnamed and probably deprecatable.
 	RulerEvaluationDelay        model.Duration                   `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"`
 	RulerMaxRulesPerRuleGroup   int                              `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"`
 	RulerMaxRuleGroupsPerTenant int                              `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"`