mirror of https://github.com/grafana/grafana
Alerting: Add setting to distribute rule group evaluations over time (#80766)
* Simple, per-base-interval jitter * Add log just for test purposes * Add strategy approach, allow choosing between group or rule * Add flag to jitter rules * Add second toggle for jittering within a group * Wire up toggles to strategy * Slightly improve comment ordering * Add tests for offset generation * Rename JitterStrategyFrom * Improve debug log message * Use grafana SDK labels rather than prometheus labelspull/80841/head
parent
94c3be3b49
commit
00a260effa
|
@ -0,0 +1,66 @@ |
||||
package schedule |
||||
|
||||
import ( |
||||
"fmt" |
||||
"time" |
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data" |
||||
"github.com/grafana/grafana/pkg/services/featuremgmt" |
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" |
||||
) |
||||
|
||||
// JitterStrategy represents a modifier to alert rule timing that affects how evaluations are distributed.
|
||||
type JitterStrategy int |
||||
|
||||
const ( |
||||
JitterNever JitterStrategy = iota |
||||
JitterByGroup |
||||
JitterByRule |
||||
) |
||||
|
||||
// JitterStrategyFrom returns the JitterStrategy indicated by the current Grafana feature toggles.
|
||||
func JitterStrategyFrom(toggles featuremgmt.FeatureToggles) JitterStrategy { |
||||
strategy := JitterNever |
||||
if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRules) { |
||||
strategy = JitterByGroup |
||||
} |
||||
if toggles.IsEnabledGlobally(featuremgmt.FlagJitterAlertRulesWithinGroups) { |
||||
strategy = JitterByRule |
||||
} |
||||
return strategy |
||||
} |
||||
|
||||
// jitterOffsetInTicks gives the jitter offset for a rule, in terms of a number of ticks relative to its interval and a base interval.
|
||||
// The resulting number of ticks is non-negative. We assume the rule is well-formed and has an IntervalSeconds greater to or equal than baseInterval.
|
||||
func jitterOffsetInTicks(r *ngmodels.AlertRule, baseInterval time.Duration, strategy JitterStrategy) int64 { |
||||
if strategy == JitterNever { |
||||
return 0 |
||||
} |
||||
|
||||
itemFrequency := r.IntervalSeconds / int64(baseInterval.Seconds()) |
||||
offset := jitterHash(r, strategy) % uint64(itemFrequency) |
||||
// Offset is always nonnegative and less than int64.max, because above we mod by itemFrequency which fits in the positive half of int64.
|
||||
// offset <= itemFrequency <= int64.max
|
||||
// So, this will not overflow and produce a negative offset.
|
||||
res := int64(offset) |
||||
|
||||
// Regardless, take an absolute value anyway for an extra layer of safety in case the above logic ever changes.
|
||||
// Our contract requires that the result is nonnegative and less than int64.max.
|
||||
if res < 0 { |
||||
return -res |
||||
} |
||||
return res |
||||
} |
||||
|
||||
func jitterHash(r *ngmodels.AlertRule, strategy JitterStrategy) uint64 { |
||||
ls := data.Labels{ |
||||
"name": r.RuleGroup, |
||||
"file": r.NamespaceUID, |
||||
"orgId": fmt.Sprint(r.OrgID), |
||||
} |
||||
|
||||
if strategy == JitterByRule { |
||||
ls["uid"] = r.UID |
||||
} |
||||
return uint64(ls.Fingerprint()) |
||||
} |
||||
@ -0,0 +1,100 @@ |
||||
package schedule |
||||
|
||||
import ( |
||||
"testing" |
||||
"time" |
||||
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" |
||||
"github.com/stretchr/testify/require" |
||||
) |
||||
|
||||
func TestJitter(t *testing.T) { |
||||
t.Run("when strategy is JitterNever", func(t *testing.T) { |
||||
t.Run("offset is always zero", func(t *testing.T) { |
||||
rules := createTestRules(100, ngmodels.WithIntervalBetween(10, 600)) |
||||
baseInterval := 10 * time.Second |
||||
|
||||
for _, r := range rules { |
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterNever) |
||||
require.Zero(t, offset, "unexpected offset, should be zero with jitter disabled; got %d", offset) |
||||
} |
||||
}) |
||||
}) |
||||
|
||||
t.Run("when strategy is JitterByGroup", func(t *testing.T) { |
||||
t.Run("offset is stable for the same rule", func(t *testing.T) { |
||||
rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))() |
||||
baseInterval := 10 * time.Second |
||||
original := jitterOffsetInTicks(rule, baseInterval, JitterByGroup) |
||||
|
||||
for i := 0; i < 100; i++ { |
||||
offset := jitterOffsetInTicks(rule, baseInterval, JitterByGroup) |
||||
require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule") |
||||
} |
||||
}) |
||||
|
||||
t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) { |
||||
baseInterval := 10 * time.Second |
||||
rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600)) |
||||
|
||||
for _, r := range rules { |
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup) |
||||
require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds) |
||||
upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds()) |
||||
require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit) |
||||
} |
||||
}) |
||||
|
||||
t.Run("offset for any rule in the same group is always the same", func(t *testing.T) { |
||||
baseInterval := 10 * time.Second |
||||
group1 := ngmodels.AlertRuleGroupKey{} |
||||
group2 := ngmodels.AlertRuleGroupKey{} |
||||
rules1 := createTestRules(1000, ngmodels.WithInterval(60*time.Second), ngmodels.WithGroupKey(group1)) |
||||
rules2 := createTestRules(1000, ngmodels.WithInterval(1*time.Hour), ngmodels.WithGroupKey(group2)) |
||||
|
||||
group1Offset := jitterOffsetInTicks(rules1[0], baseInterval, JitterByGroup) |
||||
for _, r := range rules1 { |
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup) |
||||
require.Equal(t, group1Offset, offset) |
||||
} |
||||
group2Offset := jitterOffsetInTicks(rules2[0], baseInterval, JitterByGroup) |
||||
for _, r := range rules2 { |
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByGroup) |
||||
require.Equal(t, group2Offset, offset) |
||||
} |
||||
}) |
||||
}) |
||||
|
||||
t.Run("when strategy is JitterByRule", func(t *testing.T) { |
||||
t.Run("offset is stable for the same rule", func(t *testing.T) { |
||||
rule := ngmodels.AlertRuleGen(ngmodels.WithIntervalBetween(10, 600))() |
||||
baseInterval := 10 * time.Second |
||||
original := jitterOffsetInTicks(rule, baseInterval, JitterByRule) |
||||
|
||||
for i := 0; i < 100; i++ { |
||||
offset := jitterOffsetInTicks(rule, baseInterval, JitterByRule) |
||||
require.Equal(t, original, offset, "jitterOffsetInTicks should return the same value for the same rule") |
||||
} |
||||
}) |
||||
|
||||
t.Run("offset is on the interval [0, interval/baseInterval)", func(t *testing.T) { |
||||
baseInterval := 10 * time.Second |
||||
rules := createTestRules(1000, ngmodels.WithIntervalBetween(10, 600)) |
||||
|
||||
for _, r := range rules { |
||||
offset := jitterOffsetInTicks(r, baseInterval, JitterByRule) |
||||
require.GreaterOrEqual(t, offset, int64(0), "offset cannot be negative, got %d for rule with interval %d", offset, r.IntervalSeconds) |
||||
upperLimit := r.IntervalSeconds / int64(baseInterval.Seconds()) |
||||
require.Less(t, offset, upperLimit, "offset cannot be equal to or greater than interval/baseInterval of %d", upperLimit) |
||||
} |
||||
}) |
||||
}) |
||||
} |
||||
|
||||
func createTestRules(n int, mutators ...ngmodels.AlertRuleMutator) []*ngmodels.AlertRule { |
||||
result := make([]*ngmodels.AlertRule, 0, n) |
||||
for i := 0; i < n; i++ { |
||||
result = append(result, ngmodels.AlertRuleGen(mutators...)()) |
||||
} |
||||
return result |
||||
} |
||||
Loading…
Reference in new issue