|
|
|
|
@ -407,6 +407,7 @@ func TestForStateRestore(t *testing.T) { |
|
|
|
|
|
|
|
|
|
// Prometheus goes down here. We create new rules and groups.
|
|
|
|
|
type testInput struct { |
|
|
|
|
name string |
|
|
|
|
restoreDuration time.Duration |
|
|
|
|
alerts []*Alert |
|
|
|
|
|
|
|
|
|
@ -414,105 +415,110 @@ func TestForStateRestore(t *testing.T) { |
|
|
|
|
noRestore bool |
|
|
|
|
gracePeriod bool |
|
|
|
|
downDuration time.Duration |
|
|
|
|
before func() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
tests := []testInput{ |
|
|
|
|
{ |
|
|
|
|
// Normal restore (alerts were not firing).
|
|
|
|
|
name: "normal restore (alerts were not firing)", |
|
|
|
|
restoreDuration: 15 * time.Minute, |
|
|
|
|
alerts: rule.ActiveAlerts(), |
|
|
|
|
downDuration: 10 * time.Minute, |
|
|
|
|
}, |
|
|
|
|
{ |
|
|
|
|
// Testing Outage Tolerance.
|
|
|
|
|
name: "outage tolerance", |
|
|
|
|
restoreDuration: 40 * time.Minute, |
|
|
|
|
noRestore: true, |
|
|
|
|
num: 2, |
|
|
|
|
}, |
|
|
|
|
{ |
|
|
|
|
// No active alerts.
|
|
|
|
|
name: "no active alerts", |
|
|
|
|
restoreDuration: 50 * time.Minute, |
|
|
|
|
alerts: []*Alert{}, |
|
|
|
|
}, |
|
|
|
|
{ |
|
|
|
|
name: "test the grace period", |
|
|
|
|
restoreDuration: 25 * time.Minute, |
|
|
|
|
alerts: []*Alert{}, |
|
|
|
|
gracePeriod: true, |
|
|
|
|
before: func() { |
|
|
|
|
for _, duration := range []time.Duration{10 * time.Minute, 15 * time.Minute, 20 * time.Minute} { |
|
|
|
|
evalTime := baseTime.Add(duration) |
|
|
|
|
group.Eval(context.TODO(), evalTime) |
|
|
|
|
} |
|
|
|
|
}, |
|
|
|
|
num: 2, |
|
|
|
|
}, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
testFunc := func(tst testInput) { |
|
|
|
|
newRule := NewAlertingRule( |
|
|
|
|
"HTTPRequestRateLow", |
|
|
|
|
expr, |
|
|
|
|
alertForDuration, |
|
|
|
|
0, |
|
|
|
|
labels.FromStrings("severity", "critical"), |
|
|
|
|
labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil, |
|
|
|
|
) |
|
|
|
|
newGroup := NewGroup(GroupOptions{ |
|
|
|
|
Name: "default", |
|
|
|
|
Interval: time.Second, |
|
|
|
|
Rules: []Rule{newRule}, |
|
|
|
|
ShouldRestore: true, |
|
|
|
|
Opts: opts, |
|
|
|
|
}) |
|
|
|
|
for _, tt := range tests { |
|
|
|
|
t.Run(tt.name, func(t *testing.T) { |
|
|
|
|
if tt.before != nil { |
|
|
|
|
tt.before() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
newGroups := make(map[string]*Group) |
|
|
|
|
newGroups["default;"] = newGroup |
|
|
|
|
newRule := NewAlertingRule( |
|
|
|
|
"HTTPRequestRateLow", |
|
|
|
|
expr, |
|
|
|
|
alertForDuration, |
|
|
|
|
0, |
|
|
|
|
labels.FromStrings("severity", "critical"), |
|
|
|
|
labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil, |
|
|
|
|
) |
|
|
|
|
newGroup := NewGroup(GroupOptions{ |
|
|
|
|
Name: "default", |
|
|
|
|
Interval: time.Second, |
|
|
|
|
Rules: []Rule{newRule}, |
|
|
|
|
ShouldRestore: true, |
|
|
|
|
Opts: opts, |
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
restoreTime := baseTime.Add(tst.restoreDuration) |
|
|
|
|
// First eval before restoration.
|
|
|
|
|
newGroup.Eval(context.TODO(), restoreTime) |
|
|
|
|
// Restore happens here.
|
|
|
|
|
newGroup.RestoreForState(restoreTime) |
|
|
|
|
newGroups := make(map[string]*Group) |
|
|
|
|
newGroups["default;"] = newGroup |
|
|
|
|
|
|
|
|
|
got := newRule.ActiveAlerts() |
|
|
|
|
for _, aa := range got { |
|
|
|
|
require.Zero(t, aa.Labels.Get(model.MetricNameLabel), "%s label set on active alert: %s", model.MetricNameLabel, aa.Labels) |
|
|
|
|
} |
|
|
|
|
sort.Slice(got, func(i, j int) bool { |
|
|
|
|
return labels.Compare(got[i].Labels, got[j].Labels) < 0 |
|
|
|
|
}) |
|
|
|
|
restoreTime := baseTime.Add(tt.restoreDuration) |
|
|
|
|
// First eval before restoration.
|
|
|
|
|
newGroup.Eval(context.TODO(), restoreTime) |
|
|
|
|
// Restore happens here.
|
|
|
|
|
newGroup.RestoreForState(restoreTime) |
|
|
|
|
|
|
|
|
|
// Checking if we have restored it correctly.
|
|
|
|
|
switch { |
|
|
|
|
case tst.noRestore: |
|
|
|
|
require.Len(t, got, tst.num) |
|
|
|
|
for _, e := range got { |
|
|
|
|
require.Equal(t, e.ActiveAt, restoreTime) |
|
|
|
|
} |
|
|
|
|
case tst.gracePeriod: |
|
|
|
|
require.Len(t, got, tst.num) |
|
|
|
|
for _, e := range got { |
|
|
|
|
require.Equal(t, opts.ForGracePeriod, e.ActiveAt.Add(alertForDuration).Sub(restoreTime)) |
|
|
|
|
} |
|
|
|
|
default: |
|
|
|
|
exp := tst.alerts |
|
|
|
|
require.Equal(t, len(exp), len(got)) |
|
|
|
|
sortAlerts(exp) |
|
|
|
|
sortAlerts(got) |
|
|
|
|
for i, e := range exp { |
|
|
|
|
require.Equal(t, e.Labels, got[i].Labels) |
|
|
|
|
|
|
|
|
|
// Difference in time should be within 1e6 ns, i.e. 1ms
|
|
|
|
|
// (due to conversion between ns & ms, float64 & int64).
|
|
|
|
|
activeAtDiff := float64(e.ActiveAt.Unix() + int64(tst.downDuration/time.Second) - got[i].ActiveAt.Unix()) |
|
|
|
|
require.Equal(t, 0.0, math.Abs(activeAtDiff), "'for' state restored time is wrong") |
|
|
|
|
got := newRule.ActiveAlerts() |
|
|
|
|
for _, aa := range got { |
|
|
|
|
require.Zero(t, aa.Labels.Get(model.MetricNameLabel), "%s label set on active alert: %s", model.MetricNameLabel, aa.Labels) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
sort.Slice(got, func(i, j int) bool { |
|
|
|
|
return labels.Compare(got[i].Labels, got[j].Labels) < 0 |
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
for _, tst := range tests { |
|
|
|
|
testFunc(tst) |
|
|
|
|
} |
|
|
|
|
// Checking if we have restored it correctly.
|
|
|
|
|
switch { |
|
|
|
|
case tt.noRestore: |
|
|
|
|
require.Len(t, got, tt.num) |
|
|
|
|
for _, e := range got { |
|
|
|
|
require.Equal(t, e.ActiveAt, restoreTime) |
|
|
|
|
} |
|
|
|
|
case tt.gracePeriod: |
|
|
|
|
|
|
|
|
|
// Testing the grace period.
|
|
|
|
|
for _, duration := range []time.Duration{10 * time.Minute, 15 * time.Minute, 20 * time.Minute} { |
|
|
|
|
evalTime := baseTime.Add(duration) |
|
|
|
|
group.Eval(context.TODO(), evalTime) |
|
|
|
|
require.Len(t, got, tt.num) |
|
|
|
|
for _, e := range got { |
|
|
|
|
require.Equal(t, opts.ForGracePeriod, e.ActiveAt.Add(alertForDuration).Sub(restoreTime)) |
|
|
|
|
} |
|
|
|
|
default: |
|
|
|
|
exp := tt.alerts |
|
|
|
|
require.Equal(t, len(exp), len(got)) |
|
|
|
|
sortAlerts(exp) |
|
|
|
|
sortAlerts(got) |
|
|
|
|
for i, e := range exp { |
|
|
|
|
require.Equal(t, e.Labels, got[i].Labels) |
|
|
|
|
|
|
|
|
|
// Difference in time should be within 1e6 ns, i.e. 1ms
|
|
|
|
|
// (due to conversion between ns & ms, float64 & int64).
|
|
|
|
|
activeAtDiff := float64(e.ActiveAt.Unix() + int64(tt.downDuration/time.Second) - got[i].ActiveAt.Unix()) |
|
|
|
|
require.Equal(t, 0.0, math.Abs(activeAtDiff), "'for' state restored time is wrong") |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
}) |
|
|
|
|
} |
|
|
|
|
testFunc(testInput{ |
|
|
|
|
restoreDuration: 25 * time.Minute, |
|
|
|
|
alerts: []*Alert{}, |
|
|
|
|
gracePeriod: true, |
|
|
|
|
num: 2, |
|
|
|
|
}) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func TestStaleness(t *testing.T) { |
|
|
|
|
|