From f06cf664e1e213ef6a17e7bb49dfb2b8b764fb46 Mon Sep 17 00:00:00 2001 From: Fabian Reinartz Date: Tue, 30 Jun 2015 11:51:05 +0200 Subject: [PATCH] rules: cleanup alerting test --- rules/manager.go | 48 +++++--- rules/manager_test.go | 278 ++++++++++++++++++------------------------ 2 files changed, 149 insertions(+), 177 deletions(-) diff --git a/rules/manager.go b/rules/manager.go index 59f9573b9f..1e85cf5c58 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -21,6 +21,8 @@ import ( "sync" "time" + html_template "html/template" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/log" @@ -84,7 +86,7 @@ type Rule interface { String() string // HTMLSnippet returns a human-readable string representation of the rule, // decorated with HTML elements for use the web frontend. - HTMLSnippet(pathPrefix string) template.HTML + HTMLSnippet(pathPrefix string) html_template.HTML } // The Manager manages recording and alerting rules. @@ -285,22 +287,42 @@ func (m *Manager) runIteration() { wg.Wait() } +// transferAlertState makes a copy of the state of alerting rules and returns a function +// that restores them in the current state. +func (m *Manager) transferAlertState() func() { + + alertingRules := map[string]*AlertingRule{} + for _, r := range m.rules { + if ar, ok := r.(*AlertingRule); ok { + alertingRules[ar.name] = ar + } + } + + return func() { + // Restore alerting rule state. + for _, r := range m.rules { + ar, ok := r.(*AlertingRule) + if !ok { + continue + } + if old, ok := alertingRules[ar.name]; ok { + ar.activeAlerts = old.activeAlerts + } + } + } +} + // ApplyConfig updates the rule manager's state as the config requires. If // loading the new rules failed the old rule set is restored. Returns true on success. func (m *Manager) ApplyConfig(conf *config.Config) bool { m.Lock() defer m.Unlock() + defer m.transferAlertState()() + success := true m.interval = time.Duration(conf.GlobalConfig.EvaluationInterval) - alertingRules := map[string]*AlertingRule{} - for _, r := range m.rules { - if ar, ok := r.(*AlertingRule); ok { - alertingRules[ar.name] = ar - } - } - rulesSnapshot := make([]Rule, len(m.rules)) copy(rulesSnapshot, m.rules) m.rules = m.rules[:0] @@ -321,16 +343,6 @@ func (m *Manager) ApplyConfig(conf *config.Config) bool { log.Errorf("Error loading rules, previous rule set restored: %s", err) success = false } - // Restore alerting rule state. - for _, r := range m.rules { - ar, ok := r.(*AlertingRule) - if !ok { - continue - } - if old, ok := alertingRules[ar.name]; ok { - ar.activeAlerts = old.activeAlerts - } - } return success } diff --git a/rules/manager_test.go b/rules/manager_test.go index d29cc60883..f51899f71f 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -15,6 +15,7 @@ package rules import ( "fmt" + "reflect" "strings" "testing" "time" @@ -22,202 +23,161 @@ import ( clientmodel "github.com/prometheus/client_golang/model" "github.com/prometheus/prometheus/promql" - "github.com/prometheus/prometheus/storage/local" - "github.com/prometheus/prometheus/storage/metric" ) -var ( - testSampleInterval = time.Duration(5) * time.Minute - testStartTime = clientmodel.Timestamp(0) -) - -func getTestValueStream(startVal clientmodel.SampleValue, endVal clientmodel.SampleValue, stepVal clientmodel.SampleValue, startTime clientmodel.Timestamp) (resultValues metric.Values) { - currentTime := startTime - for currentVal := startVal; currentVal <= endVal; currentVal += stepVal { - sample := metric.SamplePair{ - Value: currentVal, - Timestamp: currentTime, - } - resultValues = append(resultValues, sample) - currentTime = currentTime.Add(testSampleInterval) - } - return resultValues -} - -func getTestVectorFromTestMatrix(matrix promql.Matrix) promql.Vector { - vector := promql.Vector{} - for _, sampleStream := range matrix { - lastSample := sampleStream.Values[len(sampleStream.Values)-1] - vector = append(vector, &promql.Sample{ - Metric: sampleStream.Metric, - Value: lastSample.Value, - Timestamp: lastSample.Timestamp, - }) +func TestAlertingRule(t *testing.T) { + suite, err := promql.NewTest(t, ` + load 5m + http_requests{job="api-server", instance="0", group="production"} 0+10x10 + http_requests{job="api-server", instance="1", group="production"} 0+20x10 + http_requests{job="api-server", instance="0", group="canary"} 0+30x10 + http_requests{job="api-server", instance="1", group="canary"} 0+40x10 + http_requests{job="app-server", instance="0", group="production"} 0+50x10 + http_requests{job="app-server", instance="1", group="production"} 0+60x10 + http_requests{job="app-server", instance="0", group="canary"} 0+70x10 + http_requests{job="app-server", instance="1", group="canary"} 0+80x10 + `) + if err != nil { + t.Fatal(err) } - return vector -} + defer suite.Close() -func storeMatrix(storage local.Storage, matrix promql.Matrix) { - pendingSamples := clientmodel.Samples{} - for _, sampleStream := range matrix { - for _, sample := range sampleStream.Values { - pendingSamples = append(pendingSamples, &clientmodel.Sample{ - Metric: sampleStream.Metric.Metric, - Value: sample.Value, - Timestamp: sample.Timestamp, - }) - } - } - for _, s := range pendingSamples { - storage.Append(s) + if err := suite.Run(); err != nil { + t.Fatal(err) } - storage.WaitForIndexing() -} - -func vectorComparisonString(expected []string, actual []string) string { - separator := "\n--------------\n" - return fmt.Sprintf("Expected:%v%v%v\nActual:%v%v%v ", - separator, - strings.Join(expected, "\n"), - separator, - separator, - strings.Join(actual, "\n"), - separator) -} -func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string { - annotatedLines := []string{} - for _, line := range lines { - annotatedLines = append(annotatedLines, fmt.Sprintf(line, timestamp)) + expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`) + if err != nil { + t.Fatalf("Unable to parse alert expression: %s", err) } - return annotatedLines -} -var testMatrix = promql.Matrix{ - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "api-server", - "instance": "0", - "group": "canary", - }, - }, - Values: getTestValueStream(0, 300, 30, testStartTime), - }, - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "api-server", - "instance": "1", - "group": "canary", + rule := NewAlertingRule( + "HTTPRequestRateLow", + expr, + time.Minute, + clientmodel.LabelSet{"severity": "critical"}, + "summary", "description", "runbook", + ) + + var tests = []struct { + time time.Duration + result []string + }{ + { + time: 0, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, }, - }, - Values: getTestValueStream(0, 400, 40, testStartTime), - }, - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "app-server", - "instance": "0", - "group": "canary", + }, { + time: 5 * time.Minute, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, }, - }, - Values: getTestValueStream(0, 700, 70, testStartTime), - }, - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "app-server", - "instance": "1", - "group": "canary", + }, { + time: 10 * time.Minute, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, }, }, - Values: getTestValueStream(0, 800, 80, testStartTime), - }, -} - -func TestAlertingRule(t *testing.T) { - // Labels in expected output need to be alphabetically sorted. - var evalOutputs = [][]string{ { - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, + time: 15 * time.Minute, + result: nil, }, { - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, + time: 20 * time.Minute, + result: nil, }, - { - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, - }, - { - /* empty */ - }, - { - /* empty */ - }, - } - - storage, closer := local.NewTestStorage(t, 1) - defer closer.Close() - - storeMatrix(storage, testMatrix) - - engine := promql.NewEngine(storage, nil) - defer engine.Stop() - - expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`) - if err != nil { - t.Fatalf("Unable to parse alert expression: %s", err) } - alertLabels := clientmodel.LabelSet{ - "severity": "critical", - } - rule := NewAlertingRule("HttpRequestRateLow", expr, time.Minute, alertLabels, "summary", "description", "runbook") - - for i, expectedLines := range evalOutputs { - evalTime := testStartTime.Add(testSampleInterval * time.Duration(i)) + for i, test := range tests { + evalTime := clientmodel.Timestamp(0).Add(test.time) - res, err := rule.eval(evalTime, engine) + res, err := rule.eval(evalTime, suite.QueryEngine()) if err != nil { t.Fatalf("Error during alerting rule evaluation: %s", err) } - actualLines := strings.Split(res.String(), "\n") - expectedLines := annotateWithTime(expectedLines, evalTime) - if actualLines[0] == "" { - actualLines = []string{} + actual := strings.Split(res.String(), "\n") + expected := annotateWithTime(test.result, evalTime) + if actual[0] == "" { + actual = []string{} } - failed := false - if len(actualLines) != len(expectedLines) { - t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expectedLines), len(actualLines)) - failed = true + if len(actual) != len(expected) { + t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expected), len(actual)) } - for j, expectedSample := range expectedLines { + for j, expectedSample := range expected { found := false - for _, actualSample := range actualLines { + for _, actualSample := range actual { if actualSample == expectedSample { found = true } } if !found { t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample) - failed = true } } - if failed { - t.Fatalf("%d. Expected and actual outputs don't match:\n%v", i, vectorComparisonString(expectedLines, actualLines)) + if t.Failed() { + t.Errorf("%d. Expected and actual outputs don't match:", i) + t.Fatalf("Expected:\n%v\n----\nActual:\n%v", strings.Join(expected, "\n"), strings.Join(actual, "\n")) } } } + +func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string { + annotatedLines := []string{} + for _, line := range lines { + annotatedLines = append(annotatedLines, fmt.Sprintf(line, timestamp)) + } + return annotatedLines +} + +func TestTransferAlertState(t *testing.T) { + m := NewManager(&ManagerOptions{}) + + alert := &Alert{ + Name: "testalert", + State: StateFiring, + } + + arule := AlertingRule{ + name: "test", + activeAlerts: map[clientmodel.Fingerprint]*Alert{}, + } + aruleCopy := arule + + m.rules = append(m.rules, &arule) + + // Set an alert. + arule.activeAlerts[0] = alert + + // Save state and get the restore function. + restore := m.transferAlertState() + + // Remove arule from the rule list and add an unrelated rule and the + // stateless copy of arule. + m.rules = []Rule{ + &AlertingRule{ + name: "test_other", + activeAlerts: map[clientmodel.Fingerprint]*Alert{}, + }, + &aruleCopy, + } + + // Apply the restore function. + restore() + + if ar := m.rules[0].(*AlertingRule); len(ar.activeAlerts) != 0 { + t.Fatalf("unexpected alert for unrelated alerting rule") + } + if ar := m.rules[1].(*AlertingRule); !reflect.DeepEqual(ar.activeAlerts[0], alert) { + t.Fatalf("alert state was not restored") + } +}