mirror of https://github.com/grafana/grafana
Contact point testing (#37308)
This commit adds contact point testing to ngalerts via a new API endpoint. This endpoint accepts JSON containing a list of receiver configurations which are validated and then tested with a notification for a test alert. The endpoint returns JSON for each receiver with a status and error message. It accepts a configurable timeout via the Request-Timeout header (in seconds) up to a maximum of 30 seconds.pull/37475/head
parent
afabc617ed
commit
3ca00f90b5
@ -0,0 +1,140 @@ |
||||
package api |
||||
|
||||
import ( |
||||
"context" |
||||
"net/http" |
||||
"testing" |
||||
"time" |
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/notifier" |
||||
"github.com/stretchr/testify/require" |
||||
) |
||||
|
||||
func TestContextWithTimeoutFromRequest(t *testing.T) { |
||||
t.Run("assert context has default timeout when header is absent", func(t *testing.T) { |
||||
req, err := http.NewRequest(http.MethodGet, "https://grafana.net", nil) |
||||
require.NoError(t, err) |
||||
|
||||
now := time.Now() |
||||
ctx := context.Background() |
||||
ctx, cancelFunc, err := contextWithTimeoutFromRequest( |
||||
ctx, |
||||
req, |
||||
15*time.Second, |
||||
30*time.Second) |
||||
require.NoError(t, err) |
||||
require.NotNil(t, cancelFunc) |
||||
require.NotNil(t, ctx) |
||||
|
||||
deadline, ok := ctx.Deadline() |
||||
require.True(t, ok) |
||||
require.True(t, deadline.After(now)) |
||||
require.Less(t, deadline.Sub(now).Seconds(), 30.0) |
||||
require.GreaterOrEqual(t, deadline.Sub(now).Seconds(), 15.0) |
||||
}) |
||||
|
||||
t.Run("assert context has timeout in request header", func(t *testing.T) { |
||||
req, err := http.NewRequest(http.MethodGet, "https://grafana.net", nil) |
||||
require.NoError(t, err) |
||||
req.Header.Set("Request-Timeout", "5") |
||||
|
||||
now := time.Now() |
||||
ctx := context.Background() |
||||
ctx, cancelFunc, err := contextWithTimeoutFromRequest( |
||||
ctx, |
||||
req, |
||||
15*time.Second, |
||||
30*time.Second) |
||||
require.NoError(t, err) |
||||
require.NotNil(t, cancelFunc) |
||||
require.NotNil(t, ctx) |
||||
|
||||
deadline, ok := ctx.Deadline() |
||||
require.True(t, ok) |
||||
require.True(t, deadline.After(now)) |
||||
require.Less(t, deadline.Sub(now).Seconds(), 15.0) |
||||
require.GreaterOrEqual(t, deadline.Sub(now).Seconds(), 5.0) |
||||
}) |
||||
|
||||
t.Run("assert timeout in request header cannot exceed max timeout", func(t *testing.T) { |
||||
req, err := http.NewRequest(http.MethodGet, "https://grafana.net", nil) |
||||
require.NoError(t, err) |
||||
req.Header.Set("Request-Timeout", "60") |
||||
|
||||
ctx := context.Background() |
||||
ctx, cancelFunc, err := contextWithTimeoutFromRequest( |
||||
ctx, |
||||
req, |
||||
15*time.Second, |
||||
30*time.Second) |
||||
require.Error(t, err, "exceeded maximum timeout") |
||||
require.Nil(t, cancelFunc) |
||||
require.Nil(t, ctx) |
||||
}) |
||||
} |
||||
|
||||
func TestStatusForTestReceivers(t *testing.T) { |
||||
t.Run("assert HTTP 400 Status Bad Request for no receivers", func(t *testing.T) { |
||||
require.Equal(t, http.StatusBadRequest, statusForTestReceivers([]notifier.TestReceiverResult{})) |
||||
}) |
||||
|
||||
t.Run("assert HTTP 400 Bad Request when all invalid receivers", func(t *testing.T) { |
||||
require.Equal(t, http.StatusBadRequest, statusForTestReceivers([]notifier.TestReceiverResult{{ |
||||
Name: "test1", |
||||
Configs: []notifier.TestReceiverConfigResult{{ |
||||
Name: "test1", |
||||
UID: "uid1", |
||||
Status: "failed", |
||||
Error: notifier.InvalidReceiverError{}, |
||||
}}, |
||||
}, { |
||||
Name: "test2", |
||||
Configs: []notifier.TestReceiverConfigResult{{ |
||||
Name: "test2", |
||||
UID: "uid2", |
||||
Status: "failed", |
||||
Error: notifier.InvalidReceiverError{}, |
||||
}}, |
||||
}})) |
||||
}) |
||||
|
||||
t.Run("assert HTTP 408 Request Timeout when all receivers timed out", func(t *testing.T) { |
||||
require.Equal(t, http.StatusRequestTimeout, statusForTestReceivers([]notifier.TestReceiverResult{{ |
||||
Name: "test1", |
||||
Configs: []notifier.TestReceiverConfigResult{{ |
||||
Name: "test1", |
||||
UID: "uid1", |
||||
Status: "failed", |
||||
Error: notifier.ReceiverTimeoutError{}, |
||||
}}, |
||||
}, { |
||||
Name: "test2", |
||||
Configs: []notifier.TestReceiverConfigResult{{ |
||||
Name: "test2", |
||||
UID: "uid2", |
||||
Status: "failed", |
||||
Error: notifier.ReceiverTimeoutError{}, |
||||
}}, |
||||
}})) |
||||
}) |
||||
|
||||
t.Run("assert 207 Multi Status for different errors", func(t *testing.T) { |
||||
require.Equal(t, http.StatusMultiStatus, statusForTestReceivers([]notifier.TestReceiverResult{{ |
||||
Name: "test1", |
||||
Configs: []notifier.TestReceiverConfigResult{{ |
||||
Name: "test1", |
||||
UID: "uid1", |
||||
Status: "failed", |
||||
Error: notifier.InvalidReceiverError{}, |
||||
}}, |
||||
}, { |
||||
Name: "test2", |
||||
Configs: []notifier.TestReceiverConfigResult{{ |
||||
Name: "test2", |
||||
UID: "uid2", |
||||
Status: "failed", |
||||
Error: notifier.ReceiverTimeoutError{}, |
||||
}}, |
||||
}})) |
||||
}) |
||||
} |
||||
@ -0,0 +1,227 @@ |
||||
package notifier |
||||
|
||||
import ( |
||||
"context" |
||||
"errors" |
||||
"fmt" |
||||
"net/url" |
||||
"time" |
||||
|
||||
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" |
||||
"github.com/prometheus/alertmanager/notify" |
||||
"github.com/prometheus/alertmanager/types" |
||||
"github.com/prometheus/common/model" |
||||
"golang.org/x/sync/errgroup" |
||||
) |
||||
|
||||
const ( |
||||
maxTestReceiversWorkers = 10 |
||||
) |
||||
|
||||
var ( |
||||
ErrNoReceivers = errors.New("no receivers") |
||||
) |
||||
|
||||
type TestReceiversResult struct { |
||||
Receivers []TestReceiverResult |
||||
NotifedAt time.Time |
||||
} |
||||
|
||||
type TestReceiverResult struct { |
||||
Name string |
||||
Configs []TestReceiverConfigResult |
||||
} |
||||
|
||||
type TestReceiverConfigResult struct { |
||||
Name string |
||||
UID string |
||||
Status string |
||||
Error error |
||||
} |
||||
|
||||
type InvalidReceiverError struct { |
||||
Receiver *apimodels.PostableGrafanaReceiver |
||||
Err error |
||||
} |
||||
|
||||
func (e InvalidReceiverError) Error() string { |
||||
return fmt.Sprintf("the receiver is invalid: %s", e.Err) |
||||
} |
||||
|
||||
type ReceiverTimeoutError struct { |
||||
Receiver *apimodels.PostableGrafanaReceiver |
||||
Err error |
||||
} |
||||
|
||||
func (e ReceiverTimeoutError) Error() string { |
||||
return fmt.Sprintf("the receiver timed out: %s", e.Err) |
||||
} |
||||
|
||||
func (am *Alertmanager) TestReceivers(ctx context.Context, c apimodels.TestReceiversConfigParams) (*TestReceiversResult, error) { |
||||
// now represents the start time of the test
|
||||
now := time.Now() |
||||
testAlert := &types.Alert{ |
||||
Alert: model.Alert{ |
||||
Labels: model.LabelSet{ |
||||
model.LabelName("alertname"): "TestAlertAlwaysFiring", |
||||
model.LabelName("instance"): "Grafana", |
||||
}, |
||||
Annotations: model.LabelSet{ |
||||
model.LabelName("summary"): "TestAlertAlwaysFiring", |
||||
model.LabelName("description"): "This is a test alert from Grafana", |
||||
}, |
||||
StartsAt: now, |
||||
}, |
||||
UpdatedAt: now, |
||||
} |
||||
|
||||
// we must set a group key that is unique per test as some receivers use this key to deduplicate alerts
|
||||
ctx = notify.WithGroupKey(ctx, testAlert.Labels.String()+now.String()) |
||||
|
||||
tmpl, err := am.getTemplate() |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to get template: %w", err) |
||||
} |
||||
|
||||
// job contains all metadata required to test a receiver
|
||||
type job struct { |
||||
Config *apimodels.PostableGrafanaReceiver |
||||
ReceiverName string |
||||
Notifier notify.Notifier |
||||
} |
||||
|
||||
// result contains the receiver that was tested and an error that is non-nil if the test failed
|
||||
type result struct { |
||||
Config *apimodels.PostableGrafanaReceiver |
||||
ReceiverName string |
||||
Error error |
||||
} |
||||
|
||||
newTestReceiversResult := func(results []result, notifiedAt time.Time) *TestReceiversResult { |
||||
m := make(map[string]TestReceiverResult) |
||||
for _, receiver := range c.Receivers { |
||||
// set up the result for this receiver
|
||||
m[receiver.Name] = TestReceiverResult{ |
||||
Name: receiver.Name, |
||||
// A Grafana receiver can have multiple nested receivers
|
||||
Configs: make([]TestReceiverConfigResult, 0, len(receiver.GrafanaManagedReceivers)), |
||||
} |
||||
} |
||||
for _, next := range results { |
||||
tmp := m[next.ReceiverName] |
||||
status := "ok" |
||||
if next.Error != nil { |
||||
status = "failed" |
||||
} |
||||
tmp.Configs = append(tmp.Configs, TestReceiverConfigResult{ |
||||
Name: next.Config.Name, |
||||
UID: next.Config.UID, |
||||
Status: status, |
||||
Error: processNotifierError(next.Config, next.Error), |
||||
}) |
||||
m[next.ReceiverName] = tmp |
||||
} |
||||
v := new(TestReceiversResult) |
||||
v.Receivers = make([]TestReceiverResult, 0, len(c.Receivers)) |
||||
v.NotifedAt = notifiedAt |
||||
for _, next := range m { |
||||
v.Receivers = append(v.Receivers, next) |
||||
} |
||||
return v |
||||
} |
||||
|
||||
// invalid keeps track of all invalid receiver configurations
|
||||
invalid := make([]result, 0, len(c.Receivers)) |
||||
// jobs keeps track of all receivers that need to be sent test notifications
|
||||
jobs := make([]job, 0, len(c.Receivers)) |
||||
|
||||
for _, receiver := range c.Receivers { |
||||
for _, next := range receiver.GrafanaManagedReceivers { |
||||
n, err := am.buildReceiverIntegration(next, tmpl) |
||||
if err != nil { |
||||
invalid = append(invalid, result{ |
||||
Config: next, |
||||
ReceiverName: next.Name, |
||||
Error: err, |
||||
}) |
||||
} else { |
||||
jobs = append(jobs, job{ |
||||
Config: next, |
||||
ReceiverName: receiver.Name, |
||||
Notifier: n, |
||||
}) |
||||
} |
||||
} |
||||
} |
||||
|
||||
if len(invalid)+len(jobs) == 0 { |
||||
return nil, ErrNoReceivers |
||||
} |
||||
|
||||
if len(jobs) == 0 { |
||||
return newTestReceiversResult(invalid, now), nil |
||||
} |
||||
|
||||
numWorkers := maxTestReceiversWorkers |
||||
if numWorkers > len(jobs) { |
||||
numWorkers = len(jobs) |
||||
} |
||||
|
||||
resultCh := make(chan result, len(jobs)) |
||||
workCh := make(chan job, len(jobs)) |
||||
for _, job := range jobs { |
||||
workCh <- job |
||||
} |
||||
close(workCh) |
||||
|
||||
g, ctx := errgroup.WithContext(ctx) |
||||
for i := 0; i < numWorkers; i++ { |
||||
g.Go(func() error { |
||||
for next := range workCh { |
||||
v := result{ |
||||
Config: next.Config, |
||||
ReceiverName: next.ReceiverName, |
||||
} |
||||
if _, err := next.Notifier.Notify(ctx, testAlert); err != nil { |
||||
v.Error = err |
||||
} |
||||
resultCh <- v |
||||
} |
||||
return nil |
||||
}) |
||||
} |
||||
g.Wait() // nolint
|
||||
close(resultCh) |
||||
|
||||
results := make([]result, 0, len(jobs)) |
||||
for next := range resultCh { |
||||
results = append(results, next) |
||||
} |
||||
|
||||
return newTestReceiversResult(append(invalid, results...), now), nil |
||||
} |
||||
|
||||
func processNotifierError(config *apimodels.PostableGrafanaReceiver, err error) error { |
||||
if err == nil { |
||||
return nil |
||||
} |
||||
|
||||
var urlError *url.Error |
||||
if errors.As(err, &urlError) { |
||||
if urlError.Timeout() { |
||||
return ReceiverTimeoutError{ |
||||
Receiver: config, |
||||
Err: err, |
||||
} |
||||
} |
||||
} |
||||
|
||||
if errors.Is(err, context.DeadlineExceeded) { |
||||
return ReceiverTimeoutError{ |
||||
Receiver: config, |
||||
Err: err, |
||||
} |
||||
} |
||||
|
||||
return err |
||||
} |
||||
@ -0,0 +1,82 @@ |
||||
package notifier |
||||
|
||||
import ( |
||||
"context" |
||||
"errors" |
||||
"net/url" |
||||
"testing" |
||||
|
||||
"github.com/stretchr/testify/require" |
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" |
||||
) |
||||
|
||||
func TestInvalidReceiverError_Error(t *testing.T) { |
||||
e := InvalidReceiverError{ |
||||
Receiver: &definitions.PostableGrafanaReceiver{ |
||||
Name: "test", |
||||
UID: "uid", |
||||
}, |
||||
Err: errors.New("this is an error"), |
||||
} |
||||
require.Equal(t, "the receiver is invalid: this is an error", e.Error()) |
||||
} |
||||
|
||||
func TestReceiverTimeoutError_Error(t *testing.T) { |
||||
e := ReceiverTimeoutError{ |
||||
Receiver: &definitions.PostableGrafanaReceiver{ |
||||
Name: "test", |
||||
UID: "uid", |
||||
}, |
||||
Err: errors.New("context deadline exceeded"), |
||||
} |
||||
require.Equal(t, "the receiver timed out: context deadline exceeded", e.Error()) |
||||
} |
||||
|
||||
type timeoutError struct{} |
||||
|
||||
func (e timeoutError) Error() string { |
||||
return "the request timed out" |
||||
} |
||||
|
||||
func (e timeoutError) Timeout() bool { |
||||
return true |
||||
} |
||||
|
||||
func TestProcessNotifierError(t *testing.T) { |
||||
t.Run("assert ReceiverTimeoutError is returned for context deadline exceeded", func(t *testing.T) { |
||||
r := &definitions.PostableGrafanaReceiver{ |
||||
Name: "test", |
||||
UID: "uid", |
||||
} |
||||
require.Equal(t, ReceiverTimeoutError{ |
||||
Receiver: r, |
||||
Err: context.DeadlineExceeded, |
||||
}, processNotifierError(r, context.DeadlineExceeded)) |
||||
}) |
||||
|
||||
t.Run("assert ReceiverTimeoutError is returned for *url.Error timeout", func(t *testing.T) { |
||||
r := &definitions.PostableGrafanaReceiver{ |
||||
Name: "test", |
||||
UID: "uid", |
||||
} |
||||
urlError := &url.Error{ |
||||
Op: "Get", |
||||
URL: "https://grafana.net", |
||||
Err: timeoutError{}, |
||||
} |
||||
require.Equal(t, ReceiverTimeoutError{ |
||||
Receiver: r, |
||||
Err: urlError, |
||||
}, processNotifierError(r, urlError)) |
||||
}) |
||||
|
||||
t.Run("assert unknown error is returned unmodified", func(t *testing.T) { |
||||
r := &definitions.PostableGrafanaReceiver{ |
||||
Name: "test", |
||||
UID: "uid", |
||||
} |
||||
err := errors.New("this is an error") |
||||
require.Equal(t, err, processNotifierError(r, err)) |
||||
}) |
||||
} |
||||
Loading…
Reference in new issue