Alerting: Handle connection errors in remote writer as expected (i.e. user) errors (#105687)

pull/105709/head
William Wernert 1 week ago committed by GitHub
parent d0d941fe8d
commit 0208231ed6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 18
      pkg/services/ngalert/writer/prom.go
  2. 32
      pkg/services/ngalert/writer/prom_test.go

@ -25,6 +25,11 @@ import (
const backendType = "prometheus"
const (
// Network error strings
networkErrDialTCP = "dial tcp"
networkErrConnectionRefused = "connection refused"
networkErrNoSuchHost = "no such host"
// NOTE: Mimir errors were copied from globalerror package:
// https://github.com/grafana/mimir/blob/1ff367ef58987cd1941de03a8d6923fde82dfdd3/pkg/util/globalerror/user.go
// Variable names have been standardized as Mimir+{globalerror.ID}+Error for consistency
@ -115,6 +120,7 @@ var (
ErrBadFrame = errors.New("failed to read dataframe")
ErrDatasourceUnauthorized = errors.New("failed to authenticate in datasource")
ErrDatasourceForbidden = errors.New("failed to authorize in datasource")
ErrConnectionFailure = errors.New("failed to connect to remote write endpoint")
// IgnoredErrors don't cause the Write to fail, but are still logged.
IgnoredErrors = []string{
@ -404,11 +410,19 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
return nil, false
}
// Network errors will be in the error string since we can't unwrap
errString := writeErr.Error()
if strings.Contains(errString, networkErrDialTCP) ||
strings.Contains(errString, networkErrConnectionRefused) ||
strings.Contains(errString, networkErrNoSuchHost) {
return fmt.Errorf("%w: %v", ErrConnectionFailure, errString), false
}
// Most 500-range statuses are automatically unexpected and not the fault of the data.
if writeErr.StatusCode()/100 == 5 {
// mimir does return some errors as 500s that should maybe not be considered as such?
// e.g. `multiple org IDs present`. Handle those separately though to make sure they're treated as exceptions
if strings.Contains(writeErr.Error(), MimirErrTooManyOrgIDs) {
if strings.Contains(errString, MimirErrTooManyOrgIDs) {
return errors.Join(ErrRejectedWrite, writeErr), true
}
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
@ -416,7 +430,7 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
// Special case for 400 status code. 400s may be ignorable in the event of HA writers, or the fault of the written data.
if writeErr.StatusCode() == 400 {
msg := writeErr.Error()
msg := errString
// HA may potentially write different values for the same timestamp, so we ignore this error
// TODO: this may not be needed, further testing needed
for _, e := range IgnoredErrors {

@ -4,6 +4,7 @@ import (
"context"
"math"
"math/rand/v2"
"net"
"net/http"
"reflect"
"slices"
@ -171,6 +172,27 @@ func TestPrometheusWriter_Write(t *testing.T) {
require.ErrorIs(t, err, ErrUnexpectedWriteFailure)
})
t.Run("handle connection failures", func(t *testing.T) {
dnsErr := &net.DNSError{
Err: "no such host",
Name: "host.example.com",
Server: "10.0.0.1:53",
IsTimeout: false,
IsNotFound: true,
}
client.writeSeriesFunc = func(ctx context.Context, ts promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError) {
return promremote.WriteResult{}, testClientWriteError{
statusCode: 0,
err: dnsErr,
}
}
err := writer.Write(ctx, "test", now, frames, 1, map[string]string{})
require.Error(t, err)
require.ErrorIs(t, err, ErrConnectionFailure)
require.Contains(t, err.Error(), dnsErr.Error())
})
t.Run("writes expected points", func(t *testing.T) {
client.writeSeriesFunc = func(ctx context.Context, tslist promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError) {
require.Len(t, tslist, len(series))
@ -539,6 +561,7 @@ func (c *testClient) WriteTimeSeries(
type testClientWriteError struct {
statusCode int
msg *string
err error
}
func (e testClientWriteError) StatusCode() int {
@ -546,8 +569,11 @@ func (e testClientWriteError) StatusCode() int {
}
func (e testClientWriteError) Error() string {
if e.msg == nil {
return "test error"
if e.err != nil {
return e.err.Error()
}
if e.msg != nil {
return *e.msg
}
return *e.msg
return "test client error"
}

Loading…
Cancel
Save