Alerting: Separate write errors from Prometheus/Mimir into 400/500 categories (#94699)

* Separate errors from Prometheus/Mimir into categories

* Drop duplicate

* tests
pull/94595/head
Alexander Weaver 7 months ago committed by GitHub
parent a8c1c15235
commit 4d8d916434
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 36
      pkg/services/ngalert/writer/prom.go
  2. 17
      pkg/services/ngalert/writer/prom_test.go

@ -25,14 +25,18 @@ const backendType = "prometheus"
const (
// Fixed error messages
MimirDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp"
MimirInvalidLabelError = "err-mimir-label-invalid"
// Best effort error messages
PrometheusDuplicateTimestampError = "duplicate sample for timestamp"
)
var (
ErrWriteFailure = errors.New("failed to write time series")
ErrBadFrame = errors.New("failed to read dataframe")
// Unexpected, 500-like write errors.
ErrUnexpectedWriteFailure = errors.New("failed to write time series")
// Expected, user-level write errors like trying to write an invalid series.
ErrRejectedWrite = errors.New("series was rejected")
ErrBadFrame = errors.New("failed to read dataframe")
)
var DuplicateTimestampErrors = [...]string{
@ -213,10 +217,12 @@ func (w PrometheusWriter) Write(ctx context.Context, name string, t time.Time, f
lvs = append(lvs, fmt.Sprint(res.StatusCode))
w.metrics.WritesTotal.WithLabelValues(lvs...).Inc()
if err, ignored := checkWriteError(writeErr); err != nil {
return errors.Join(ErrWriteFailure, err)
} else if ignored {
l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode)
if writeErr != nil {
if err, ignored := checkWriteError(writeErr); err != nil {
return err
} else if ignored {
l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode)
}
}
return nil
@ -242,7 +248,12 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
return nil, false
}
// special case for 400 status code
// All 500-range statuses are automatically unexpected and not the fault of the data.
if writeErr.StatusCode()/100 == 5 {
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
}
// Special case for 400 status code. 400s may be ignorable in the event of HA writers, or the fault of the written data.
if writeErr.StatusCode() == 400 {
msg := writeErr.Error()
// HA may potentially write different values for the same timestamp, so we ignore this error
@ -252,7 +263,16 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
return nil, true
}
}
if strings.Contains(msg, MimirInvalidLabelError) {
return errors.Join(ErrRejectedWrite, writeErr), false
}
// For now, all 400s that are not previously known are considered unexpected.
// TODO: Consider blanket-converting all 400s to be known errors. This should only be done once we are confident this is not a problem with this client.
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
}
return writeErr, false
// All other errors which do not fit into the above categories are also unexpected.
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
}

@ -166,6 +166,7 @@ func TestPrometheusWriter_Write(t *testing.T) {
err := writer.Write(ctx, "test", now, frames, 1, map[string]string{})
require.Error(t, err)
require.ErrorIs(t, err, clientErr)
require.ErrorIs(t, err, ErrUnexpectedWriteFailure)
})
t.Run("writes expected points", func(t *testing.T) {
@ -204,6 +205,22 @@ func TestPrometheusWriter_Write(t *testing.T) {
})
}
})
t.Run("bad labels fit under the client error category", func(t *testing.T) {
msg := MimirInvalidLabelError
clientErr := testClientWriteError{
statusCode: http.StatusBadRequest,
msg: &msg,
}
client.writeSeriesFunc = func(ctx context.Context, ts promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError) {
return promremote.WriteResult{}, clientErr
}
err := writer.Write(ctx, "test", now, frames, 1, map[string]string{"extra": "label"})
require.Error(t, err)
require.ErrorIs(t, err, ErrRejectedWrite)
})
}
func extractValue(t *testing.T, frames data.Frames, labels map[string]string, frameType data.FrameType) float64 {

Loading…
Cancel
Save