Alerting: Separate write errors from Prometheus/Mimir into 400/500 categories (#94699)

* Separate errors from Prometheus/Mimir into categories

* Drop duplicate

* tests
pull/94595/head
Alexander Weaver 7 months ago committed by GitHub
parent a8c1c15235
commit 4d8d916434
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 36
      pkg/services/ngalert/writer/prom.go
  2. 17
      pkg/services/ngalert/writer/prom_test.go

@ -25,14 +25,18 @@ const backendType = "prometheus"
const ( const (
// Fixed error messages // Fixed error messages
MimirDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp" MimirDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp"
MimirInvalidLabelError = "err-mimir-label-invalid"
// Best effort error messages // Best effort error messages
PrometheusDuplicateTimestampError = "duplicate sample for timestamp" PrometheusDuplicateTimestampError = "duplicate sample for timestamp"
) )
var ( var (
ErrWriteFailure = errors.New("failed to write time series") // Unexpected, 500-like write errors.
ErrBadFrame = errors.New("failed to read dataframe") ErrUnexpectedWriteFailure = errors.New("failed to write time series")
// Expected, user-level write errors like trying to write an invalid series.
ErrRejectedWrite = errors.New("series was rejected")
ErrBadFrame = errors.New("failed to read dataframe")
) )
var DuplicateTimestampErrors = [...]string{ var DuplicateTimestampErrors = [...]string{
@ -213,10 +217,12 @@ func (w PrometheusWriter) Write(ctx context.Context, name string, t time.Time, f
lvs = append(lvs, fmt.Sprint(res.StatusCode)) lvs = append(lvs, fmt.Sprint(res.StatusCode))
w.metrics.WritesTotal.WithLabelValues(lvs...).Inc() w.metrics.WritesTotal.WithLabelValues(lvs...).Inc()
if err, ignored := checkWriteError(writeErr); err != nil { if writeErr != nil {
return errors.Join(ErrWriteFailure, err) if err, ignored := checkWriteError(writeErr); err != nil {
} else if ignored { return err
l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode) } else if ignored {
l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode)
}
} }
return nil return nil
@ -242,7 +248,12 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
return nil, false return nil, false
} }
// special case for 400 status code // All 500-range statuses are automatically unexpected and not the fault of the data.
if writeErr.StatusCode()/100 == 5 {
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
}
// Special case for 400 status code. 400s may be ignorable in the event of HA writers, or the fault of the written data.
if writeErr.StatusCode() == 400 { if writeErr.StatusCode() == 400 {
msg := writeErr.Error() msg := writeErr.Error()
// HA may potentially write different values for the same timestamp, so we ignore this error // HA may potentially write different values for the same timestamp, so we ignore this error
@ -252,7 +263,16 @@ func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
return nil, true return nil, true
} }
} }
if strings.Contains(msg, MimirInvalidLabelError) {
return errors.Join(ErrRejectedWrite, writeErr), false
}
// For now, all 400s that are not previously known are considered unexpected.
// TODO: Consider blanket-converting all 400s to be known errors. This should only be done once we are confident this is not a problem with this client.
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
} }
return writeErr, false // All other errors which do not fit into the above categories are also unexpected.
return errors.Join(ErrUnexpectedWriteFailure, writeErr), false
} }

@ -166,6 +166,7 @@ func TestPrometheusWriter_Write(t *testing.T) {
err := writer.Write(ctx, "test", now, frames, 1, map[string]string{}) err := writer.Write(ctx, "test", now, frames, 1, map[string]string{})
require.Error(t, err) require.Error(t, err)
require.ErrorIs(t, err, clientErr) require.ErrorIs(t, err, clientErr)
require.ErrorIs(t, err, ErrUnexpectedWriteFailure)
}) })
t.Run("writes expected points", func(t *testing.T) { t.Run("writes expected points", func(t *testing.T) {
@ -204,6 +205,22 @@ func TestPrometheusWriter_Write(t *testing.T) {
}) })
} }
}) })
t.Run("bad labels fit under the client error category", func(t *testing.T) {
msg := MimirInvalidLabelError
clientErr := testClientWriteError{
statusCode: http.StatusBadRequest,
msg: &msg,
}
client.writeSeriesFunc = func(ctx context.Context, ts promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError) {
return promremote.WriteResult{}, clientErr
}
err := writer.Write(ctx, "test", now, frames, 1, map[string]string{"extra": "label"})
require.Error(t, err)
require.ErrorIs(t, err, ErrRejectedWrite)
})
} }
func extractValue(t *testing.T, frames data.Frames, labels map[string]string, frameType data.FrameType) float64 { func extractValue(t *testing.T, frames data.Frames, labels map[string]string, frameType data.FrameType) float64 {

Loading…
Cancel
Save