promtail retry 429 rate limit errors, clarifying risks in the docs of configuring multiple client sections in promtail, also increased the backoff and retry settings in promtail.

Signed-off-by: Edward Welch <edward.welch@grafana.com>
pull/1853/head
Edward Welch 5 years ago committed by Ed Welch
parent 30303c6a71
commit 6841c418ef
  1. 5
      docs/clients/promtail/configuration.md
  2. 4
      pkg/promtail/client/client.go
  3. 29
      pkg/promtail/client/client_test.go
  4. 8
      pkg/promtail/client/config.go

@ -68,6 +68,11 @@ Supported contents and default values of `config.yaml`:
# Describes how Promtail connects to multiple instances # Describes how Promtail connects to multiple instances
# of Loki, sending logs to each. # of Loki, sending logs to each.
# WARNING: If one of the remote Loki servers fails to respond or responds
# with any error which is retriable, this will impact sending logs to any
# other configured remote Loki servers. Sending is done on a single thread!
# It is generally recommended to run multiple promtail clients in parallel
# if you want to send to multiple remote Loki instances.
clients: clients:
- [<client_config>] - [<client_config>]

@ -234,8 +234,8 @@ func (c *client) sendBatch(tenantID string, batch *batch) {
return return
} }
// Only retry 500s and connection-level errors. // Only retry 429s, 500s and connection-level errors.
if status > 0 && status/100 != 5 { if status > 0 && status != 429 && status/100 != 5 {
break break
} }

@ -152,6 +152,35 @@ func TestClient_Handle(t *testing.T) {
promtail_sent_entries_total{host="__HOST__"} 0 promtail_sent_entries_total{host="__HOST__"} 0
`, `,
}, },
"do retry sending a batch in case the server responds with a 429": {
clientBatchSize: 10,
clientBatchWait: 10 * time.Millisecond,
clientMaxRetries: 3,
serverResponseStatus: 429,
inputEntries: []entry{logEntries[0]},
expectedReqs: []receivedReq{
{
tenantID: "",
pushReq: logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
},
{
tenantID: "",
pushReq: logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
},
{
tenantID: "",
pushReq: logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
},
},
expectedMetrics: `
# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
# TYPE promtail_dropped_entries_total counter
promtail_dropped_entries_total{host="__HOST__"} 1.0
# HELP promtail_sent_entries_total Number of log entries sent to the ingester.
# TYPE promtail_sent_entries_total counter
promtail_sent_entries_total{host="__HOST__"} 0
`,
},
"batch log entries together honoring the client tenant ID": { "batch log entries together honoring the client tenant ID": {
clientBatchSize: 100, clientBatchSize: 100,
clientBatchWait: 100 * time.Millisecond, clientBatchWait: 100 * time.Millisecond,

@ -34,10 +34,10 @@ func (c *Config) RegisterFlags(flags *flag.FlagSet) {
flags.Var(&c.URL, "client.url", "URL of log server") flags.Var(&c.URL, "client.url", "URL of log server")
flags.DurationVar(&c.BatchWait, "client.batch-wait", 1*time.Second, "Maximum wait period before sending batch.") flags.DurationVar(&c.BatchWait, "client.batch-wait", 1*time.Second, "Maximum wait period before sending batch.")
flags.IntVar(&c.BatchSize, "client.batch-size-bytes", 100*1024, "Maximum batch size to accrue before sending. ") flags.IntVar(&c.BatchSize, "client.batch-size-bytes", 100*1024, "Maximum batch size to accrue before sending. ")
// Default backoff schedule: 0.5s, 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s(4.267m) For a total time of 511.5s(8.5m) before logs are lost
flag.IntVar(&c.BackoffConfig.MaxRetries, "client.max-retries", 5, "Maximum number of retires when sending batches.") flag.IntVar(&c.BackoffConfig.MaxRetries, "client.max-retries", 10, "Maximum number of retires when sending batches.")
flag.DurationVar(&c.BackoffConfig.MinBackoff, "client.min-backoff", 100*time.Millisecond, "Initial backoff time between retries.") flag.DurationVar(&c.BackoffConfig.MinBackoff, "client.min-backoff", 500*time.Millisecond, "Initial backoff time between retries.")
flag.DurationVar(&c.BackoffConfig.MaxBackoff, "client.max-backoff", 5*time.Second, "Maximum backoff time between retries.") flag.DurationVar(&c.BackoffConfig.MaxBackoff, "client.max-backoff", 5*time.Minute, "Maximum backoff time between retries.")
flag.DurationVar(&c.Timeout, "client.timeout", 10*time.Second, "Maximum time to wait for server to respond to a request") flag.DurationVar(&c.Timeout, "client.timeout", 10*time.Second, "Maximum time to wait for server to respond to a request")
flags.Var(&c.ExternalLabels, "client.external-labels", "list of external labels to add to each log (e.g: --client.external-labels=lb1=v1,lb2=v2)") flags.Var(&c.ExternalLabels, "client.external-labels", "list of external labels to add to each log (e.g: --client.external-labels=lb1=v1,lb2=v2)")

Loading…
Cancel
Save