promtail retry 429 rate limit errors, clarifying risks in the docs of configuring multiple client sections in promtail, also increased the backoff and retry settings in promtail.

Signed-off-by: Edward Welch <edward.welch@grafana.com>
5 years ago · 6841c418ef
parent 30303c6a71
commit 6841c418ef
4 changed files with 40 additions and 6 deletions
--- a/docs/clients/promtail/configuration.md
+++ b/docs/clients/promtail/configuration.md
@ -68,6 +68,11 @@ Supported contents and default values of `config.yaml`:
 # Describes how Promtail connects to multiple instances
 # of Loki, sending logs to each.
 # WARNING: If one of the remote Loki servers fails to respond or responds 
 # with any error which is retriable, this will impact sending logs to any 
 # other configured remote Loki servers.  Sending is done on a single thread!
 # It is generally recommended to run multiple promtail clients in parallel
 # if you want to send to multiple remote Loki instances.
 clients:
  - [<client_config>]
--- a/pkg/promtail/client/client.go
+++ b/pkg/promtail/client/client.go
@ -234,8 +234,8 @@ func (c *client) sendBatch(tenantID string, batch *batch) {
 			return
 		}
-		// Only retry 500s and connection-level errors.
+		// Only retry 429s, 500s and connection-level errors.
-		if status > 0 && status/100 != 5 {
+		if status > 0 && status != 429 && status/100 != 5 {
 			break
 		}
--- a/pkg/promtail/client/client_test.go
+++ b/pkg/promtail/client/client_test.go
@ -152,6 +152,35 @@ func TestClient_Handle(t *testing.T) {
 				promtail_sent_entries_total{host="__HOST__"} 0
 			`,
 		},
 		"do retry sending a batch in case the server responds with a 429": {
 			clientBatchSize:      10,
 			clientBatchWait:      10 * time.Millisecond,
 			clientMaxRetries:     3,
 			serverResponseStatus: 429,
 			inputEntries:         []entry{logEntries[0]},
 			expectedReqs: []receivedReq{
 				{
 					tenantID: "",
 					pushReq:  logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
 				},
 				{
 					tenantID: "",
 					pushReq:  logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
 				},
 				{
 					tenantID: "",
 					pushReq:  logproto.PushRequest{Streams: []*logproto.Stream{{Labels: "{}", Entries: []logproto.Entry{logEntries[0].Entry}}}},
 				},
 			},
 			expectedMetrics: `
 				# HELP promtail_dropped_entries_total Number of log entries dropped because failed to be sent to the ingester after all retries.
 				# TYPE promtail_dropped_entries_total counter
 				promtail_dropped_entries_total{host="__HOST__"} 1.0
 				# HELP promtail_sent_entries_total Number of log entries sent to the ingester.
 				# TYPE promtail_sent_entries_total counter
 				promtail_sent_entries_total{host="__HOST__"} 0
 			`,
 		},
 		"batch log entries together honoring the client tenant ID": {
 			clientBatchSize:      100,
 			clientBatchWait:      100 * time.Millisecond,
--- a/pkg/promtail/client/config.go
+++ b/pkg/promtail/client/config.go
@ -34,10 +34,10 @@ func (c *Config) RegisterFlags(flags *flag.FlagSet) {
 	flags.Var(&c.URL, "client.url", "URL of log server")
 	flags.DurationVar(&c.BatchWait, "client.batch-wait", 1*time.Second, "Maximum wait period before sending batch.")
 	flags.IntVar(&c.BatchSize, "client.batch-size-bytes", 100*1024, "Maximum batch size to accrue before sending. ")
-
+	// Default backoff schedule: 0.5s, 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s(4.267m) For a total time of 511.5s(8.5m) before logs are lost
-	flag.IntVar(&c.BackoffConfig.MaxRetries, "client.max-retries", 5, "Maximum number of retires when sending batches.")
+	flag.IntVar(&c.BackoffConfig.MaxRetries, "client.max-retries", 10, "Maximum number of retires when sending batches.")
-	flag.DurationVar(&c.BackoffConfig.MinBackoff, "client.min-backoff", 100*time.Millisecond, "Initial backoff time between retries.")
+	flag.DurationVar(&c.BackoffConfig.MinBackoff, "client.min-backoff", 500*time.Millisecond, "Initial backoff time between retries.")
-	flag.DurationVar(&c.BackoffConfig.MaxBackoff, "client.max-backoff", 5*time.Second, "Maximum backoff time between retries.")
+	flag.DurationVar(&c.BackoffConfig.MaxBackoff, "client.max-backoff", 5*time.Minute, "Maximum backoff time between retries.")
 	flag.DurationVar(&c.Timeout, "client.timeout", 10*time.Second, "Maximum time to wait for server to respond to a request")
 	flags.Var(&c.ExternalLabels, "client.external-labels", "list of external labels to add to each log (e.g: --client.external-labels=lb1=v1,lb2=v2)")