Promtail: (and also fluent-bit) change the max batch size to 1MB (#2710)

* change the max batch size to 1MB for all the defaults including helm and fluent-bit, attempt to centralize this config a little where possible. * fix test
5 years ago · d3bf21e774
parent 9e6afea5f6
commit d3bf21e774
8 changed files with 44 additions and 34 deletions
--- a/cmd/docker-driver/config.go
+++ b/cmd/docker-driver/config.go
@ -67,14 +67,14 @@ const (

 var (
 	defaultClientConfig = client.Config{
-		BatchWait: 1 * time.Second,
-		BatchSize: 100 * 1024,
+		BatchWait: client.BatchWait,
+		BatchSize: client.BatchSize,
 		BackoffConfig: cortex_util.BackoffConfig{
-			MinBackoff: 100 * time.Millisecond,
-			MaxBackoff: 10 * time.Second,
-			MaxRetries: 10,
+			MinBackoff: client.MinBackoff,
+			MaxBackoff: client.MaxBackoff,
+			MaxRetries: client.MaxRetries,
 		},
-		Timeout: 10 * time.Second,
+		Timeout: client.Timeout,
 	}
 )

@ -242,8 +242,8 @@ func parseConfig(logCtx logger.Info) (*config, error) {

 	// other labels coming from docker labels or env selected by user labels, labels-regex, env, env-regex config.
 	attrs, err := logCtx.ExtraAttributes(func(label string) string {
-                return strings.ReplaceAll(strings.ReplaceAll(label, "-", "_"), ".", "_")
-        })
+		return strings.ReplaceAll(strings.ReplaceAll(label, "-", "_"), ".", "_")
+	})
 	if err != nil {
 		return nil, err
 	}
--- a/pkg/promtail/client/config.go
+++ b/pkg/promtail/client/config.go
@ -11,6 +11,16 @@ import (
 	lokiflag "github.com/grafana/loki/pkg/util/flagext"
 )

+// NOTE the helm chart for promtail and fluent-bit also have defaults for these values, please update to match if you make changes here.
+const (
+	BatchWait      = 1 * time.Second
+	BatchSize  int = 1024 * 1024
+	MinBackoff     = 500 * time.Millisecond
+	MaxBackoff     = 5 * time.Minute
+	MaxRetries int = 10
+	Timeout        = 10 * time.Second
+)
+
 // Config describes configuration for a HTTP pusher client.
 type Config struct {
 	URL       flagext.URLValue
@ -33,13 +43,13 @@ type Config struct {
 // prefix. If prefix is a non-empty string, prefix should end with a period.
 func (c *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
 	f.Var(&c.URL, prefix+"client.url", "URL of log server")
-	f.DurationVar(&c.BatchWait, prefix+"client.batch-wait", 1*time.Second, "Maximum wait period before sending batch.")
-	f.IntVar(&c.BatchSize, prefix+"client.batch-size-bytes", 1024*1024, "Maximum batch size to accrue before sending. ")
+	f.DurationVar(&c.BatchWait, prefix+"client.batch-wait", BatchWait, "Maximum wait period before sending batch.")
+	f.IntVar(&c.BatchSize, prefix+"client.batch-size-bytes", BatchSize, "Maximum batch size to accrue before sending. ")
 	// Default backoff schedule: 0.5s, 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s(4.267m) For a total time of 511.5s(8.5m) before logs are lost
-	f.IntVar(&c.BackoffConfig.MaxRetries, prefix+"client.max-retries", 10, "Maximum number of retires when sending batches.")
-	f.DurationVar(&c.BackoffConfig.MinBackoff, prefix+"client.min-backoff", 500*time.Millisecond, "Initial backoff time between retries.")
-	f.DurationVar(&c.BackoffConfig.MaxBackoff, prefix+"client.max-backoff", 5*time.Minute, "Maximum backoff time between retries.")
-	f.DurationVar(&c.Timeout, prefix+"client.timeout", 10*time.Second, "Maximum time to wait for server to respond to a request")
+	f.IntVar(&c.BackoffConfig.MaxRetries, prefix+"client.max-retries", MaxRetries, "Maximum number of retires when sending batches.")
+	f.DurationVar(&c.BackoffConfig.MinBackoff, prefix+"client.min-backoff", MinBackoff, "Initial backoff time between retries.")
+	f.DurationVar(&c.BackoffConfig.MaxBackoff, prefix+"client.max-backoff", MaxBackoff, "Maximum backoff time between retries.")
+	f.DurationVar(&c.Timeout, prefix+"client.timeout", Timeout, "Maximum time to wait for server to respond to a request")
 	f.Var(&c.ExternalLabels, prefix+"client.external-labels", "list of external labels to add to each log (e.g: --client.external-labels=lb1=v1,lb2=v2)")

 	f.StringVar(&c.TenantID, prefix+"client.tenant-id", "", "Tenant ID to use when pushing logs to Loki.")
@ -61,13 +71,13 @@ func (c *Config) UnmarshalYAML(unmarshal func(interface{}) error) error {
 		// force sane defaults.
 		cfg = raw{
 			BackoffConfig: util.BackoffConfig{
-				MaxBackoff: 5 * time.Minute,
-				MaxRetries: 10,
-				MinBackoff: 500 * time.Millisecond,
+				MaxBackoff: MaxBackoff,
+				MaxRetries: MaxRetries,
+				MinBackoff: MinBackoff,
 			},
-			BatchSize: 100 * 1024,
-			BatchWait: 1 * time.Second,
-			Timeout:   10 * time.Second,
+			BatchSize: BatchSize,
+			BatchWait: BatchWait,
+			Timeout:   Timeout,
 		}
 	}

--- a/pkg/promtail/client/config_test.go
+++ b/pkg/promtail/client/config_test.go
@ -44,13 +44,13 @@ func Test_Config(t *testing.T) {
 					URL: u,
 				},
 				BackoffConfig: util.BackoffConfig{
-					MaxBackoff: 5 * time.Minute,
-					MaxRetries: 10,
-					MinBackoff: 500 * time.Millisecond,
+					MaxBackoff: MaxBackoff,
+					MaxRetries: MaxRetries,
+					MinBackoff: MinBackoff,
 				},
-				BatchSize: 100 * 1024,
-				BatchWait: 1 * time.Second,
-				Timeout:   10 * time.Second,
+				BatchSize: BatchSize,
+				BatchWait: BatchWait,
+				Timeout:   Timeout,
 			},
 		},
 		{
--- a/production/helm/fluent-bit/Chart.yaml
+++ b/production/helm/fluent-bit/Chart.yaml
@ -1,6 +1,6 @@
 apiVersion: "v1"
 name: fluent-bit
-version: 0.3.0
+version: 0.3.1
 appVersion: v1.6.0
 kubeVersion: "^1.10.0-0"
 description: "Uses fluent-bit Loki go plugin for gathering logs and sending them to Loki"
--- a/production/helm/fluent-bit/values.yaml
+++ b/production/helm/fluent-bit/values.yaml
@ -10,7 +10,7 @@ config:
  port: 2020
  tenantID: '""'
  batchWait: 1
-  batchSize: 10240
+  batchSize: 1048576
  loglevel: warn
  lineFormat: json
  k8sLoggingParser: "Off"
--- a/production/helm/loki-stack/Chart.yaml
+++ b/production/helm/loki-stack/Chart.yaml
@ -1,6 +1,6 @@
 apiVersion: "v1"
 name: loki-stack
-version: 0.41.0
+version: 0.41.1
 appVersion: v1.6.0
 kubeVersion: "^1.10.0-0"
 description: "Loki: like Prometheus, but for logs."
--- a/production/helm/promtail/Chart.yaml
+++ b/production/helm/promtail/Chart.yaml
@ -1,6 +1,6 @@
 apiVersion: "v1"
 name: promtail
-version: 0.25.0
+version: 0.25.1
 appVersion: v1.6.0
 kubeVersion: "^1.10.0-0"
 description: "Responsible for gathering logs and sending them to Loki"
--- a/production/helm/promtail/values.yaml
+++ b/production/helm/promtail/values.yaml
@ -159,18 +159,18 @@ config:
    # Maximum wait period before sending batch
    batchwait: 1s
    # Maximum batch size to accrue before sending, unit is byte
-    batchsize: 102400
+    batchsize: 1048576

    # Maximum time to wait for server to respond to a request
    timeout: 10s

    backoff_config:
      # Initial backoff time between retries
-      min_period: 100ms
+      min_period: 500ms
      # Maximum backoff time between retries
-      max_period: 5s
+      max_period: 5m
      # Maximum number of retries when sending batches, 0 means infinite retries
-      max_retries: 20
+      max_retries: 10

    # The labels to add to any time series or alerts when communicating with loki
    external_labels: {}