Loki: Change default limits to common values (#4415)

* Change some default limits to reasonable values. - These new limits protect users from overwhelming their cluster with ingestion load - The new limits are: * ingestion_rate_strategy: global <- currently defaults to local * max_global_streams_per_user: 5000 <- current default is no limit * max_query_length: 721h <- current default is no limit * max_query_parallelism: 32 <- current default is 14 * max_streams_per_user: 0 <- current default is 10000 * reject_old_samples: true <-- current default is false * reject_old_samples_max_age: 168h <-- current default is 336h * Remove parameters from examples - They are not necessary anymore because of the new default values * Add new entries to CHANGELOG and upgrading guide * Apply suggestions from code review Co-authored-by: Trevor Whitney <trevorjwhitney@gmail.com> * Change changelog mark entry from '-' to '*' Co-authored-by: Trevor Whitney <trevorjwhitney@gmail.com>
4 years ago · 6498c8cb13
parent d3d63e1778
commit 6498c8cb13
6 changed files with 36 additions and 20 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,7 @@
 * [4435](https://github.com/grafana/loki/pull/4435) **trevorwhitney**: Change default values for two GRPC settings so querier can connect to frontend/scheduler
 * [4440](https://github.com/grafana/loki/pull/4440) **DylanGuedes**: Config: Override distributor's default ring KV store
 * [4443](https://github.com/grafana/loki/pull/4443) **DylanGuedes**: Loki: Change how push API checks for contentType
+* [4415](https://github.com/grafana/loki/pull/4415) **DylanGuedes**: Change default limits to common values

 # 2.3.0 (2021/08/06)

--- a/cmd/loki/loki-docker-config.yaml
+++ b/cmd/loki/loki-docker-config.yaml
@ -40,10 +40,6 @@ compactor:
  working_directory: /loki/boltdb-shipper-compactor
  shared_store: filesystem

-limits_config:
-  reject_old_samples: true
-  reject_old_samples_max_age: 168h
-
 chunk_store_config:
  max_look_back_period: 0s

--- a/cmd/loki/loki-local-config.yaml
+++ b/cmd/loki/loki-local-config.yaml
@ -45,8 +45,6 @@ compactor:
  shared_store: filesystem

 limits_config:
-  reject_old_samples: true
-  reject_old_samples_max_age: 168h
  unordered_writes: true

 chunk_store_config:
--- a/docs/sources/configuration/_index.md
+++ b/docs/sources/configuration/_index.md
@ -1785,7 +1785,7 @@ logs in Loki.
 #   The global strategy requires the distributors to form their own ring, which
 #   is used to keep track of the current number of healthy distributor replicas.
 # CLI flag: -distributor.ingestion-rate-limit-strategy
-[ingestion_rate_strategy: <string> | default = "local"]
+[ingestion_rate_strategy: <string> | default = "global"]

 # Per-user ingestion rate limit in sample size per second. Units in MB.
 # CLI flag: -distributor.ingestion-rate-limit-mb
@ -1812,11 +1812,11 @@ logs in Loki.

 # Whether or not old samples will be rejected.
 # CLI flag: -validation.reject-old-samples
-[reject_old_samples: <bool> | default = false]
+[reject_old_samples: <bool> | default = true]

 # Maximum accepted sample age before rejecting.
 # CLI flag: -validation.reject-old-samples.max-age
-[reject_old_samples_max_age: <duration> | default = 336h]
+[reject_old_samples_max_age: <duration> | default = 168h]

 # Duration for a table to be created/deleted before/after it's
 # needed. Samples won't be accepted before this time.
@ -1829,7 +1829,7 @@ logs in Loki.

 # Maximum number of active streams per user, per ingester. 0 to disable.
 # CLI flag: -ingester.max-streams-per-user
-[max_streams_per_user: <int> | default = 10000]
+[max_streams_per_user: <int> | default = 0]

 # Maximum line size on ingestion path. Example: 256kb.
 # There is no limit when unset.
@ -1849,7 +1849,7 @@ logs in Loki.
 # local limit based on the replication factor and the current number of healthy
 # ingesters, and is kept updated whenever the number of ingesters change.
 # CLI flag: -ingester.max-global-streams-per-user
-[max_global_streams_per_user: <int> | default = 0]
+[max_global_streams_per_user: <int> | default = 5000]

 # When true, out-of-order writes are accepted.
 # CLI flag: -ingester.unordered-writes
@ -1861,11 +1861,11 @@ logs in Loki.

 # The limit to length of chunk store queries. 0 to disable.
 # CLI flag: -store.max-query-length
-[max_query_length: <duration> | default = 0]
+[max_query_length: <duration> | default = 721h]

 # Maximum number of queries that will be scheduled in parallel by the frontend.
 # CLI flag: -querier.max-query-parallelism
-[max_query_parallelism: <int> | default = 14]
+[max_query_parallelism: <int> | default = 32]

 # Limit the maximum of unique series that is returned by a metric query.
 # When the limit is reached an error is returned.
--- a/docs/sources/upgrading/_index.md
+++ b/docs/sources/upgrading/_index.md
@ -99,6 +99,27 @@ Please manually provide the values of `5m` and `true` (respectively) in your con

 -_add changes here which are unreleased_

+### Loki Config
+
+#### Change of some default limits to common values
+
+PR [4415](https://github.com/grafana/loki/pull/4415) **DylanGuedes**: the default value of some limits were changed to protect users from overwhelming their cluster with ingestion load caused by relying on default configs.
+
+We suggest you double check if the following parameters are
+present in your Loki config: `ingestion_rate_strategy`, `max_global_streams_per_user`
+`max_query_length` `max_query_parallelism` `max_streams_per_user`
+`reject_old_samples` `reject_old_samples_max_age`. If they are not present, we recommend you double check that the new values will not negatively impact your system. The changes are:
+
+| config | new default | old default |
+| --- | --- | --- |
+| ingestion_rate_strategy | "global" | "local" |
+| max_global_streams_per_user | 5000 | 0 (no limit) |
+| max_query_length | "721h" | "0h" (no limit) |
+| max_query_parallelism | 32 | 14 |
+| max_streams_per_user | 0 (no limit) | 10000 |
+| reject_old_samples | true | false |
+| reject_old_samples_max_age | "168h" | "336h" |
+
 ## 2.3.0

 ### Loki
@ -144,7 +165,7 @@ This makes it important to first upgrade to 2.0, 2.0.1, or 2.1 **before** upgrad

 **Read this if you use the query-frontend and have `sharded_queries_enabled: true`**

-We discovered query scheduling related to sharded queries over long time ranges could lead to unfair work scheduling by one single query in the per tenant work queue. 
+We discovered query scheduling related to sharded queries over long time ranges could lead to unfair work scheduling by one single query in the per tenant work queue.

 The `max_query_parallelism` setting is designed to limit how many split and sharded units of 'work' for a single query are allowed to be put into the per tenant work queue at one time. The previous behavior would split the query by time using the `split_queries_by_interval` and compare this value to `max_query_parallelism` when filling the queue, however with sharding enabled, every split was then sharded into 16 additional units of work after the `max_query_parallelism` limit was applied.

--- a/pkg/validation/limits.go
+++ b/pkg/validation/limits.go
@ -116,7 +116,7 @@ type StreamRetention struct {

 // RegisterFlags adds the flags required to config this to the given FlagSet
 func (l *Limits) RegisterFlags(f *flag.FlagSet) {
-	f.StringVar(&l.IngestionRateStrategy, "distributor.ingestion-rate-limit-strategy", "local", "Whether the ingestion rate limit should be applied individually to each distributor instance (local), or evenly shared across the cluster (global).")
+	f.StringVar(&l.IngestionRateStrategy, "distributor.ingestion-rate-limit-strategy", "global", "Whether the ingestion rate limit should be applied individually to each distributor instance (local), or evenly shared across the cluster (global).")
 	f.Float64Var(&l.IngestionRateMB, "distributor.ingestion-rate-limit-mb", 4, "Per-user ingestion rate limit in sample size per second. Units in MB.")
 	f.Float64Var(&l.IngestionBurstSizeMB, "distributor.ingestion-burst-size-mb", 6, "Per-user allowed ingestion burst size (in sample size). Units in MB.")
 	f.Var(&l.MaxLineSize, "distributor.max-line-size", "maximum line length allowed, i.e. 100mb. Default (0) means unlimited.")
@ -124,7 +124,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
 	f.IntVar(&l.MaxLabelNameLength, "validation.max-length-label-name", 1024, "Maximum length accepted for label names")
 	f.IntVar(&l.MaxLabelValueLength, "validation.max-length-label-value", 2048, "Maximum length accepted for label value. This setting also applies to the metric name")
 	f.IntVar(&l.MaxLabelNamesPerSeries, "validation.max-label-names-per-series", 30, "Maximum number of label names per series.")
-	f.BoolVar(&l.RejectOldSamples, "validation.reject-old-samples", false, "Reject old samples.")
+	f.BoolVar(&l.RejectOldSamples, "validation.reject-old-samples", true, "Reject old samples.")

 	_ = l.RejectOldSamplesMaxAge.Set("14d")
 	f.Var(&l.RejectOldSamplesMaxAge, "validation.reject-old-samples.max-age", "Maximum accepted sample age before rejecting.")
@ -133,8 +133,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {
 	f.BoolVar(&l.EnforceMetricName, "validation.enforce-metric-name", true, "Enforce every sample has a metric name.")
 	f.IntVar(&l.MaxEntriesLimitPerQuery, "validation.max-entries-limit", 5000, "Per-user entries limit per query")

-	f.IntVar(&l.MaxLocalStreamsPerUser, "ingester.max-streams-per-user", 10e3, "Maximum number of active streams per user, per ingester. 0 to disable.")
-	f.IntVar(&l.MaxGlobalStreamsPerUser, "ingester.max-global-streams-per-user", 0, "Maximum number of active streams per user, across the cluster. 0 to disable.")
+	f.IntVar(&l.MaxLocalStreamsPerUser, "ingester.max-streams-per-user", 0, "Maximum number of active streams per user, per ingester. 0 to disable.")
+	f.IntVar(&l.MaxGlobalStreamsPerUser, "ingester.max-global-streams-per-user", 5000, "Maximum number of active streams per user, across the cluster. 0 to disable.")
 	f.BoolVar(&l.UnorderedWrites, "ingester.unordered-writes", false, "(Experimental) Allow out of order writes.")

 	_ = l.PerStreamRateLimit.Set(strconv.Itoa(defaultPerStreamRateLimit))
@ -144,13 +144,13 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) {

 	f.IntVar(&l.MaxChunksPerQuery, "store.query-chunk-limit", 2e6, "Maximum number of chunks that can be fetched in a single query.")

-	_ = l.MaxQueryLength.Set("0s")
+	_ = l.MaxQueryLength.Set("721h")
 	f.Var(&l.MaxQueryLength, "store.max-query-length", "Limit to length of chunk store queries, 0 to disable.")
 	f.IntVar(&l.MaxQuerySeries, "querier.max-query-series", 500, "Limit the maximum of unique series returned by a metric query. When the limit is reached an error is returned.")

 	_ = l.MaxQueryLookback.Set("0s")
 	f.Var(&l.MaxQueryLookback, "querier.max-query-lookback", "Limit how long back data (series and metadata) can be queried, up until <lookback> duration ago. This limit is enforced in the query-frontend, querier and ruler. If the requested time range is outside the allowed range, the request will not fail but will be manipulated to only query data within the allowed time range. 0 to disable.")
-	f.IntVar(&l.MaxQueryParallelism, "querier.max-query-parallelism", 14, "Maximum number of queries will be scheduled in parallel by the frontend.")
+	f.IntVar(&l.MaxQueryParallelism, "querier.max-query-parallelism", 32, "Maximum number of queries will be scheduled in parallel by the frontend.")
 	f.IntVar(&l.CardinalityLimit, "store.cardinality-limit", 1e5, "Cardinality limit for index queries.")
 	f.IntVar(&l.MaxStreamsMatchersPerQuery, "querier.max-streams-matcher-per-query", 1000, "Limit the number of streams matchers per query")
 	f.IntVar(&l.MaxConcurrentTailRequests, "querier.max-concurrent-tail-requests", 10, "Limit the number of concurrent tail requests")