diff --git a/CHANGELOG.md b/CHANGELOG.md
index 403e37d253..69b9352f5d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -30,6 +30,7 @@
* [9184](https://github.com/grafana/loki/pull/9184) **periklis**: Bump dskit to introduce IPv6 support for memberlist
* [9357](https://github.com/grafana/loki/pull/9357) **Indransh**: Add HTTP API to change the log level at runtime
* [9431](https://github.com/grafana/loki/pull/9431) **dannykopping**: Add more buckets to `loki_memcache_request_duration_seconds` metric; latencies can increase if using memcached with NVMe
+* [8684](https://github.com/grafana/loki/pull/8684) **oleksii-boiko-ua**: Helm: Add hpa templates for read, write and backend components.
##### Fixes
diff --git a/docs/sources/installation/helm/reference.md b/docs/sources/installation/helm/reference.md
index b32ba88aff..7de274aa52 100644
--- a/docs/sources/installation/helm/reference.md
+++ b/docs/sources/installation/helm/reference.md
@@ -38,6 +38,60 @@ This is the generated reference for the Loki Helm Chart values.
Hard node and soft zone anti-affinity
+ |
+
+
+ | backend.autoscaling.behavior |
+ object |
+ Behavior policies while scaling. |
+
+{}
+
+ |
+
+
+ | backend.autoscaling.enabled |
+ bool |
+ Enable autoscaling for the backend. |
+
+false
+
+ |
+
+
+ | backend.autoscaling.maxReplicas |
+ int |
+ Maximum autoscaling replicas for the backend. |
+
+3
+
+ |
+
+
+ | backend.autoscaling.minReplicas |
+ int |
+ Minimum autoscaling replicas for the backend. |
+
+1
+
+ |
+
+
+ | backend.autoscaling.targetCPUUtilizationPercentage |
+ int |
+ Target CPU utilization percentage for the backend. |
+
+60
+
+ |
+
+
+ | backend.autoscaling.targetMemoryUtilizationPercentage |
+ string |
+ Target memory utilization percentage for the backend. |
+
+null
+
|
@@ -764,6 +818,15 @@ null
Hard node and soft zone anti-affinity
+ |
+
+
+ | gateway.autoscaling.behavior |
+ object |
+ Behavior policies while scaling. |
+
+{}
+
|
@@ -874,13 +937,11 @@ null
- | gateway.deploymentStrategy |
- object |
- ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy |
+ gateway.deploymentStrategy.type |
+ string |
+ |
-{
- "type": "RollingUpdate"
-}
+"RollingUpdate"
|
@@ -2780,6 +2841,15 @@ false
Hard node and soft zone anti-affinity
+ |
+
+
+ | read.autoscaling.behavior |
+ object |
+ Behavior policies while scaling. |
+
+{}
+
|
@@ -3699,6 +3769,97 @@ null
Hard node and soft zone anti-affinity
+ |
+
+
+ | write.autoscaling.behavior |
+ object |
+ Behavior policies while scaling. |
+
+{
+ "scaleDown": {
+ "policies": [
+ {
+ "periodSeconds": 1800,
+ "type": "Pods",
+ "value": 1
+ }
+ ]
+ },
+ "scaleUp": {
+ "policies": [
+ {
+ "periodSeconds": 900,
+ "type": "Pods",
+ "value": 1
+ }
+ ]
+ },
+ "stabilizationWindowSeconds": 3600
+}
+
+ |
+
+
+ | write.autoscaling.behavior.scaleUp |
+ object |
+ see https://github.com/grafana/loki/blob/main/docs/sources/operations/storage/wal.md#how-to-scale-updown for scaledown details |
+
+{
+ "policies": [
+ {
+ "periodSeconds": 900,
+ "type": "Pods",
+ "value": 1
+ }
+ ]
+}
+
+ |
+
+
+ | write.autoscaling.enabled |
+ bool |
+ Enable autoscaling for the write. |
+
+false
+
+ |
+
+
+ | write.autoscaling.maxReplicas |
+ int |
+ Maximum autoscaling replicas for the write. |
+
+3
+
+ |
+
+
+ | write.autoscaling.minReplicas |
+ int |
+ Minimum autoscaling replicas for the write. |
+
+1
+
+ |
+
+
+ | write.autoscaling.targetCPUUtilizationPercentage |
+ int |
+ Target CPU utilisation percentage for the write. |
+
+60
+
+ |
+
+
+ | write.autoscaling.targetMemoryUtilizationPercentage |
+ string |
+ Target memory utilization percentage for the write. |
+
+null
+
|
diff --git a/docs/sources/operations/storage/wal.md b/docs/sources/operations/storage/wal.md
index bc31bc4aa3..85eaa3d83d 100644
--- a/docs/sources/operations/storage/wal.md
+++ b/docs/sources/operations/storage/wal.md
@@ -95,6 +95,18 @@ Statefulsets are significantly more cumbersome to work with/upgrade/etc. Much of
In this case, try `kubectl -n delete sts ingester --cascade=false`. This will leave the pods alive but delete the statefulset. Then you may recreate the (updated) statefulset and one-by-one start deleting the `ingester-0` through `ingester-n` pods _in that order_, allowing the statefulset to spin up new pods to replace them.
+#### Scaling Down Using `/flush_shutdown` Endpoint and Lifecycle Hook
+
+1. **StatefulSets for Ordered Scaling Down**: Loki's ingesters should be scaled down one by one, which is efficiently handled by Kubernetes StatefulSets. This ensures an ordered and reliable scaling process, as described in the [Deployment and Scaling Guarantees](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#deployment-and-scaling-guarantees) documentation.
+
+2. **Using PreStop Lifecycle Hook**: During the pod scaling down process, the PreStop [lifecycle hook](https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/) triggers the `/flush_shutdown` endpoint on the ingester. This action flushes the chunks and removes the ingester from the ring, allowing it to register as unready and become eligible for deletion.
+
+3. **Using terminationGracePeriodSeconds**: Provides time for the ingester to flush its data before being deleted, if flushing data takes more than 30 minutes, you may need to increase it.
+
+4. **Cleaning Persistent Volumes**: Persistent volumes are automatically cleaned up by leveraging the [enableStatefulSetAutoDeletePVC](https://kubernetes.io/blog/2021/12/16/kubernetes-1-23-statefulset-pvc-auto-deletion/) feature in Kubernetes.
+
+By following the above steps, you can ensure a smooth scaling down process for Loki's ingesters while maintaining data integrity and minimizing potential disruptions.
+
### Non-Kubernetes or baremetal deployments
* When the ingester restarts for any reason (upgrade, crash, etc), it should be able to attach to the same volume in order to recover back the WAL and tokens.
diff --git a/production/helm/loki/CHANGELOG.md b/production/helm/loki/CHANGELOG.md
index b159fd163c..5dde3e2687 100644
--- a/production/helm/loki/CHANGELOG.md
+++ b/production/helm/loki/CHANGELOG.md
@@ -13,6 +13,10 @@ Entries should include a reference to the pull request that introduced the chang
[//]: # ( : do not remove this line. This locator is used by the CI pipeline to automatically create a changelog entry for each new Loki release. Add other chart versions and respective changelog entries bellow this line.)
+## 5.5.6
+
+- [FEATURE] Add hpa templates for read, write and backend.
+
## 5.5.5
- [BUGFIX] Quote tenantId value in logsInstance
diff --git a/production/helm/loki/Chart.yaml b/production/helm/loki/Chart.yaml
index 53eb136dea..9976bb270a 100644
--- a/production/helm/loki/Chart.yaml
+++ b/production/helm/loki/Chart.yaml
@@ -3,7 +3,7 @@ name: loki
description: Helm chart for Grafana Loki in simple, scalable mode
type: application
appVersion: 2.8.2
-version: 5.5.5
+version: 5.5.6
home: https://grafana.github.io/helm-charts
sources:
- https://github.com/grafana/loki
diff --git a/production/helm/loki/README.md b/production/helm/loki/README.md
index b2b0a61603..85239d198c 100644
--- a/production/helm/loki/README.md
+++ b/production/helm/loki/README.md
@@ -1,6 +1,6 @@
# loki
-  
+  
Helm chart for Grafana Loki in simple, scalable mode
diff --git a/production/helm/loki/templates/backend/hpa.yaml b/production/helm/loki/templates/backend/hpa.yaml
new file mode 100644
index 0000000000..83487e9aaa
--- /dev/null
+++ b/production/helm/loki/templates/backend/hpa.yaml
@@ -0,0 +1,49 @@
+{{- $isSimpleScalable := eq (include "loki.deployment.isScalable" .) "true" -}}
+{{- $autoscalingv2 := .Capabilities.APIVersions.Has "autoscaling/v2" -}}
+{{- if and $isSimpleScalable (not .Values.read.legacyReadTarget ) ( .Values.backend.autoscaling.enabled ) }}
+{{- if $autoscalingv2 }}
+apiVersion: autoscaling/v2
+{{- else }}
+apiVersion: autoscaling/v2beta1
+{{- end }}
+kind: HorizontalPodAutoscaler
+metadata:
+ name: {{ include "loki.backendFullname" . }}
+ labels:
+ {{- include "loki.backendLabels" . | nindent 4 }}
+spec:
+ scaleTargetRef:
+ apiVersion: apps/v1
+ kind: StatefulSet
+ name: {{ include "loki.backendFullname" . }}
+ minReplicas: {{ .Values.backend.autoscaling.minReplicas }}
+ maxReplicas: {{ .Values.backend.autoscaling.maxReplicas }}
+ {{- with .Values.backend.autoscaling.behavior }}
+ behavior:
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+ {{- with .Values.backend.autoscaling.targetMemoryUtilizationPercentage }}
+ - type: Resource
+ resource:
+ name: memory
+ {{- if $autoscalingv2 }}
+ target:
+ type: Utilization
+ averageUtilization: {{ . }}
+ {{- else }}
+ targetAverageUtilization: {{ . }}
+ {{- end }}
+ {{- end }}
+ {{- with .Values.backend.autoscaling.targetCPUUtilizationPercentage }}
+ - type: Resource
+ resource:
+ name: cpu
+ {{- if $autoscalingv2 }}
+ target:
+ type: Utilization
+ averageUtilization: {{ . }}
+ {{- else }}
+ targetAverageUtilization: {{ . }}
+ {{- end }}
+ {{- end }}
+{{- end }}
diff --git a/production/helm/loki/templates/backend/statefulset-backend.yaml b/production/helm/loki/templates/backend/statefulset-backend.yaml
index d74bd9c957..59816a4749 100644
--- a/production/helm/loki/templates/backend/statefulset-backend.yaml
+++ b/production/helm/loki/templates/backend/statefulset-backend.yaml
@@ -10,7 +10,9 @@ metadata:
{{- include "loki.backendLabels" . | nindent 4 }}
app.kubernetes.io/part-of: memberlist
spec:
+{{- if not .Values.backend.autoscaling.enabled }}
replicas: {{ .Values.backend.replicas }}
+{{- end }}
podManagementPolicy: Parallel
updateStrategy:
rollingUpdate:
diff --git a/production/helm/loki/templates/gateway/hpa.yaml b/production/helm/loki/templates/gateway/hpa.yaml
index c71b4f2284..3541ec6965 100644
--- a/production/helm/loki/templates/gateway/hpa.yaml
+++ b/production/helm/loki/templates/gateway/hpa.yaml
@@ -18,6 +18,10 @@ spec:
name: {{ include "loki.gatewayFullname" . }}
minReplicas: {{ .Values.gateway.autoscaling.minReplicas }}
maxReplicas: {{ .Values.gateway.autoscaling.maxReplicas }}
+ {{- with .Values.gateway.autoscaling.behavior }}
+ behavior:
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
metrics:
{{- with .Values.gateway.autoscaling.targetMemoryUtilizationPercentage }}
- type: Resource
diff --git a/production/helm/loki/templates/read/hpa.yaml b/production/helm/loki/templates/read/hpa.yaml
new file mode 100644
index 0000000000..5515ecb0b4
--- /dev/null
+++ b/production/helm/loki/templates/read/hpa.yaml
@@ -0,0 +1,55 @@
+{{- $isSimpleScalable := eq (include "loki.deployment.isScalable" .) "true" -}}
+{{- $autoscalingv2 := .Capabilities.APIVersions.Has "autoscaling/v2" -}}
+{{- if and $isSimpleScalable ( .Values.read.autoscaling.enabled ) }}
+{{- if $autoscalingv2 }}
+apiVersion: autoscaling/v2
+{{- else }}
+apiVersion: autoscaling/v2beta1
+{{- end }}
+kind: HorizontalPodAutoscaler
+metadata:
+ name: {{ include "loki.readFullname" . }}
+ labels:
+ {{- include "loki.readLabels" . | nindent 4 }}
+spec:
+ scaleTargetRef:
+ apiVersion: apps/v1
+{{- if and $isSimpleScalable (not .Values.read.legacyReadTarget ) }}
+ kind: Deployment
+ name: {{ include "loki.readFullname" . }}
+{{- else }}
+ kind: StatefulSet
+ name: {{ include "loki.readFullname" . }}
+{{- end }}
+ minReplicas: {{ .Values.read.autoscaling.minReplicas }}
+ maxReplicas: {{ .Values.read.autoscaling.maxReplicas }}
+ {{- with .Values.read.autoscaling.behavior }}
+ behavior:
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+ metrics:
+ {{- with .Values.read.autoscaling.targetMemoryUtilizationPercentage }}
+ - type: Resource
+ resource:
+ name: memory
+ {{- if $autoscalingv2 }}
+ target:
+ type: Utilization
+ averageUtilization: {{ . }}
+ {{- else }}
+ targetAverageUtilization: {{ . }}
+ {{- end }}
+ {{- end }}
+ {{- with .Values.read.autoscaling.targetCPUUtilizationPercentage }}
+ - type: Resource
+ resource:
+ name: cpu
+ {{- if $autoscalingv2 }}
+ target:
+ type: Utilization
+ averageUtilization: {{ . }}
+ {{- else }}
+ targetAverageUtilization: {{ . }}
+ {{- end }}
+ {{- end }}
+{{- end }}
diff --git a/production/helm/loki/templates/read/statefulset-read.yaml b/production/helm/loki/templates/read/statefulset-read.yaml
index 93e9c0e85e..b4eb9ba369 100644
--- a/production/helm/loki/templates/read/statefulset-read.yaml
+++ b/production/helm/loki/templates/read/statefulset-read.yaml
@@ -10,7 +10,9 @@ metadata:
app.kubernetes.io/part-of: memberlist
{{- include "loki.readLabels" . | nindent 4 }}
spec:
+{{- if not .Values.read.autoscaling.enabled }}
replicas: {{ .Values.read.replicas }}
+{{- end }}
podManagementPolicy: Parallel
updateStrategy:
rollingUpdate:
diff --git a/production/helm/loki/templates/write/hpa.yaml b/production/helm/loki/templates/write/hpa.yaml
new file mode 100644
index 0000000000..18d59dbd07
--- /dev/null
+++ b/production/helm/loki/templates/write/hpa.yaml
@@ -0,0 +1,50 @@
+{{- $isSimpleScalable := eq (include "loki.deployment.isScalable" .) "true" -}}
+{{- $autoscalingv2 := .Capabilities.APIVersions.Has "autoscaling/v2" -}}
+{{- if and $isSimpleScalable ( .Values.write.autoscaling.enabled ) }}
+{{- if $autoscalingv2 }}
+apiVersion: autoscaling/v2
+{{- else }}
+apiVersion: autoscaling/v2beta1
+{{- end }}
+kind: HorizontalPodAutoscaler
+metadata:
+ name: {{ include "loki.writeFullname" . }}
+ labels:
+ {{- include "loki.writeLabels" . | nindent 4 }}
+spec:
+ scaleTargetRef:
+ apiVersion: apps/v1
+ kind: StatefulSet
+ name: {{ include "loki.writeFullname" . }}
+ minReplicas: {{ .Values.write.autoscaling.minReplicas }}
+ maxReplicas: {{ .Values.write.autoscaling.maxReplicas }}
+ {{- with .Values.write.autoscaling.behavior }}
+ behavior:
+ {{- toYaml . | nindent 4 }}
+ {{- end }}
+ metrics:
+ {{- with .Values.write.autoscaling.targetMemoryUtilizationPercentage }}
+ - type: Resource
+ resource:
+ name: memory
+ {{- if $autoscalingv2 }}
+ target:
+ type: Utilization
+ averageUtilization: {{ . }}
+ {{- else }}
+ targetAverageUtilization: {{ . }}
+ {{- end }}
+ {{- end }}
+ {{- with .Values.write.autoscaling.targetCPUUtilizationPercentage }}
+ - type: Resource
+ resource:
+ name: cpu
+ {{- if $autoscalingv2 }}
+ target:
+ type: Utilization
+ averageUtilization: {{ . }}
+ {{- else }}
+ targetAverageUtilization: {{ . }}
+ {{- end }}
+ {{- end }}
+{{- end }}
diff --git a/production/helm/loki/templates/write/statefulset-write.yaml b/production/helm/loki/templates/write/statefulset-write.yaml
index 3a55247251..b1756e4321 100644
--- a/production/helm/loki/templates/write/statefulset-write.yaml
+++ b/production/helm/loki/templates/write/statefulset-write.yaml
@@ -9,9 +9,10 @@ metadata:
{{- include "loki.writeLabels" . | nindent 4 }}
app.kubernetes.io/part-of: memberlist
spec:
+{{- if not .Values.write.autoscaling.enabled }}
replicas: {{ .Values.write.replicas }}
-
- podManagementPolicy: Parallel
+{{- end }}
+ podManagementPolicy: OrderedReady
updateStrategy:
rollingUpdate:
partition: 0
@@ -101,9 +102,15 @@ spec:
{{- toYaml .Values.loki.containerSecurityContext | nindent 12 }}
readinessProbe:
{{- toYaml .Values.loki.readinessProbe | nindent 12 }}
- {{- with .Values.write.lifecycle }}
+ {{- if .Values.write.lifecycle }}
lifecycle:
- {{- toYaml . | nindent 12 }}
+ {{- toYaml .Values.write.lifecycle | nindent 12 }}
+ {{- else if .Values.write.autoscaling.enabled }}
+ lifecycle:
+ preStop:
+ httpGet:
+ path: "/ingester/flush_shutdown"
+ port: http-metrics
{{- end }}
volumeMounts:
- name: config
diff --git a/production/helm/loki/values.yaml b/production/helm/loki/values.yaml
index ad799bb770..3b1aafad51 100644
--- a/production/helm/loki/values.yaml
+++ b/production/helm/loki/values.yaml
@@ -645,6 +645,31 @@ monitoring:
write:
# -- Number of replicas for the write
replicas: 3
+ autoscaling:
+ # -- Enable autoscaling for the write.
+ enabled: false
+ # -- Minimum autoscaling replicas for the write.
+ minReplicas: 1
+ # -- Maximum autoscaling replicas for the write.
+ maxReplicas: 3
+ # -- Target CPU utilisation percentage for the write.
+ targetCPUUtilizationPercentage: 60
+ # -- Target memory utilization percentage for the write.
+ targetMemoryUtilizationPercentage:
+ # -- Behavior policies while scaling.
+ behavior:
+ # -- see https://github.com/grafana/loki/blob/main/docs/sources/operations/storage/wal.md#how-to-scale-updown for scaledown details
+ scaleUp:
+ policies:
+ - type: Pods
+ value: 1
+ periodSeconds: 900
+ scaleDown:
+ policies:
+ - type: Pods
+ value: 1
+ periodSeconds: 1800
+ stabilizationWindowSeconds: 3600
image:
# -- The Docker registry for the write image. Overrides `loki.image.registry`
registry: null
@@ -672,6 +697,11 @@ write:
extraEnvFrom: []
# -- Lifecycle for the write container
lifecycle: {}
+ # -- The default /flush_shutdown preStop hook is recommended as part of the ingester
+ # scaledown process so it's added to the template by default when autoscaling is enabled,
+ # but it's disabled to optimize rolling restarts in instances that will never be scaled
+ # down or when using chunks storage with WAL disabled.
+ # https://github.com/grafana/loki/blob/main/docs/sources/operations/storage/wal.md#how-to-scale-updown
# -- Init containers to add to the write pods
initContainers: []
# -- Volume mounts to add to the write pods
@@ -784,6 +814,20 @@ read:
targetCPUUtilizationPercentage: 60
# -- Target memory utilisation percentage for the read
targetMemoryUtilizationPercentage:
+ # -- Behavior policies while scaling.
+ behavior: {}
+ # scaleUp:
+ # stabilizationWindowSeconds: 300
+ # policies:
+ # - type: Pods
+ # value: 1
+ # periodSeconds: 60
+ # scaleDown:
+ # stabilizationWindowSeconds: 300
+ # policies:
+ # - type: Pods
+ # value: 1
+ # periodSeconds: 180
image:
# -- The Docker registry for the read image. Overrides `loki.image.registry`
registry: null
@@ -853,6 +897,31 @@ read:
backend:
# -- Number of replicas for the backend
replicas: 3
+ autoscaling:
+ # -- Enable autoscaling for the backend.
+ enabled: false
+ # -- Minimum autoscaling replicas for the backend.
+ minReplicas: 1
+ # -- Maximum autoscaling replicas for the backend.
+ maxReplicas: 3
+ # -- Target CPU utilization percentage for the backend.
+ targetCPUUtilizationPercentage: 60
+ # -- Target memory utilization percentage for the backend.
+ targetMemoryUtilizationPercentage:
+ # -- Behavior policies while scaling.
+ behavior: {}
+ # scaleUp:
+ # stabilizationWindowSeconds: 300
+ # policies:
+ # - type: Pods
+ # value: 1
+ # periodSeconds: 60
+ # scaleDown:
+ # stabilizationWindowSeconds: 300
+ # policies:
+ # - type: Pods
+ # value: 1
+ # periodSeconds: 180
image:
# -- The Docker registry for the backend image. Overrides `loki.image.registry`
registry: null
@@ -1060,6 +1129,20 @@ gateway:
targetMemoryUtilizationPercentage:
# -- See `kubectl explain deployment.spec.strategy` for more
# -- ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy
+ # -- Behavior policies while scaling.
+ behavior: {}
+ # scaleUp:
+ # stabilizationWindowSeconds: 300
+ # policies:
+ # - type: Pods
+ # value: 1
+ # periodSeconds: 60
+ # scaleDown:
+ # stabilizationWindowSeconds: 300
+ # policies:
+ # - type: Pods
+ # value: 1
+ # periodSeconds: 180
deploymentStrategy:
type: RollingUpdate
image: