operator: Add lokistack operations alerts and sop (#6951)

3 years ago · ee88599e5e
parent d0a96b20b0
commit ee88599e5e
7 changed files with 461 additions and 43 deletions
--- a/operator/CHANGELOG.md
+++ b/operator/CHANGELOG.md
@ -1,5 +1,6 @@
 ## Main

+- [6951](https://github.com/grafana/loki/pull/6951) **Red-GV**: Adding operational Lokistack alerts
 - [7254](https://github.com/grafana/loki/pull/7254) **periklis**: Expose Loki Ruler API via the lokistack-gateway
 - [7214](https://github.com/grafana/loki/pull/7214) **periklis**: Fix ruler GRPC tls client configuration
 - [7201](https://github.com/grafana/loki/pull/7201) **xperimental**: Write configuration for per-tenant retention
--- a/operator/Makefile
+++ b/operator/Makefile
@ -154,7 +154,7 @@ lint: $(GOLANGCI_LINT) | generate ## Run golangci-lint on source code.

 .PHONY: lint-prometheus
 lint-prometheus: $(PROMTOOL) ## Run promtool check against recording rules and alerts.
-	@$(PROMTOOL) check rules ./internal/manifests/internal/alerts/prometheus-alerts.yaml
+	@$(PROMTOOL) check rules ./internal/manifests/internal/alerts/prometheus-*.yaml

 .PHONY: fmt
 fmt: $(GOFUMPT) ## Run gofumpt on source code.
--- a/operator/docs/lokistack/sop.md
+++ b/operator/docs/lokistack/sop.md
@ -136,3 +136,173 @@ A service(s) has crashed.

 - Check the logs of the service that is panicking
 - Examine metrics for signs of failure
+
+## Loki Request Latency
+
+### Impact
+
+A service(s) is affected by slow request responses.
+
+### Summary
+
+A service(s) is slower than expected at processing data.
+
+### Severity
+
+`Critical`
+
+### Access Required
+
+- Console access to the cluster
+- Edit access to the deployed operator and Loki namespace:
+  - OpenShift
+    - `openshift-logging` (LokiStack)
+    - `openshift-operators-redhat` (Loki Operator)
+
+### Steps
+
+- Check the logs of all the services
+- Check to ensure that the Loki components can reach the storage
+  - Particularly for queriers, examine metrics for a small query queue: `cortex_query_scheduler_inflight_requests`
+
+## Loki Tenant Rate Limit
+
+### Impact
+
+A tenant is being rate limited, resulting in potential loss of data.
+
+### Summary
+
+A service(s) is rate limiting at least 10% of all incoming requests.
+
+### Severity
+
+`Warning`
+
+### Access Required
+
+- Console access to the cluster
+- Edit access to the deployed operator and Loki namespace:
+  - OpenShift
+    - `openshift-logging` (LokiStack)
+    - `openshift-operators-redhat` (Loki Operator)
+
+### Steps
+
+- Examine the metrics for the reason and tenant that is being limited: `loki_discarded_samples_total{namespace="<namespace>"}`
+- Increase the limits allocated to the tenant in the LokiStack CRD
+  - For ingestion limits, please consult the table below
+  - For query limits, the `MaxEntriesLimitPerQuery`, `MaxChunksPerQuery`, or `MaxQuerySeries` can be changed to raise the limit
+
+| Reason | Corresponding Ingestion Limit Keys |
+| --- | --- |
+| `rate_limited` | `ingestionRate`, `ingestionBurstSize` |
+| `stream_limit` | `maxGlobalStreamsPerTenant` |
+| `label_name_too_long` | `maxLabelNameLength` |
+| `label_value_too_long` | `maxLabelValueLength` |
+| `line_too_long` | `maxLineSize` |
+| `max_label_names_per_series` | `maxLabelNamesPerSeries` |
+
+## Loki Storage Slow Write
+
+### Impact
+
+The cluster is unable to push logs to backend storage in a timely manner.
+
+### Summary
+
+The cluster is unable to push logs to backend storage in a timely manner.
+
+### Severity
+
+`Warning`
+
+### Access Required
+
+- Console access to the cluster
+- Edit access to the deployed operator and Loki namespace:
+  - OpenShift
+    - `openshift-logging` (LokiStack)
+    - `openshift-operators-redhat` (Loki Operator)
+
+### Steps
+
+- Ensure that the cluster can communicate with the backend storage
+
+## Loki Storage Slow Read
+
+### Impact
+
+The cluster is unable to retrieve logs to backend storage in a timely manner.
+
+### Summary
+
+The cluster is unable to retrieve logs to backend storage in a timely manner.
+
+### Severity
+
+`Warning`
+
+### Access Required
+
+- Console access to the cluster
+- Edit access to the deployed operator and Loki namespace:
+  - OpenShift
+    - `openshift-logging` (LokiStack)
+    - `openshift-operators-redhat` (Loki Operator)
+
+### Steps
+
+- Ensure that the cluster can communicate with the backend storage
+
+## Loki Write Path High Load
+
+### Impact
+
+The write path is under high pressure and requires a storage flush.
+
+### Summary
+
+The write path is flushing the storage in response to back-pressuring.
+
+### Severity
+
+`Warning`
+
+### Access Required
+
+- Console access to the cluster
+- Edit access to the deployed operator and Loki namespace:
+  - OpenShift
+    - `openshift-logging` (LokiStack)
+    - `openshift-operators-redhat` (Loki Operator)
+
+### Steps
+
+- Adjust the ingestion limits for the affected tenant or increase the number of ingesters
+
+## Loki Read Path High Load
+
+### Impact
+
+The read path is under high load.
+
+### Summary
+
+The query queue is currently under high load.
+
+### Severity
+
+`Warning`
+
+### Access Required
+
+- Console access to the cluster
+- Edit access to the deployed operator and Loki namespace:
+  - OpenShift
+    - `openshift-logging` (LokiStack)
+    - `openshift-operators-redhat` (Loki Operator)
+
+### Steps
+
+- Increase the number of queriers
--- a/operator/internal/manifests/internal/alerts/build.go
+++ b/operator/internal/manifests/internal/alerts/build.go
@ -20,21 +20,43 @@ var (
 	//go:embed prometheus-alerts.yaml
 	alertsYAMLTmplFile embed.FS

+	//go:embed prometheus-rules.yaml
+	rulesYAMLTmplFile embed.FS
+
 	alertsYAMLTmpl = template.Must(template.New("").Delims("[[", "]]").ParseFS(alertsYAMLTmplFile, "prometheus-alerts.yaml"))
+
+	rulesYAMLTmpl = template.Must(template.New("").Delims("[[", "]]").ParseFS(rulesYAMLTmplFile, "prometheus-rules.yaml"))
 )

 // Build creates Prometheus alerts for the Loki stack
 func Build(opts Options) (*monitoringv1.PrometheusRuleSpec, error) {
+	alerts, err := ruleSpec("prometheus-alerts.yaml", alertsYAMLTmpl, opts)
+	if err != nil {
+		return nil, kverrors.Wrap(err, "failed to create prometheus alerts")
+	}
+
+	recordingRules, err := ruleSpec("prometheus-rules.yaml", rulesYAMLTmpl, opts)
+	if err != nil {
+		return nil, kverrors.Wrap(err, "failed to create prometheus rules")
+	}
+
+	spec := alerts.DeepCopy()
+	spec.Groups = append(alerts.Groups, recordingRules.Groups...)
+
+	return spec, nil
+}
+
+func ruleSpec(file string, tmpl *template.Template, opts Options) (*monitoringv1.PrometheusRuleSpec, error) {
 	spec := monitoringv1.PrometheusRuleSpec{}

-	// Build alerts yaml
 	w := bytes.NewBuffer(nil)
-	err := alertsYAMLTmpl.ExecuteTemplate(w, "prometheus-alerts.yaml", opts)
+	err := tmpl.ExecuteTemplate(w, file, opts)
 	if err != nil {
-		return nil, kverrors.Wrap(err, "failed to create prometheus alerts")
+		return nil, kverrors.Wrap(err, "failed to execute template",
+			"template", file,
+		)
 	}

-	// Decode the spec
 	r := io.Reader(w)
 	err = yaml.NewYAMLOrJSONDecoder(r, 1000).Decode(&spec)
 	if err != nil {
--- a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml
+++ b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml
@ -10,16 +10,12 @@ groups:
      runbook_url: "[[ .RunbookURL ]]#Loki-Request-Errors"
    expr: |
      sum(
-        rate(
-          loki_request_duration_seconds_count{status_code=~"5.."}[1m]
-        )
-      ) by (namespace, job, route)
+        job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code=~"5.."}
+      ) by (job, namespace, route)
      /
      sum(
-        rate(
-          loki_request_duration_seconds_count[1m]
-        )
-      ) by (namespace, job, route)
+        job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m
+      ) by (job, namespace, route)
      * 100
      > 10
    for: 15m
@ -28,21 +24,17 @@ groups:
  - alert: LokiStackWriteRequestErrors
    annotations:
      message: |-
-        {{ printf "%.2f" $value }}% of write requests from {{ $labels.job }} are returned with server errors.
+        {{ printf "%.2f" $value }}% of write requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors.
      summary: "At least 10% of write requests to the lokistack-gateway are responded with 5xx server errors."
      runbook_url: "[[ .RunbookURL ]]#LokiStack-Write-Request-Errors"
    expr: |
      sum(
-        rate(
-          http_requests_total{code=~"5..", group="logsv1", handler="push"}[1m]
-        )
-      ) by (namespace, job, tenant)
+        code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler="push"}
+      ) by (job, namespace)
      /
      sum(
-        rate(
-          http_requests_total{group="logsv1", handler="push"}[1m]
-        )
-      ) by (namespace, job, tenant)
+        code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler="push"}
+      ) by (job, namespace)
      * 100
      > 10
    for: 15m
@ -51,21 +43,17 @@ groups:
  - alert: LokiStackReadRequestErrors
    annotations:
      message: |-
-        {{ printf "%.2f" $value }}% of query requests from {{ $labels.job }} are returned with server errors.
+        {{ printf "%.2f" $value }}% of query requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors.
      summary: "At least 10% of query requests to the lokistack-gateway are responded with 5xx server errors."
      runbook_url: "[[ .RunbookURL ]]#LokiStack-Read-Request-Errors"
    expr: |
      sum(
-        rate(
-          http_requests_total{code=~"5..", group="logsv1", handler=~"query|query_range|label|labels|label_values"}[1m]
-        )
-      ) by (namespace, job, tenant)
+        code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler=~"query|query_range|label|labels|label_values"}
+      ) by (job, namespace)
      /
      sum(
-        rate(
-          http_requests_total{group="logsv1", handler=~"query|query_range|label|labels|label_values"}[1m]
-        )
-      ) by (namespace, job, tenant)
+        code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler=~"query|query_range|label|labels|label_values"}
+      ) by (job, namespace)
      * 100
      > 10
    for: 15m
@ -82,7 +70,112 @@ groups:
        increase(
          loki_panic_total[10m]
        )
-      ) by (namespace, job)
+      ) by (job, namespace)
      > 0
    labels:
-        severity: critical
+      severity: critical
+  - alert: LokiRequestLatency
+    annotations:
+      message: |-
+        {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
+      summary: "The 99th percentile is experiencing high latency (higher than 1 second)."
+      runbook_url: "[[ .RunbookURL ]]#Loki-Request-Latency"
+    expr: |
+      histogram_quantile(0.99,
+        sum(
+          irate(
+            loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[1m]
+          )
+        ) by (job, le, namespace, route)
+      )
+      * 100
+      > 1
+    for: 15m
+    labels:
+      severity: critical
+  - alert: LokiTenantRateLimit
+    annotations:
+      message: |-
+        {{ $labels.job }} {{ $labels.route }} is experiencing 429 errors.
+      summary: "At least 10% of requests are responded with the rate limit error code."
+      runbook_url: "[[ .RunbookURL ]]#Loki-Tenant-Rate-Limit"
+    expr: |
+      sum(
+        job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429"}
+      ) by (job, namespace, route)
+      /
+      sum(
+        job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m
+      ) by (job, namespace, route)
+      * 100
+      > 10
+    for: 15m
+    labels:
+      severity: warning
+  - alert: LokiStorageSlowWrite
+    annotations:
+      message: |-
+        The storage path is experiencing slow write response rates.
+      summary: "The storage path is experiencing slow write response rates."
+      runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Write"
+    expr: |
+      histogram_quantile(0.99,
+        sum(
+          job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="WRITE"}
+        ) by (job, le, namespace)
+      )
+      * 100
+      > 1
+    for: 15m
+    labels:
+      severity: warning
+  - alert: LokiStorageSlowRead
+    annotations:
+      message: |-
+        The storage path is experiencing slow read response rates.
+      summary: "The storage path is experiencing slow read response rates."
+      runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Read"
+    expr: |
+      histogram_quantile(0.99,
+        sum(
+          job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="Shipper.Query"}
+        ) by (job, le, namespace)
+      )
+      * 100
+      > 5
+    for: 15m
+    labels:
+      severity: warning
+  - alert: LokiWritePathHighLoad
+    annotations:
+      message: |-
+        The write path is experiencing high load.
+      summary: "The write path is experiencing high load, causing backpressure storage flushing."
+      runbook_url: "[[ .RunbookURL ]]#Loki-Write-Path-High-Load"
+    expr: |
+      sum(
+        loki_ingester_wal_replay_flushing
+      ) by (job, namespace)
+      > 0
+    for: 15m
+    labels:
+      severity: warning
+  - alert: LokiReadPathHighLoad
+    annotations:
+      message: |-
+        The read path is experiencing high load.
+      summary: "The read path has high volume of queries, causing longer response times."
+      runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load"
+    expr: |
+      histogram_quantile(0.99,
+        sum(
+          rate(
+            loki_logql_querystats_latency_seconds_bucket[5m]
+          )
+        ) by (job, le, namespace)
+      )
+      * 100
+      > 30
+    for: 15m
+    labels:
+      severity: warning
--- a/operator/internal/manifests/internal/alerts/prometheus-rules.yaml
+++ b/operator/internal/manifests/internal/alerts/prometheus-rules.yaml
@ -0,0 +1,25 @@
+---
+groups:
+- name: logging_loki.rules
+  rules:
+  - record: code_handler_job_namespace:lokistack_gateway_http_requests:irate1m
+    expr: |
+      sum(
+        irate(
+          http_requests_total{container="gateway", group="logsv1"}[1m]
+        )
+      ) by (code, handler, job, namespace)
+  - record: job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m
+    expr: |
+      sum(
+        irate(
+          loki_request_duration_seconds_count[1m]
+        )
+      ) by (job, namespace, route, status_code)
+  - record: job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m
+    expr: |
+      sum(
+        rate(
+          loki_boltdb_shipper_request_duration_seconds_bucket[5m]
+        )
+      ) by (job, le, namespace, operation)
--- a/operator/internal/manifests/internal/alerts/testdata/test.yaml
+++ b/operator/internal/manifests/internal/alerts/testdata/test.yaml
@ -7,23 +7,62 @@ tests:
  - interval: 1m

    input_series:
-      - series: 'loki_request_duration_seconds_count{status_code="500", namespace="my-ns", job="ingester", route="my-route"}'
+      - series: 'job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="500", namespace="my-ns", job="ingester", route="my-route"}'
        values: '1+1x20'
-      - series: 'loki_request_duration_seconds_count{status_code="200", namespace="my-ns", job="ingester", route="my-route"}'
+      - series: 'job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429", namespace="my-ns", job="ingester", route="my-route"}'
+        values: '1+1x20'
+      - series: 'job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="200", namespace="my-ns", job="ingester", route="my-route"}'
        values: '1+3x20'
-      - series: 'http_requests_total{code="500", namespace="my-ns", job="gateway", handler="push", group="logsv1"}'
+      - series: 'code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code="500", namespace="my-ns", job="gateway", handler="push"}'
        values: '1+1x20'
-      - series: 'http_requests_total{code="200", namespace="my-ns", job="gateway", handler="push", group="logsv1"}'
+      - series: 'code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code="200", namespace="my-ns", job="gateway", handler="push"}'
        values: '1+3x20'
-      - series: 'http_requests_total{code="500", namespace="my-ns", job="gateway", handler="query", group="logsv1"}'
+      - series: 'code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code="500", namespace="my-ns", job="gateway", handler="query"}'
        values: '1+1x20'
-      - series: 'http_requests_total{code="200", namespace="my-ns", job="gateway", handler="query", group="logsv1"}'
+      - series: 'code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code="200", namespace="my-ns", job="gateway", handler="query"}'
        values: '1+3x20'

      - series: 'loki_panic_total{namespace="my-ns", job="ingester"}'
        values: '0 1 1 2+0x10'

-    # Unit test for alerting rules.
+      - series: 'loki_ingester_wal_replay_flushing{namespace="my-ns", job="ingester"}'
+        values: '0 1+0x20'
+
+      - series: 'loki_request_duration_seconds_bucket{namespace="my-ns", job="ingester", route="my-route", le="1"}'
+        values: '0+10x20'
+      - series: 'loki_request_duration_seconds_bucket{namespace="my-ns", job="ingester", route="my-route", le="5"}'
+        values: '0+50x20'
+      - series: 'loki_request_duration_seconds_bucket{namespace="my-ns", job="ingester", route="my-route", le="10"}'
+        values: '0+100x20'
+      - series: 'loki_request_duration_seconds_bucket{namespace="my-ns", job="ingester", route="my-route", le="+Inf"}'
+        values: '0+100x20'
+
+      - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="ingester", operation="WRITE", le="1"}'
+        values: '0+10x20'
+      - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="ingester", operation="WRITE", le="5"}'
+        values: '0+50x20'
+      - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="ingester", operation="WRITE", le="10"}'
+        values: '0+100x20'
+      - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="ingester", operation="WRITE", le="+Inf"}'
+        values: '0+100x20'
+      - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="querier", operation="Shipper.Query", le="1"}'
+        values: '0+10x20'
+      - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="querier", operation="Shipper.Query", le="5"}'
+        values: '0+50x20'
+      - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="querier", operation="Shipper.Query", le="10"}'
+        values: '0+100x20'
+      - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="querier", operation="Shipper.Query", le="+Inf"}'
+        values: '0+100x20'
+
+      - series: 'loki_logql_querystats_latency_seconds_bucket{namespace="my-ns", job="querier", route="my-route", le="1"}'
+        values: '0+10x20'
+      - series: 'loki_logql_querystats_latency_seconds_bucket{namespace="my-ns", job="querier", route="my-route", le="5"}'
+        values: '0+50x20'
+      - series: 'loki_logql_querystats_latency_seconds_bucket{namespace="my-ns", job="querier", route="my-route", le="10"}'
+        values: '0+100x20'
+      - series: 'loki_logql_querystats_latency_seconds_bucket{namespace="my-ns", job="querier", route="my-route", le="+Inf"}'
+        values: '0+100x20'
+
    alert_rule_test:
      - eval_time: 16m
        alertname: LokiRequestErrors
@ -35,7 +74,7 @@ tests:
              severity: critical
            exp_annotations:
              summary: "At least 10% of requests are responded by 5xx server errors."
-              message: "ingester my-route is experiencing 25.00% errors."
+              message: "ingester my-route is experiencing 20.48% errors."
              runbook_url: "[[ .RunbookURL ]]#Loki-Request-Errors"
      - eval_time: 16m
        alertname: LokiStackWriteRequestErrors
@ -46,7 +85,7 @@ tests:
              severity: critical
            exp_annotations:
              summary: "At least 10% of write requests to the lokistack-gateway are responded with 5xx server errors."
-              message: "25.00% of write requests from gateway are returned with server errors."
+              message: "25.76% of write requests from gateway in my-ns are returned with server errors."
              runbook_url: "[[ .RunbookURL ]]#LokiStack-Write-Request-Errors"
      - eval_time: 16m
        alertname: LokiStackReadRequestErrors
@ -57,7 +96,7 @@ tests:
              severity: critical
            exp_annotations:
              summary: "At least 10% of query requests to the lokistack-gateway are responded with 5xx server errors."
-              message: "25.00% of query requests from gateway are returned with server errors."
+              message: "25.76% of query requests from gateway in my-ns are returned with server errors."
              runbook_url: "[[ .RunbookURL ]]#LokiStack-Read-Request-Errors"
      - eval_time: 10m
        alertname: LokiRequestPanics
@ -70,3 +109,71 @@ tests:
              summary: "A panic was triggered."
              message: "ingester is experiencing an increase of 2 panics."
              runbook_url: "[[ .RunbookURL ]]#Loki-Request-Panics"
+      - eval_time: 16m
+        alertname: LokiRequestLatency
+        exp_alerts:
+          - exp_labels:
+              namespace: my-ns
+              job: ingester
+              route: my-route
+              severity: critical
+            exp_annotations:
+              summary: "The 99th percentile is experiencing high latency (higher than 1 second)."
+              message: "ingester my-route is experiencing 990.00s 99th percentile latency."
+              runbook_url: "[[ .RunbookURL ]]#Loki-Request-Latency"
+      - eval_time: 16m
+        alertname: LokiTenantRateLimit
+        exp_alerts:
+          - exp_labels:
+              namespace: my-ns
+              job: ingester
+              route: my-route
+              severity: warning
+            exp_annotations:
+              summary: "At least 10% of requests are responded with the rate limit error code."
+              message: "ingester my-route is experiencing 429 errors."
+              runbook_url: "[[ .RunbookURL ]]#Loki-Tenant-Rate-Limit"
+      - eval_time: 16m
+        alertname: LokiStorageSlowWrite
+        exp_alerts:
+          - exp_labels:
+              namespace: my-ns
+              job: ingester
+              severity: warning
+            exp_annotations:
+              summary: "The storage path is experiencing slow write response rates."
+              message: "The storage path is experiencing slow write response rates."
+              runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Write"
+      - eval_time: 16m
+        alertname: LokiStorageSlowRead
+        exp_alerts:
+          - exp_labels:
+              namespace: my-ns
+              job: querier
+              severity: warning
+            exp_annotations:
+              summary: "The storage path is experiencing slow read response rates."
+              message: "The storage path is experiencing slow read response rates."
+              runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Read"
+      - eval_time: 16m
+        alertname: LokiWritePathHighLoad
+        exp_alerts:
+          - exp_labels:
+              namespace: my-ns
+              job: ingester
+              severity: warning
+            exp_annotations:
+              summary: "The write path is experiencing high load, causing backpressure storage flushing."
+              message: "The write path is experiencing high load."
+              runbook_url: "[[ .RunbookURL ]]#Loki-Write-Path-High-Load"
+      - eval_time: 16m
+        alertname: LokiReadPathHighLoad
+        exp_alerts:
+          - exp_labels:
+              namespace: my-ns
+              job: querier
+              severity: warning
+            exp_annotations:
+              summary: "The read path has high volume of queries, causing longer response times."
+              message: "The read path is experiencing high load."
+              runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load"