diff --git a/operator/CHANGELOG.md b/operator/CHANGELOG.md index 915286ea45..92ed101eef 100644 --- a/operator/CHANGELOG.md +++ b/operator/CHANGELOG.md @@ -1,5 +1,6 @@ ## Main +- [6951](https://github.com/grafana/loki/pull/6951) **Red-GV**: Adding operational Lokistack alerts - [7254](https://github.com/grafana/loki/pull/7254) **periklis**: Expose Loki Ruler API via the lokistack-gateway - [7214](https://github.com/grafana/loki/pull/7214) **periklis**: Fix ruler GRPC tls client configuration - [7201](https://github.com/grafana/loki/pull/7201) **xperimental**: Write configuration for per-tenant retention diff --git a/operator/Makefile b/operator/Makefile index 96fe91abf5..c8da06cb18 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -154,7 +154,7 @@ lint: $(GOLANGCI_LINT) | generate ## Run golangci-lint on source code. .PHONY: lint-prometheus lint-prometheus: $(PROMTOOL) ## Run promtool check against recording rules and alerts. - @$(PROMTOOL) check rules ./internal/manifests/internal/alerts/prometheus-alerts.yaml + @$(PROMTOOL) check rules ./internal/manifests/internal/alerts/prometheus-*.yaml .PHONY: fmt fmt: $(GOFUMPT) ## Run gofumpt on source code. diff --git a/operator/docs/lokistack/sop.md b/operator/docs/lokistack/sop.md index 9f2cdaa8b2..2e27b5a6e3 100644 --- a/operator/docs/lokistack/sop.md +++ b/operator/docs/lokistack/sop.md @@ -136,3 +136,173 @@ A service(s) has crashed. - Check the logs of the service that is panicking - Examine metrics for signs of failure + +## Loki Request Latency + +### Impact + +A service(s) is affected by slow request responses. + +### Summary + +A service(s) is slower than expected at processing data. + +### Severity + +`Critical` + +### Access Required + +- Console access to the cluster +- Edit access to the deployed operator and Loki namespace: + - OpenShift + - `openshift-logging` (LokiStack) + - `openshift-operators-redhat` (Loki Operator) + +### Steps + +- Check the logs of all the services +- Check to ensure that the Loki components can reach the storage + - Particularly for queriers, examine metrics for a small query queue: `cortex_query_scheduler_inflight_requests` + +## Loki Tenant Rate Limit + +### Impact + +A tenant is being rate limited, resulting in potential loss of data. + +### Summary + +A service(s) is rate limiting at least 10% of all incoming requests. + +### Severity + +`Warning` + +### Access Required + +- Console access to the cluster +- Edit access to the deployed operator and Loki namespace: + - OpenShift + - `openshift-logging` (LokiStack) + - `openshift-operators-redhat` (Loki Operator) + +### Steps + +- Examine the metrics for the reason and tenant that is being limited: `loki_discarded_samples_total{namespace=""}` +- Increase the limits allocated to the tenant in the LokiStack CRD + - For ingestion limits, please consult the table below + - For query limits, the `MaxEntriesLimitPerQuery`, `MaxChunksPerQuery`, or `MaxQuerySeries` can be changed to raise the limit + +| Reason | Corresponding Ingestion Limit Keys | +| --- | --- | +| `rate_limited` | `ingestionRate`, `ingestionBurstSize` | +| `stream_limit` | `maxGlobalStreamsPerTenant` | +| `label_name_too_long` | `maxLabelNameLength` | +| `label_value_too_long` | `maxLabelValueLength` | +| `line_too_long` | `maxLineSize` | +| `max_label_names_per_series` | `maxLabelNamesPerSeries` | + +## Loki Storage Slow Write + +### Impact + +The cluster is unable to push logs to backend storage in a timely manner. + +### Summary + +The cluster is unable to push logs to backend storage in a timely manner. + +### Severity + +`Warning` + +### Access Required + +- Console access to the cluster +- Edit access to the deployed operator and Loki namespace: + - OpenShift + - `openshift-logging` (LokiStack) + - `openshift-operators-redhat` (Loki Operator) + +### Steps + +- Ensure that the cluster can communicate with the backend storage + +## Loki Storage Slow Read + +### Impact + +The cluster is unable to retrieve logs to backend storage in a timely manner. + +### Summary + +The cluster is unable to retrieve logs to backend storage in a timely manner. + +### Severity + +`Warning` + +### Access Required + +- Console access to the cluster +- Edit access to the deployed operator and Loki namespace: + - OpenShift + - `openshift-logging` (LokiStack) + - `openshift-operators-redhat` (Loki Operator) + +### Steps + +- Ensure that the cluster can communicate with the backend storage + +## Loki Write Path High Load + +### Impact + +The write path is under high pressure and requires a storage flush. + +### Summary + +The write path is flushing the storage in response to back-pressuring. + +### Severity + +`Warning` + +### Access Required + +- Console access to the cluster +- Edit access to the deployed operator and Loki namespace: + - OpenShift + - `openshift-logging` (LokiStack) + - `openshift-operators-redhat` (Loki Operator) + +### Steps + +- Adjust the ingestion limits for the affected tenant or increase the number of ingesters + +## Loki Read Path High Load + +### Impact + +The read path is under high load. + +### Summary + +The query queue is currently under high load. + +### Severity + +`Warning` + +### Access Required + +- Console access to the cluster +- Edit access to the deployed operator and Loki namespace: + - OpenShift + - `openshift-logging` (LokiStack) + - `openshift-operators-redhat` (Loki Operator) + +### Steps + +- Increase the number of queriers diff --git a/operator/internal/manifests/internal/alerts/build.go b/operator/internal/manifests/internal/alerts/build.go index 4e860664f2..4ff9376baa 100644 --- a/operator/internal/manifests/internal/alerts/build.go +++ b/operator/internal/manifests/internal/alerts/build.go @@ -20,21 +20,43 @@ var ( //go:embed prometheus-alerts.yaml alertsYAMLTmplFile embed.FS + //go:embed prometheus-rules.yaml + rulesYAMLTmplFile embed.FS + alertsYAMLTmpl = template.Must(template.New("").Delims("[[", "]]").ParseFS(alertsYAMLTmplFile, "prometheus-alerts.yaml")) + + rulesYAMLTmpl = template.Must(template.New("").Delims("[[", "]]").ParseFS(rulesYAMLTmplFile, "prometheus-rules.yaml")) ) // Build creates Prometheus alerts for the Loki stack func Build(opts Options) (*monitoringv1.PrometheusRuleSpec, error) { + alerts, err := ruleSpec("prometheus-alerts.yaml", alertsYAMLTmpl, opts) + if err != nil { + return nil, kverrors.Wrap(err, "failed to create prometheus alerts") + } + + recordingRules, err := ruleSpec("prometheus-rules.yaml", rulesYAMLTmpl, opts) + if err != nil { + return nil, kverrors.Wrap(err, "failed to create prometheus rules") + } + + spec := alerts.DeepCopy() + spec.Groups = append(alerts.Groups, recordingRules.Groups...) + + return spec, nil +} + +func ruleSpec(file string, tmpl *template.Template, opts Options) (*monitoringv1.PrometheusRuleSpec, error) { spec := monitoringv1.PrometheusRuleSpec{} - // Build alerts yaml w := bytes.NewBuffer(nil) - err := alertsYAMLTmpl.ExecuteTemplate(w, "prometheus-alerts.yaml", opts) + err := tmpl.ExecuteTemplate(w, file, opts) if err != nil { - return nil, kverrors.Wrap(err, "failed to create prometheus alerts") + return nil, kverrors.Wrap(err, "failed to execute template", + "template", file, + ) } - // Decode the spec r := io.Reader(w) err = yaml.NewYAMLOrJSONDecoder(r, 1000).Decode(&spec) if err != nil { diff --git a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml index 0e408200b1..21650a4e1e 100644 --- a/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml +++ b/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml @@ -10,16 +10,12 @@ groups: runbook_url: "[[ .RunbookURL ]]#Loki-Request-Errors" expr: | sum( - rate( - loki_request_duration_seconds_count{status_code=~"5.."}[1m] - ) - ) by (namespace, job, route) + job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code=~"5.."} + ) by (job, namespace, route) / sum( - rate( - loki_request_duration_seconds_count[1m] - ) - ) by (namespace, job, route) + job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m + ) by (job, namespace, route) * 100 > 10 for: 15m @@ -28,21 +24,17 @@ groups: - alert: LokiStackWriteRequestErrors annotations: message: |- - {{ printf "%.2f" $value }}% of write requests from {{ $labels.job }} are returned with server errors. + {{ printf "%.2f" $value }}% of write requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. summary: "At least 10% of write requests to the lokistack-gateway are responded with 5xx server errors." runbook_url: "[[ .RunbookURL ]]#LokiStack-Write-Request-Errors" expr: | sum( - rate( - http_requests_total{code=~"5..", group="logsv1", handler="push"}[1m] - ) - ) by (namespace, job, tenant) + code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler="push"} + ) by (job, namespace) / sum( - rate( - http_requests_total{group="logsv1", handler="push"}[1m] - ) - ) by (namespace, job, tenant) + code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler="push"} + ) by (job, namespace) * 100 > 10 for: 15m @@ -51,21 +43,17 @@ groups: - alert: LokiStackReadRequestErrors annotations: message: |- - {{ printf "%.2f" $value }}% of query requests from {{ $labels.job }} are returned with server errors. + {{ printf "%.2f" $value }}% of query requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. summary: "At least 10% of query requests to the lokistack-gateway are responded with 5xx server errors." runbook_url: "[[ .RunbookURL ]]#LokiStack-Read-Request-Errors" expr: | sum( - rate( - http_requests_total{code=~"5..", group="logsv1", handler=~"query|query_range|label|labels|label_values"}[1m] - ) - ) by (namespace, job, tenant) + code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler=~"query|query_range|label|labels|label_values"} + ) by (job, namespace) / sum( - rate( - http_requests_total{group="logsv1", handler=~"query|query_range|label|labels|label_values"}[1m] - ) - ) by (namespace, job, tenant) + code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler=~"query|query_range|label|labels|label_values"} + ) by (job, namespace) * 100 > 10 for: 15m @@ -82,7 +70,112 @@ groups: increase( loki_panic_total[10m] ) - ) by (namespace, job) + ) by (job, namespace) > 0 labels: - severity: critical + severity: critical + - alert: LokiRequestLatency + annotations: + message: |- + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + summary: "The 99th percentile is experiencing high latency (higher than 1 second)." + runbook_url: "[[ .RunbookURL ]]#Loki-Request-Latency" + expr: | + histogram_quantile(0.99, + sum( + irate( + loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[1m] + ) + ) by (job, le, namespace, route) + ) + * 100 + > 1 + for: 15m + labels: + severity: critical + - alert: LokiTenantRateLimit + annotations: + message: |- + {{ $labels.job }} {{ $labels.route }} is experiencing 429 errors. + summary: "At least 10% of requests are responded with the rate limit error code." + runbook_url: "[[ .RunbookURL ]]#Loki-Tenant-Rate-Limit" + expr: | + sum( + job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429"} + ) by (job, namespace, route) + / + sum( + job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m + ) by (job, namespace, route) + * 100 + > 10 + for: 15m + labels: + severity: warning + - alert: LokiStorageSlowWrite + annotations: + message: |- + The storage path is experiencing slow write response rates. + summary: "The storage path is experiencing slow write response rates." + runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Write" + expr: | + histogram_quantile(0.99, + sum( + job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="WRITE"} + ) by (job, le, namespace) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + - alert: LokiStorageSlowRead + annotations: + message: |- + The storage path is experiencing slow read response rates. + summary: "The storage path is experiencing slow read response rates." + runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Read" + expr: | + histogram_quantile(0.99, + sum( + job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="Shipper.Query"} + ) by (job, le, namespace) + ) + * 100 + > 5 + for: 15m + labels: + severity: warning + - alert: LokiWritePathHighLoad + annotations: + message: |- + The write path is experiencing high load. + summary: "The write path is experiencing high load, causing backpressure storage flushing." + runbook_url: "[[ .RunbookURL ]]#Loki-Write-Path-High-Load" + expr: | + sum( + loki_ingester_wal_replay_flushing + ) by (job, namespace) + > 0 + for: 15m + labels: + severity: warning + - alert: LokiReadPathHighLoad + annotations: + message: |- + The read path is experiencing high load. + summary: "The read path has high volume of queries, causing longer response times." + runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load" + expr: | + histogram_quantile(0.99, + sum( + rate( + loki_logql_querystats_latency_seconds_bucket[5m] + ) + ) by (job, le, namespace) + ) + * 100 + > 30 + for: 15m + labels: + severity: warning diff --git a/operator/internal/manifests/internal/alerts/prometheus-rules.yaml b/operator/internal/manifests/internal/alerts/prometheus-rules.yaml new file mode 100644 index 0000000000..f298faf23d --- /dev/null +++ b/operator/internal/manifests/internal/alerts/prometheus-rules.yaml @@ -0,0 +1,25 @@ +--- +groups: +- name: logging_loki.rules + rules: + - record: code_handler_job_namespace:lokistack_gateway_http_requests:irate1m + expr: | + sum( + irate( + http_requests_total{container="gateway", group="logsv1"}[1m] + ) + ) by (code, handler, job, namespace) + - record: job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m + expr: | + sum( + irate( + loki_request_duration_seconds_count[1m] + ) + ) by (job, namespace, route, status_code) + - record: job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m + expr: | + sum( + rate( + loki_boltdb_shipper_request_duration_seconds_bucket[5m] + ) + ) by (job, le, namespace, operation) diff --git a/operator/internal/manifests/internal/alerts/testdata/test.yaml b/operator/internal/manifests/internal/alerts/testdata/test.yaml index 6de3c7266f..d5f800bac2 100644 --- a/operator/internal/manifests/internal/alerts/testdata/test.yaml +++ b/operator/internal/manifests/internal/alerts/testdata/test.yaml @@ -7,23 +7,62 @@ tests: - interval: 1m input_series: - - series: 'loki_request_duration_seconds_count{status_code="500", namespace="my-ns", job="ingester", route="my-route"}' + - series: 'job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="500", namespace="my-ns", job="ingester", route="my-route"}' values: '1+1x20' - - series: 'loki_request_duration_seconds_count{status_code="200", namespace="my-ns", job="ingester", route="my-route"}' + - series: 'job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429", namespace="my-ns", job="ingester", route="my-route"}' + values: '1+1x20' + - series: 'job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="200", namespace="my-ns", job="ingester", route="my-route"}' values: '1+3x20' - - series: 'http_requests_total{code="500", namespace="my-ns", job="gateway", handler="push", group="logsv1"}' + - series: 'code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code="500", namespace="my-ns", job="gateway", handler="push"}' values: '1+1x20' - - series: 'http_requests_total{code="200", namespace="my-ns", job="gateway", handler="push", group="logsv1"}' + - series: 'code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code="200", namespace="my-ns", job="gateway", handler="push"}' values: '1+3x20' - - series: 'http_requests_total{code="500", namespace="my-ns", job="gateway", handler="query", group="logsv1"}' + - series: 'code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code="500", namespace="my-ns", job="gateway", handler="query"}' values: '1+1x20' - - series: 'http_requests_total{code="200", namespace="my-ns", job="gateway", handler="query", group="logsv1"}' + - series: 'code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code="200", namespace="my-ns", job="gateway", handler="query"}' values: '1+3x20' - series: 'loki_panic_total{namespace="my-ns", job="ingester"}' values: '0 1 1 2+0x10' - # Unit test for alerting rules. + - series: 'loki_ingester_wal_replay_flushing{namespace="my-ns", job="ingester"}' + values: '0 1+0x20' + + - series: 'loki_request_duration_seconds_bucket{namespace="my-ns", job="ingester", route="my-route", le="1"}' + values: '0+10x20' + - series: 'loki_request_duration_seconds_bucket{namespace="my-ns", job="ingester", route="my-route", le="5"}' + values: '0+50x20' + - series: 'loki_request_duration_seconds_bucket{namespace="my-ns", job="ingester", route="my-route", le="10"}' + values: '0+100x20' + - series: 'loki_request_duration_seconds_bucket{namespace="my-ns", job="ingester", route="my-route", le="+Inf"}' + values: '0+100x20' + + - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="ingester", operation="WRITE", le="1"}' + values: '0+10x20' + - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="ingester", operation="WRITE", le="5"}' + values: '0+50x20' + - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="ingester", operation="WRITE", le="10"}' + values: '0+100x20' + - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="ingester", operation="WRITE", le="+Inf"}' + values: '0+100x20' + - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="querier", operation="Shipper.Query", le="1"}' + values: '0+10x20' + - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="querier", operation="Shipper.Query", le="5"}' + values: '0+50x20' + - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="querier", operation="Shipper.Query", le="10"}' + values: '0+100x20' + - series: 'job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{namespace="my-ns", job="querier", operation="Shipper.Query", le="+Inf"}' + values: '0+100x20' + + - series: 'loki_logql_querystats_latency_seconds_bucket{namespace="my-ns", job="querier", route="my-route", le="1"}' + values: '0+10x20' + - series: 'loki_logql_querystats_latency_seconds_bucket{namespace="my-ns", job="querier", route="my-route", le="5"}' + values: '0+50x20' + - series: 'loki_logql_querystats_latency_seconds_bucket{namespace="my-ns", job="querier", route="my-route", le="10"}' + values: '0+100x20' + - series: 'loki_logql_querystats_latency_seconds_bucket{namespace="my-ns", job="querier", route="my-route", le="+Inf"}' + values: '0+100x20' + alert_rule_test: - eval_time: 16m alertname: LokiRequestErrors @@ -35,7 +74,7 @@ tests: severity: critical exp_annotations: summary: "At least 10% of requests are responded by 5xx server errors." - message: "ingester my-route is experiencing 25.00% errors." + message: "ingester my-route is experiencing 20.48% errors." runbook_url: "[[ .RunbookURL ]]#Loki-Request-Errors" - eval_time: 16m alertname: LokiStackWriteRequestErrors @@ -46,7 +85,7 @@ tests: severity: critical exp_annotations: summary: "At least 10% of write requests to the lokistack-gateway are responded with 5xx server errors." - message: "25.00% of write requests from gateway are returned with server errors." + message: "25.76% of write requests from gateway in my-ns are returned with server errors." runbook_url: "[[ .RunbookURL ]]#LokiStack-Write-Request-Errors" - eval_time: 16m alertname: LokiStackReadRequestErrors @@ -57,7 +96,7 @@ tests: severity: critical exp_annotations: summary: "At least 10% of query requests to the lokistack-gateway are responded with 5xx server errors." - message: "25.00% of query requests from gateway are returned with server errors." + message: "25.76% of query requests from gateway in my-ns are returned with server errors." runbook_url: "[[ .RunbookURL ]]#LokiStack-Read-Request-Errors" - eval_time: 10m alertname: LokiRequestPanics @@ -70,3 +109,71 @@ tests: summary: "A panic was triggered." message: "ingester is experiencing an increase of 2 panics." runbook_url: "[[ .RunbookURL ]]#Loki-Request-Panics" + - eval_time: 16m + alertname: LokiRequestLatency + exp_alerts: + - exp_labels: + namespace: my-ns + job: ingester + route: my-route + severity: critical + exp_annotations: + summary: "The 99th percentile is experiencing high latency (higher than 1 second)." + message: "ingester my-route is experiencing 990.00s 99th percentile latency." + runbook_url: "[[ .RunbookURL ]]#Loki-Request-Latency" + - eval_time: 16m + alertname: LokiTenantRateLimit + exp_alerts: + - exp_labels: + namespace: my-ns + job: ingester + route: my-route + severity: warning + exp_annotations: + summary: "At least 10% of requests are responded with the rate limit error code." + message: "ingester my-route is experiencing 429 errors." + runbook_url: "[[ .RunbookURL ]]#Loki-Tenant-Rate-Limit" + - eval_time: 16m + alertname: LokiStorageSlowWrite + exp_alerts: + - exp_labels: + namespace: my-ns + job: ingester + severity: warning + exp_annotations: + summary: "The storage path is experiencing slow write response rates." + message: "The storage path is experiencing slow write response rates." + runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Write" + - eval_time: 16m + alertname: LokiStorageSlowRead + exp_alerts: + - exp_labels: + namespace: my-ns + job: querier + severity: warning + exp_annotations: + summary: "The storage path is experiencing slow read response rates." + message: "The storage path is experiencing slow read response rates." + runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Read" + - eval_time: 16m + alertname: LokiWritePathHighLoad + exp_alerts: + - exp_labels: + namespace: my-ns + job: ingester + severity: warning + exp_annotations: + summary: "The write path is experiencing high load, causing backpressure storage flushing." + message: "The write path is experiencing high load." + runbook_url: "[[ .RunbookURL ]]#Loki-Write-Path-High-Load" + - eval_time: 16m + alertname: LokiReadPathHighLoad + exp_alerts: + - exp_labels: + namespace: my-ns + job: querier + severity: warning + exp_annotations: + summary: "The read path has high volume of queries, causing longer response times." + message: "The read path is experiencing high load." + runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load"