Helm: add additionalRuleLabels to PrometheusRule alerts (#9020)

**What this PR does / why we need it**:

Adds the ability to specify other labels on the Prometheus alerts.
Specifying:
```yaml
monitoring:
  rules:
    additionalRuleLabels:
      custom_label: custom_value
```

will add it to all alerts:
```yaml
- alert: LokiRequestErrors
  annotations: ...
  expr: ...
  for: 15m
  labels:
    severity: critical
    custom_label: custom_value
```

This approach is inspired by:
[kube-prometheus-stack/rules](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/general.rules.yaml)
Though in their case they run
[sync_prometheus_rules.py](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/hack/sync_prometheus_rules.py)
to create the `tpl`.

I can understand if this is not the approach (manual) you want to take
as editing the tpl adding the labels everywhere and escaping go tpl `{{
}}` is quite painful.

**Which issue(s) this PR fixes**:
N/A

**Special notes for your reviewer**:

**Checklist**
- [x] Reviewed the
[`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md)
guide (**required**)
- [x] Documentation added
- [x] Tests updated
- [ ] `CHANGELOG.md` updated
- [x] Changes that require user attention or interaction to upgrade are
documented in `docs/sources/upgrading/_index.md`
pull/8984/head^2
Alexandru Gologan 2 years ago committed by GitHub
parent 9eca39601d
commit f785faa820
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 9
      docs/sources/installation/helm/reference.md
  2. 25
      production/helm/loki/src/alerts.yaml.tpl
  3. 2
      production/helm/loki/templates/monitoring/loki-alerts.yaml
  4. 2
      production/helm/loki/values.yaml

@ -2478,6 +2478,15 @@ null
<td><pre lang="json">
[]
</pre>
</td>
</tr>
<tr>
<td>monitoring.rules.additionalRuleLabels</td>
<td>object</td>
<td>Additional labels for PrometheusRule alerts</td>
<td><pre lang="json">
{}
</pre>
</td>
</tr>
<tr>

@ -5,7 +5,7 @@ groups:
- alert: "LokiRequestErrors"
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% errors.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
/
@ -14,40 +14,55 @@ groups:
for: "15m"
labels:
severity: "critical"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
{{- end }}
- alert: "LokiRequestPanics"
annotations:
message: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
{{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% increase of panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
labels:
severity: "critical"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
{{- end }}
- alert: "LokiRequestLatency"
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
for: "15m"
labels:
severity: "critical"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
{{- end }}
- alert: "LokiTooManyCompactorsRunning"
annotations:
message: |
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
{{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
for: "5m"
labels:
severity: "warning"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
{{- end }}
- name: "loki_canaries_alerts"
rules:
- alert: "LokiCanaryLatency"
annotations:
message: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
{{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
expr: |
histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5
for: "15m"
labels:
severity: "warning"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
{{- end }}

@ -17,6 +17,6 @@ metadata:
namespace: {{ .namespace | default $.Release.Namespace }}
spec:
groups:
{{- include "loki.ruleGroupToYaml" ($.Files.Get "src/alerts.yaml" | fromYaml).groups | indent 4 }}
{{- include "loki.ruleGroupToYaml" (tpl ($.Files.Get "src/alerts.yaml.tpl") $ | fromYaml).groups | indent 4 }}
{{- end }}
{{- end }}

@ -539,6 +539,8 @@ monitoring:
annotations: {}
# -- Additional labels for the rules PrometheusRule resource
labels: {}
# -- Additional labels for PrometheusRule alerts
additionalRuleLabels: {}
# -- Additional groups to add to the rules file
additionalGroups: []
# - name: additional-loki-rules

Loading…
Cancel
Save