diff --git a/.github/workflows/helm-ci.yml b/.github/workflows/helm-ci.yml index dab5e26ecf..dbd4350252 100644 --- a/.github/workflows/helm-ci.yml +++ b/.github/workflows/helm-ci.yml @@ -24,6 +24,9 @@ jobs: exit 1 fi + - name: Lint Yaml + run: make helm-lint + - name: Lint Code Base uses: docker://github/super-linter:v3.12.0 env: diff --git a/Makefile b/Makefile index 22783485ee..70dd245b1f 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,7 @@ .PHONY: validate-example-configs generate-example-config-doc check-example-config-doc .PHONY: clean clean-protos .PHONY: k3d-loki k3d-enterprise-logs k3d-down +.PHONY: helm-test helm-lint SHELL = /usr/bin/env bash -o pipefail @@ -160,7 +161,7 @@ cmd/loki-canary/loki-canary: CGO_ENABLED=0 go build $(GO_FLAGS) -o $@ ./$(@D) ############### -# Helm-Test # +# Helm # ############### .PHONY: production/helm/loki/src/helm-test/helm-test helm-test: production/helm/loki/src/helm-test/helm-test @@ -169,6 +170,9 @@ helm-test: production/helm/loki/src/helm-test/helm-test production/helm/loki/src/helm-test/helm-test: CGO_ENABLED=0 go test $(GO_FLAGS) --tags=helm_test -c -o $@ ./$(@D) +helm-lint: + $(MAKE) -BC production/helm/loki lint + ################# # Loki-QueryTee # ################# diff --git a/production/helm/loki/Makefile b/production/helm/loki/Makefile new file mode 100644 index 0000000000..4b56414df7 --- /dev/null +++ b/production/helm/loki/Makefile @@ -0,0 +1,7 @@ +.DEFAULT_GOAL := all +.PHONY: lint lint-yaml + +lint: lint-yaml + +lint-yaml: + yamllint -c $(CURDIR)/src/.yamllint.yaml $(CURDIR)/src diff --git a/production/helm/loki/src/.yamllint.yaml b/production/helm/loki/src/.yamllint.yaml new file mode 100644 index 0000000000..19e5933aca --- /dev/null +++ b/production/helm/loki/src/.yamllint.yaml @@ -0,0 +1,4 @@ +--- +rules: + quoted-strings: + required: true diff --git a/production/helm/loki/src/alerts.yaml b/production/helm/loki/src/alerts.yaml index b1e9687762..e2dbdcb573 100644 --- a/production/helm/loki/src/alerts.yaml +++ b/production/helm/loki/src/alerts.yaml @@ -1,52 +1,53 @@ +--- groups: -- name: loki_alerts - rules: - - alert: LokiRequestErrors - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - expr: | - namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1 - for: 15m - labels: - severity: critical - - alert: LokiTooManyCompactorsRunning - annotations: - message: | - {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - for: 5m - labels: - severity: warning -- name: 'loki_canaries_alerts' - rules: - - alert: 'LokiCanaryLatency' - annotations: - message: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - expr: | - histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5 - for: '15m' - labels: - severity: 'warning' + - name: "loki_alerts" + rules: + - alert: "LokiRequestErrors" + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + expr: | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + / + sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + > 10 + for: "15m" + labels: + severity: "critical" + - alert: "LokiRequestPanics" + annotations: + message: | + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + expr: | + sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + labels: + severity: "critical" + - alert: "LokiRequestLatency" + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + expr: | + namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1 + for: "15m" + labels: + severity: "critical" + - alert: "LokiTooManyCompactorsRunning" + annotations: + message: | + {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + expr: | + sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + for: "5m" + labels: + severity: "warning" + - name: "loki_canaries_alerts" + rules: + - alert: "LokiCanaryLatency" + annotations: + message: | + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + expr: | + histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5 + for: "15m" + labels: + severity: "warning"