diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index 1ea5028246..a37ae62746 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -39,74 +39,6 @@ }, ], }, - { - name: 'promtail_alerts', - rules: [ - { - alert: 'PromtailRequestsErrors', - expr: ||| - 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) - / - sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) - > 10 - |||, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, - }, - }, - { - alert: 'PromtailRequestLatency', - expr: ||| - job_status_code:promtail_request_duration_seconds:99quantile > 1 - |||, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - |||, - }, - }, - { - alert: 'PromtailFileLagging', - expr: ||| - abs(promtail_file_bytes_total - promtail_read_bytes_total) > 100000 - |||, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 100kb for more than 15m. - |||, - }, - }, - { - alert: 'PromtailFileMissing', - expr: ||| - count by (path,instance,job) (promtail_file_bytes_total) unless count by (path,instance,job) (promtail_read_bytes_total) - |||, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed. - |||, - }, - }, - ], - }, ], }, } diff --git a/production/loki-mixin/dashboards.libsonnet b/production/loki-mixin/dashboards.libsonnet index a9ac85c8e7..cf497a7ea1 100644 --- a/production/loki-mixin/dashboards.libsonnet +++ b/production/loki-mixin/dashboards.libsonnet @@ -141,56 +141,5 @@ local utils = import 'mixin-utils/utils.libsonnet'; g.qpsPanel('loki_ingester_chunk_age_seconds_count{cluster="$cluster", job="$namespace/ingester"}'), ), ), - - 'promtail.json': - g.dashboard('Loki / Promtail') - .addTemplate('cluster', 'kube_pod_container_info{image=~".*promtail.*"}', 'cluster') - .addTemplate('namespace', 'kube_pod_container_info{image=~".*promtail.*"}', 'namespace') - .addRow( - g.row('Targets & Files') - .addPanel( - g.panel('Active Targets') + - g.queryPanel( - 'sum(promtail_targets_active_total{cluster="$cluster", job="$namespace/promtail"})', - 'Active Targets', - ), - ) - .addPanel( - g.panel('Active Files') + - g.queryPanel( - 'sum(promtail_files_active_total{cluster="$cluster", job="$namespace/promtail"})', - 'Active Targets', - ), - ) - ) - .addRow( - g.row('IO') - .addPanel( - g.panel('Bps') + - g.queryPanel( - 'sum(rate(promtail_read_bytes_total{cluster="$cluster", job="$namespace/promtail"}[1m]))', - 'logs read', - ) + - { yaxes: g.yaxes('Bps') }, - ) - .addPanel( - g.panel('Lines') + - g.queryPanel( - 'sum(rate(promtail_read_lines_total{cluster="$cluster", job="$namespace/promtail"}[1m]))', - 'lines read', - ), - ) - ) - .addRow( - g.row('Requests') - .addPanel( - g.panel('QPS') + - g.qpsPanel('promtail_request_duration_seconds_count{cluster="$cluster", job="$namespace/promtail"}') - ) - .addPanel( - g.panel('Latency') + - utils.latencyRecordingRulePanel('promtail_request_duration_seconds', [utils.selector.eq('job', '$namespace/promtail')], extra_selectors=[utils.selector.eq('cluster', '$cluster')]) - ) - ), }, } diff --git a/production/loki-mixin/recording_rules.libsonnet b/production/loki-mixin/recording_rules.libsonnet index 2aecf437a0..9717a9a8aa 100644 --- a/production/loki-mixin/recording_rules.libsonnet +++ b/production/loki-mixin/recording_rules.libsonnet @@ -8,11 +8,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.histogramRules('loki_request_duration_seconds', ['job']) + utils.histogramRules('loki_request_duration_seconds', ['job', 'route']) + utils.histogramRules('loki_request_duration_seconds', ['namespace', 'job', 'route']), - }, { - name: 'promtail_rules', - rules: - utils.histogramRules('promtail_request_duration_seconds', ['job']) + - utils.histogramRules('promtail_request_duration_seconds', ['job', 'status_code']), }], }, } diff --git a/production/promtail-mixin/alerts.libsonnet b/production/promtail-mixin/alerts.libsonnet new file mode 100644 index 0000000000..82052fa2b1 --- /dev/null +++ b/production/promtail-mixin/alerts.libsonnet @@ -0,0 +1,74 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'promtail_alerts', + rules: [ + { + alert: 'PromtailRequestsErrors', + expr: ||| + 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) + / + sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) + > 10 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'PromtailRequestLatency', + expr: ||| + job_status_code:promtail_request_duration_seconds:99quantile > 1 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, + }, + }, + { + alert: 'PromtailFileLagging', + expr: ||| + abs(promtail_file_bytes_total - promtail_read_bytes_total) > 100000 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 100kb for more than 15m. + |||, + }, + }, + { + alert: 'PromtailFileMissing', + expr: ||| + count by (path,instance,job) (promtail_file_bytes_total) unless count by (path,instance,job) (promtail_read_bytes_total) + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed. + |||, + }, + }, + ], + }, + ], + }, +} diff --git a/production/promtail-mixin/dashboards.libsonnet b/production/promtail-mixin/dashboards.libsonnet new file mode 100644 index 0000000000..3ea9a3aff3 --- /dev/null +++ b/production/promtail-mixin/dashboards.libsonnet @@ -0,0 +1,57 @@ +local g = import 'grafana-builder/grafana.libsonnet'; +local utils = import 'mixin-utils/utils.libsonnet'; + +{ + dashboards+: { + 'promtail.json': + g.dashboard('Loki / Promtail') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*promtail.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*promtail.*"}', 'namespace') + .addRow( + g.row('Targets & Files') + .addPanel( + g.panel('Active Targets') + + g.queryPanel( + 'sum(promtail_targets_active_total{cluster="$cluster", job="$namespace/promtail"})', + 'Active Targets', + ), + ) + .addPanel( + g.panel('Active Files') + + g.queryPanel( + 'sum(promtail_files_active_total{cluster="$cluster", job="$namespace/promtail"})', + 'Active Targets', + ), + ) + ) + .addRow( + g.row('IO') + .addPanel( + g.panel('Bps') + + g.queryPanel( + 'sum(rate(promtail_read_bytes_total{cluster="$cluster", job="$namespace/promtail"}[1m]))', + 'logs read', + ) + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Lines') + + g.queryPanel( + 'sum(rate(promtail_read_lines_total{cluster="$cluster", job="$namespace/promtail"}[1m]))', + 'lines read', + ), + ) + ) + .addRow( + g.row('Requests') + .addPanel( + g.panel('QPS') + + g.qpsPanel('promtail_request_duration_seconds_count{cluster="$cluster", job="$namespace/promtail"}') + ) + .addPanel( + g.panel('Latency') + + utils.latencyRecordingRulePanel('promtail_request_duration_seconds', [utils.selector.eq('job', '$namespace/promtail')], extra_selectors=[utils.selector.eq('cluster', '$cluster')]) + ) + ), + }, +} diff --git a/production/promtail-mixin/jsonnetfile.json b/production/promtail-mixin/jsonnetfile.json new file mode 100644 index 0000000000..d62efed215 --- /dev/null +++ b/production/promtail-mixin/jsonnetfile.json @@ -0,0 +1,24 @@ +{ + "dependencies": [ + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "grafana-builder" + } + }, + "version": "master" + }, + { + "name": "mixin-utils", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "mixin-utils" + } + }, + "version": "master" + } + ] +} \ No newline at end of file diff --git a/production/promtail-mixin/mixin.libsonnet b/production/promtail-mixin/mixin.libsonnet new file mode 100644 index 0000000000..b2b2f10dd2 --- /dev/null +++ b/production/promtail-mixin/mixin.libsonnet @@ -0,0 +1,3 @@ +(import 'dashboards.libsonnet') + +(import 'alerts.libsonnet') + +(import 'recording_rules.libsonnet') diff --git a/production/promtail-mixin/recording_rules.libsonnet b/production/promtail-mixin/recording_rules.libsonnet new file mode 100644 index 0000000000..a93c19a7ec --- /dev/null +++ b/production/promtail-mixin/recording_rules.libsonnet @@ -0,0 +1,12 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +{ + prometheusRules+:: { + groups+: [{ + name: 'promtail_rules', + rules: + utils.histogramRules('promtail_request_duration_seconds', ['job']) + + utils.histogramRules('promtail_request_duration_seconds', ['job', 'status_code']), + }], + }, +}