breakout promtail mixin to be separate.

Signed-off-by: Edward Welch <edward.welch@grafana.com>
pull/1749/head
Edward Welch 6 years ago committed by Ed Welch
parent 4fdfbe8e56
commit f9e37e2997
  1. 68
      production/loki-mixin/alerts.libsonnet
  2. 51
      production/loki-mixin/dashboards.libsonnet
  3. 5
      production/loki-mixin/recording_rules.libsonnet
  4. 74
      production/promtail-mixin/alerts.libsonnet
  5. 57
      production/promtail-mixin/dashboards.libsonnet
  6. 24
      production/promtail-mixin/jsonnetfile.json
  7. 3
      production/promtail-mixin/mixin.libsonnet
  8. 12
      production/promtail-mixin/recording_rules.libsonnet

@ -39,74 +39,6 @@
},
],
},
{
name: 'promtail_alerts',
rules: [
{
alert: 'PromtailRequestsErrors',
expr: |||
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
/
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
> 10
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
},
},
{
alert: 'PromtailRequestLatency',
expr: |||
job_status_code:promtail_request_duration_seconds:99quantile > 1
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||,
},
},
{
alert: 'PromtailFileLagging',
expr: |||
abs(promtail_file_bytes_total - promtail_read_bytes_total) > 100000
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 100kb for more than 15m.
|||,
},
},
{
alert: 'PromtailFileMissing',
expr: |||
count by (path,instance,job) (promtail_file_bytes_total) unless count by (path,instance,job) (promtail_read_bytes_total)
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed.
|||,
},
},
],
},
],
},
}

@ -141,56 +141,5 @@ local utils = import 'mixin-utils/utils.libsonnet';
g.qpsPanel('loki_ingester_chunk_age_seconds_count{cluster="$cluster", job="$namespace/ingester"}'),
),
),
'promtail.json':
g.dashboard('Loki / Promtail')
.addTemplate('cluster', 'kube_pod_container_info{image=~".*promtail.*"}', 'cluster')
.addTemplate('namespace', 'kube_pod_container_info{image=~".*promtail.*"}', 'namespace')
.addRow(
g.row('Targets & Files')
.addPanel(
g.panel('Active Targets') +
g.queryPanel(
'sum(promtail_targets_active_total{cluster="$cluster", job="$namespace/promtail"})',
'Active Targets',
),
)
.addPanel(
g.panel('Active Files') +
g.queryPanel(
'sum(promtail_files_active_total{cluster="$cluster", job="$namespace/promtail"})',
'Active Targets',
),
)
)
.addRow(
g.row('IO')
.addPanel(
g.panel('Bps') +
g.queryPanel(
'sum(rate(promtail_read_bytes_total{cluster="$cluster", job="$namespace/promtail"}[1m]))',
'logs read',
) +
{ yaxes: g.yaxes('Bps') },
)
.addPanel(
g.panel('Lines') +
g.queryPanel(
'sum(rate(promtail_read_lines_total{cluster="$cluster", job="$namespace/promtail"}[1m]))',
'lines read',
),
)
)
.addRow(
g.row('Requests')
.addPanel(
g.panel('QPS') +
g.qpsPanel('promtail_request_duration_seconds_count{cluster="$cluster", job="$namespace/promtail"}')
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('promtail_request_duration_seconds', [utils.selector.eq('job', '$namespace/promtail')], extra_selectors=[utils.selector.eq('cluster', '$cluster')])
)
),
},
}

@ -8,11 +8,6 @@ local utils = import 'mixin-utils/utils.libsonnet';
utils.histogramRules('loki_request_duration_seconds', ['job']) +
utils.histogramRules('loki_request_duration_seconds', ['job', 'route']) +
utils.histogramRules('loki_request_duration_seconds', ['namespace', 'job', 'route']),
}, {
name: 'promtail_rules',
rules:
utils.histogramRules('promtail_request_duration_seconds', ['job']) +
utils.histogramRules('promtail_request_duration_seconds', ['job', 'status_code']),
}],
},
}

@ -0,0 +1,74 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'promtail_alerts',
rules: [
{
alert: 'PromtailRequestsErrors',
expr: |||
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
/
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
> 10
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
},
},
{
alert: 'PromtailRequestLatency',
expr: |||
job_status_code:promtail_request_duration_seconds:99quantile > 1
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||,
},
},
{
alert: 'PromtailFileLagging',
expr: |||
abs(promtail_file_bytes_total - promtail_read_bytes_total) > 100000
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 100kb for more than 15m.
|||,
},
},
{
alert: 'PromtailFileMissing',
expr: |||
count by (path,instance,job) (promtail_file_bytes_total) unless count by (path,instance,job) (promtail_read_bytes_total)
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed.
|||,
},
},
],
},
],
},
}

@ -0,0 +1,57 @@
local g = import 'grafana-builder/grafana.libsonnet';
local utils = import 'mixin-utils/utils.libsonnet';
{
dashboards+: {
'promtail.json':
g.dashboard('Loki / Promtail')
.addTemplate('cluster', 'kube_pod_container_info{image=~".*promtail.*"}', 'cluster')
.addTemplate('namespace', 'kube_pod_container_info{image=~".*promtail.*"}', 'namespace')
.addRow(
g.row('Targets & Files')
.addPanel(
g.panel('Active Targets') +
g.queryPanel(
'sum(promtail_targets_active_total{cluster="$cluster", job="$namespace/promtail"})',
'Active Targets',
),
)
.addPanel(
g.panel('Active Files') +
g.queryPanel(
'sum(promtail_files_active_total{cluster="$cluster", job="$namespace/promtail"})',
'Active Targets',
),
)
)
.addRow(
g.row('IO')
.addPanel(
g.panel('Bps') +
g.queryPanel(
'sum(rate(promtail_read_bytes_total{cluster="$cluster", job="$namespace/promtail"}[1m]))',
'logs read',
) +
{ yaxes: g.yaxes('Bps') },
)
.addPanel(
g.panel('Lines') +
g.queryPanel(
'sum(rate(promtail_read_lines_total{cluster="$cluster", job="$namespace/promtail"}[1m]))',
'lines read',
),
)
)
.addRow(
g.row('Requests')
.addPanel(
g.panel('QPS') +
g.qpsPanel('promtail_request_duration_seconds_count{cluster="$cluster", job="$namespace/promtail"}')
)
.addPanel(
g.panel('Latency') +
utils.latencyRecordingRulePanel('promtail_request_duration_seconds', [utils.selector.eq('job', '$namespace/promtail')], extra_selectors=[utils.selector.eq('cluster', '$cluster')])
)
),
},
}

@ -0,0 +1,24 @@
{
"dependencies": [
{
"name": "grafana-builder",
"source": {
"git": {
"remote": "https://github.com/grafana/jsonnet-libs",
"subdir": "grafana-builder"
}
},
"version": "master"
},
{
"name": "mixin-utils",
"source": {
"git": {
"remote": "https://github.com/grafana/jsonnet-libs",
"subdir": "mixin-utils"
}
},
"version": "master"
}
]
}

@ -0,0 +1,3 @@
(import 'dashboards.libsonnet') +
(import 'alerts.libsonnet') +
(import 'recording_rules.libsonnet')

@ -0,0 +1,12 @@
local utils = import 'mixin-utils/utils.libsonnet';
{
prometheusRules+:: {
groups+: [{
name: 'promtail_rules',
rules:
utils.histogramRules('promtail_request_duration_seconds', ['job']) +
utils.histogramRules('promtail_request_duration_seconds', ['job', 'status_code']),
}],
},
}
Loading…
Cancel
Save