mirror of https://github.com/grafana/loki
Signed-off-by: Edward Welch <edward.welch@grafana.com>pull/1749/head
parent
4fdfbe8e56
commit
f9e37e2997
@ -0,0 +1,74 @@ |
||||
{ |
||||
prometheusAlerts+:: { |
||||
groups+: [ |
||||
{ |
||||
name: 'promtail_alerts', |
||||
rules: [ |
||||
{ |
||||
alert: 'PromtailRequestsErrors', |
||||
expr: ||| |
||||
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) |
||||
/ |
||||
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) |
||||
> 10 |
||||
|||, |
||||
'for': '15m', |
||||
labels: { |
||||
severity: 'critical', |
||||
}, |
||||
annotations: { |
||||
message: ||| |
||||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. |
||||
|||, |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'PromtailRequestLatency', |
||||
expr: ||| |
||||
job_status_code:promtail_request_duration_seconds:99quantile > 1 |
||||
|||, |
||||
'for': '15m', |
||||
labels: { |
||||
severity: 'critical', |
||||
}, |
||||
annotations: { |
||||
message: ||| |
||||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. |
||||
|||, |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'PromtailFileLagging', |
||||
expr: ||| |
||||
abs(promtail_file_bytes_total - promtail_read_bytes_total) > 100000 |
||||
|||, |
||||
'for': '15m', |
||||
labels: { |
||||
severity: 'critical', |
||||
}, |
||||
annotations: { |
||||
message: ||| |
||||
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 100kb for more than 15m. |
||||
|||, |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'PromtailFileMissing', |
||||
expr: ||| |
||||
count by (path,instance,job) (promtail_file_bytes_total) unless count by (path,instance,job) (promtail_read_bytes_total) |
||||
|||, |
||||
'for': '15m', |
||||
labels: { |
||||
severity: 'critical', |
||||
}, |
||||
annotations: { |
||||
message: ||| |
||||
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed. |
||||
|||, |
||||
}, |
||||
}, |
||||
], |
||||
}, |
||||
], |
||||
}, |
||||
} |
||||
@ -0,0 +1,57 @@ |
||||
local g = import 'grafana-builder/grafana.libsonnet'; |
||||
local utils = import 'mixin-utils/utils.libsonnet'; |
||||
|
||||
{ |
||||
dashboards+: { |
||||
'promtail.json': |
||||
g.dashboard('Loki / Promtail') |
||||
.addTemplate('cluster', 'kube_pod_container_info{image=~".*promtail.*"}', 'cluster') |
||||
.addTemplate('namespace', 'kube_pod_container_info{image=~".*promtail.*"}', 'namespace') |
||||
.addRow( |
||||
g.row('Targets & Files') |
||||
.addPanel( |
||||
g.panel('Active Targets') + |
||||
g.queryPanel( |
||||
'sum(promtail_targets_active_total{cluster="$cluster", job="$namespace/promtail"})', |
||||
'Active Targets', |
||||
), |
||||
) |
||||
.addPanel( |
||||
g.panel('Active Files') + |
||||
g.queryPanel( |
||||
'sum(promtail_files_active_total{cluster="$cluster", job="$namespace/promtail"})', |
||||
'Active Targets', |
||||
), |
||||
) |
||||
) |
||||
.addRow( |
||||
g.row('IO') |
||||
.addPanel( |
||||
g.panel('Bps') + |
||||
g.queryPanel( |
||||
'sum(rate(promtail_read_bytes_total{cluster="$cluster", job="$namespace/promtail"}[1m]))', |
||||
'logs read', |
||||
) + |
||||
{ yaxes: g.yaxes('Bps') }, |
||||
) |
||||
.addPanel( |
||||
g.panel('Lines') + |
||||
g.queryPanel( |
||||
'sum(rate(promtail_read_lines_total{cluster="$cluster", job="$namespace/promtail"}[1m]))', |
||||
'lines read', |
||||
), |
||||
) |
||||
) |
||||
.addRow( |
||||
g.row('Requests') |
||||
.addPanel( |
||||
g.panel('QPS') + |
||||
g.qpsPanel('promtail_request_duration_seconds_count{cluster="$cluster", job="$namespace/promtail"}') |
||||
) |
||||
.addPanel( |
||||
g.panel('Latency') + |
||||
utils.latencyRecordingRulePanel('promtail_request_duration_seconds', [utils.selector.eq('job', '$namespace/promtail')], extra_selectors=[utils.selector.eq('cluster', '$cluster')]) |
||||
) |
||||
), |
||||
}, |
||||
} |
||||
@ -0,0 +1,24 @@ |
||||
{ |
||||
"dependencies": [ |
||||
{ |
||||
"name": "grafana-builder", |
||||
"source": { |
||||
"git": { |
||||
"remote": "https://github.com/grafana/jsonnet-libs", |
||||
"subdir": "grafana-builder" |
||||
} |
||||
}, |
||||
"version": "master" |
||||
}, |
||||
{ |
||||
"name": "mixin-utils", |
||||
"source": { |
||||
"git": { |
||||
"remote": "https://github.com/grafana/jsonnet-libs", |
||||
"subdir": "mixin-utils" |
||||
} |
||||
}, |
||||
"version": "master" |
||||
} |
||||
] |
||||
} |
||||
@ -0,0 +1,3 @@ |
||||
(import 'dashboards.libsonnet') + |
||||
(import 'alerts.libsonnet') + |
||||
(import 'recording_rules.libsonnet') |
||||
@ -0,0 +1,12 @@ |
||||
local utils = import 'mixin-utils/utils.libsonnet'; |
||||
|
||||
{ |
||||
prometheusRules+:: { |
||||
groups+: [{ |
||||
name: 'promtail_rules', |
||||
rules: |
||||
utils.histogramRules('promtail_request_duration_seconds', ['job']) + |
||||
utils.histogramRules('promtail_request_duration_seconds', ['job', 'status_code']), |
||||
}], |
||||
}, |
||||
} |
||||
Loading…
Reference in new issue