Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>pull/941/head
parent
17fee8081f
commit
bafe1707f1
@ -0,0 +1,165 @@ |
||||
{ |
||||
prometheusAlerts+:: { |
||||
groups+: [ |
||||
{ |
||||
name: 'node', |
||||
rules: [ |
||||
{ |
||||
alert: 'NodeFilesystemSpaceFillingUp', |
||||
expr: ||| |
||||
predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 |
||||
AND |
||||
node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 |
||||
AND |
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 |
||||
||| % $._config, |
||||
'for': '1h', |
||||
labels: { |
||||
severity: 'warning', |
||||
}, |
||||
annotations: { |
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.', |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'NodeFilesystemSpaceFillingUp', |
||||
expr: ||| |
||||
predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 |
||||
AND |
||||
node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 |
||||
AND |
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 |
||||
||| % $._config, |
||||
'for': '1h', |
||||
labels: { |
||||
severity: 'critical', |
||||
}, |
||||
annotations: { |
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.', |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'NodeFilesystemOutOfSpace', |
||||
expr: ||| |
||||
node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 |
||||
AND |
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 |
||||
||| % $._config, |
||||
'for': '1h', |
||||
labels: { |
||||
severity: 'warning', |
||||
}, |
||||
annotations: { |
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'NodeFilesystemOutOfSpace', |
||||
expr: ||| |
||||
node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 |
||||
AND |
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 |
||||
||| % $._config, |
||||
'for': '1h', |
||||
labels: { |
||||
severity: 'critical', |
||||
}, |
||||
annotations: { |
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'NodeFilesystemFilesFillingUp', |
||||
expr: ||| |
||||
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 |
||||
AND |
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 |
||||
AND |
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 |
||||
||| % $._config, |
||||
'for': '1h', |
||||
labels: { |
||||
severity: 'warning', |
||||
}, |
||||
annotations: { |
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.', |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'NodeFilesystemFilesFillingUp', |
||||
expr: ||| |
||||
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 |
||||
AND |
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 |
||||
AND |
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 |
||||
||| % $._config, |
||||
'for': '1h', |
||||
labels: { |
||||
severity: 'warning', |
||||
}, |
||||
annotations: { |
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.', |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'NodeFilesystemOutOfFiles', |
||||
expr: ||| |
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 |
||||
AND |
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 |
||||
||| % $._config, |
||||
'for': '1h', |
||||
labels: { |
||||
severity: 'warning', |
||||
}, |
||||
annotations: { |
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.', |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'NodeFilesystemOutOfSpace', |
||||
expr: ||| |
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 |
||||
AND |
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 |
||||
||| % $._config, |
||||
'for': '1h', |
||||
labels: { |
||||
severity: 'critical', |
||||
}, |
||||
annotations: { |
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'NodeNetworkReceiveErrs', |
||||
expr: ||| |
||||
increase(node_network_receive_errs[2m]) > 10 |
||||
||| % $._config, |
||||
'for': '1h', |
||||
labels: { |
||||
severity: 'critical', |
||||
}, |
||||
annotations: { |
||||
message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).', |
||||
}, |
||||
}, |
||||
{ |
||||
alert: 'NodeNetworkTransmitErrs', |
||||
expr: ||| |
||||
increase(node_network_transmit_errs[2m]) > 10 |
||||
||| % $._config, |
||||
'for': '1h', |
||||
labels: { |
||||
severity: 'critical', |
||||
}, |
||||
annotations: { |
||||
message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).', |
||||
}, |
||||
}, |
||||
], |
||||
}, |
||||
], |
||||
}, |
||||
} |
||||
@ -0,0 +1,11 @@ |
||||
{ |
||||
_config+:: { |
||||
// Selectors are inserted between {} in Prometheus queries. |
||||
nodeExporterSelector: 'job="node-exporter"', |
||||
|
||||
// Mainly extracted because they are repetitive, but also useful to customize. |
||||
fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', |
||||
|
||||
grafana_prefix: '', |
||||
}, |
||||
} |
||||
@ -0,0 +1,2 @@ |
||||
(import 'node.libsonnet') + |
||||
(import 'use.libsonnet') |
||||
@ -0,0 +1,176 @@ |
||||
local grafana = import 'grafonnet/grafana.libsonnet'; |
||||
local dashboard = grafana.dashboard; |
||||
local row = grafana.row; |
||||
local prometheus = grafana.prometheus; |
||||
local template = grafana.template; |
||||
local graphPanel = grafana.graphPanel; |
||||
local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet'; |
||||
local gauge = promgrafonnet.gauge; |
||||
|
||||
{ |
||||
grafanaDashboards+:: { |
||||
'nodes.json': |
||||
local idleCPU = |
||||
graphPanel.new( |
||||
'Idle CPU', |
||||
datasource='$datasource', |
||||
span=6, |
||||
format='percent', |
||||
max=100, |
||||
min=0, |
||||
) |
||||
.addTarget(prometheus.target( |
||||
||| |
||||
100 - (avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[5m])) * 100) |
||||
||| % $._config, |
||||
legendFormat='{{cpu}}', |
||||
intervalFactor=10, |
||||
)); |
||||
|
||||
local systemLoad = |
||||
graphPanel.new( |
||||
'System load', |
||||
datasource='$datasource', |
||||
span=6, |
||||
format='percent', |
||||
) |
||||
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 1m')) |
||||
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 5m')) |
||||
.addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 15m')); |
||||
|
||||
local memoryGraph = |
||||
graphPanel.new( |
||||
'Memory Usage', |
||||
datasource='$datasource', |
||||
span=9, |
||||
format='bytes', |
||||
) |
||||
.addTarget(prometheus.target( |
||||
||| |
||||
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} |
||||
- node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} |
||||
- node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} |
||||
- node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} |
||||
||| % $._config, legendFormat='memory used' |
||||
)) |
||||
.addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) |
||||
.addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) |
||||
.addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); |
||||
|
||||
local memoryGauge = gauge.new( |
||||
'Memory Usage', |
||||
||| |
||||
( |
||||
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} |
||||
- node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} |
||||
- node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} |
||||
- node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} |
||||
) * 100 |
||||
/ |
||||
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} |
||||
||| % $._config, |
||||
).withLowerBeingBetter(); |
||||
|
||||
local diskIO = |
||||
graphPanel.new( |
||||
'Disk I/O', |
||||
datasource='$datasource', |
||||
span=9, |
||||
) |
||||
.addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_read{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='read')) |
||||
.addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_written{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='written')) |
||||
.addTarget(prometheus.target('sum by (instance) (rate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='io time')) + |
||||
{ |
||||
seriesOverrides: [ |
||||
{ |
||||
alias: 'read', |
||||
yaxis: 1, |
||||
}, |
||||
{ |
||||
alias: 'io time', |
||||
yaxis: 2, |
||||
}, |
||||
], |
||||
yaxes: [ |
||||
self.yaxe(format='bytes'), |
||||
self.yaxe(format='ms'), |
||||
], |
||||
}; |
||||
|
||||
local diskSpaceUsage = gauge.new( |
||||
'Disk Space Usage', |
||||
||| |
||||
( |
||||
sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) |
||||
- sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) |
||||
) * 100 |
||||
/ |
||||
sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) |
||||
||| % $._config, |
||||
).withLowerBeingBetter(); |
||||
|
||||
local networkReceived = |
||||
graphPanel.new( |
||||
'Network Received', |
||||
datasource='$datasource', |
||||
span=6, |
||||
format='bytes', |
||||
) |
||||
.addTarget(prometheus.target('rate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); |
||||
|
||||
local networkTransmitted = |
||||
graphPanel.new( |
||||
'Network Transmitted', |
||||
datasource='$datasource', |
||||
span=6, |
||||
format='bytes', |
||||
) |
||||
.addTarget(prometheus.target('rate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); |
||||
|
||||
dashboard.new('Nodes', time_from='now-1h') |
||||
.addTemplate( |
||||
{ |
||||
current: { |
||||
text: 'Prometheus', |
||||
value: 'Prometheus', |
||||
}, |
||||
hide: 0, |
||||
label: null, |
||||
name: 'datasource', |
||||
options: [], |
||||
query: 'prometheus', |
||||
refresh: 1, |
||||
regex: '', |
||||
type: 'datasource', |
||||
}, |
||||
) |
||||
.addTemplate( |
||||
template.new( |
||||
'instance', |
||||
'$datasource', |
||||
'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config, |
||||
refresh='time', |
||||
) |
||||
) |
||||
.addRow( |
||||
row.new() |
||||
.addPanel(idleCPU) |
||||
.addPanel(systemLoad) |
||||
) |
||||
.addRow( |
||||
row.new() |
||||
.addPanel(memoryGraph) |
||||
.addPanel(memoryGauge) |
||||
) |
||||
.addRow( |
||||
row.new() |
||||
.addPanel(diskIO) |
||||
.addPanel(diskSpaceUsage) |
||||
) |
||||
.addRow( |
||||
row.new() |
||||
.addPanel(networkReceived) |
||||
.addPanel(networkTransmitted) |
||||
), |
||||
}, |
||||
} |
||||
@ -0,0 +1,151 @@ |
||||
local g = import 'grafana-builder/grafana.libsonnet'; |
||||
|
||||
{ |
||||
grafanaDashboards+:: { |
||||
'node-cluster-rsrc-use.json': |
||||
local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix; |
||||
|
||||
g.dashboard('USE Method / Cluster') |
||||
.addRow( |
||||
g.row('CPU') |
||||
.addPanel( |
||||
g.panel('CPU Utilisation') + |
||||
g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) + |
||||
g.stack + |
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
||||
) |
||||
.addPanel( |
||||
g.panel('CPU Saturation (Load1)') + |
||||
g.queryPanel(||| |
||||
instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s})) |
||||
||| % $._config, '{{instance}}', legendLink) + |
||||
g.stack + |
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
||||
) |
||||
) |
||||
.addRow( |
||||
g.row('Memory') |
||||
.addPanel( |
||||
g.panel('Memory Utilisation') + |
||||
g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) + |
||||
g.stack + |
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
||||
) |
||||
.addPanel( |
||||
g.panel('Memory Saturation (Swap I/O)') + |
||||
g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) + |
||||
g.stack + |
||||
{ yaxes: g.yaxes('Bps') }, |
||||
) |
||||
) |
||||
.addRow( |
||||
g.row('Disk') |
||||
.addPanel( |
||||
g.panel('Disk IO Utilisation') + |
||||
// Full utilisation would be all disks on each node spending an average of |
||||
// 1 sec per second doing I/O, normalize by node count for stacked charts |
||||
g.queryPanel(||| |
||||
instance:node_disk_utilisation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) |
||||
||| % $._config, '{{instance}}', legendLink) + |
||||
g.stack + |
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
||||
) |
||||
.addPanel( |
||||
g.panel('Disk IO Saturation') + |
||||
g.queryPanel(||| |
||||
instance:node_disk_saturation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) |
||||
||| % $._config, '{{instance}}', legendLink) + |
||||
g.stack + |
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
||||
) |
||||
) |
||||
.addRow( |
||||
g.row('Network') |
||||
.addPanel( |
||||
g.panel('Net Utilisation (Transmitted)') + |
||||
g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) + |
||||
g.stack + |
||||
{ yaxes: g.yaxes('Bps') }, |
||||
) |
||||
.addPanel( |
||||
g.panel('Net Saturation (Dropped)') + |
||||
g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) + |
||||
g.stack + |
||||
{ yaxes: g.yaxes('Bps') }, |
||||
) |
||||
) |
||||
.addRow( |
||||
g.row('Storage') |
||||
.addPanel( |
||||
g.panel('Disk Capacity') + |
||||
g.queryPanel('sum(max(node_filesystem_size{fstype=~"ext[24]"} - node_filesystem_free{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + |
||||
g.stack + |
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |
||||
), |
||||
), |
||||
|
||||
'k8s-node-rsrc-use.json': |
||||
g.dashboard('K8s / USE Method / Node') |
||||
.addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') |
||||
.addRow( |
||||
g.row('CPU') |
||||
.addPanel( |
||||
g.panel('CPU Utilisation') + |
||||
g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') + |
||||
{ yaxes: g.yaxes('percentunit') }, |
||||
) |
||||
.addPanel( |
||||
g.panel('CPU Saturation (Load1)') + |
||||
g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') + |
||||
{ yaxes: g.yaxes('percentunit') }, |
||||
) |
||||
) |
||||
.addRow( |
||||
g.row('Memory') |
||||
.addPanel( |
||||
g.panel('Memory Utilisation') + |
||||
g.queryPanel('instance:node_memory_utilisation:{instance="$instance"}', 'Memory') + |
||||
{ yaxes: g.yaxes('percentunit') }, |
||||
) |
||||
.addPanel( |
||||
g.panel('Memory Saturation (Swap I/O)') + |
||||
g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') + |
||||
{ yaxes: g.yaxes('Bps') }, |
||||
) |
||||
) |
||||
.addRow( |
||||
g.row('Disk') |
||||
.addPanel( |
||||
g.panel('Disk IO Utilisation') + |
||||
g.queryPanel('instance:node_disk_utilisation:avg_irate{instance="$instance"}', 'Utilisation') + |
||||
{ yaxes: g.yaxes('percentunit') }, |
||||
) |
||||
.addPanel( |
||||
g.panel('Disk IO Saturation') + |
||||
g.queryPanel('instance:node_disk_saturation:avg_irate{instance="$instance"}', 'Saturation') + |
||||
{ yaxes: g.yaxes('percentunit') }, |
||||
) |
||||
) |
||||
.addRow( |
||||
g.row('Net') |
||||
.addPanel( |
||||
g.panel('Net Utilisation (Transmitted)') + |
||||
g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + |
||||
{ yaxes: g.yaxes('Bps') }, |
||||
) |
||||
.addPanel( |
||||
g.panel('Net Saturation (Dropped)') + |
||||
g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') + |
||||
{ yaxes: g.yaxes('Bps') }, |
||||
) |
||||
) |
||||
.addRow( |
||||
g.row('Disk') |
||||
.addPanel( |
||||
g.panel('Disk Utilisation') + |
||||
g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size{fstype=~"ext[24]"}))', 'Disk') + |
||||
{ yaxes: g.yaxes('percentunit') }, |
||||
), |
||||
), |
||||
}, |
||||
} |
||||
@ -0,0 +1,24 @@ |
||||
{ |
||||
"dependencies": [ |
||||
{ |
||||
"name": "grafonnet", |
||||
"source": { |
||||
"git": { |
||||
"remote": "https://github.com/grafana/grafonnet-lib", |
||||
"subdir": "grafonnet" |
||||
} |
||||
}, |
||||
"version": "master" |
||||
}, |
||||
{ |
||||
"name": "grafana-builder", |
||||
"source": { |
||||
"git": { |
||||
"remote": "https://github.com/kausalco/public", |
||||
"subdir": "grafana-builder" |
||||
} |
||||
}, |
||||
"version": "master" |
||||
} |
||||
] |
||||
} |
||||
@ -0,0 +1,60 @@ |
||||
local grafana = import 'grafonnet/grafana.libsonnet'; |
||||
local singlestat = grafana.singlestat; |
||||
local prometheus = grafana.prometheus; |
||||
|
||||
{ |
||||
new(title, query):: |
||||
singlestat.new( |
||||
title, |
||||
datasource='prometheus', |
||||
span=3, |
||||
format='percent', |
||||
valueName='current', |
||||
colors=[ |
||||
'rgba(245, 54, 54, 0.9)', |
||||
'rgba(237, 129, 40, 0.89)', |
||||
'rgba(50, 172, 45, 0.97)', |
||||
], |
||||
thresholds='50, 80', |
||||
valueMaps=[ |
||||
{ |
||||
op: '=', |
||||
text: 'N/A', |
||||
value: 'null', |
||||
}, |
||||
], |
||||
) |
||||
.addTarget( |
||||
prometheus.target( |
||||
query |
||||
) |
||||
) + { |
||||
gauge: { |
||||
maxValue: 100, |
||||
minValue: 0, |
||||
show: true, |
||||
thresholdLabels: false, |
||||
thresholdMarkers: true, |
||||
}, |
||||
withTextNullValue(text):: self { |
||||
valueMaps: [ |
||||
{ |
||||
op: '=', |
||||
text: text, |
||||
value: 'null', |
||||
}, |
||||
], |
||||
}, |
||||
withSpanSize(size):: self { |
||||
span: size, |
||||
}, |
||||
withLowerBeingBetter():: self { |
||||
colors: [ |
||||
'rgba(50, 172, 45, 0.97)', |
||||
'rgba(237, 129, 40, 0.89)', |
||||
'rgba(245, 54, 54, 0.9)', |
||||
], |
||||
thresholds: '80, 90', |
||||
}, |
||||
}, |
||||
} |
||||
@ -0,0 +1,48 @@ |
||||
local grafana = import 'grafonnet/grafana.libsonnet'; |
||||
local singlestat = grafana.singlestat; |
||||
local prometheus = grafana.prometheus; |
||||
|
||||
{ |
||||
new(title, query):: |
||||
singlestat.new( |
||||
title, |
||||
datasource='prometheus', |
||||
span=3, |
||||
valueName='current', |
||||
valueMaps=[ |
||||
{ |
||||
op: '=', |
||||
text: '0', |
||||
value: 'null', |
||||
}, |
||||
], |
||||
) |
||||
.addTarget( |
||||
prometheus.target( |
||||
query |
||||
) |
||||
) + { |
||||
withTextNullValue(text):: self { |
||||
valueMaps: [ |
||||
{ |
||||
op: '=', |
||||
text: text, |
||||
value: 'null', |
||||
}, |
||||
], |
||||
}, |
||||
withSpanSize(size):: self { |
||||
span: size, |
||||
}, |
||||
withPostfix(postfix):: self { |
||||
postfix: postfix, |
||||
}, |
||||
withSparkline():: self { |
||||
sparkline: { |
||||
show: true, |
||||
lineColor: 'rgb(31, 120, 193)', |
||||
fillColor: 'rgba(31, 118, 189, 0.18)', |
||||
}, |
||||
}, |
||||
}, |
||||
} |
||||
@ -0,0 +1,5 @@ |
||||
{ |
||||
numbersinglestat:: import 'numbersinglestat.libsonnet', |
||||
gauge:: import 'gauge.libsonnet', |
||||
percentlinegraph:: import 'percentlinegraph.libsonnet', |
||||
} |
||||
@ -0,0 +1,4 @@ |
||||
(import 'config.libsonnet') + |
||||
(import 'alerts/alerts.libsonnet') + |
||||
(import 'dashboards/dashboards.libsonnet') + |
||||
(import 'rules/rules.libsonnet') |
||||
@ -0,0 +1,121 @@ |
||||
{ |
||||
prometheusRules+:: { |
||||
groups+: [ |
||||
{ |
||||
name: 'node.rules', |
||||
rules: [ |
||||
{ |
||||
// This rule gives the number of CPUs per node. |
||||
record: 'instance:node_num_cpu:sum', |
||||
expr: ||| |
||||
count by (instance) ( |
||||
sum by (instance, cpu) ( |
||||
node_cpu{%(nodeExporterSelector)s} |
||||
) |
||||
) |
||||
||| % $._config, |
||||
}, |
||||
{ |
||||
// CPU utilisation is % CPU is not idle. |
||||
record: 'instance:node_cpu_utilisation:avg1m', |
||||
expr: ||| |
||||
1 - avg by (instance) ( |
||||
rate(node_cpu{%(nodeExporterSelector)s,mode="idle"}[1m]) |
||||
) |
||||
||| % $._config, |
||||
}, |
||||
{ |
||||
// CPU saturation is 1min avg run queue length / number of CPUs. |
||||
// Can go over 100%. >100% is bad. |
||||
record: 'instance:node_cpu_saturation_load1:', |
||||
expr: ||| |
||||
sum by (instance) ( |
||||
node_load1{%(nodeExporterSelector)s} |
||||
) |
||||
/ |
||||
instance:node_num_cpu:sum |
||||
||| % $._config, |
||||
}, |
||||
{ |
||||
// Available memory per node |
||||
record: 'instance:node_memory_bytes_available:sum', |
||||
expr: ||| |
||||
sum by (instance) ( |
||||
(node_memory_MemFree{%(nodeExporterSelector)s} + node_memory_Cached{%(nodeExporterSelector)s} + node_memory_Buffers{%(nodeExporterSelector)s}) |
||||
) |
||||
||| % $._config, |
||||
}, |
||||
{ |
||||
// Total memory per node |
||||
record: 'instance:node_memory_bytes_total:sum', |
||||
expr: ||| |
||||
sum by (instance) ( |
||||
node_memory_MemTotal{%(nodeExporterSelector)s} |
||||
) |
||||
||| % $._config, |
||||
}, |
||||
{ |
||||
// Memory utilisation per node, normalized by per-node memory |
||||
record: 'instance:node_memory_utilisation:ratio', |
||||
expr: ||| |
||||
(instance:node_memory_bytes_total:sum - instance:node_memory_bytes_available:sum) |
||||
/ |
||||
scalar(sum(instance:node_memory_bytes_total:sum)) |
||||
|||, |
||||
}, |
||||
{ |
||||
record: 'instance:node_memory_utilisation:', |
||||
expr: ||| |
||||
1 - (instance:node_memory_bytes_available:sum / instance:node_memory_bytes_total:sum) |
||||
||| % $._config, |
||||
}, |
||||
{ |
||||
record: 'instance:node_memory_swap_io_bytes:sum_rate', |
||||
expr: ||| |
||||
1e3 * sum by (instance) ( |
||||
(rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) |
||||
+ rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])) |
||||
) |
||||
||| % $._config, |
||||
}, |
||||
{ |
||||
// Disk utilisation (ms spent, by rate() it's bound by 1 second) |
||||
record: 'instance:node_disk_utilisation:avg_irate', |
||||
expr: ||| |
||||
avg by (instance) ( |
||||
irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 |
||||
) |
||||
||| % $._config, |
||||
}, |
||||
{ |
||||
// Disk saturation (ms spent, by rate() it's bound by 1 second) |
||||
record: 'instance:node_disk_saturation:avg_irate', |
||||
expr: ||| |
||||
avg by (instance) ( |
||||
irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 |
||||
) |
||||
||| % $._config, |
||||
}, |
||||
{ |
||||
record: 'instance:node_net_utilisation:sum_irate', |
||||
expr: ||| |
||||
sum by (instance) ( |
||||
(irate(node_network_receive_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]) + |
||||
irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device="eth0"}[1m])) |
||||
) |
||||
||| % $._config, |
||||
}, |
||||
{ |
||||
record: 'instance:node_net_saturation:sum_irate', |
||||
expr: ||| |
||||
sum by (instance) ( |
||||
(irate(node_network_receive_drop{%(nodeExporterSelector)s,device="eth0"}[1m]) + |
||||
irate(node_network_transmit_drop{%(nodeExporterSelector)s,device="eth0"}[1m])) |
||||
) |
||||
||| % $._config, |
||||
}, |
||||
], |
||||
}, |
||||
], |
||||
}, |
||||
} |
||||
Loading…
Reference in new issue