fix(mixin): Remove pod label from disk usage aggregation (#14180)

This PR fixes the disk read and writes dashboards to support node-exporter pod name changes when node changes often (tools like karpenter for instance, autoscaling, etc.) as this causes duplicate entries when checking metrics over 1h
pull/13362/merge
Quentin Bisson 8 months ago committed by GitHub
parent 170217f38f
commit 5d45c96ce1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 8
      production/loki-mixin-compiled-ssd/dashboards/loki-resources-overview.json
  2. 12
      production/loki-mixin-compiled/dashboards/loki-reads-resources.json
  3. 4
      production/loki-mixin-compiled/dashboards/loki-writes-resources.json
  4. 12
      production/loki-mixin/dashboards/loki-reads-resources.libsonnet
  5. 8
      production/loki-mixin/dashboards/loki-resources-overview.libsonnet
  6. 4
      production/loki-mixin/dashboards/loki-writes-resources.libsonnet

@ -642,7 +642,7 @@
"span": 4,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -689,7 +689,7 @@
"span": 4,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-write.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -1059,7 +1059,7 @@
"span": 4,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -1106,7 +1106,7 @@
"span": 4,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"loki\", pod=~\"(loki.*|enterprise-logs)-backend.*\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null

@ -844,7 +844,7 @@
"span": 2,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"querier\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"querier\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -891,7 +891,7 @@
"span": 2,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"querier\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance,device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"querier\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -1249,7 +1249,7 @@
"span": 2,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"index-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"index-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -1296,7 +1296,7 @@
"span": 2,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"index-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"index-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -1654,7 +1654,7 @@
"span": 2,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -1701,7 +1701,7 @@
"span": 2,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"bloom-gateway\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null

@ -630,7 +630,7 @@
"span": 1,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null
@ -677,7 +677,7 @@
"span": 1,
"targets": [
{
"expr": "sum by(instance, pod, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"expr": "sum by(instance, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + ignoring(pod) group_right() (label_replace(count by(instance, pod, device) (container_fs_writes_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", container=\"ingester\", device!~\".*sda.*\"}), \"device\", \"$1\", \"device\", \"/dev/(.*)\") * 0)\n",
"format": "time_series",
"legendFormat": "{{pod}} - {{device}}",
"legendLink": null

@ -70,7 +70,7 @@
.addPanel(
$.newQueryPanel('Disk Writes', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('querier')],
'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDiskContainer('querier')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,
@ -78,7 +78,7 @@
.addPanel(
$.newQueryPanel('Disk Reads', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('querier')],
'sum by(%s,device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDiskContainer('querier')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,
@ -102,7 +102,7 @@
.addPanel(
$.newQueryPanel('Disk Writes', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(index_gateway_pod_matcher)],
'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(index_gateway_pod_matcher)],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,
@ -110,7 +110,7 @@
.addPanel(
$.newQueryPanel('Disk Reads', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(index_gateway_pod_matcher)],
'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(index_gateway_pod_matcher)],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,
@ -133,7 +133,7 @@
.addPanel(
$.newQueryPanel('Disk Writes', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('bloom-gateway')],
'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDiskContainer('bloom-gateway')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,
@ -141,7 +141,7 @@
.addPanel(
$.newQueryPanel('Disk Reads', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('bloom-gateway')],
'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDiskContainer('bloom-gateway')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,

@ -58,7 +58,7 @@
.addPanel(
$.newQueryPanel('Disk Writes', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(write_pod_matcher)],
'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(write_pod_matcher)],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,
@ -66,7 +66,7 @@
.addPanel(
$.newQueryPanel('Disk Reads', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(write_pod_matcher)],
'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(write_pod_matcher)],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,
@ -92,7 +92,7 @@
.addPanel(
$.newQueryPanel('Disk Writes', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(backend_pod_matcher)],
'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(backend_pod_matcher)],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,
@ -100,7 +100,7 @@
.addPanel(
$.newQueryPanel('Disk Reads', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(backend_pod_matcher)],
'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(backend_pod_matcher)],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,

@ -61,7 +61,7 @@
.addPanel(
$.newQueryPanel('Disk Writes', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(ingester_pod_matcher)],
'sum by(%s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(ingester_pod_matcher)],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,
@ -69,7 +69,7 @@
.addPanel(
$.newQueryPanel('Disk Reads', 'Bps') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDisk(ingester_pod_matcher)],
'sum by(%s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $.filterNodeDisk(ingester_pod_matcher)],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.withStacking,

Loading…
Cancel
Save