diff --git a/production/loki-mixin/dashboards.libsonnet b/production/loki-mixin/dashboards.libsonnet index 16b54e6fa5..9ea6c2ca82 100644 --- a/production/loki-mixin/dashboards.libsonnet +++ b/production/loki-mixin/dashboards.libsonnet @@ -8,4 +8,5 @@ (import 'dashboards/loki-writes-resources.libsonnet') + (import 'dashboards/loki-reads-resources.libsonnet') + (import 'dashboards/loki-deletion.libsonnet') + +(import 'dashboards/loki-canary-dashboard.libsonnet') + (import 'dashboards/recording-rules.libsonnet') diff --git a/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet b/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet new file mode 100644 index 0000000000..6539a34d77 --- /dev/null +++ b/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet @@ -0,0 +1,151 @@ +local vendor_config = import 'github.com/grafana/mimir/operations/mimir-mixin/config.libsonnet'; +local vendor_utils = import 'github.com/grafana/mimir/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet'; +local g = import 'grafana-builder/grafana.libsonnet'; +local grafana = import 'grafonnet/grafana.libsonnet'; + +{ + _config+:: { + canary+: { + enabled: false, + }, + }, + grafanaDashboards+: if !$._config.canary.enabled then {} else { + local dashboard = ( + vendor_utils { + _config:: vendor_config._config + $._config { + product: 'Loki', + dashboard_prefix: 'Loki / ', + tags: ['loki'], + }, + } + ), + 'loki-canary.json': + // The dashboard() function automatically adds the "Loki / " prefix to the dashboard title. + // This logic is inherited from mimir-mixin. + dashboard.dashboard('Canary') + // We can't make use of simplified template selectors from the loki dashboard utils until we port the cortex dashboard utils panel/grid functionality. + .addTemplate('cluster', 'loki_build_info', 'cluster') + .addTemplate('namespace', 'loki_build_info{cluster=~"$cluster"}', 'namespace') + + { + // This dashboard uses the new grid system in order to place panels (using gridPos). + // Because of this we can't use the mixin's addRow() and addPanel(). + schemaVersion: 27, + rows: null, + // ugly hack, copy pasta the tag/link + // code from the loki-mixin + tags: ['loki'], + links: [ + { + asDropdown: true, + icon: 'external link', + includeVars: true, + keepTime: true, + tags: $._config.tags, + targetBlank: false, + title: 'Loki Dashboards', + type: 'dashboards', + }, + ], + panels: [ + // grid row 1 + dashboard.panel('Canary Entries Total') + + dashboard.newStatPanel('sum(count(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}))', unit='short') + + { gridPos: { h: 4, w: 3, x: 0, y: 0 } }, + + dashboard.panel('Canary Logs Total') + + dashboard.newStatPanel('sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + { gridPos: { h: 4, w: 3, x: 3, y: 0 } }, + + dashboard.panel('Missing') + + dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + { gridPos: { h: 4, w: 3, x: 6, y: 0 } }, + + dashboard.panel('Spotcheck Missing') + + dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + { gridPos: { h: 4, w: 3, x: 9, y: 0 } }, + + // grid row 2 + dashboard.panel('Spotcheck Total') + + dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + { gridPos: { h: 4, w: 3, x: 0, y: 4 } }, + + dashboard.panel('Metric Test Error %') + + dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))) * 100') + + { gridPos: { h: 4, w: 3, x: 3, y: 4 } }, + + dashboard.panel('Missing %') + + dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + + { gridPos: { h: 4, w: 3, x: 6, y: 4 } }, + + dashboard.panel('Spotcheck Missing %') + + dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') + + { gridPos: { h: 4, w: 3, x: 9, y: 4 } }, + + // grid row 3 + dashboard.panel('Metric Test Expected') + + dashboard.newStatPanel('sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') + + { gridPos: { h: 4, w: 3, x: 0, y: 8 } }, + + dashboard.panel('Metric Test Actual') + + dashboard.newStatPanel('sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') + + { gridPos: { h: 4, w: 3, x: 3, y: 8 } }, + + dashboard.panel('Websocket Missing') + + dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + { gridPos: { h: 4, w: 3, x: 6, y: 8 } }, + + dashboard.panel('Websocket Missing %') + + dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + + { gridPos: { h: 4, w: 3, x: 9, y: 8 } }, + // end of grid + + dashboard.panel('Log Write to read Latency Percentiles') + + dashboard.queryPanel([ + 'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + ], ['p95', 'p50']) + + { gridPos: { h: 6, w: 12, x: 12, y: 0 } }, + + grafana.heatmapPanel.new( + 'Log Write to Read Latency', + datasource='$datasource', + tooltip_showHistogram=true, + color_colorScheme='interpolateReds', + legend_show=false, + ).addTargets( + [ + grafana.prometheus.target( + 'sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)', + legendFormat='{{le}}', + format='heatmap', + ), + ], + ) + + { gridPos: { h: 6, w: 12, x: 12, y: 12 } }, + + dashboard.panel('Spot Check Query') + + dashboard.queryPanel([ + 'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + ], ['p99', 'p95']) + + { gridPos: { h: 6, w: 12, x: 0, y: 14 } }, + + dashboard.panel('Metric Test Query') + + dashboard.queryPanel([ + 'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', + ], ['p99', 'p95'],) + + { gridPos: { h: 6, w: 12, x: 12, y: 14 } }, + + dashboard.panel('Spot Check Missing %') + + dashboard.queryPanel('topk(20, (sum by (cluster, pod) (increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod) (increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') + + { gridPos: { h: 6, w: 12, x: 0, y: 20 } }, + + g.panel('Missing logs') + + g.queryPanel('topk(20,(sum by (cluster, pod)(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod)(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ cluster }} {{ pod }}') + + { gridPos: { h: 6, w: 12, x: 12, y: 20 } }, + + ], + }, + }, +} diff --git a/production/loki-mixin/jsonnetfile.json b/production/loki-mixin/jsonnetfile.json index c8f1d4c4ac..f3ac40227c 100644 --- a/production/loki-mixin/jsonnetfile.json +++ b/production/loki-mixin/jsonnetfile.json @@ -28,6 +28,15 @@ }, "version": "master" }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/mimir.git", + "subdir": "operations/mimir-mixin" + } + }, + "version": "main" + }, { "source": { "git": { diff --git a/production/loki-mixin/jsonnetfile.lock.json b/production/loki-mixin/jsonnetfile.lock.json index c1a1482fc9..4dd7a006b6 100644 --- a/production/loki-mixin/jsonnetfile.lock.json +++ b/production/loki-mixin/jsonnetfile.lock.json @@ -31,6 +31,16 @@ "version": "3f71e00a64810075b5d5f969cc6d0e419cbdebc4", "sum": "v6fuqqQp9rHZbsxN9o79QzOpUlwYZEJ84DxTCZMCYeU=" }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/mimir.git", + "subdir": "operations/mimir-mixin" + } + }, + "version": "91986521f324c84a9cf869529bd901f077ddf8bc", + "sum": "eBp1Oo3j0YiI5hv9YrZb0lJQxEOC17rP3pZiKM/R3Zo=" + }, { "source": { "git": {