From 6a62b8cf42f2813cb6d93ec76c39b6d8d75f822b Mon Sep 17 00:00:00 2001 From: Sandeep Sukhani Date: Mon, 20 Nov 2023 18:22:03 +0530 Subject: [PATCH] compaction: Separate metrics for tracking retention and compaction (#11263) **What this PR does / why we need it**: In PR #9884, we separated the retention loop from compaction to avoid blocking compaction for too long due to some intensive delete requests. Currently, we track retention and compaction using the same metrics. This PR adds separate metrics for monitoring retention operation. I have also updated the Retention dashboard to use the new metrics. --- .../loki_micro_services_delete_test.go | 1 + pkg/compactor/compactor.go | 28 +- pkg/compactor/compactor_test.go | 40 +- pkg/compactor/metrics.go | 46 +- .../dashboards/loki-retention.json | 398 +++++++++++++++++- .../dashboards/loki-retention.json | 398 +++++++++++++++++- .../dashboards/loki-retention.libsonnet | 30 +- 7 files changed, 860 insertions(+), 81 deletions(-) diff --git a/integration/loki_micro_services_delete_test.go b/integration/loki_micro_services_delete_test.go index 5cce134d94..07195d919e 100644 --- a/integration/loki_micro_services_delete_test.go +++ b/integration/loki_micro_services_delete_test.go @@ -216,6 +216,7 @@ func TestMicroServicesDeleteRequest(t *testing.T) { validateQueryResponse := func(expectedStreams []client.StreamValues, resp *client.Response) { t.Helper() + assert.Equal(t, "success", resp.Status) assert.Equal(t, "streams", resp.Data.ResultType) require.Len(t, resp.Data.Stream, len(expectedStreams)) diff --git a/pkg/compactor/compactor.go b/pkg/compactor/compactor.go index 64f89fc696..774536152c 100644 --- a/pkg/compactor/compactor.go +++ b/pkg/compactor/compactor.go @@ -128,6 +128,11 @@ func (cfg *Config) Validate() error { cfg.ApplyRetentionInterval = cfg.CompactionInterval } + if cfg.ApplyRetentionInterval == cfg.CompactionInterval { + // add some jitter to avoid running retention and compaction at same time + cfg.ApplyRetentionInterval += minDuration(10*time.Minute, cfg.ApplyRetentionInterval/2) + } + if err := config.ValidatePathPrefix(cfg.DeleteRequestStoreKeyPrefix); err != nil { return fmt.Errorf("validate delete store path prefix: %w", err) } @@ -604,7 +609,7 @@ func (c *Compactor) CompactTable(ctx context.Context, tableName string, applyRet } if hasUncompactedIndex { - c.metrics.skippedCompactingLockedTables.Inc() + c.metrics.skippedCompactingLockedTables.WithLabelValues(tableName).Inc() level.Warn(util_log.Logger).Log("msg", "skipped compacting table which likely has uncompacted index since it is locked by retention", "table_name", tableName) } return nil @@ -657,14 +662,19 @@ func (c *Compactor) RunCompaction(ctx context.Context, applyRetention bool) (err if err != nil { status = statusFailure } - withRetentionLabelValue := fmt.Sprintf("%v", applyRetention) - c.metrics.compactTablesOperationTotal.WithLabelValues(status, withRetentionLabelValue).Inc() + if applyRetention { + c.metrics.applyRetentionOperationTotal.WithLabelValues(status).Inc() + } else { + c.metrics.compactTablesOperationTotal.WithLabelValues(status).Inc() + } runtime := time.Since(start) if status == statusSuccess { - c.metrics.compactTablesOperationDurationSeconds.WithLabelValues(withRetentionLabelValue).Set(runtime.Seconds()) - c.metrics.compactTablesOperationLastSuccess.WithLabelValues(withRetentionLabelValue).SetToCurrentTime() if applyRetention { + c.metrics.applyRetentionOperationDurationSeconds.Set(runtime.Seconds()) c.metrics.applyRetentionLastSuccess.SetToCurrentTime() + } else { + c.metrics.compactTablesOperationDurationSeconds.Set(runtime.Seconds()) + c.metrics.compactTablesOperationLastSuccess.SetToCurrentTime() } } @@ -874,3 +884,11 @@ func schemaPeriodForTable(cfg config.SchemaConfig, tableName string) (config.Per return schemaCfg, true } + +func minDuration(x time.Duration, y time.Duration) time.Duration { + if x < y { + return x + } + + return y +} diff --git a/pkg/compactor/compactor_test.go b/pkg/compactor/compactor_test.go index 6913956aaa..17df040290 100644 --- a/pkg/compactor/compactor_test.go +++ b/pkg/compactor/compactor_test.go @@ -348,7 +348,7 @@ func TestCompactor_TableLocking(t *testing.T) { lockTable string applyRetention bool - compactionShouldTimeout bool + retentionShouldTimeout bool }{ { name: "no table locked - not applying retention", @@ -362,10 +362,10 @@ func TestCompactor_TableLocking(t *testing.T) { lockTable: fmt.Sprintf("%s%d", indexTablePrefix, tableNumEnd), }, { - name: "first table locked - applying retention", - lockTable: fmt.Sprintf("%s%d", indexTablePrefix, tableNumEnd), - applyRetention: true, - compactionShouldTimeout: true, + name: "first table locked - applying retention", + lockTable: fmt.Sprintf("%s%d", indexTablePrefix, tableNumEnd), + applyRetention: true, + retentionShouldTimeout: true, }, } { t.Run(tc.name, func(t *testing.T) { @@ -389,30 +389,38 @@ func TestCompactor_TableLocking(t *testing.T) { defer cancel() err := compactor.RunCompaction(ctx, tc.applyRetention) - // compaction should not timeout after first run since we won't be locking the table - if n == 1 && tc.compactionShouldTimeout { + // retention should not timeout after first run since we won't be locking the table + if n == 1 && tc.retentionShouldTimeout { require.ErrorIs(t, err, context.DeadlineExceeded) - require.Equal(t, float64(1), testutil.ToFloat64(compactor.metrics.compactTablesOperationTotal.WithLabelValues(statusFailure, "true"))) - require.Equal(t, float64(0), testutil.ToFloat64(compactor.metrics.compactTablesOperationTotal.WithLabelValues(statusFailure, "false"))) + require.Equal(t, float64(1), testutil.ToFloat64(compactor.metrics.applyRetentionOperationTotal.WithLabelValues(statusFailure))) + require.Equal(t, float64(0), testutil.ToFloat64(compactor.metrics.compactTablesOperationTotal.WithLabelValues(statusFailure))) return } require.NoError(t, err) - if n > 1 && tc.compactionShouldTimeout { - // this should be the first successful run if compaction was expected to be timeout out during first run - require.Equal(t, float64(1), testutil.ToFloat64(compactor.metrics.compactTablesOperationTotal.WithLabelValues(statusSuccess, fmt.Sprintf("%v", tc.applyRetention)))) + if n > 1 && tc.applyRetention && tc.retentionShouldTimeout { + // this should be the first successful run if retention was expected to timeout out during first run + require.Equal(t, float64(1), testutil.ToFloat64(compactor.metrics.applyRetentionOperationTotal.WithLabelValues(statusSuccess))) } else { // else it should have succeeded during all the n runs - require.Equal(t, float64(n), testutil.ToFloat64(compactor.metrics.compactTablesOperationTotal.WithLabelValues(statusSuccess, fmt.Sprintf("%v", tc.applyRetention)))) + if tc.applyRetention { + require.Equal(t, float64(n), testutil.ToFloat64(compactor.metrics.applyRetentionOperationTotal.WithLabelValues(statusSuccess))) + } else { + require.Equal(t, float64(n), testutil.ToFloat64(compactor.metrics.compactTablesOperationTotal.WithLabelValues(statusSuccess))) + } + } + if tc.applyRetention { + require.Equal(t, float64(0), testutil.ToFloat64(compactor.metrics.compactTablesOperationTotal.WithLabelValues(statusSuccess))) + } else { + require.Equal(t, float64(0), testutil.ToFloat64(compactor.metrics.applyRetentionOperationTotal.WithLabelValues(statusSuccess))) } - require.Equal(t, float64(0), testutil.ToFloat64(compactor.metrics.compactTablesOperationTotal.WithLabelValues(statusSuccess, fmt.Sprintf("%v", !tc.applyRetention)))) // if the table was locked and compaction ran without retention then only locked table should have been skipped if tc.lockTable != "" { if tc.applyRetention { - require.Equal(t, float64(0), testutil.ToFloat64(compactor.metrics.skippedCompactingLockedTables)) + require.Equal(t, float64(0), testutil.ToFloat64(compactor.metrics.skippedCompactingLockedTables.WithLabelValues(tc.lockTable))) } else { - require.Equal(t, float64(1), testutil.ToFloat64(compactor.metrics.skippedCompactingLockedTables)) + require.Equal(t, float64(1), testutil.ToFloat64(compactor.metrics.skippedCompactingLockedTables.WithLabelValues(tc.lockTable))) } } diff --git a/pkg/compactor/metrics.go b/pkg/compactor/metrics.go index 7cbf404c81..28b2057896 100644 --- a/pkg/compactor/metrics.go +++ b/pkg/compactor/metrics.go @@ -8,17 +8,17 @@ import ( const ( statusFailure = "failure" statusSuccess = "success" - - lblWithRetention = "with_retention" ) type metrics struct { - compactTablesOperationTotal *prometheus.CounterVec - compactTablesOperationDurationSeconds *prometheus.GaugeVec - compactTablesOperationLastSuccess *prometheus.GaugeVec - applyRetentionLastSuccess prometheus.Gauge - compactorRunning prometheus.Gauge - skippedCompactingLockedTables prometheus.Counter + compactTablesOperationTotal *prometheus.CounterVec + compactTablesOperationDurationSeconds prometheus.Gauge + compactTablesOperationLastSuccess prometheus.Gauge + applyRetentionOperationTotal *prometheus.CounterVec + applyRetentionOperationDurationSeconds prometheus.Gauge + applyRetentionLastSuccess prometheus.Gauge + compactorRunning prometheus.Gauge + skippedCompactingLockedTables *prometheus.CounterVec } func newMetrics(r prometheus.Registerer) *metrics { @@ -26,18 +26,28 @@ func newMetrics(r prometheus.Registerer) *metrics { compactTablesOperationTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Namespace: "loki_boltdb_shipper", Name: "compact_tables_operation_total", - Help: "Total number of tables compaction done by status and with/without retention", - }, []string{"status", lblWithRetention}), - compactTablesOperationDurationSeconds: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{ + Help: "Total number of tables compaction done by status", + }, []string{"status"}), + compactTablesOperationDurationSeconds: promauto.With(r).NewGauge(prometheus.GaugeOpts{ Namespace: "loki_boltdb_shipper", Name: "compact_tables_operation_duration_seconds", - Help: "Time (in seconds) spent in compacting all the tables with/without retention", - }, []string{lblWithRetention}), - compactTablesOperationLastSuccess: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{ + Help: "Time (in seconds) spent in compacting all the tables", + }), + compactTablesOperationLastSuccess: promauto.With(r).NewGauge(prometheus.GaugeOpts{ Namespace: "loki_boltdb_shipper", Name: "compact_tables_operation_last_successful_run_timestamp_seconds", Help: "Unix timestamp of the last successful compaction run", - }, []string{lblWithRetention}), + }), + applyRetentionOperationTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Namespace: "loki_compactor", + Name: "apply_retention_operation_total", + Help: "Total number of attempts done to apply retention with status", + }, []string{"status"}), + applyRetentionOperationDurationSeconds: promauto.With(r).NewGauge(prometheus.GaugeOpts{ + Namespace: "loki_compactor", + Name: "apply_retention_operation_duration_seconds", + Help: "Time (in seconds) spent in applying retention", + }), applyRetentionLastSuccess: promauto.With(r).NewGauge(prometheus.GaugeOpts{ Namespace: "loki_boltdb_shipper", Name: "apply_retention_last_successful_run_timestamp_seconds", @@ -48,11 +58,11 @@ func newMetrics(r prometheus.Registerer) *metrics { Name: "compactor_running", Help: "Value will be 1 if compactor is currently running on this instance", }), - skippedCompactingLockedTables: promauto.With(r).NewCounter(prometheus.CounterOpts{ + skippedCompactingLockedTables: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Namespace: "loki_compactor", - Name: "skipped_compacting_locked_tables_total", + Name: "skipped_compacting_locked_table_total", Help: "Count of uncompacted tables being skipped due to them being locked by retention", - }), + }, []string{"table_name"}), } return &m diff --git a/production/loki-mixin-compiled-ssd/dashboards/loki-retention.json b/production/loki-mixin-compiled-ssd/dashboards/loki-retention.json index 73791bf2b1..95bc7b6e0f 100644 --- a/production/loki-mixin-compiled-ssd/dashboards/loki-retention.json +++ b/production/loki-mixin-compiled-ssd/dashboards/loki-retention.json @@ -375,7 +375,7 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 6, "stack": false, "steppedLine": false, "targets": [ @@ -389,7 +389,7 @@ "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Last Compact and Mark Operation Success", + "title": "Last Compact Tables Operation Success", "tooltip": { "shared": true, "sort": 2, @@ -449,7 +449,7 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 6, "stack": false, "steppedLine": false, "targets": [ @@ -465,7 +465,7 @@ "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Compact and Mark Operations Duration", + "title": "Compact Tables Operations Duration", "tooltip": { "shared": true, "sort": 2, @@ -497,7 +497,19 @@ "show": false } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Compaction", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ { "aliasColors": { }, "bars": false, @@ -525,7 +537,83 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(loki_compactor_skipped_compacting_locked_table_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$__range]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{table_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Number of times Tables were skipped during Compaction", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, "stack": false, "steppedLine": false, "targets": [ @@ -541,7 +629,279 @@ "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Compact and Mark Operations Per Status", + "title": "Compact Tables Operations Per Status", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "custom": { }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "dateTimeFromNow" + } + }, + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": { }, + "textMode": "auto" + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "loki_compactor_apply_retention_last_successful_run_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"} * 1e3", + "format": "time_series", + "instant": true, + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Last Mark Operation Success", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "stat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "loki_compactor_apply_retention_operation_duration_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "duration", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Mark Operations Duration", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (status)(rate(loki_compactor_apply_retention_operation_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{success}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Mark Operations Per Status", "tooltip": { "shared": true, "sort": 2, @@ -579,7 +939,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Compact and Mark", + "title": "Retention", "titleSize": "h6" }, { @@ -593,7 +953,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 11, "legend": { "avg": false, "current": false, @@ -669,7 +1029,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 12, "legend": { "avg": false, "current": false, @@ -745,7 +1105,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 9, + "id": 13, "legend": { "avg": false, "current": false, @@ -834,7 +1194,7 @@ "datasource": "$datasource", "fill": 1, "format": "short", - "id": 10, + "id": 14, "legend": { "avg": false, "current": false, @@ -909,7 +1269,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 11, + "id": 15, "legend": { "avg": false, "current": false, @@ -1014,7 +1374,7 @@ "datasource": "$datasource", "fill": 1, "format": "short", - "id": 12, + "id": 16, "legend": { "avg": false, "current": false, @@ -1089,7 +1449,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 13, + "id": 17, "legend": { "avg": false, "current": false, @@ -1193,7 +1553,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 14, + "id": 18, "legend": { "avg": false, "current": false, @@ -1269,7 +1629,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 15, + "id": 19, "legend": { "avg": false, "current": false, @@ -1345,7 +1705,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 16, + "id": 20, "legend": { "avg": false, "current": false, @@ -1428,7 +1788,7 @@ "panels": [ { "datasource": "$loki_datasource", - "id": 17, + "id": 21, "span": 12, "targets": [ { diff --git a/production/loki-mixin-compiled/dashboards/loki-retention.json b/production/loki-mixin-compiled/dashboards/loki-retention.json index fc8f9e5619..a266d15734 100644 --- a/production/loki-mixin-compiled/dashboards/loki-retention.json +++ b/production/loki-mixin-compiled/dashboards/loki-retention.json @@ -375,7 +375,7 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 6, "stack": false, "steppedLine": false, "targets": [ @@ -389,7 +389,7 @@ "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Last Compact and Mark Operation Success", + "title": "Last Compact Tables Operation Success", "tooltip": { "shared": true, "sort": 2, @@ -449,7 +449,7 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 6, "stack": false, "steppedLine": false, "targets": [ @@ -465,7 +465,7 @@ "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Compact and Mark Operations Duration", + "title": "Compact Tables Operations Duration", "tooltip": { "shared": true, "sort": 2, @@ -497,7 +497,19 @@ "show": false } ] - }, + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Compaction", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ { "aliasColors": { }, "bars": false, @@ -525,7 +537,83 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 4, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(loki_compactor_skipped_compacting_locked_table_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$__range]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{table_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Number of times Tables were skipped during Compaction", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, "stack": false, "steppedLine": false, "targets": [ @@ -541,7 +629,279 @@ "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Compact and Mark Operations Per Status", + "title": "Compact Tables Operations Per Status", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "custom": { }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "dateTimeFromNow" + } + }, + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": { }, + "textMode": "auto" + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "loki_compactor_apply_retention_last_successful_run_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"} * 1e3", + "format": "time_series", + "instant": true, + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Last Mark Operation Success", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "stat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "loki_compactor_apply_retention_operation_duration_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "duration", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Mark Operations Duration", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (status)(rate(loki_compactor_apply_retention_operation_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{success}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Mark Operations Per Status", "tooltip": { "shared": true, "sort": 2, @@ -579,7 +939,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Compact and Mark", + "title": "Retention", "titleSize": "h6" }, { @@ -593,7 +953,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 11, "legend": { "avg": false, "current": false, @@ -669,7 +1029,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 12, "legend": { "avg": false, "current": false, @@ -745,7 +1105,7 @@ "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 9, + "id": 13, "legend": { "avg": false, "current": false, @@ -834,7 +1194,7 @@ "datasource": "$datasource", "fill": 1, "format": "short", - "id": 10, + "id": 14, "legend": { "avg": false, "current": false, @@ -909,7 +1269,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 11, + "id": 15, "legend": { "avg": false, "current": false, @@ -1014,7 +1374,7 @@ "datasource": "$datasource", "fill": 1, "format": "short", - "id": 12, + "id": 16, "legend": { "avg": false, "current": false, @@ -1089,7 +1449,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 13, + "id": 17, "legend": { "avg": false, "current": false, @@ -1193,7 +1553,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 14, + "id": 18, "legend": { "avg": false, "current": false, @@ -1269,7 +1629,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 15, + "id": 19, "legend": { "avg": false, "current": false, @@ -1345,7 +1705,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 16, + "id": 20, "legend": { "avg": false, "current": false, @@ -1428,7 +1788,7 @@ "panels": [ { "datasource": "$loki_datasource", - "id": 17, + "id": 21, "span": 12, "targets": [ { diff --git a/production/loki-mixin/dashboards/loki-retention.libsonnet b/production/loki-mixin/dashboards/loki-retention.libsonnet index 8e28ccdb0e..a5aa45a13d 100644 --- a/production/loki-mixin/dashboards/loki-retention.libsonnet +++ b/production/loki-mixin/dashboards/loki-retention.libsonnet @@ -25,20 +25,42 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRow( - $.row('Compact and Mark') + $.row('Compaction') .addPanel( - $.fromNowPanel('Last Compact and Mark Operation Success', 'loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds') + $.fromNowPanel('Last Compact Tables Operation Success', 'loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds') ) .addPanel( - $.panel('Compact and Mark Operations Duration') + + $.panel('Compact Tables Operations Duration') + $.queryPanel(['loki_boltdb_shipper_compact_tables_operation_duration_seconds{%s}' % $.namespaceMatcher()], ['duration']) + { yaxes: $.yaxes('s') }, ) + ) + .addRow( + $.row('') .addPanel( - $.panel('Compact and Mark Operations Per Status') + + $.panel('Number of times Tables were skipped during Compaction') + + $.queryPanel(['sum(increase(loki_compactor_skipped_compacting_locked_table_total{%s}[$__range]))' % $.namespaceMatcher()], ['{{table_name}}']), + ) + .addPanel( + $.panel('Compact Tables Operations Per Status') + $.queryPanel(['sum by (status)(rate(loki_boltdb_shipper_compact_tables_operation_total{%s}[$__rate_interval]))' % $.namespaceMatcher()], ['{{success}}']), ) ) + .addRow( + $.row('Retention') + .addPanel( + $.fromNowPanel('Last Mark Operation Success', 'loki_compactor_apply_retention_last_successful_run_timestamp_seconds') + ) + .addPanel( + $.panel('Mark Operations Duration') + + $.queryPanel(['loki_compactor_apply_retention_operation_duration_seconds{%s}' % $.namespaceMatcher()], ['duration']) + + { yaxes: $.yaxes('s') }, + ) + .addPanel( + $.panel('Mark Operations Per Status') + + $.queryPanel(['sum by (status)(rate(loki_compactor_apply_retention_operation_total{%s}[$__rate_interval]))' % $.namespaceMatcher()], ['{{success}}']), + ) + ) .addRow( $.row('Per Table Marker') .addPanel(