mirror of https://github.com/grafana/loki
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
297 lines
13 KiB
297 lines
13 KiB
package ingester
|
|
|
|
import (
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
|
|
"github.com/grafana/loki/v3/pkg/analytics"
|
|
"github.com/grafana/loki/v3/pkg/util/constants"
|
|
"github.com/grafana/loki/v3/pkg/validation"
|
|
)
|
|
|
|
type ingesterMetrics struct {
|
|
checkpointDeleteFail prometheus.Counter
|
|
checkpointDeleteTotal prometheus.Counter
|
|
checkpointCreationFail prometheus.Counter
|
|
checkpointCreationTotal prometheus.Counter
|
|
checkpointDuration prometheus.Summary
|
|
checkpointLoggedBytesTotal prometheus.Counter
|
|
|
|
walDiskFullFailures prometheus.Counter
|
|
walReplayActive prometheus.Gauge
|
|
walReplayDuration prometheus.Gauge
|
|
walReplaySamplesDropped *prometheus.CounterVec
|
|
walReplayBytesDropped *prometheus.CounterVec
|
|
walCorruptionsTotal *prometheus.CounterVec
|
|
walLoggedBytesTotal prometheus.Counter
|
|
walRecordsLogged prometheus.Counter
|
|
|
|
recoveredStreamsTotal prometheus.Counter
|
|
recoveredChunksTotal prometheus.Counter
|
|
recoveredEntriesTotal prometheus.Counter
|
|
duplicateEntriesTotal prometheus.Counter
|
|
recoveredBytesTotal prometheus.Counter
|
|
recoveryBytesInUse prometheus.Gauge
|
|
recoveryIsFlushing prometheus.Gauge
|
|
|
|
limiterEnabled prometheus.Gauge
|
|
|
|
autoForgetUnhealthyIngestersTotal prometheus.Counter
|
|
|
|
chunkUtilization prometheus.Histogram
|
|
memoryChunks prometheus.Gauge
|
|
chunkEntries prometheus.Histogram
|
|
chunkSize prometheus.Histogram
|
|
chunkCompressionRatio prometheus.Histogram
|
|
chunksPerTenant *prometheus.CounterVec
|
|
chunkSizePerTenant *prometheus.CounterVec
|
|
chunkAge prometheus.Histogram
|
|
chunkEncodeTime prometheus.Histogram
|
|
chunksFlushFailures prometheus.Counter
|
|
chunksFlushedPerReason *prometheus.CounterVec
|
|
chunkLifespan prometheus.Histogram
|
|
flushedChunksStats *analytics.Counter
|
|
flushedChunksBytesStats *analytics.Statistics
|
|
flushedChunksLinesStats *analytics.Statistics
|
|
flushedChunksAgeStats *analytics.Statistics
|
|
flushedChunksLifespanStats *analytics.Statistics
|
|
flushedChunksUtilizationStats *analytics.Statistics
|
|
|
|
chunksCreatedTotal prometheus.Counter
|
|
samplesPerChunk prometheus.Histogram
|
|
blocksPerChunk prometheus.Histogram
|
|
chunkCreatedStats *analytics.Counter
|
|
|
|
// Shutdown marker for ingester scale down
|
|
shutdownMarker prometheus.Gauge
|
|
|
|
flushQueueLength prometheus.Gauge
|
|
}
|
|
|
|
// setRecoveryBytesInUse bounds the bytes reports to >= 0.
|
|
// TODO(owen-d): we can gain some efficiency by having the flusher never update this after recovery ends.
|
|
func (m *ingesterMetrics) setRecoveryBytesInUse(v int64) {
|
|
if v < 0 {
|
|
v = 0
|
|
}
|
|
m.recoveryBytesInUse.Set(float64(v))
|
|
}
|
|
|
|
const (
|
|
walTypeCheckpoint = "checkpoint"
|
|
walTypeSegment = "segment"
|
|
|
|
duplicateReason = "duplicate"
|
|
)
|
|
|
|
func newIngesterMetrics(r prometheus.Registerer, metricsNamespace string) *ingesterMetrics {
|
|
return &ingesterMetrics{
|
|
walDiskFullFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_disk_full_failures_total",
|
|
Help: "Total number of wal write failures due to full disk.",
|
|
}),
|
|
walReplayActive: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Name: "loki_ingester_wal_replay_active",
|
|
Help: "Whether the WAL is replaying",
|
|
}),
|
|
walReplayDuration: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Name: "loki_ingester_wal_replay_duration_seconds",
|
|
Help: "Time taken to replay the checkpoint and the WAL.",
|
|
}),
|
|
walReplaySamplesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_discarded_samples_total",
|
|
Help: "WAL segment entries discarded during replay",
|
|
}, []string{validation.ReasonLabel}),
|
|
walReplayBytesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_discarded_bytes_total",
|
|
Help: "WAL segment bytes discarded during replay",
|
|
}, []string{validation.ReasonLabel}),
|
|
walCorruptionsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_corruptions_total",
|
|
Help: "Total number of WAL corruptions encountered.",
|
|
}, []string{"type"}),
|
|
checkpointDeleteFail: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_checkpoint_deletions_failed_total",
|
|
Help: "Total number of checkpoint deletions that failed.",
|
|
}),
|
|
checkpointDeleteTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_checkpoint_deletions_total",
|
|
Help: "Total number of checkpoint deletions attempted.",
|
|
}),
|
|
checkpointCreationFail: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_checkpoint_creations_failed_total",
|
|
Help: "Total number of checkpoint creations that failed.",
|
|
}),
|
|
checkpointCreationTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_checkpoint_creations_total",
|
|
Help: "Total number of checkpoint creations attempted.",
|
|
}),
|
|
checkpointDuration: promauto.With(r).NewSummary(prometheus.SummaryOpts{
|
|
Name: "loki_ingester_checkpoint_duration_seconds",
|
|
Help: "Time taken to create a checkpoint.",
|
|
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
|
}),
|
|
walRecordsLogged: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_records_logged_total",
|
|
Help: "Total number of WAL records logged.",
|
|
}),
|
|
checkpointLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_checkpoint_logged_bytes_total",
|
|
Help: "Total number of bytes written to disk for checkpointing.",
|
|
}),
|
|
walLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_logged_bytes_total",
|
|
Help: "Total number of bytes written to disk for WAL records.",
|
|
}),
|
|
recoveredStreamsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_recovered_streams_total",
|
|
Help: "Total number of streams recovered from the WAL.",
|
|
}),
|
|
recoveredChunksTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_recovered_chunks_total",
|
|
Help: "Total number of chunks recovered from the WAL checkpoints.",
|
|
}),
|
|
recoveredEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_recovered_entries_total",
|
|
Help: "Total number of entries recovered from the WAL.",
|
|
}),
|
|
duplicateEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_duplicate_entries_total",
|
|
Help: "Entries discarded during WAL replay due to existing in checkpoints.",
|
|
}),
|
|
recoveredBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_wal_recovered_bytes_total",
|
|
Help: "Total number of bytes recovered from the WAL.",
|
|
}),
|
|
recoveryBytesInUse: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Name: "loki_ingester_wal_bytes_in_use",
|
|
Help: "Total number of bytes in use by the WAL recovery process.",
|
|
}),
|
|
recoveryIsFlushing: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Name: "loki_ingester_wal_replay_flushing",
|
|
Help: "Whether the wal replay is in a flushing phase due to backpressure",
|
|
}),
|
|
limiterEnabled: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Name: "loki_ingester_limiter_enabled",
|
|
Help: "Whether the ingester's limiter is enabled",
|
|
}),
|
|
autoForgetUnhealthyIngestersTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Name: "loki_ingester_autoforget_unhealthy_ingesters_total",
|
|
Help: "Total number of ingesters automatically forgotten",
|
|
}),
|
|
chunkUtilization: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunk_utilization",
|
|
Help: "Distribution of stored chunk utilization (when stored).",
|
|
Buckets: prometheus.LinearBuckets(0, 0.2, 6),
|
|
}),
|
|
memoryChunks: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_memory_chunks",
|
|
Help: "The total number of chunks in memory.",
|
|
}),
|
|
chunkEntries: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunk_entries",
|
|
Help: "Distribution of stored lines per chunk (when stored).",
|
|
Buckets: prometheus.ExponentialBuckets(200, 2, 9), // biggest bucket is 200*2^(9-1) = 51200
|
|
}),
|
|
chunkSize: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunk_size_bytes",
|
|
Help: "Distribution of stored chunk sizes (when stored).",
|
|
Buckets: prometheus.ExponentialBuckets(20000, 2, 10), // biggest bucket is 20000*2^(10-1) = 10,240,000 (~10.2MB)
|
|
}),
|
|
chunkCompressionRatio: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunk_compression_ratio",
|
|
Help: "Compression ratio of chunks (when stored).",
|
|
Buckets: prometheus.LinearBuckets(.75, 2, 10),
|
|
}),
|
|
chunksPerTenant: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunks_stored_total",
|
|
Help: "Total stored chunks per tenant.",
|
|
}, []string{"tenant"}),
|
|
chunkSizePerTenant: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunk_stored_bytes_total",
|
|
Help: "Total bytes stored in chunks per tenant.",
|
|
}, []string{"tenant"}),
|
|
chunkAge: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunk_age_seconds",
|
|
Help: "Distribution of chunk ages (when stored).",
|
|
// with default settings chunks should flush between 5 min and 12 hours
|
|
// so buckets at 1min, 5min, 10min, 30min, 1hr, 2hr, 4hr, 10hr, 12hr, 16hr
|
|
Buckets: []float64{60, 300, 600, 1800, 3600, 7200, 14400, 36000, 43200, 57600},
|
|
}),
|
|
chunkEncodeTime: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunk_encode_time_seconds",
|
|
Help: "Distribution of chunk encode times.",
|
|
// 10ms to 10s.
|
|
Buckets: prometheus.ExponentialBuckets(0.01, 4, 6),
|
|
}),
|
|
chunksFlushFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunks_flush_failures_total",
|
|
Help: "Total number of flush failures.",
|
|
}),
|
|
chunksFlushedPerReason: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunks_flushed_total",
|
|
Help: "Total flushed chunks per reason.",
|
|
}, []string{"reason"}),
|
|
chunkLifespan: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunk_bounds_hours",
|
|
Help: "Distribution of chunk end-start durations.",
|
|
// 1h -> 8hr
|
|
Buckets: prometheus.LinearBuckets(1, 1, 8),
|
|
}),
|
|
flushedChunksStats: analytics.NewCounter("ingester_flushed_chunks"),
|
|
flushedChunksBytesStats: analytics.NewStatistics("ingester_flushed_chunks_bytes"),
|
|
flushedChunksLinesStats: analytics.NewStatistics("ingester_flushed_chunks_lines"),
|
|
flushedChunksAgeStats: analytics.NewStatistics("ingester_flushed_chunks_age_seconds"),
|
|
flushedChunksLifespanStats: analytics.NewStatistics("ingester_flushed_chunks_lifespan_seconds"),
|
|
flushedChunksUtilizationStats: analytics.NewStatistics("ingester_flushed_chunks_utilization"),
|
|
chunksCreatedTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
|
Namespace: constants.Loki,
|
|
Name: "ingester_chunks_created_total",
|
|
Help: "The total number of chunks created in the ingester.",
|
|
}),
|
|
samplesPerChunk: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: constants.Loki,
|
|
Subsystem: "ingester",
|
|
Name: "samples_per_chunk",
|
|
Help: "The number of samples in a chunk.",
|
|
|
|
Buckets: prometheus.LinearBuckets(4096, 2048, 6),
|
|
}),
|
|
blocksPerChunk: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: constants.Loki,
|
|
Subsystem: "ingester",
|
|
Name: "blocks_per_chunk",
|
|
Help: "The number of blocks in a chunk.",
|
|
|
|
Buckets: prometheus.ExponentialBuckets(5, 2, 6),
|
|
}),
|
|
|
|
chunkCreatedStats: analytics.NewCounter("ingester_chunk_created"),
|
|
|
|
shutdownMarker: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Namespace: constants.Loki,
|
|
Subsystem: "ingester",
|
|
Name: "shutdown_marker",
|
|
Help: "1 if prepare shutdown has been called, 0 otherwise",
|
|
}),
|
|
|
|
flushQueueLength: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Namespace: metricsNamespace,
|
|
Subsystem: "ingester",
|
|
Name: "flush_queue_length",
|
|
Help: "The total number of series pending in the flush queue.",
|
|
}),
|
|
}
|
|
}
|
|
|