Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/ingester/metrics.go

297 lines
13 KiB

package ingester
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/grafana/loki/v3/pkg/analytics"
"github.com/grafana/loki/v3/pkg/util/constants"
"github.com/grafana/loki/v3/pkg/validation"
)
type ingesterMetrics struct {
checkpointDeleteFail prometheus.Counter
checkpointDeleteTotal prometheus.Counter
checkpointCreationFail prometheus.Counter
checkpointCreationTotal prometheus.Counter
checkpointDuration prometheus.Summary
checkpointLoggedBytesTotal prometheus.Counter
walDiskFullFailures prometheus.Counter
walReplayActive prometheus.Gauge
walReplayDuration prometheus.Gauge
walReplaySamplesDropped *prometheus.CounterVec
walReplayBytesDropped *prometheus.CounterVec
walCorruptionsTotal *prometheus.CounterVec
walLoggedBytesTotal prometheus.Counter
walRecordsLogged prometheus.Counter
recoveredStreamsTotal prometheus.Counter
recoveredChunksTotal prometheus.Counter
recoveredEntriesTotal prometheus.Counter
duplicateEntriesTotal prometheus.Counter
recoveredBytesTotal prometheus.Counter
recoveryBytesInUse prometheus.Gauge
recoveryIsFlushing prometheus.Gauge
limiterEnabled prometheus.Gauge
autoForgetUnhealthyIngestersTotal prometheus.Counter
chunkUtilization prometheus.Histogram
memoryChunks prometheus.Gauge
chunkEntries prometheus.Histogram
chunkSize prometheus.Histogram
chunkCompressionRatio prometheus.Histogram
chunksPerTenant *prometheus.CounterVec
chunkSizePerTenant *prometheus.CounterVec
chunkAge prometheus.Histogram
chunkEncodeTime prometheus.Histogram
chunksFlushFailures prometheus.Counter
chunksFlushedPerReason *prometheus.CounterVec
chunkLifespan prometheus.Histogram
flushedChunksStats *analytics.Counter
flushedChunksBytesStats *analytics.Statistics
flushedChunksLinesStats *analytics.Statistics
flushedChunksAgeStats *analytics.Statistics
flushedChunksLifespanStats *analytics.Statistics
flushedChunksUtilizationStats *analytics.Statistics
chunksCreatedTotal prometheus.Counter
samplesPerChunk prometheus.Histogram
blocksPerChunk prometheus.Histogram
chunkCreatedStats *analytics.Counter
// Shutdown marker for ingester scale down
shutdownMarker prometheus.Gauge
flushQueueLength prometheus.Gauge
}
// setRecoveryBytesInUse bounds the bytes reports to >= 0.
// TODO(owen-d): we can gain some efficiency by having the flusher never update this after recovery ends.
func (m *ingesterMetrics) setRecoveryBytesInUse(v int64) {
if v < 0 {
v = 0
}
m.recoveryBytesInUse.Set(float64(v))
}
const (
walTypeCheckpoint = "checkpoint"
walTypeSegment = "segment"
duplicateReason = "duplicate"
)
func newIngesterMetrics(r prometheus.Registerer, metricsNamespace string) *ingesterMetrics {
return &ingesterMetrics{
walDiskFullFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_disk_full_failures_total",
Help: "Total number of wal write failures due to full disk.",
}),
walReplayActive: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_wal_replay_active",
Help: "Whether the WAL is replaying",
}),
walReplayDuration: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_wal_replay_duration_seconds",
Help: "Time taken to replay the checkpoint and the WAL.",
}),
walReplaySamplesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "loki_ingester_wal_discarded_samples_total",
Help: "WAL segment entries discarded during replay",
}, []string{validation.ReasonLabel}),
walReplayBytesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "loki_ingester_wal_discarded_bytes_total",
Help: "WAL segment bytes discarded during replay",
}, []string{validation.ReasonLabel}),
walCorruptionsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "loki_ingester_wal_corruptions_total",
Help: "Total number of WAL corruptions encountered.",
}, []string{"type"}),
checkpointDeleteFail: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_checkpoint_deletions_failed_total",
Help: "Total number of checkpoint deletions that failed.",
}),
checkpointDeleteTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_checkpoint_deletions_total",
Help: "Total number of checkpoint deletions attempted.",
}),
checkpointCreationFail: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_checkpoint_creations_failed_total",
Help: "Total number of checkpoint creations that failed.",
}),
checkpointCreationTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_checkpoint_creations_total",
Help: "Total number of checkpoint creations attempted.",
}),
checkpointDuration: promauto.With(r).NewSummary(prometheus.SummaryOpts{
Name: "loki_ingester_checkpoint_duration_seconds",
Help: "Time taken to create a checkpoint.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}),
walRecordsLogged: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_records_logged_total",
Help: "Total number of WAL records logged.",
}),
checkpointLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_checkpoint_logged_bytes_total",
Help: "Total number of bytes written to disk for checkpointing.",
}),
walLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_logged_bytes_total",
Help: "Total number of bytes written to disk for WAL records.",
}),
recoveredStreamsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_recovered_streams_total",
Help: "Total number of streams recovered from the WAL.",
}),
recoveredChunksTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_recovered_chunks_total",
Help: "Total number of chunks recovered from the WAL checkpoints.",
}),
recoveredEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_recovered_entries_total",
Help: "Total number of entries recovered from the WAL.",
}),
duplicateEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_duplicate_entries_total",
Help: "Entries discarded during WAL replay due to existing in checkpoints.",
}),
recoveredBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_recovered_bytes_total",
Help: "Total number of bytes recovered from the WAL.",
}),
recoveryBytesInUse: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_wal_bytes_in_use",
Help: "Total number of bytes in use by the WAL recovery process.",
}),
recoveryIsFlushing: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_wal_replay_flushing",
Help: "Whether the wal replay is in a flushing phase due to backpressure",
}),
limiterEnabled: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_limiter_enabled",
Help: "Whether the ingester's limiter is enabled",
}),
autoForgetUnhealthyIngestersTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_autoforget_unhealthy_ingesters_total",
Help: "Total number of ingesters automatically forgotten",
}),
chunkUtilization: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_utilization",
Help: "Distribution of stored chunk utilization (when stored).",
Buckets: prometheus.LinearBuckets(0, 0.2, 6),
}),
memoryChunks: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: constants.Loki,
Name: "ingester_memory_chunks",
Help: "The total number of chunks in memory.",
}),
chunkEntries: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_entries",
Help: "Distribution of stored lines per chunk (when stored).",
Buckets: prometheus.ExponentialBuckets(200, 2, 9), // biggest bucket is 200*2^(9-1) = 51200
}),
chunkSize: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_size_bytes",
Help: "Distribution of stored chunk sizes (when stored).",
Buckets: prometheus.ExponentialBuckets(20000, 2, 10), // biggest bucket is 20000*2^(10-1) = 10,240,000 (~10.2MB)
}),
chunkCompressionRatio: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_compression_ratio",
Help: "Compression ratio of chunks (when stored).",
Buckets: prometheus.LinearBuckets(.75, 2, 10),
}),
chunksPerTenant: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunks_stored_total",
Help: "Total stored chunks per tenant.",
}, []string{"tenant"}),
chunkSizePerTenant: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_stored_bytes_total",
Help: "Total bytes stored in chunks per tenant.",
}, []string{"tenant"}),
chunkAge: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_age_seconds",
Help: "Distribution of chunk ages (when stored).",
// with default settings chunks should flush between 5 min and 12 hours
// so buckets at 1min, 5min, 10min, 30min, 1hr, 2hr, 4hr, 10hr, 12hr, 16hr
Buckets: []float64{60, 300, 600, 1800, 3600, 7200, 14400, 36000, 43200, 57600},
}),
chunkEncodeTime: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_encode_time_seconds",
Help: "Distribution of chunk encode times.",
// 10ms to 10s.
Buckets: prometheus.ExponentialBuckets(0.01, 4, 6),
}),
chunksFlushFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunks_flush_failures_total",
Help: "Total number of flush failures.",
}),
chunksFlushedPerReason: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunks_flushed_total",
Help: "Total flushed chunks per reason.",
}, []string{"reason"}),
chunkLifespan: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_bounds_hours",
Help: "Distribution of chunk end-start durations.",
// 1h -> 8hr
Buckets: prometheus.LinearBuckets(1, 1, 8),
}),
flushedChunksStats: analytics.NewCounter("ingester_flushed_chunks"),
flushedChunksBytesStats: analytics.NewStatistics("ingester_flushed_chunks_bytes"),
flushedChunksLinesStats: analytics.NewStatistics("ingester_flushed_chunks_lines"),
flushedChunksAgeStats: analytics.NewStatistics("ingester_flushed_chunks_age_seconds"),
flushedChunksLifespanStats: analytics.NewStatistics("ingester_flushed_chunks_lifespan_seconds"),
flushedChunksUtilizationStats: analytics.NewStatistics("ingester_flushed_chunks_utilization"),
chunksCreatedTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunks_created_total",
Help: "The total number of chunks created in the ingester.",
}),
samplesPerChunk: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Subsystem: "ingester",
Name: "samples_per_chunk",
Help: "The number of samples in a chunk.",
Buckets: prometheus.LinearBuckets(4096, 2048, 6),
}),
blocksPerChunk: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Subsystem: "ingester",
Name: "blocks_per_chunk",
Help: "The number of blocks in a chunk.",
Buckets: prometheus.ExponentialBuckets(5, 2, 6),
}),
chunkCreatedStats: analytics.NewCounter("ingester_chunk_created"),
shutdownMarker: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: constants.Loki,
Subsystem: "ingester",
Name: "shutdown_marker",
Help: "1 if prepare shutdown has been called, 0 otherwise",
}),
flushQueueLength: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: "ingester",
Name: "flush_queue_length",
Help: "The total number of series pending in the flush queue.",
}),
}
}