adds loki_ingester_wal_replay_active metric and records this more acc… (#4193)

* adds loki_ingester_wal_replay_active metric and records this more accurately

* adds WAL to msg
pull/4196/head
Owen Diehl 4 years ago committed by GitHub
parent 345e10128f
commit 9cdb1295b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 34
      pkg/ingester/ingester.go
  2. 5
      pkg/ingester/metrics.go

@ -318,21 +318,37 @@ func (i *Ingester) setupAutoForget() {
func (i *Ingester) starting(ctx context.Context) error {
if i.cfg.WAL.Enabled {
start := time.Now()
// Ignore retain period during wal replay.
old := i.cfg.RetainPeriod
oldRetain := i.cfg.RetainPeriod
i.cfg.RetainPeriod = 0
defer func() {
i.cfg.RetainPeriod = old
}()
// Disable the in process stream limit checks while replaying the WAL.
// It is re-enabled in the recover's Close() method.
i.limiter.DisableForWALReplay()
recoverer := newIngesterRecoverer(i)
defer recoverer.Close()
start := time.Now()
i.metrics.walReplayActive.Set(1)
endReplay := func() func() {
var once sync.Once
return func() {
once.Do(func() {
level.Info(util_log.Logger).Log("msg", "closing recoverer")
recoverer.Close()
elapsed := time.Since(start)
i.metrics.walReplayActive.Set(0)
i.metrics.walReplayDuration.Set(elapsed.Seconds())
i.cfg.RetainPeriod = oldRetain
level.Info(util_log.Logger).Log("msg", "WAL recovery finished", "time", elapsed.String())
})
}
}()
defer endReplay()
level.Info(util_log.Logger).Log("msg", "recovering from checkpoint")
checkpointReader, checkpointCloser, err := newCheckpointReader(i.cfg.WAL.Dir)
@ -378,11 +394,7 @@ func (i *Ingester) starting(ctx context.Context) error {
"errors", segmentRecoveryErr != nil,
)
level.Info(util_log.Logger).Log("msg", "closing recoverer")
recoverer.Close()
elapsed := time.Since(start)
i.metrics.walReplayDuration.Set(elapsed.Seconds())
level.Info(util_log.Logger).Log("msg", "recovery finished", "time", elapsed.String())
endReplay()
i.wal.Start()
}

@ -14,6 +14,7 @@ type ingesterMetrics struct {
checkpointLoggedBytesTotal prometheus.Counter
walDiskFullFailures prometheus.Counter
walReplayActive prometheus.Gauge
walReplayDuration prometheus.Gauge
walCorruptionsTotal *prometheus.CounterVec
walLoggedBytesTotal prometheus.Counter
@ -52,6 +53,10 @@ func newIngesterMetrics(r prometheus.Registerer) *ingesterMetrics {
Name: "loki_ingester_wal_disk_full_failures_total",
Help: "Total number of wal write failures due to full disk.",
}),
walReplayActive: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_wal_replay_active",
Help: "Whether the WAL is replaying",
}),
walReplayDuration: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_wal_replay_duration_seconds",
Help: "Time taken to replay the checkpoint and the WAL.",

Loading…
Cancel
Save