package ingester import ( "bytes" "context" "errors" "fmt" "os" "path/filepath" "regexp" "strconv" "time" "github.com/dustin/go-humanize" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/gogo/protobuf/proto" tsdb_errors "github.com/prometheus/prometheus/tsdb/errors" "github.com/prometheus/prometheus/tsdb/fileutil" "github.com/prometheus/prometheus/tsdb/wlog" prompool "github.com/prometheus/prometheus/util/pool" "github.com/grafana/loki/v3/pkg/chunkenc" "github.com/grafana/loki/v3/pkg/ingester/wal" "github.com/grafana/loki/v3/pkg/logproto" util_log "github.com/grafana/loki/v3/pkg/util/log" "github.com/grafana/loki/v3/pkg/util/pool" ) var ( // todo(ctovena) those pools should be in factor of the actual configuration (blocksize, targetsize). // Starting with something sane first then we can refine with more experience. // Buckets [1KB 2KB 4KB 16KB 32KB to 4MB] by 2 chunksBufferPool = pool.NewBuffer(1024, 4*1024*1024, 2) // Buckets [64B 128B 256B 512B... to 2MB] by 2 headBufferPool = pool.NewBuffer(64, 2*1024*1024, 2) ) type chunkWithBuffer struct { blocks, head *bytes.Buffer Chunk } // The passed wireChunks slice is for re-use. func toWireChunks(descs []chunkDesc, wireChunks []chunkWithBuffer) ([]chunkWithBuffer, error) { // release memory from previous list of chunks. for _, wc := range wireChunks { chunksBufferPool.Put(wc.blocks) headBufferPool.Put(wc.head) wc.Data = nil wc.Head = nil } if cap(wireChunks) < len(descs) { wireChunks = make([]chunkWithBuffer, len(descs)) } else { wireChunks = wireChunks[:len(descs)] } for i, d := range descs { from, to := d.chunk.Bounds() chunkSize, headSize := d.chunk.CheckpointSize() wireChunk := chunkWithBuffer{ Chunk: Chunk{ From: from, To: to, Closed: d.closed, FlushedAt: d.flushed, LastUpdated: d.lastUpdated, Synced: d.synced, }, blocks: chunksBufferPool.Get(chunkSize), head: headBufferPool.Get(headSize), } err := d.chunk.SerializeForCheckpointTo( wireChunk.blocks, wireChunk.head, ) if err != nil { return nil, err } wireChunk.Data = wireChunk.blocks.Bytes() wireChunk.Head = wireChunk.head.Bytes() wireChunks[i] = wireChunk } return wireChunks, nil } func fromWireChunks(conf *Config, headfmt chunkenc.HeadBlockFmt, wireChunks []Chunk) ([]chunkDesc, error) { descs := make([]chunkDesc, 0, len(wireChunks)) for _, c := range wireChunks { desc := chunkDesc{ closed: c.Closed, synced: c.Synced, flushed: c.FlushedAt, lastUpdated: c.LastUpdated, } mc, err := chunkenc.MemchunkFromCheckpoint(c.Data, c.Head, headfmt, conf.BlockSize, conf.TargetChunkSize) if err != nil { return nil, err } desc.chunk = mc descs = append(descs, desc) } return descs, nil } // nolint:interfacer func decodeCheckpointRecord(rec []byte, s *Series) error { // TODO(owen-d): reduce allocs // The proto unmarshaling code will retain references to the underlying []byte it's passed // in order to reduce allocs. This is harmful to us because when reading from a WAL, the []byte // is only guaranteed to be valid between calls to Next(). // Therefore, we copy it to avoid this problem. cpy := make([]byte, len(rec)) copy(cpy, rec) switch wal.RecordType(cpy[0]) { case wal.CheckpointRecord: return proto.Unmarshal(cpy[1:], s) default: return fmt.Errorf("unexpected record type: %d", rec[0]) } } func encodeWithTypeHeader(m *Series, typ wal.RecordType, buf []byte) ([]byte, error) { size := m.Size() if cap(buf) < size+1 { buf = make([]byte, size+1) } _, err := m.MarshalTo(buf[1 : size+1]) if err != nil { return nil, err } buf[0] = byte(typ) return buf[:size+1], nil } type SeriesWithErr struct { Err error Series *Series } type SeriesIter interface { Count() int Iter() *streamIterator Stop() } type ingesterSeriesIter struct { ing ingesterInstances done chan struct{} } type ingesterInstances interface { getInstances() []*instance } func newIngesterSeriesIter(ing ingesterInstances) *ingesterSeriesIter { return &ingesterSeriesIter{ ing: ing, done: make(chan struct{}), } } func (i *ingesterSeriesIter) Count() (ct int) { for _, inst := range i.ing.getInstances() { ct += inst.numStreams() } return ct } func (i *ingesterSeriesIter) Stop() { close(i.done) } func (i *ingesterSeriesIter) Iter() *streamIterator { return newStreamsIterator(i.ing) } type streamInstance struct { id string streams []*stream } type streamIterator struct { instances []streamInstance current Series buffer []chunkWithBuffer err error } // newStreamsIterator returns a new stream iterators that iterates over one instance at a time, then // each stream per instances. func newStreamsIterator(ing ingesterInstances) *streamIterator { instances := ing.getInstances() streamInstances := make([]streamInstance, len(instances)) for i, inst := range instances { streams := make([]*stream, 0, inst.streams.Len()) _ = inst.forAllStreams(context.Background(), func(s *stream) error { streams = append(streams, s) return nil }) streamInstances[i] = streamInstance{ streams: streams, id: inst.instanceID, } } return &streamIterator{ instances: streamInstances, } } // Next loads the next stream of the current instance. // If the instance is empty, it moves to the next instance until there is no more. // Return true if there's a next stream, each successful calls will replace the current stream. func (s *streamIterator) Next() bool { if len(s.instances) == 0 { s.instances = nil return false } currentInstance := s.instances[0] if len(currentInstance.streams) == 0 { s.instances = s.instances[1:] return s.Next() } // current stream stream := currentInstance.streams[0] // remove the first stream s.instances[0].streams = s.instances[0].streams[1:] stream.chunkMtx.RLock() defer stream.chunkMtx.RUnlock() if len(stream.chunks) < 1 { // it's possible the stream has been flushed to storage // in between starting the checkpointing process and // checkpointing this stream. return s.Next() } chunks, err := toWireChunks(stream.chunks, s.buffer) if err != nil { s.err = err return false } s.buffer = chunks s.current.Chunks = s.current.Chunks[:0] if cap(s.current.Chunks) == 0 { s.current.Chunks = make([]Chunk, 0, len(chunks)) } for _, c := range chunks { s.current.Chunks = append(s.current.Chunks, c.Chunk) } s.current.UserID = currentInstance.id s.current.Fingerprint = uint64(stream.fp) s.current.Labels = logproto.FromLabelsToLabelAdapters(stream.labels) s.current.To = stream.lastLine.ts s.current.LastLine = stream.lastLine.content s.current.EntryCt = stream.entryCt s.current.HighestTs = stream.highestTs return true } // Err returns an errors thrown while iterating over the streams. func (s *streamIterator) Error() error { return s.err } // Stream is serializable (for checkpointing) stream of chunks. // NOTE: the series is re-used between successful Next calls. // This means you should make a copy or use the data before calling Next. func (s *streamIterator) Stream() *Series { return &s.current } type CheckpointWriter interface { // Advances current checkpoint, can also signal a no-op. Advance() (noop bool, err error) Write(*Series) error // Closes current checkpoint. Close(abort bool) error } type walLogger interface { Log(recs ...[]byte) error Close() error Dir() string } type WALCheckpointWriter struct { metrics *ingesterMetrics segmentWAL *wlog.WL checkpointWAL walLogger lastSegment int // name of the last segment guaranteed to be covered by the checkpoint final string // filename to atomically rotate upon completion bufSize int recs [][]byte } func (w *WALCheckpointWriter) Advance() (bool, error) { _, lastSegment, err := wlog.Segments(w.segmentWAL.Dir()) if err != nil { return false, err } if lastSegment < 0 { // There are no WAL segments. No need of checkpoint yet. return true, nil } // First we advance the wal segment internally to ensure we don't overlap a previous checkpoint in // low throughput scenarios and to minimize segment replays on top of checkpoints. if _, err := w.segmentWAL.NextSegment(); err != nil { return false, err } // Checkpoint is named after the last WAL segment present so that when replaying the WAL // we can start from that particular WAL segment. checkpointDir := filepath.Join(w.segmentWAL.Dir(), fmt.Sprintf(checkpointPrefix+"%06d", lastSegment)) level.Info(util_log.Logger).Log("msg", "attempting checkpoint for", "dir", checkpointDir) checkpointDirTemp := checkpointDir + ".tmp" // cleanup any old partial checkpoints if _, err := os.Stat(checkpointDirTemp); err == nil { if err := os.RemoveAll(checkpointDirTemp); err != nil { level.Error(util_log.Logger).Log("msg", "unable to cleanup old tmp checkpoint", "dir", checkpointDirTemp) return false, err } } if err := os.MkdirAll(checkpointDirTemp, 0777); err != nil { return false, fmt.Errorf("create checkpoint dir: %w", err) } checkpoint, err := wlog.NewSize(log.With(util_log.Logger, "component", "checkpoint_wal"), nil, checkpointDirTemp, walSegmentSize, wlog.CompressionNone) if err != nil { return false, fmt.Errorf("open checkpoint: %w", err) } w.checkpointWAL = checkpoint w.lastSegment = lastSegment w.final = checkpointDir return false, nil } // Buckets [64KB to 256MB] by 2 var recordBufferPool = prompool.New(1<<16, 1<<28, 2, func(size int) interface{} { return make([]byte, 0, size) }) func (w *WALCheckpointWriter) Write(s *Series) error { size := s.Size() + 1 // +1 for header buf := recordBufferPool.Get(size).([]byte)[:size] b, err := encodeWithTypeHeader(s, wal.CheckpointRecord, buf) if err != nil { return err } w.recs = append(w.recs, b) w.bufSize += len(b) level.Debug(util_log.Logger).Log("msg", "writing series", "size", humanize.Bytes(uint64(len(b)))) // 1MB if w.bufSize > 1<<20 { if err := w.flush(); err != nil { return err } } return nil } func (w *WALCheckpointWriter) flush() error { level.Debug(util_log.Logger).Log("msg", "flushing series", "totalSize", humanize.Bytes(uint64(w.bufSize)), "series", len(w.recs)) if err := w.checkpointWAL.Log(w.recs...); err != nil { return err } w.metrics.checkpointLoggedBytesTotal.Add(float64(w.bufSize)) for _, b := range w.recs { recordBufferPool.Put(b) } w.recs = w.recs[:0] w.bufSize = 0 return nil } const checkpointPrefix = "checkpoint." var checkpointRe = regexp.MustCompile("^" + regexp.QuoteMeta(checkpointPrefix) + "(\\d+)(\\.tmp)?$") // checkpointIndex returns the index of a given checkpoint file. It handles // both regular and temporary checkpoints according to the includeTmp flag. If // the file is not a checkpoint it returns an error. func checkpointIndex(filename string, includeTmp bool) (int, error) { result := checkpointRe.FindStringSubmatch(filename) if len(result) < 2 { return 0, errors.New("file is not a checkpoint") } // Filter out temporary checkpoints if desired. if !includeTmp && len(result) == 3 && result[2] != "" { return 0, errors.New("temporary checkpoint") } return strconv.Atoi(result[1]) } // lastCheckpoint returns the directory name and index of the most recent checkpoint. // If dir does not contain any checkpoints, -1 is returned as index. func lastCheckpoint(dir string) (string, int, error) { dirs, err := os.ReadDir(dir) if err != nil { return "", -1, err } var ( maxIdx = -1 checkpointDir string ) // There may be multiple checkpoints left, so select the one with max index. for i := 0; i < len(dirs); i++ { di := dirs[i] idx, err := checkpointIndex(di.Name(), false) if err != nil { continue } if !di.IsDir() { return "", -1, fmt.Errorf("checkpoint %s is not a directory", di.Name()) } if idx > maxIdx { checkpointDir = di.Name() maxIdx = idx } } if maxIdx >= 0 { return filepath.Join(dir, checkpointDir), maxIdx, nil } return "", -1, nil } // deleteCheckpoints deletes all checkpoints in a directory which is < maxIndex. func (w *WALCheckpointWriter) deleteCheckpoints(maxIndex int) (err error) { w.metrics.checkpointDeleteTotal.Inc() defer func() { if err != nil { w.metrics.checkpointDeleteFail.Inc() } }() errs := tsdb_errors.NewMulti() files, err := os.ReadDir(w.segmentWAL.Dir()) if err != nil { return err } for _, fi := range files { index, err := checkpointIndex(fi.Name(), true) if err != nil || index >= maxIndex { continue } if err := os.RemoveAll(filepath.Join(w.segmentWAL.Dir(), fi.Name())); err != nil { errs.Add(err) } } return errs.Err() } func (w *WALCheckpointWriter) Close(abort bool) error { if len(w.recs) > 0 { if err := w.flush(); err != nil { return err } } if err := w.checkpointWAL.Close(); err != nil { return err } if abort { return os.RemoveAll(w.checkpointWAL.Dir()) } if err := fileutil.Replace(w.checkpointWAL.Dir(), w.final); err != nil { return fmt.Errorf("rename checkpoint directory: %w", err) } level.Info(util_log.Logger).Log("msg", "atomic checkpoint finished", "old", w.checkpointWAL.Dir(), "new", w.final) // We delete the WAL segments which are before the previous checkpoint and not before the // current checkpoint created. This is because if the latest checkpoint is corrupted for any reason, we // should be able to recover from the older checkpoint which would need the older WAL segments. if err := w.segmentWAL.Truncate(w.lastSegment + 1); err != nil { // It is fine to have old WAL segments hanging around if deletion failed. // We can try again next time. level.Error(util_log.Logger).Log("msg", "error deleting old WAL segments", "err", err, "lastSegment", w.lastSegment) } if w.lastSegment >= 0 { if err := w.deleteCheckpoints(w.lastSegment); err != nil { // It is fine to have old checkpoints hanging around if deletion failed. // We can try again next time. level.Error(util_log.Logger).Log("msg", "error deleting old checkpoint", "err", err) } } return nil } type Checkpointer struct { dur time.Duration iter SeriesIter writer CheckpointWriter metrics *ingesterMetrics quit <-chan struct{} } func NewCheckpointer(dur time.Duration, iter SeriesIter, writer CheckpointWriter, metrics *ingesterMetrics, quit <-chan struct{}) *Checkpointer { return &Checkpointer{ dur: dur, iter: iter, writer: writer, metrics: metrics, quit: quit, } } func (c *Checkpointer) PerformCheckpoint() (err error) { noop, err := c.writer.Advance() if err != nil { return err } if noop { return nil } c.metrics.checkpointCreationTotal.Inc() defer func() { if err != nil { c.metrics.checkpointCreationFail.Inc() } }() // signal whether checkpoint writes should be amortized or burst var immediate bool n := c.iter.Count() if n < 1 { return c.writer.Close(false) } // Give a 10% buffer to the checkpoint duration in order to account for // new series, slow writes, etc. perSeriesDuration := (90 * c.dur) / (100 * time.Duration(n)) ticker := time.NewTicker(perSeriesDuration) defer ticker.Stop() start := time.Now() defer func() { elapsed := time.Since(start) level.Info(util_log.Logger).Log("msg", "checkpoint done", "time", elapsed.String()) c.metrics.checkpointDuration.Observe(elapsed.Seconds()) }() iter := c.iter.Iter() for iter.Next() { if err := c.writer.Write(iter.Stream()); err != nil { return err } if !immediate { if time.Since(start) > c.dur { // This indicates the checkpoint is taking too long; stop waiting // and flush the remaining series as fast as possible. immediate = true continue } } select { case <-c.quit: return c.writer.Close(true) case <-ticker.C: } } if iter.Error() != nil { return iter.Error() } return c.writer.Close(false) } func (c *Checkpointer) Run() { ticker := time.NewTicker(c.dur) defer ticker.Stop() defer c.iter.Stop() for { select { case <-ticker.C: level.Info(util_log.Logger).Log("msg", "starting checkpoint") if err := c.PerformCheckpoint(); err != nil { level.Error(util_log.Logger).Log("msg", "error checkpointing series", "err", err) continue } case <-c.quit: return } } } func unflushedChunks(descs []chunkDesc) []chunkDesc { filtered := make([]chunkDesc, 0, len(descs)) for _, d := range descs { if d.flushed.IsZero() { filtered = append(filtered, d) } } return filtered }