Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/chunkenc/unordered.go

513 lines
12 KiB

package chunkenc
import (
"bytes"
"context"
"encoding/binary"
"fmt"
"io"
"math"
"time"
"github.com/Workiva/go-datastructures/rangetree"
"github.com/cespare/xxhash/v2"
"github.com/pkg/errors"
"github.com/prometheus/prometheus/model/labels"
"github.com/grafana/loki/pkg/iter"
"github.com/grafana/loki/pkg/logproto"
"github.com/grafana/loki/pkg/logql/log"
"github.com/grafana/loki/pkg/logqlmodel/stats"
)
var noopStreamPipeline = log.NewNoopPipeline().ForStream(labels.Labels{})
type HeadBlock interface {
IsEmpty() bool
CheckpointTo(w io.Writer) error
CheckpointBytes(b []byte) ([]byte, error)
CheckpointSize() int
LoadBytes(b []byte) error
Serialise(pool WriterPool) ([]byte, error)
Reset()
Bounds() (mint, maxt int64)
Entries() int
UncompressedSize() int
Convert(HeadBlockFmt) (HeadBlock, error)
Append(int64, string) error
Iterator(
ctx context.Context,
direction logproto.Direction,
mint,
maxt int64,
pipeline log.StreamPipeline,
) iter.EntryIterator
SampleIterator(
ctx context.Context,
mint,
maxt int64,
extractor log.StreamSampleExtractor,
) iter.SampleIterator
Format() HeadBlockFmt
}
type unorderedHeadBlock struct {
// Opted for range tree over skiplist for space reduction.
// Inserts: O(log(n))
// Scans: (O(k+log(n))) where k=num_scanned_entries & n=total_entries
rt rangetree.RangeTree
lines int // number of entries
size int // size of uncompressed bytes.
mint, maxt int64 // upper and lower bounds
}
func newUnorderedHeadBlock() *unorderedHeadBlock {
return &unorderedHeadBlock{
rt: rangetree.New(1),
}
}
func (hb *unorderedHeadBlock) Format() HeadBlockFmt { return UnorderedHeadBlockFmt }
func (hb *unorderedHeadBlock) IsEmpty() bool {
return hb.size == 0
}
func (hb *unorderedHeadBlock) Bounds() (int64, int64) {
return hb.mint, hb.maxt
}
func (hb *unorderedHeadBlock) Entries() int {
return hb.lines
}
func (hb *unorderedHeadBlock) UncompressedSize() int {
return hb.size
}
func (hb *unorderedHeadBlock) Reset() {
x := newUnorderedHeadBlock()
*hb = *x
}
// collection of entries belonging to the same nanosecond
type nsEntries struct {
ts int64
entries []string
}
func (e *nsEntries) ValueAtDimension(_ uint64) int64 {
return e.ts
}
func (hb *unorderedHeadBlock) Append(ts int64, line string) error {
// This is an allocation hack. The rangetree lib does not
// support the ability to pass a "mutate" function during an insert
// and instead will displace any existing entry at the specified timestamp.
// Since Loki supports multiple lines per timestamp,
// we insert an entry without any log lines,
// which is ordered by timestamp alone.
// Then, we detect if we've displaced any existing entries, and
// append the new one to the existing, preallocated slice.
// If not, we create a slice with one entry.
e := &nsEntries{
ts: ts,
}
displaced := hb.rt.Add(e)
if displaced[0] != nil {
e.entries = append(displaced[0].(*nsEntries).entries, line)
} else {
e.entries = []string{line}
}
// Update hb metdata
if hb.size == 0 || hb.mint > ts {
hb.mint = ts
}
if hb.maxt < ts {
hb.maxt = ts
}
hb.size += len(line)
hb.lines++
return nil
}
// Implements rangetree.Interval
type interval struct {
mint, maxt int64
}
func (i interval) LowAtDimension(_ uint64) int64 { return i.mint }
// rangetree library treats this as inclusive, but we want exclusivity,
// or [from, through) in nanoseconds
func (i interval) HighAtDimension(_ uint64) int64 { return i.maxt - 1 }
// helper for base logic across {Entry,Sample}Iterator
func (hb *unorderedHeadBlock) forEntries(
ctx context.Context,
direction logproto.Direction,
mint,
maxt int64,
entryFn func(int64, string) error, // returning an error exits early
) (err error) {
if hb.IsEmpty() || (maxt < hb.mint || hb.maxt < mint) {
return
}
entries := hb.rt.Query(interval{
mint: mint,
maxt: maxt,
})
chunkStats := stats.FromContext(ctx)
process := func(es *nsEntries) {
chunkStats.AddHeadChunkLines(int64(len(es.entries)))
// preserve write ordering of entries with the same ts
var i int
if direction == logproto.BACKWARD {
i = len(es.entries) - 1
}
next := func() {
if direction == logproto.FORWARD {
i++
} else {
i--
}
}
for ; i < len(es.entries) && i >= 0; next() {
line := es.entries[i]
chunkStats.AddHeadChunkBytes(int64(len(line)))
err = entryFn(es.ts, line)
}
}
if direction == logproto.FORWARD {
for _, e := range entries {
process(e.(*nsEntries))
if err != nil {
return err
}
}
} else {
for i := len(entries) - 1; i >= 0; i-- {
process(entries[i].(*nsEntries))
if err != nil {
return err
}
}
}
return nil
}
func (hb *unorderedHeadBlock) Iterator(
ctx context.Context,
direction logproto.Direction,
mint,
maxt int64,
pipeline log.StreamPipeline,
) iter.EntryIterator {
// We are doing a copy everytime, this is because b.entries could change completely,
// the alternate would be that we allocate a new b.entries everytime we cut a block,
// but the tradeoff is that queries to near-realtime data would be much lower than
// cutting of blocks.
streams := map[uint64]*logproto.Stream{}
_ = hb.forEntries(
ctx,
direction,
mint,
maxt,
func(ts int64, line string) error {
newLine, parsedLbs, ok := pipeline.ProcessString(line)
if !ok {
return nil
}
var stream *logproto.Stream
lhash := parsedLbs.Hash()
if stream, ok = streams[lhash]; !ok {
stream = &logproto.Stream{
Labels: parsedLbs.String(),
}
streams[lhash] = stream
}
stream.Entries = append(stream.Entries, logproto.Entry{
Timestamp: time.Unix(0, ts),
Line: newLine,
})
return nil
},
)
if len(streams) == 0 {
return iter.NoopIterator
}
streamsResult := make([]logproto.Stream, 0, len(streams))
for _, stream := range streams {
streamsResult = append(streamsResult, *stream)
}
return iter.NewStreamsIterator(ctx, streamsResult, direction)
}
// nolint:unused
func (hb *unorderedHeadBlock) SampleIterator(
ctx context.Context,
mint,
maxt int64,
extractor log.StreamSampleExtractor,
) iter.SampleIterator {
series := map[uint64]*logproto.Series{}
_ = hb.forEntries(
ctx,
logproto.FORWARD,
mint,
maxt,
func(ts int64, line string) error {
value, parsedLabels, ok := extractor.ProcessString(line)
if !ok {
return nil
}
var found bool
var s *logproto.Series
lhash := parsedLabels.Hash()
if s, found = series[lhash]; !found {
s = &logproto.Series{
Labels: parsedLabels.String(),
Samples: SamplesPool.Get(hb.lines).([]logproto.Sample)[:0],
}
series[lhash] = s
}
h := xxhash.Sum64(unsafeGetBytes(line))
s.Samples = append(s.Samples, logproto.Sample{
Timestamp: ts,
Value: value,
Hash: h,
})
return nil
},
)
if len(series) == 0 {
return iter.NoopIterator
}
seriesRes := make([]logproto.Series, 0, len(series))
for _, s := range series {
seriesRes = append(seriesRes, *s)
}
return iter.SampleIteratorWithClose(iter.NewMultiSeriesIterator(ctx, seriesRes), func() error {
for _, s := range series {
SamplesPool.Put(s.Samples)
}
return nil
})
}
// nolint:unused
// serialise is used in creating an ordered, compressed block from an unorderedHeadBlock
func (hb *unorderedHeadBlock) Serialise(pool WriterPool) ([]byte, error) {
inBuf := serializeBytesBufferPool.Get().(*bytes.Buffer)
defer func() {
inBuf.Reset()
serializeBytesBufferPool.Put(inBuf)
}()
outBuf := &bytes.Buffer{}
encBuf := make([]byte, binary.MaxVarintLen64)
compressedWriter := pool.GetWriter(outBuf)
defer pool.PutWriter(compressedWriter)
_ = hb.forEntries(
context.Background(),
logproto.FORWARD,
0,
math.MaxInt64,
func(ts int64, line string) error {
n := binary.PutVarint(encBuf, ts)
inBuf.Write(encBuf[:n])
n = binary.PutUvarint(encBuf, uint64(len(line)))
inBuf.Write(encBuf[:n])
inBuf.WriteString(line)
return nil
},
)
if _, err := compressedWriter.Write(inBuf.Bytes()); err != nil {
return nil, errors.Wrap(err, "appending entry")
}
if err := compressedWriter.Close(); err != nil {
return nil, errors.Wrap(err, "flushing pending compress buffer")
}
return outBuf.Bytes(), nil
}
func (hb *unorderedHeadBlock) Convert(version HeadBlockFmt) (HeadBlock, error) {
if version > OrderedHeadBlockFmt {
return hb, nil
}
out := version.NewBlock()
err := hb.forEntries(
context.Background(),
logproto.FORWARD,
0,
math.MaxInt64,
func(ts int64, line string) error {
return out.Append(ts, line)
},
)
return out, err
}
// CheckpointSize returns the estimated size of the headblock checkpoint.
func (hb *unorderedHeadBlock) CheckpointSize() int {
size := 1 // version
size += binary.MaxVarintLen32 * 2 // total entries + total size
size += binary.MaxVarintLen64 * 2 // mint,maxt
size += (binary.MaxVarintLen64 + binary.MaxVarintLen32) * hb.lines // ts + len of log line.
size += hb.size // uncompressed bytes of lines
return size
}
// CheckpointBytes serializes a headblock to []byte. This is used by the WAL checkpointing,
// which does not want to mutate a chunk by cutting it (otherwise risking content address changes), but
// needs to serialize/deserialize the data to disk to ensure data durability.
func (hb *unorderedHeadBlock) CheckpointBytes(b []byte) ([]byte, error) {
buf := bytes.NewBuffer(b[:0])
err := hb.CheckpointTo(buf)
return buf.Bytes(), err
}
// CheckpointTo serializes a headblock to a `io.Writer`. see `CheckpointBytes`.
func (hb *unorderedHeadBlock) CheckpointTo(w io.Writer) error {
eb := EncodeBufferPool.Get().(*encbuf)
defer EncodeBufferPool.Put(eb)
eb.reset()
eb.putByte(byte(hb.Format()))
_, err := w.Write(eb.get())
if err != nil {
return errors.Wrap(err, "write headBlock version")
}
eb.reset()
eb.putUvarint(hb.lines)
_, err = w.Write(eb.get())
if err != nil {
return errors.Wrap(err, "write headBlock metas")
}
eb.reset()
err = hb.forEntries(
context.Background(),
logproto.FORWARD,
0,
math.MaxInt64,
func(ts int64, line string) error {
eb.putVarint64(ts)
eb.putUvarint(len(line))
_, err = w.Write(eb.get())
if err != nil {
return errors.Wrap(err, "write headBlock entry ts")
}
eb.reset()
_, err := io.WriteString(w, line)
if err != nil {
return errors.Wrap(err, "write headblock entry line")
}
return nil
},
)
return nil
}
func (hb *unorderedHeadBlock) LoadBytes(b []byte) error {
// ensure it's empty
*hb = *newUnorderedHeadBlock()
if len(b) < 1 {
return nil
}
db := decbuf{b: b}
version := db.byte()
if db.err() != nil {
return errors.Wrap(db.err(), "verifying headblock header")
}
if version != UnorderedHeadBlockFmt.Byte() {
return errors.Errorf("incompatible headBlock version (%v), only V4 is currently supported", version)
}
n := db.uvarint()
if err := db.err(); err != nil {
return errors.Wrap(err, "verifying headblock metadata")
}
for i := 0; i < n && db.err() == nil; i++ {
ts := db.varint64()
lineLn := db.uvarint()
line := string(db.bytes(lineLn))
if err := hb.Append(ts, line); err != nil {
return err
}
}
if err := db.err(); err != nil {
return errors.Wrap(err, "decoding entries")
}
return nil
}
// HeadFromCheckpoint handles reading any head block format and returning the desired form.
// This is particularly helpful replaying WALs from different configurations
// such as after enabling unordered writes.
func HeadFromCheckpoint(b []byte, desired HeadBlockFmt) (HeadBlock, error) {
if len(b) == 0 {
return desired.NewBlock(), nil
}
db := decbuf{b: b}
version := db.byte()
if db.err() != nil {
return nil, errors.Wrap(db.err(), "verifying headblock header")
}
format := HeadBlockFmt(version)
if format > UnorderedHeadBlockFmt {
return nil, fmt.Errorf("unexpected head block version: %v", format)
}
decodedBlock := format.NewBlock()
if err := decodedBlock.LoadBytes(b); err != nil {
return nil, err
}
if decodedBlock.Format() != desired {
return decodedBlock.Convert(desired)
}
return decodedBlock, nil
}