feat: Do not add empty blooms to offsets (#14577)

pull/14626/head
Salva Corts 7 months ago committed by GitHub
parent 5824e3d35c
commit 51c42e8645
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 47
      integration/bloom_building_test.go
  2. 7
      pkg/bloombuild/builder/spec.go
  3. 12
      pkg/storage/bloom/v1/bloom_builder.go
  4. 15
      pkg/storage/bloom/v1/builder.go
  5. 7
      pkg/storage/bloom/v1/builder_test.go
  6. 53
      pkg/storage/bloom/v1/fuse.go
  7. 3
      pkg/storage/bloom/v1/index.go
  8. 4
      pkg/storage/bloom/v1/index_builder.go
  9. 1
      pkg/storage/bloom/v1/metrics.go
  10. 4
      pkg/storage/bloom/v1/test_util.go
  11. 27
      pkg/storage/bloom/v1/versioned_builder.go
  12. 103
      pkg/storage/bloom/v1/versioned_builder_test.go
  13. 9
      pkg/storage/stores/shipper/bloomshipper/client.go

@ -61,15 +61,7 @@ func TestBloomBuilding(t *testing.T) {
cliIngester.Now = now
// We now ingest some logs across many series.
series := make([]labels.Labels, 0, nSeries)
for i := 0; i < nSeries; i++ {
lbs := labels.FromStrings("job", fmt.Sprintf("job-%d", i))
series = append(series, lbs)
for j := 0; j < nLogsPerSeries; j++ {
require.NoError(t, cliDistributor.PushLogLine(fmt.Sprintf("log line %d", j), now, nil, lbs.Map()))
}
}
series := writeSeries(t, nSeries, nLogsPerSeries, cliDistributor, now, "job")
// restart ingester which should flush the chunks and index
require.NoError(t, tIngester.Restart())
@ -124,14 +116,8 @@ func TestBloomBuilding(t *testing.T) {
checkSeriesInBlooms(t, now, tenantID, bloomStore, series)
// Push some more logs so TSDBs need to be updated.
for i := 0; i < nSeries; i++ {
lbs := labels.FromStrings("job", fmt.Sprintf("job-new-%d", i))
series = append(series, lbs)
for j := 0; j < nLogsPerSeries; j++ {
require.NoError(t, cliDistributor.PushLogLine(fmt.Sprintf("log line %d", j), now, nil, lbs.Map()))
}
}
newSeries := writeSeries(t, nSeries, nLogsPerSeries, cliDistributor, now, "job-new")
series = append(series, newSeries...)
// restart ingester which should flush the chunks and index
require.NoError(t, tIngester.Restart())
@ -147,6 +133,33 @@ func TestBloomBuilding(t *testing.T) {
checkSeriesInBlooms(t, now, tenantID, bloomStore, series)
}
func writeSeries(t *testing.T, nSeries int, nLogsPerSeries int, cliDistributor *client.Client, now time.Time, seriesPrefix string) []labels.Labels {
series := make([]labels.Labels, 0, nSeries)
for i := 0; i < nSeries; i++ {
lbs := labels.FromStrings("job", fmt.Sprintf("%s-%d", seriesPrefix, i))
series = append(series, lbs)
for j := 0; j < nLogsPerSeries; j++ {
// Only write wtructured metadata for half of the series
var metadata map[string]string
if i%2 == 0 {
metadata = map[string]string{
"traceID": fmt.Sprintf("%d%d", i, j),
"user": fmt.Sprintf("%d%d", i, j%10),
}
}
require.NoError(t, cliDistributor.PushLogLine(
fmt.Sprintf("log line %d", j),
now,
metadata,
lbs.Map(),
))
}
}
return series
}
func checkCompactionFinished(t *testing.T, cliCompactor *client.Client) {
checkForTimestampMetric(t, cliCompactor, "loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds")
}

@ -137,7 +137,7 @@ func (s *SimpleBloomGenerator) Generate(ctx context.Context) *LazyBlockBuilderIt
)
}
return NewLazyBlockBuilderIterator(ctx, s.opts, s.metrics, s.populator(ctx), s.writerReaderFunc, series, s.blocksIter)
return NewLazyBlockBuilderIterator(ctx, s.opts, s.metrics, s.logger, s.populator(ctx), s.writerReaderFunc, series, s.blocksIter)
}
// LazyBlockBuilderIterator is a lazy iterator over blocks that builds
@ -146,6 +146,7 @@ type LazyBlockBuilderIterator struct {
ctx context.Context
opts v1.BlockOptions
metrics *v1.Metrics
logger log.Logger
populate v1.BloomPopulatorFunc
writerReaderFunc func() (v1.BlockWriter, v1.BlockReader)
series iter.PeekIterator[*v1.Series]
@ -160,6 +161,7 @@ func NewLazyBlockBuilderIterator(
ctx context.Context,
opts v1.BlockOptions,
metrics *v1.Metrics,
logger log.Logger,
populate v1.BloomPopulatorFunc,
writerReaderFunc func() (v1.BlockWriter, v1.BlockReader),
series iter.PeekIterator[*v1.Series],
@ -169,6 +171,7 @@ func NewLazyBlockBuilderIterator(
ctx: ctx,
opts: opts,
metrics: metrics,
logger: logger,
populate: populate,
writerReaderFunc: writerReaderFunc,
series: series,
@ -196,7 +199,7 @@ func (b *LazyBlockBuilderIterator) Next() bool {
return false
}
mergeBuilder := v1.NewMergeBuilder(b.blocks, b.series, b.populate, b.metrics)
mergeBuilder := v1.NewMergeBuilder(b.blocks, b.series, b.populate, b.metrics, b.logger)
writer, reader := b.writerReaderFunc()
blockBuilder, err := v1.NewBlockBuilder(b.opts, writer)
if err != nil {

@ -28,6 +28,10 @@ func NewBloomBlockBuilder(opts BlockOptions, writer io.WriteCloser) *BloomBlockB
}
}
func (b *BloomBlockBuilder) UnflushedSize() int {
return b.scratch.Len() + b.page.UnflushedSize()
}
func (b *BloomBlockBuilder) Append(bloom *Bloom) (BloomOffset, error) {
if !b.writtenSchema {
if err := b.writeSchema(); err != nil {
@ -68,6 +72,14 @@ func (b *BloomBlockBuilder) writeSchema() error {
}
func (b *BloomBlockBuilder) Close() (uint32, error) {
if !b.writtenSchema {
// We will get here only if we haven't appended any bloom filters to the block
// This would happen only if all series yielded empty blooms
if err := b.writeSchema(); err != nil {
return 0, errors.Wrap(err, "writing schema")
}
}
if b.page.Count() > 0 {
if err := b.flushPage(); err != nil {
return 0, errors.Wrap(err, "flushing final bloom page")

@ -5,6 +5,8 @@ import (
"hash"
"io"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/pkg/errors"
"github.com/grafana/loki/v3/pkg/compression"
@ -112,6 +114,10 @@ func (w *PageWriter) Reset() {
w.n = 0
}
func (w *PageWriter) UnflushedSize() int {
return w.enc.Len()
}
func (w *PageWriter) SpaceFor(numBytes int) bool {
// if a single bloom exceeds the target size, still accept it
// otherwise only accept it if adding it would not exceed the target size
@ -189,6 +195,7 @@ type MergeBuilder struct {
// Add chunks of a single series to a bloom
populate BloomPopulatorFunc
metrics *Metrics
logger log.Logger
}
type BloomPopulatorFunc func(series *Series, preExistingBlooms iter.SizedIterator[*Bloom], chunksToAdd ChunkRefs, ch chan *BloomCreation)
@ -202,6 +209,7 @@ func NewMergeBuilder(
store iter.Iterator[*Series],
populate BloomPopulatorFunc,
metrics *Metrics,
logger log.Logger,
) *MergeBuilder {
// combinedSeriesIter handles series with fingerprint collisions:
// because blooms dont contain the label-set (only the fingerprint),
@ -229,6 +237,7 @@ func NewMergeBuilder(
store: combinedSeriesIter,
populate: populate,
metrics: metrics,
logger: logger,
}
}
@ -306,6 +315,12 @@ func (mb *MergeBuilder) processNextSeries(
if creation.Err != nil {
return nil, info.sourceBytes, 0, false, false, errors.Wrap(creation.Err, "populating bloom")
}
if creation.Bloom.IsEmpty() {
level.Debug(mb.logger).Log("msg", "received empty bloom. Adding to index but skipping offsets", "fingerprint", nextInStore.Fingerprint)
continue
}
offset, err := builder.AddBloom(creation.Bloom)
if err != nil {
return nil, info.sourceBytes, 0, false, false, errors.Wrapf(

@ -6,6 +6,7 @@ import (
"sort"
"testing"
"github.com/go-kit/log"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
@ -263,7 +264,7 @@ func TestMergeBuilder(t *testing.T) {
)
// Ensure that the merge builder combines all the blocks correctly
mergeBuilder := NewMergeBuilder(dedupedBlocks(blocks), storeItr, populate, NewMetrics(nil))
mergeBuilder := NewMergeBuilder(dedupedBlocks(blocks), storeItr, populate, NewMetrics(nil), log.NewNopLogger())
indexBuf := bytes.NewBuffer(nil)
bloomsBuf := bytes.NewBuffer(nil)
writer := NewMemoryBlockWriter(indexBuf, bloomsBuf)
@ -350,6 +351,8 @@ func TestMergeBuilderFingerprintCollision(t *testing.T) {
// We're not testing the ability to extend a bloom in this test
pop := func(_ *Series, _ iter.SizedIterator[*Bloom], _ ChunkRefs, ch chan *BloomCreation) {
bloom := NewBloom()
// Add something to the bloom so it's not empty
bloom.Add([]byte("hello"))
stats := indexingInfo{
sourceBytes: int(bloom.Capacity()) / 8,
indexedFields: NewSetFromLiteral[Field]("__all__"),
@ -367,6 +370,7 @@ func TestMergeBuilderFingerprintCollision(t *testing.T) {
iter.NewSliceIter(data),
pop,
NewMetrics(nil),
log.NewNopLogger(),
)
_, _, err = mergeBuilder.Build(builder)
@ -539,6 +543,7 @@ func TestMergeBuilder_Roundtrip(t *testing.T) {
dedupedStore,
pop,
NewMetrics(nil),
log.NewNopLogger(),
)
builder, err := NewBlockBuilder(blockOpts, writer)
require.Nil(t, err)

@ -32,6 +32,8 @@ func NewBloomRecorder(ctx context.Context, id string) *BloomRecorder {
chunksSkipped: atomic.NewInt64(0),
seriesMissed: atomic.NewInt64(0),
chunksMissed: atomic.NewInt64(0),
seriesEmpty: atomic.NewInt64(0),
chunksEmpty: atomic.NewInt64(0),
chunksFiltered: atomic.NewInt64(0),
}
}
@ -45,6 +47,8 @@ type BloomRecorder struct {
seriesSkipped, chunksSkipped *atomic.Int64
// not found in bloom
seriesMissed, chunksMissed *atomic.Int64
// exists in block index but empty offsets
seriesEmpty, chunksEmpty *atomic.Int64
// filtered out
chunksFiltered *atomic.Int64
}
@ -56,6 +60,8 @@ func (r *BloomRecorder) Merge(other *BloomRecorder) {
r.chunksSkipped.Add(other.chunksSkipped.Load())
r.seriesMissed.Add(other.seriesMissed.Load())
r.chunksMissed.Add(other.chunksMissed.Load())
r.seriesEmpty.Add(other.seriesEmpty.Load())
r.chunksEmpty.Add(other.chunksEmpty.Load())
r.chunksFiltered.Add(other.chunksFiltered.Load())
}
@ -66,13 +72,15 @@ func (r *BloomRecorder) Report(logger log.Logger, metrics *Metrics) {
seriesFound = r.seriesFound.Load()
seriesSkipped = r.seriesSkipped.Load()
seriesMissed = r.seriesMissed.Load()
seriesRequested = seriesFound + seriesSkipped + seriesMissed
seriesEmpty = r.seriesEmpty.Load()
seriesRequested = seriesFound + seriesSkipped + seriesMissed + seriesEmpty
chunksFound = r.chunksFound.Load()
chunksSkipped = r.chunksSkipped.Load()
chunksMissed = r.chunksMissed.Load()
chunksFiltered = r.chunksFiltered.Load()
chunksRequested = chunksFound + chunksSkipped + chunksMissed
chunksEmpty = r.chunksEmpty.Load()
chunksRequested = chunksFound + chunksSkipped + chunksMissed + chunksEmpty
)
level.Debug(logger).Log(
"recorder_msg", "bloom search results",
@ -82,11 +90,13 @@ func (r *BloomRecorder) Report(logger log.Logger, metrics *Metrics) {
"recorder_series_found", seriesFound,
"recorder_series_skipped", seriesSkipped,
"recorder_series_missed", seriesMissed,
"recorder_series_empty", seriesEmpty,
"recorder_chunks_requested", chunksRequested,
"recorder_chunks_found", chunksFound,
"recorder_chunks_skipped", chunksSkipped,
"recorder_chunks_missed", chunksMissed,
"recorder_chunks_empty", chunksEmpty,
"recorder_chunks_filtered", chunksFiltered,
)
@ -94,25 +104,27 @@ func (r *BloomRecorder) Report(logger log.Logger, metrics *Metrics) {
metrics.recorderSeries.WithLabelValues(recorderRequested).Add(float64(seriesRequested))
metrics.recorderSeries.WithLabelValues(recorderFound).Add(float64(seriesFound))
metrics.recorderSeries.WithLabelValues(recorderSkipped).Add(float64(seriesSkipped))
metrics.recorderSeries.WithLabelValues(recorderEmpty).Add(float64(seriesEmpty))
metrics.recorderSeries.WithLabelValues(recorderMissed).Add(float64(seriesMissed))
metrics.recorderChunks.WithLabelValues(recorderRequested).Add(float64(chunksRequested))
metrics.recorderChunks.WithLabelValues(recorderFound).Add(float64(chunksFound))
metrics.recorderChunks.WithLabelValues(recorderSkipped).Add(float64(chunksSkipped))
metrics.recorderChunks.WithLabelValues(recorderMissed).Add(float64(chunksMissed))
metrics.recorderChunks.WithLabelValues(recorderEmpty).Add(float64(chunksEmpty))
metrics.recorderChunks.WithLabelValues(recorderFiltered).Add(float64(chunksFiltered))
}
}
func (r *BloomRecorder) record(
seriesFound, chunksFound, seriesSkipped, chunksSkipped, seriesMissed, chunksMissed, chunksFiltered int,
) {
func (r *BloomRecorder) record(seriesFound, chunksFound, seriesSkipped, chunksSkipped, seriesMissed, chunksMissed, seriesEmpty, chunksEmpty, chunksFiltered int) {
r.seriesFound.Add(int64(seriesFound))
r.chunksFound.Add(int64(chunksFound))
r.seriesSkipped.Add(int64(seriesSkipped))
r.chunksSkipped.Add(int64(chunksSkipped))
r.seriesMissed.Add(int64(seriesMissed))
r.chunksMissed.Add(int64(chunksMissed))
r.seriesEmpty.Add(int64(seriesEmpty))
r.chunksEmpty.Add(int64(chunksEmpty))
r.chunksFiltered.Add(int64(chunksFiltered))
}
@ -170,6 +182,7 @@ func (fq *FusedQuerier) recordMissingFp(
0, 0, // found
0, 0, // skipped
1, len(input.Chks), // missed
0, 0, // empty
0, // chunks filtered
)
})
@ -184,6 +197,22 @@ func (fq *FusedQuerier) recordSkippedFp(
0, 0, // found
1, len(input.Chks), // skipped
0, 0, // missed
0, 0, // empty
0, // chunks filtered
)
})
}
func (fq *FusedQuerier) recordEmptyFp(
batch []Request,
fp model.Fingerprint,
) {
fq.noRemovals(batch, fp, func(input Request) {
input.Recorder.record(
0, 0, // found
0, 0, // skipped
0, 0, // missed
1, len(input.Chks), // empty
0, // chunks filtered
)
})
@ -280,6 +309,19 @@ func (fq *FusedQuerier) runSeries(_ Schema, series *SeriesWithMeta, reqs []Reque
})
}
if len(series.Offsets) == 0 {
// We end up here for series with no structured metadata fields.
// While building blooms, these series would yield empty blooms.
// We add these series to the index of the block so we don't report them as missing,
// but we don't filter any chunks for them.
level.Debug(fq.logger).Log(
"msg", "series with empty offsets",
"fp", series.Fingerprint,
)
fq.recordEmptyFp(reqs, series.Fingerprint)
return
}
for i, offset := range series.Offsets {
skip := fq.bq.blooms.LoadOffset(offset)
if skip {
@ -361,6 +403,7 @@ func (fq *FusedQuerier) runSeries(_ Schema, series *SeriesWithMeta, reqs []Reque
1, len(inputs[i].InBlooms), // found
0, 0, // skipped
0, len(inputs[i].Missing), // missed
0, 0, // empty
len(removals), // filtered
)
req.Response <- Output{

@ -153,7 +153,8 @@ func aggregateHeaders(xs []SeriesHeader) SeriesHeader {
fromFp, _ := xs[0].Bounds.Bounds()
_, throughFP := xs[len(xs)-1].Bounds.Bounds()
res := SeriesHeader{
Bounds: NewBounds(fromFp, throughFP),
NumSeries: len(xs),
Bounds: NewBounds(fromFp, throughFP),
}
for i, x := range xs {

@ -35,6 +35,10 @@ func NewIndexBuilder(opts BlockOptions, writer io.WriteCloser) *IndexBuilder {
}
}
func (b *IndexBuilder) UnflushedSize() int {
return b.scratch.Len() + b.page.UnflushedSize()
}
func (b *IndexBuilder) WriteOpts() error {
b.scratch.Reset()
b.opts.Encode(b.scratch)

@ -56,6 +56,7 @@ const (
recorderFound = "found"
recorderSkipped = "skipped"
recorderMissed = "missed"
recorderEmpty = "empty"
recorderFiltered = "filtered"
)

@ -132,9 +132,11 @@ func CompareIterators[A, B any](
a iter.Iterator[A],
b iter.Iterator[B],
) {
var i int
for a.Next() {
require.True(t, b.Next())
require.Truef(t, b.Next(), "'a' has %dth element but 'b' does not'", i)
f(t, a.At(), b.At())
i++
}
require.False(t, b.Next())
require.NoError(t, a.Err())

@ -125,10 +125,35 @@ func (b *V3Builder) AddSeries(series Series, offsets []BloomOffset, fields Set[F
return false, errors.Wrapf(err, "writing index for series %v", series.Fingerprint)
}
full, _, err := b.writer.Full(b.opts.BlockSize)
full, err := b.full()
if err != nil {
return false, errors.Wrap(err, "checking if block is full")
}
return full, nil
}
func (b *V3Builder) full() (bool, error) {
if b.opts.BlockSize == 0 {
// Unlimited block size
return false, nil
}
full, writtenSize, err := b.writer.Full(b.opts.BlockSize)
if err != nil {
return false, errors.Wrap(err, "checking if block writer is full")
}
if full {
return true, nil
}
// Even if the block writer is not full, we may have unflushed data in the bloom builders.
// Check if by flushing these, we would exceed the block size.
unflushedIndexSize := b.index.UnflushedSize()
unflushedBloomSize := b.blooms.UnflushedSize()
if uint64(writtenSize+unflushedIndexSize+unflushedBloomSize) > b.opts.BlockSize {
return true, nil
}
return false, nil
}

@ -4,6 +4,7 @@ import (
"bytes"
"testing"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"github.com/grafana/loki/v3/pkg/compression"
@ -17,7 +18,7 @@ import (
func smallBlockOpts(v Version, enc compression.Codec) BlockOptions {
return BlockOptions{
Schema: NewSchema(v, enc),
SeriesPageSize: 100,
SeriesPageSize: 4 << 10,
BloomPageSize: 2 << 10,
BlockSize: 0, // unlimited
}
@ -78,3 +79,103 @@ func TestV3Roundtrip(t *testing.T) {
querier,
)
}
func seriesWithBlooms(nSeries int, fromFp, throughFp model.Fingerprint) []SeriesWithBlooms {
series, _ := MkBasicSeriesWithBlooms(nSeries, fromFp, throughFp, 0, 10000)
return series
}
func seriesWithoutBlooms(nSeries int, fromFp, throughFp model.Fingerprint) []SeriesWithBlooms {
series := seriesWithBlooms(nSeries, fromFp, throughFp)
// remove blooms from series
for i := range series {
series[i].Blooms = v2.NewEmptyIter[*Bloom]()
}
return series
}
func TestFullBlock(t *testing.T) {
opts := smallBlockOpts(V3, compression.None)
minBlockSize := opts.SeriesPageSize // 1 index page, 4KB
const maxEmptySeriesPerBlock = 47
for _, tc := range []struct {
name string
maxBlockSize uint64
series []SeriesWithBlooms
expected []SeriesWithBlooms
}{
{
name: "only series without blooms",
maxBlockSize: minBlockSize,
// +1 so we test adding the last series that fills the block
series: seriesWithoutBlooms(maxEmptySeriesPerBlock+1, 0, 0xffff),
expected: seriesWithoutBlooms(maxEmptySeriesPerBlock+1, 0, 0xffff),
},
{
name: "series without blooms and one with blooms",
maxBlockSize: minBlockSize,
series: append(
seriesWithoutBlooms(maxEmptySeriesPerBlock, 0, 0x7fff),
seriesWithBlooms(50, 0x8000, 0xffff)...,
),
expected: append(
seriesWithoutBlooms(maxEmptySeriesPerBlock, 0, 0x7fff),
seriesWithBlooms(1, 0x8000, 0x8001)...,
),
},
{
name: "only one series with bloom",
maxBlockSize: minBlockSize,
series: seriesWithBlooms(10, 0, 0xffff),
expected: seriesWithBlooms(1, 0, 1),
},
{
name: "one huge series with bloom and then series without",
maxBlockSize: minBlockSize,
series: append(
seriesWithBlooms(1, 0, 1),
seriesWithoutBlooms(100, 1, 0xffff)...,
),
expected: seriesWithBlooms(1, 0, 1),
},
{
name: "big block",
maxBlockSize: 1 << 20, // 1MB
series: seriesWithBlooms(100, 0, 0xffff),
expected: seriesWithBlooms(100, 0, 0xffff),
},
} {
t.Run(tc.name, func(t *testing.T) {
indexBuf := bytes.NewBuffer(nil)
bloomsBuf := bytes.NewBuffer(nil)
writer := NewMemoryBlockWriter(indexBuf, bloomsBuf)
reader := NewByteReader(indexBuf, bloomsBuf)
opts.BlockSize = tc.maxBlockSize
b, err := NewBlockBuilderV3(opts, writer)
require.NoError(t, err)
_, err = b.BuildFrom(v2.NewSliceIter(tc.series))
require.NoError(t, err)
block := NewBlock(reader, NewMetrics(nil))
querier := NewBlockQuerier(block, &mempool.SimpleHeapAllocator{}, DefaultMaxPageSize).Iter()
CompareIterators(
t,
func(t *testing.T, a SeriesWithBlooms, b *SeriesWithBlooms) {
require.Equal(t, a.Series.Fingerprint, b.Series.Fingerprint)
require.ElementsMatch(t, a.Series.Chunks, b.Series.Chunks)
bloomsA, err := v2.Collect(a.Blooms)
require.NoError(t, err)
bloomsB, err := v2.Collect(b.Blooms)
require.NoError(t, err)
require.Equal(t, len(bloomsB), len(bloomsA))
},
v2.NewSliceIter(tc.expected),
querier,
)
})
}
}

@ -225,13 +225,16 @@ func newBlockRefWithEncoding(ref Ref, enc compression.Codec) BlockRef {
}
func BlockFrom(enc compression.Codec, tenant, table string, blk *v1.Block) (Block, error) {
md, _ := blk.Metadata()
md, err := blk.Metadata()
if err != nil {
return Block{}, errors.Wrap(err, "decoding index")
}
ref := newBlockRefWithEncoding(newRefFrom(tenant, table, md), enc)
// TODO(owen-d): pool
buf := bytes.NewBuffer(nil)
err := v1.TarCompress(ref.Codec, buf, blk.Reader())
err = v1.TarCompress(ref.Codec, buf, blk.Reader())
if err != nil {
return Block{}, err
}

Loading…
Cancel
Save