package bloomcompactor import ( "context" "fmt" "io" "time" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/pkg/errors" "github.com/prometheus/common/model" "github.com/grafana/loki/v3/pkg/logproto" v1 "github.com/grafana/loki/v3/pkg/storage/bloom/v1" "github.com/grafana/loki/v3/pkg/storage/chunk" "github.com/grafana/loki/v3/pkg/storage/chunk/fetcher" "github.com/grafana/loki/v3/pkg/storage/stores" "github.com/grafana/loki/v3/pkg/storage/stores/shipper/bloomshipper" "github.com/grafana/loki/v3/pkg/storage/stores/shipper/indexshipper/tsdb" ) // inclusive range type Keyspace struct { min, max model.Fingerprint } func (k Keyspace) Cmp(other Keyspace) v1.BoundsCheck { if other.max < k.min { return v1.Before } else if other.min > k.max { return v1.After } return v1.Overlap } // Store is likely bound within. This allows specifying impls like ShardedStore // to only request the shard-range needed from the existing store. type BloomGenerator interface { Generate(ctx context.Context) (skippedBlocks []v1.BlockMetadata, toClose []io.Closer, results v1.Iterator[*v1.Block], err error) } // Simple implementation of a BloomGenerator. type SimpleBloomGenerator struct { userID string store v1.Iterator[*v1.Series] chunkLoader ChunkLoader blocksIter v1.ResettableIterator[*v1.SeriesWithBloom] // options to build blocks with opts v1.BlockOptions metrics *Metrics logger log.Logger readWriterFn func() (v1.BlockWriter, v1.BlockReader) reporter func(model.Fingerprint) tokenizer *v1.BloomTokenizer } // SimpleBloomGenerator is a foundational implementation of BloomGenerator. // It mainly wires up a few different components to generate bloom filters for a set of blocks // and handles schema compatibility: // Blocks which are incompatible with the schema are skipped and will have their chunks reindexed func NewSimpleBloomGenerator( userID string, opts v1.BlockOptions, store v1.Iterator[*v1.Series], chunkLoader ChunkLoader, blocksIter v1.ResettableIterator[*v1.SeriesWithBloom], readWriterFn func() (v1.BlockWriter, v1.BlockReader), reporter func(model.Fingerprint), metrics *Metrics, logger log.Logger, ) *SimpleBloomGenerator { return &SimpleBloomGenerator{ userID: userID, opts: opts, store: store, chunkLoader: chunkLoader, blocksIter: blocksIter, logger: log.With( logger, "component", "bloom_generator", "org_id", userID, ), readWriterFn: readWriterFn, metrics: metrics, reporter: reporter, tokenizer: v1.NewBloomTokenizer( opts.Schema.NGramLen(), opts.Schema.NGramSkip(), int(opts.UnencodedBlockOptions.MaxBloomSizeBytes), metrics.bloomMetrics, ), } } func (s *SimpleBloomGenerator) populator(ctx context.Context) func(series *v1.Series, bloom *v1.Bloom) (int, bool, error) { return func(series *v1.Series, bloom *v1.Bloom) (int, bool, error) { start := time.Now() level.Debug(s.logger).Log( "msg", "populating bloom filter", "stage", "before", "fp", series.Fingerprint, "chunks", len(series.Chunks), ) chunkItersWithFP, err := s.chunkLoader.Load(ctx, s.userID, series) if err != nil { return 0, false, errors.Wrapf(err, "failed to load chunks for series: %+v", series) } bytesAdded, skip, err := s.tokenizer.Populate( &v1.SeriesWithBloom{ Series: series, Bloom: bloom, }, chunkItersWithFP.itr, ) level.Debug(s.logger).Log( "msg", "populating bloom filter", "stage", "after", "fp", series.Fingerprint, "chunks", len(series.Chunks), "series_bytes", bytesAdded, "duration", time.Since(start), "err", err, ) if s.reporter != nil { s.reporter(series.Fingerprint) } return bytesAdded, skip, err } } func (s *SimpleBloomGenerator) Generate(ctx context.Context) *LazyBlockBuilderIterator { level.Debug(s.logger).Log("msg", "generating bloom filters for blocks", "schema", fmt.Sprintf("%+v", s.opts.Schema)) series := v1.NewPeekingIter(s.store) // TODO: Use interface impl, ok := s.blocksIter.(*blockLoadingIter) if ok { impl.Filter( func(bq *bloomshipper.CloseableBlockQuerier) bool { logger := log.With(s.logger, "block", bq.BlockRef) md, err := bq.Metadata() schema := md.Options.Schema if err != nil { level.Warn(logger).Log("msg", "failed to get schema for block", "err", err) bq.Close() // close unused querier return false } if !s.opts.Schema.Compatible(schema) { level.Warn(logger).Log("msg", "block schema incompatible with options", "generator_schema", fmt.Sprintf("%+v", s.opts.Schema), "block_schema", fmt.Sprintf("%+v", schema)) bq.Close() // close unused querier return false } level.Debug(logger).Log("msg", "adding compatible block to bloom generation inputs") return true }, ) } return NewLazyBlockBuilderIterator(ctx, s.opts, s.metrics, s.populator(ctx), s.readWriterFn, series, s.blocksIter) } // LazyBlockBuilderIterator is a lazy iterator over blocks that builds // each block by adding series to them until they are full. type LazyBlockBuilderIterator struct { ctx context.Context opts v1.BlockOptions metrics *Metrics populate func(*v1.Series, *v1.Bloom) (int, bool, error) readWriterFn func() (v1.BlockWriter, v1.BlockReader) series v1.PeekingIterator[*v1.Series] blocks v1.ResettableIterator[*v1.SeriesWithBloom] bytesAdded int curr *v1.Block err error } func NewLazyBlockBuilderIterator( ctx context.Context, opts v1.BlockOptions, metrics *Metrics, populate func(*v1.Series, *v1.Bloom) (int, bool, error), readWriterFn func() (v1.BlockWriter, v1.BlockReader), series v1.PeekingIterator[*v1.Series], blocks v1.ResettableIterator[*v1.SeriesWithBloom], ) *LazyBlockBuilderIterator { return &LazyBlockBuilderIterator{ ctx: ctx, opts: opts, metrics: metrics, populate: populate, readWriterFn: readWriterFn, series: series, blocks: blocks, } } func (b *LazyBlockBuilderIterator) Bytes() (bytes int) { return b.bytesAdded } func (b *LazyBlockBuilderIterator) Next() bool { // No more series to process if _, hasNext := b.series.Peek(); !hasNext { return false } if err := b.ctx.Err(); err != nil { b.err = errors.Wrap(err, "context canceled") return false } if err := b.blocks.Reset(); err != nil { b.err = errors.Wrap(err, "reset blocks iterator") return false } mergeBuilder := v1.NewMergeBuilder(b.blocks, b.series, b.populate, b.metrics.bloomMetrics) writer, reader := b.readWriterFn() blockBuilder, err := v1.NewBlockBuilder(b.opts, writer) if err != nil { b.err = errors.Wrap(err, "failed to create bloom block builder") return false } _, sourceBytes, err := mergeBuilder.Build(blockBuilder) b.bytesAdded += sourceBytes if err != nil { b.err = errors.Wrap(err, "failed to build bloom block") return false } b.curr = v1.NewBlock(reader, b.metrics.bloomMetrics) return true } func (b *LazyBlockBuilderIterator) At() *v1.Block { return b.curr } func (b *LazyBlockBuilderIterator) Err() error { return b.err } // IndexLoader loads an index. This helps us do things like // load TSDBs for a specific period excluding multitenant (pre-compacted) indices type indexLoader interface { Index() (tsdb.Index, error) } // ChunkItersByFingerprint models the chunks belonging to a fingerprint type ChunkItersByFingerprint struct { fp model.Fingerprint itr v1.Iterator[v1.ChunkRefWithIter] } // ChunkLoader loads chunks from a store type ChunkLoader interface { Load(ctx context.Context, userID string, series *v1.Series) (*ChunkItersByFingerprint, error) } // StoreChunkLoader loads chunks from a store type StoreChunkLoader struct { fetcherProvider stores.ChunkFetcherProvider metrics *Metrics } func NewStoreChunkLoader(fetcherProvider stores.ChunkFetcherProvider, metrics *Metrics) *StoreChunkLoader { return &StoreChunkLoader{ fetcherProvider: fetcherProvider, metrics: metrics, } } func (s *StoreChunkLoader) Load(ctx context.Context, userID string, series *v1.Series) (*ChunkItersByFingerprint, error) { // NB(owen-d): This is probably unnecessary as we should only have one fetcher // because we'll only be working on a single index period at a time, but this should protect // us in the case of refactoring/changing this and likely isn't a perf bottleneck. chksByFetcher := make(map[*fetcher.Fetcher][]chunk.Chunk) for _, chk := range series.Chunks { fetcher := s.fetcherProvider.GetChunkFetcher(chk.From) chksByFetcher[fetcher] = append(chksByFetcher[fetcher], chunk.Chunk{ ChunkRef: logproto.ChunkRef{ Fingerprint: uint64(series.Fingerprint), UserID: userID, From: chk.From, Through: chk.Through, Checksum: chk.Checksum, }, }) } var ( fetchers = make([]Fetcher[chunk.Chunk, chunk.Chunk], 0, len(chksByFetcher)) inputs = make([][]chunk.Chunk, 0, len(chksByFetcher)) ) for fetcher, chks := range chksByFetcher { fn := FetchFunc[chunk.Chunk, chunk.Chunk](fetcher.FetchChunks) fetchers = append(fetchers, fn) inputs = append(inputs, chks) } return &ChunkItersByFingerprint{ fp: series.Fingerprint, itr: newBatchedChunkLoader(ctx, fetchers, inputs, s.metrics, batchedLoaderDefaultBatchSize), }, nil }