Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/engine/compactor/sections.go

182 lines
5.9 KiB

package compactor
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"time"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/thanos-io/objstore"
"github.com/grafana/loki/v3/pkg/dataobj"
compactionv2pb "github.com/grafana/loki/v3/pkg/dataobj/compaction/v2/proto"
"github.com/grafana/loki/v3/pkg/dataobj/metastore"
"github.com/grafana/loki/v3/pkg/dataobj/sections/indexpointers"
)
// indexEntry is one index object listed in a ToC for a particular tenant.
type indexEntry struct {
Path string
Start time.Time
End time.Time
}
// tenantIndexes maps tenant ID → ordered list of indexes the ToC references
// for that tenant. Slice order reflects ToC enumeration order and is not
// part of the contract — callers must not rely on it for correctness.
type tenantIndexes map[string][]indexEntry
// loadTenantIndexes reads the ToC for the given window-aligned time and
// returns every (tenant, index path, time range) triple it references.
//
// This is the per-cycle planning input: the coordinator iterates the result
// map and skips tenants whose index slice has length ≤ 1 (the convergence
// gate). If the ToC does not exist for this window the call returns a
// bucket.IsObjNotFoundErr-class error which the coordinator treats as
// "nothing to do this cycle" — the next polling tick re-reads.
//
// Unlike pkg/dataobj/metastore.forEachIndexPointer, this helper does NOT
// filter by user.ExtractOrgID — it walks every tenant's indexpointers
// section and returns them grouped, which is what the coordinator's
// per-tenant loop needs.
func loadTenantIndexes(
ctx context.Context,
bucket objstore.Bucket,
window time.Time,
) (tenantIndexes, error) {
tocPath := metastore.TableOfContentsPath(window.UTC().Truncate(metastore.MetastoreWindowSize))
r, err := bucket.Get(ctx, tocPath)
if err != nil {
return nil, err // includes IsObjNotFoundErr — caller checks
}
defer r.Close()
buf, err := io.ReadAll(r)
if err != nil {
return nil, fmt.Errorf("read ToC %s: %w", tocPath, err)
}
obj, err := dataobj.FromReaderAt(bytes.NewReader(buf), int64(len(buf)))
if err != nil {
return nil, fmt.Errorf("decode ToC %s: %w", tocPath, err)
}
// Hoist the Reader and the per-batch decode scratch above the section
// loop. A ToC has one indexpointers section per tenant; in large
// deployments that can be hundreds. Reader.Reset(...) at each iteration
// reuses the reader's internal allocator + record-batch state — matches
// the upstream pattern in metastore/iter.go's forEachIndexPointer.
var reader indexpointers.Reader
defer reader.Close()
const batchSize = 1024
scratch := make([]indexEntry, batchSize)
out := make(tenantIndexes, len(obj.Tenants()))
for _, section := range obj.Sections().Filter(indexpointers.CheckSection) {
tenant := section.Tenant
entries, err := readAllIndexPointers(ctx, &reader, scratch, section)
if err != nil {
return nil, fmt.Errorf("read indexpointers for tenant %s: %w", tenant, err)
}
out[tenant] = append(out[tenant], entries...)
}
return out, nil
}
// readAllIndexPointers decodes every row of one indexpointers section into
// indexEntry values. The caller owns the Reader and scratch slice; both are
// reused across section iterations.
//
// Mirrors pkg/dataobj/metastore.forEachIndexPointer's structure but drops
// the user.ExtractOrgID tenant filter and the WhereTimeRangeOverlapsWith
// predicate — the compactor reads every row from every tenant in the
// most-recent ToC.
func readAllIndexPointers(ctx context.Context, reader *indexpointers.Reader, scratch []indexEntry, section *dataobj.Section) ([]indexEntry, error) {
sec, err := indexpointers.Open(ctx, section)
if err != nil {
return nil, fmt.Errorf("opening indexpointers section: %w", err)
}
reader.Reset(indexpointers.ReaderOptions{Columns: sec.Columns()})
if err := reader.Open(ctx); err != nil {
return nil, fmt.Errorf("opening reader: %w", err)
}
batchSize := len(scratch)
var out []indexEntry
for {
rec, readErr := reader.Read(ctx, batchSize)
if readErr != nil && !errors.Is(readErr, io.EOF) {
return nil, fmt.Errorf("reading batch: %w", readErr)
}
numRows := int(rec.NumRows())
if numRows == 0 && errors.Is(readErr, io.EOF) {
break
}
// Clear the rows we will populate so prior batches don't leak through.
for i := range numRows {
scratch[i] = indexEntry{}
}
for colIdx := 0; colIdx < int(rec.NumCols()); colIdx++ {
col := rec.Column(colIdx)
pointerCol := sec.Columns()[colIdx]
switch pointerCol.Type {
case indexpointers.ColumnTypePath:
values := col.(*array.String)
for rIdx := range numRows {
if col.IsNull(rIdx) {
continue
}
scratch[rIdx].Path = values.Value(rIdx)
}
case indexpointers.ColumnTypeMinTimestamp:
values := col.(*array.Timestamp)
for rIdx := range numRows {
if col.IsNull(rIdx) {
continue
}
scratch[rIdx].Start = time.Unix(0, int64(values.Value(rIdx)))
}
case indexpointers.ColumnTypeMaxTimestamp:
values := col.(*array.Timestamp)
for rIdx := range numRows {
if col.IsNull(rIdx) {
continue
}
scratch[rIdx].End = time.Unix(0, int64(values.Value(rIdx)))
}
}
}
for i := range numRows {
out = append(out, scratch[i])
}
if errors.Is(readErr, io.EOF) {
break
}
}
return out, nil
}
// sectionRefsFor converts a tenant's indexes into SectionRefs suitable for
// compactionv2.Plan. Returns one SectionRef per index with timestamp-only
// bounds (empty MinKey/MaxKey); the planner's composite (MinKey, MinTimestamp)
// sort key degrades to single-axis timestamp ordering, which is sufficient
// for index-only compaction.
func sectionRefsFor(indexes []indexEntry) []*compactionv2pb.SectionRef {
out := make([]*compactionv2pb.SectionRef, len(indexes))
for i, e := range indexes {
out[i] = &compactionv2pb.SectionRef{
ObjectPath: e.Path,
SectionIndex: 0,
MinTimestamp: e.Start.UnixNano(),
MaxTimestamp: e.End.UnixNano(),
}
}
return out
}