mirror of https://github.com/grafana/loki
chore(engine): provide basic implementation of DataObjScan (#17568)
Co-authored-by: Ashwanth Goli <iamashwanth@gmail.com>pull/17578/head
parent
c129024d63
commit
4841b2abe2
@ -0,0 +1,554 @@ |
||||
package executor |
||||
|
||||
import ( |
||||
"cmp" |
||||
"context" |
||||
"errors" |
||||
"fmt" |
||||
"io" |
||||
"slices" |
||||
"sync" |
||||
|
||||
"github.com/apache/arrow-go/v18/arrow" |
||||
"github.com/apache/arrow-go/v18/arrow/array" |
||||
"github.com/apache/arrow-go/v18/arrow/memory" |
||||
"github.com/prometheus/prometheus/model/labels" |
||||
"go.uber.org/atomic" |
||||
"golang.org/x/sync/errgroup" |
||||
|
||||
"github.com/grafana/loki/v3/pkg/dataobj" |
||||
"github.com/grafana/loki/v3/pkg/engine/internal/types" |
||||
"github.com/grafana/loki/v3/pkg/engine/planner/physical" |
||||
"github.com/grafana/loki/v3/pkg/util/topk" |
||||
) |
||||
|
||||
type dataobjScan struct { |
||||
ctx context.Context |
||||
opts dataobjScanOptions |
||||
|
||||
initialized bool |
||||
readers []*dataobj.LogsReader |
||||
streams map[int64]labels.Labels |
||||
|
||||
state state |
||||
} |
||||
|
||||
type dataobjScanOptions struct { |
||||
// TODO(rfratto): Limiting each DataObjScan to a single section is going to
|
||||
// be critical for limiting memory overhead here; the section is intended to
|
||||
// be the smallest unit of parallelization.
|
||||
|
||||
Object *dataobj.Object // Object to read from.
|
||||
StreamIDs []int64 // Stream IDs to match from logs sections.
|
||||
Predicates []dataobj.LogsPredicate // Predicate to apply to the logs.
|
||||
Projections []physical.ColumnExpression // Columns to include. An empty slice means all columns.
|
||||
|
||||
Direction physical.Direction // Direction of timestamps to return.
|
||||
Limit uint32 // A limit on the number of rows to return (0=unlimited).
|
||||
} |
||||
|
||||
var _ Pipeline = (*dataobjScan)(nil) |
||||
|
||||
// newDataobjScanPipeline creates a new Pipeline which emits a single
|
||||
// [arrow.Record] composed of all log sections in a data object. Rows in the
|
||||
// returned record are ordered by timestamp in the direction specified by
|
||||
// opts.Direction.
|
||||
func newDataobjScanPipeline(ctx context.Context, opts dataobjScanOptions) *dataobjScan { |
||||
return &dataobjScan{ctx: ctx, opts: opts} |
||||
} |
||||
|
||||
// Read retrieves the next [arrow.Record] from the dataobj.
|
||||
func (s *dataobjScan) Read() error { |
||||
if err := s.init(); err != nil { |
||||
return err |
||||
} |
||||
|
||||
rec, err := s.read() |
||||
s.state = newState(rec, err) |
||||
|
||||
if err != nil { |
||||
return fmt.Errorf("reading data object: %w", err) |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
func (s *dataobjScan) init() error { |
||||
if s.initialized { |
||||
return nil |
||||
} |
||||
|
||||
md, err := s.opts.Object.Metadata(s.ctx) |
||||
if err != nil { |
||||
return fmt.Errorf("reading metadata: %w", err) |
||||
} |
||||
|
||||
if err := s.initStreams(md); err != nil { |
||||
return fmt.Errorf("initializing streams: %w", err) |
||||
} |
||||
|
||||
s.readers = make([]*dataobj.LogsReader, 0, md.LogsSections) |
||||
|
||||
for section := range md.LogsSections { |
||||
// TODO(rfratto): There's a few problems with using LogsReader as it is:
|
||||
//
|
||||
// 1. LogsReader doesn't support providing a subset of columns to read
|
||||
// from, so we're applying projections after reading.
|
||||
//
|
||||
// 2. LogsReader is intended to be pooled to reduce memory, but we're
|
||||
// creating a new one every time here.
|
||||
//
|
||||
// For the sake of the initial implementation I'm ignoring these issues,
|
||||
// but we'll absolutely need to solve this prior to production use.
|
||||
lr := dataobj.NewLogsReader(s.opts.Object, section) |
||||
|
||||
// The calls below can't fail because we're always using a brand new logs
|
||||
// reader.
|
||||
_ = lr.MatchStreams(slices.Values(s.opts.StreamIDs)) |
||||
_ = lr.SetPredicates(s.opts.Predicates) |
||||
|
||||
s.readers = append(s.readers, lr) |
||||
} |
||||
|
||||
s.initialized = true |
||||
return nil |
||||
} |
||||
|
||||
// initStreams retrieves all requested stream records from streams sections so
|
||||
// that emitted [arrow.Record]s can include stream labels in results.
|
||||
func (s *dataobjScan) initStreams(md dataobj.Metadata) error { |
||||
var sr dataobj.StreamsReader |
||||
|
||||
streams := make([]dataobj.Stream, 512) |
||||
|
||||
// Initialize entries in the map so we can do a presence test in the loop
|
||||
// below.
|
||||
s.streams = make(map[int64]labels.Labels, len(s.opts.StreamIDs)) |
||||
for _, id := range s.opts.StreamIDs { |
||||
s.streams[id] = nil |
||||
} |
||||
|
||||
for section := range md.StreamsSections { |
||||
// TODO(rfratto): dataobj.StreamsPredicate is missing support for filtering
|
||||
// on stream IDs when we already know them in advance; this can cause the
|
||||
// Read here to take longer than it needs to since we're reading the
|
||||
// entirety of every row.
|
||||
sr.Reset(s.opts.Object, section) |
||||
|
||||
for { |
||||
n, err := sr.Read(s.ctx, streams) |
||||
if n == 0 && errors.Is(err, io.EOF) { |
||||
return nil |
||||
} else if err != nil && !errors.Is(err, io.EOF) { |
||||
return err |
||||
} |
||||
|
||||
for i, stream := range streams[:n] { |
||||
if _, found := s.streams[stream.ID]; !found { |
||||
continue |
||||
} |
||||
|
||||
s.streams[stream.ID] = stream.Labels |
||||
|
||||
// Zero out the stream entry from the slice so the next call to sr.Read
|
||||
// doesn't overwrite any memory we just moved to s.streams.
|
||||
streams[i] = dataobj.Stream{} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Check that all streams were populated.
|
||||
var errs []error |
||||
for id, labels := range s.streams { |
||||
if labels == nil { |
||||
errs = append(errs, fmt.Errorf("requested stream ID %d not found in any stream section", id)) |
||||
} |
||||
} |
||||
return errors.Join(errs...) |
||||
} |
||||
|
||||
// read reads the entire data object into memory and generates an arrow.Record
|
||||
// from the data. It returns an error upon encountering an error while reading
|
||||
// one of the sections.
|
||||
func (s *dataobjScan) read() (arrow.Record, error) { |
||||
// Since [physical.DataObjScan] requires that:
|
||||
//
|
||||
// * Records are ordered by timestamp, and
|
||||
// * Records from the same dataobjScan do not overlap in time
|
||||
//
|
||||
// we *must* read the entire data object before creating a record, as the
|
||||
// sections in the dataobj itself are not already sorted by timestamp (though
|
||||
// we only need to keep up to Limit rows in memory).
|
||||
|
||||
var ( |
||||
heapMut sync.Mutex |
||||
heap = topk.Heap[dataobj.Record]{ |
||||
Limit: int(s.opts.Limit), |
||||
Less: s.getLessFunc(s.opts.Direction), |
||||
} |
||||
) |
||||
|
||||
g, ctx := errgroup.WithContext(s.ctx) |
||||
|
||||
var gotData atomic.Bool |
||||
|
||||
for _, reader := range s.readers { |
||||
g.Go(func() error { |
||||
buf := make([]dataobj.Record, 512) |
||||
|
||||
for { |
||||
n, err := reader.Read(ctx, buf) |
||||
if n == 0 && errors.Is(err, io.EOF) { |
||||
return nil |
||||
} else if err != nil && !errors.Is(err, io.EOF) { |
||||
return err |
||||
} |
||||
|
||||
gotData.Store(true) |
||||
|
||||
heapMut.Lock() |
||||
for _, rec := range buf[:n] { |
||||
heap.Push(rec) |
||||
} |
||||
heapMut.Unlock() |
||||
} |
||||
}) |
||||
} |
||||
|
||||
if err := g.Wait(); err != nil { |
||||
return nil, err |
||||
} else if !gotData.Load() { |
||||
return nil, EOF |
||||
} |
||||
|
||||
projections, err := s.effectiveProjections(&heap) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("getting effective projections: %w", err) |
||||
} |
||||
|
||||
schema, err := schemaFromColumns(projections) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("creating schema: %w", err) |
||||
} |
||||
|
||||
// TODO(rfratto): pass allocator to builder
|
||||
rb := array.NewRecordBuilder(memory.NewGoAllocator(), schema) |
||||
defer rb.Release() |
||||
|
||||
records := heap.PopAll() |
||||
slices.Reverse(records) |
||||
|
||||
for _, record := range records { |
||||
for i := 0; i < schema.NumFields(); i++ { |
||||
field, builder := rb.Schema().Field(i), rb.Field(i) |
||||
s.appendToBuilder(builder, &field, &record) |
||||
} |
||||
} |
||||
|
||||
return rb.NewRecord(), nil |
||||
} |
||||
|
||||
func (s *dataobjScan) getLessFunc(dir physical.Direction) func(a, b dataobj.Record) bool { |
||||
// compareStreams is used when two records have the same timestamp.
|
||||
compareStreams := func(a, b dataobj.Record) bool { |
||||
aStream, ok := s.streams[a.StreamID] |
||||
if !ok { |
||||
return false |
||||
} |
||||
|
||||
bStream, ok := s.streams[b.StreamID] |
||||
if !ok { |
||||
return true |
||||
} |
||||
|
||||
return labels.Compare(aStream, bStream) < 0 |
||||
} |
||||
|
||||
switch dir { |
||||
case physical.Forward: |
||||
return func(a, b dataobj.Record) bool { |
||||
if a.Timestamp.Equal(b.Timestamp) { |
||||
compareStreams(a, b) |
||||
} |
||||
return a.Timestamp.After(b.Timestamp) |
||||
} |
||||
case physical.Backwards: |
||||
return func(a, b dataobj.Record) bool { |
||||
if a.Timestamp.Equal(b.Timestamp) { |
||||
compareStreams(a, b) |
||||
} |
||||
return a.Timestamp.Before(b.Timestamp) |
||||
} |
||||
default: |
||||
panic("invalid direction") |
||||
} |
||||
} |
||||
|
||||
// effectiveProjections returns the effective projections to return for a
|
||||
// record. If s.opts.Projections is non-empty, then its column expressions are
|
||||
// used for the projections.
|
||||
//
|
||||
// Otherwise, the set of all columns found in the heap are used, in order of:
|
||||
//
|
||||
// * All stream labels (sorted by name)
|
||||
// * All metadata columns (sorted by name)
|
||||
// * Log timestamp
|
||||
// * Log message
|
||||
//
|
||||
// effectiveProjections does not mutate h.
|
||||
func (s *dataobjScan) effectiveProjections(h *topk.Heap[dataobj.Record]) ([]physical.ColumnExpression, error) { |
||||
if len(s.opts.Projections) > 0 { |
||||
return s.opts.Projections, nil |
||||
} |
||||
|
||||
var ( |
||||
columns []physical.ColumnExpression |
||||
foundStreams = map[int64]struct{}{} |
||||
found = map[physical.ColumnExpr]struct{}{} |
||||
) |
||||
|
||||
addColumn := func(name string, ty types.ColumnType) { |
||||
expr := physical.ColumnExpr{ |
||||
Ref: types.ColumnRef{Column: name, Type: ty}, |
||||
} |
||||
|
||||
if _, ok := found[expr]; !ok { |
||||
found[expr] = struct{}{} |
||||
columns = append(columns, &expr) |
||||
} |
||||
} |
||||
|
||||
for rec := range h.Range() { |
||||
stream, ok := s.streams[rec.StreamID] |
||||
if !ok { |
||||
// If we hit this, there's a problem with either initStreams (we missed a
|
||||
// requested stream) or the predicate application, where it returned a
|
||||
// stream we didn't want.
|
||||
return nil, fmt.Errorf("stream ID %d not found in stream cache", rec.StreamID) |
||||
} |
||||
|
||||
if _, addedStream := foundStreams[rec.StreamID]; !addedStream { |
||||
for _, label := range stream { |
||||
addColumn(label.Name, types.ColumnTypeLabel) |
||||
} |
||||
foundStreams[rec.StreamID] = struct{}{} |
||||
} |
||||
|
||||
for _, md := range rec.Metadata { |
||||
addColumn(md.Name, types.ColumnTypeMetadata) |
||||
} |
||||
} |
||||
|
||||
// Sort existing columns by type (preferring labels) then name.
|
||||
slices.SortFunc(columns, func(a, b physical.ColumnExpression) int { |
||||
aRef, bRef := a.(*physical.ColumnExpr).Ref, b.(*physical.ColumnExpr).Ref |
||||
|
||||
if aRef.Type != bRef.Type { |
||||
if aRef.Type == types.ColumnTypeLabel { |
||||
return -1 // Labels first.
|
||||
} |
||||
return 1 |
||||
} |
||||
|
||||
return cmp.Compare(aRef.Column, bRef.Column) |
||||
}) |
||||
|
||||
// Add fixed columns at the end.
|
||||
addColumn("timestamp", types.ColumnTypeBuiltin) |
||||
addColumn("message", types.ColumnTypeBuiltin) |
||||
|
||||
return columns, nil |
||||
} |
||||
|
||||
func schemaFromColumns(columns []physical.ColumnExpression) (*arrow.Schema, error) { |
||||
var ( |
||||
fields = make([]arrow.Field, 0, len(columns)) |
||||
fingerprints = make(map[string]struct{}, len(columns)) |
||||
) |
||||
|
||||
addField := func(field arrow.Field) { |
||||
fp := field.Fingerprint() |
||||
if field.HasMetadata() { |
||||
// We differentiate column type using metadata, but the metadata isn't
|
||||
// included in the fingerprint, so we need to manually include it here.
|
||||
fp += field.Metadata.String() |
||||
} |
||||
|
||||
if _, exist := fingerprints[fp]; exist { |
||||
return |
||||
} |
||||
|
||||
fields = append(fields, field) |
||||
fingerprints[fp] = struct{}{} |
||||
} |
||||
|
||||
for _, column := range columns { |
||||
columnExpr, ok := column.(*physical.ColumnExpr) |
||||
if !ok { |
||||
return nil, fmt.Errorf("invalid column expression type %T", column) |
||||
} |
||||
|
||||
md := arrow.MetadataFrom(map[string]string{ |
||||
types.ColumnTypeMetadataKey: columnExpr.Ref.Type.String(), |
||||
}) |
||||
|
||||
switch columnExpr.Ref.Type { |
||||
case types.ColumnTypeLabel: |
||||
// TODO(rfratto): Switch to dictionary encoding for labels.
|
||||
//
|
||||
// Since labels are more likely to repeat than metadata, we could cut
|
||||
// down on the memory overhead of a record by dictionary encoding the
|
||||
// labels.
|
||||
//
|
||||
// However, the csv package we use for testing DataObjScan currently
|
||||
// (2025-05-02) doesn't support dictionary encoding, and we would need
|
||||
// to find a solution there.
|
||||
//
|
||||
// We skipped dictionary encoding for now to get the initial prototype
|
||||
// working.
|
||||
addField(arrow.Field{ |
||||
Name: columnExpr.Ref.Column, |
||||
Type: arrow.BinaryTypes.String, |
||||
Nullable: true, |
||||
Metadata: md, |
||||
}) |
||||
|
||||
case types.ColumnTypeMetadata: |
||||
// Metadata is *not* encoded using dictionary encoding since metadata is
|
||||
// has unconstrained cardinality. Using dictionary encoding would require
|
||||
// tracking every encoded value in the record, which is likely to be too
|
||||
// expensive.
|
||||
addField(arrow.Field{ |
||||
Name: columnExpr.Ref.Column, |
||||
Type: arrow.BinaryTypes.String, |
||||
Nullable: true, |
||||
Metadata: md, |
||||
}) |
||||
|
||||
case types.ColumnTypeBuiltin: |
||||
addField(arrow.Field{ |
||||
Name: columnExpr.Ref.Column, |
||||
Type: builtinColumnType(columnExpr.Ref), |
||||
Nullable: true, |
||||
Metadata: md, |
||||
}) |
||||
|
||||
case types.ColumnTypeAmbiguous: |
||||
// The best handling for ambiguous columns (in terms of the schema) is to
|
||||
// explode it out into multiple columns, one for each type. (Except for
|
||||
// parsed, which can't be emitted from DataObjScan right now).
|
||||
//
|
||||
// TODO(rfratto): should ambiguity be passed down like this? It's odd for
|
||||
// the returned schema to be different than the set of columns you asked
|
||||
// for.
|
||||
//
|
||||
// As an alternative, ambiguity could be handled by the planner, where it
|
||||
// performs the explosion and propagates the ambiguity down into the
|
||||
// predicates.
|
||||
//
|
||||
// If we're ok with the schema changing from what was requested, then we
|
||||
// could update this to resolve the ambiguity at [dataobjScan.effectiveProjections]
|
||||
// so we don't always explode out to the full set of columns.
|
||||
addField(arrow.Field{ |
||||
Name: columnExpr.Ref.Column, |
||||
Type: arrow.BinaryTypes.String, |
||||
Nullable: true, |
||||
Metadata: arrow.MetadataFrom(map[string]string{types.ColumnTypeMetadataKey: types.ColumnTypeLabel.String()}), |
||||
}) |
||||
addField(arrow.Field{ |
||||
Name: columnExpr.Ref.Column, |
||||
Type: arrow.BinaryTypes.String, |
||||
Nullable: true, |
||||
Metadata: arrow.MetadataFrom(map[string]string{types.ColumnTypeMetadataKey: types.ColumnTypeMetadata.String()}), |
||||
}) |
||||
|
||||
case types.ColumnTypeParsed: |
||||
return nil, fmt.Errorf("parsed column type not supported: %s", columnExpr.Ref.Type) |
||||
} |
||||
} |
||||
|
||||
return arrow.NewSchema(fields, nil), nil |
||||
} |
||||
|
||||
func builtinColumnType(ref types.ColumnRef) arrow.DataType { |
||||
if ref.Type != types.ColumnTypeBuiltin { |
||||
panic("builtinColumnType called with a non-builtin column") |
||||
} |
||||
|
||||
switch ref.Column { |
||||
case "timestamp": |
||||
return arrow.FixedWidthTypes.Timestamp_ns |
||||
case "message": |
||||
return arrow.BinaryTypes.String |
||||
default: |
||||
panic(fmt.Sprintf("unsupported builtin column type %s", ref)) |
||||
} |
||||
} |
||||
|
||||
// appendToBuilder appends a the provided field from record into the given
|
||||
// builder. The metadata of field is used to determine the category of column.
|
||||
// appendToBuilder panics if the type of field does not match the datatype of
|
||||
// builder.
|
||||
func (s *dataobjScan) appendToBuilder(builder array.Builder, field *arrow.Field, record *dataobj.Record) { |
||||
columnType, ok := field.Metadata.GetValue(types.ColumnTypeMetadataKey) |
||||
if !ok { |
||||
// This shouldn't happen; we control the metadata here on the fields.
|
||||
panic(fmt.Sprintf("missing column type in field %s", field.Name)) |
||||
} |
||||
|
||||
switch columnType { |
||||
case types.ColumnTypeLabel.String(): |
||||
stream, ok := s.streams[record.StreamID] |
||||
if !ok { |
||||
panic(fmt.Sprintf("stream ID %d not found in stream cache", record.StreamID)) |
||||
} |
||||
|
||||
val := stream.Get(field.Name) |
||||
if val == "" { |
||||
builder.(*array.StringBuilder).AppendNull() |
||||
} else { |
||||
builder.(*array.StringBuilder).Append(val) |
||||
} |
||||
|
||||
case types.ColumnTypeMetadata.String(): |
||||
val := record.Metadata.Get(field.Name) |
||||
if val == "" { |
||||
builder.(*array.StringBuilder).AppendNull() |
||||
} else { |
||||
builder.(*array.StringBuilder).Append(val) |
||||
} |
||||
|
||||
case types.ColumnTypeBuiltin.String(): |
||||
if field.Name == "timestamp" { |
||||
ts, _ := arrow.TimestampFromTime(record.Timestamp, arrow.Nanosecond) |
||||
builder.(*array.TimestampBuilder).Append(ts) |
||||
} else if field.Name == "message" { |
||||
// Use the inner BinaryBuilder to avoid converting record.Line to a
|
||||
// string and back.
|
||||
builder.(*array.StringBuilder).BinaryBuilder.Append(record.Line) |
||||
} else { |
||||
panic(fmt.Sprintf("unsupported builtin column %s", field.Name)) |
||||
} |
||||
|
||||
default: |
||||
// This shouldn't happen; we control the metadata here on the fields.
|
||||
panic(fmt.Sprintf("unsupported column type %s", columnType)) |
||||
} |
||||
} |
||||
|
||||
// Value returns the current [arrow.Record] retrieved by the previous call to
|
||||
// [dataobjScan.Read], or an error if the record cannot be read.
|
||||
func (s *dataobjScan) Value() (arrow.Record, error) { return s.state.batch, s.state.err } |
||||
|
||||
// Close closes s and releases all resources.
|
||||
func (s *dataobjScan) Close() { |
||||
for _, reader := range s.readers { |
||||
_ = reader.Close() |
||||
} |
||||
} |
||||
|
||||
// Inputs implements Pipeline and returns nil, since DataObjScan accepts no
|
||||
// pipelines as input.
|
||||
func (s *dataobjScan) Inputs() []Pipeline { return nil } |
||||
|
||||
// Transport implements Pipeline and returns [Local].
|
||||
func (s *dataobjScan) Transport() Transport { return Local } |
||||
@ -0,0 +1,47 @@ |
||||
package executor |
||||
|
||||
import ( |
||||
"fmt" |
||||
|
||||
"github.com/grafana/loki/v3/pkg/dataobj" |
||||
"github.com/grafana/loki/v3/pkg/engine/planner/physical" |
||||
) |
||||
|
||||
// buildLogsPredicate builds a logs predicate from an expression.
|
||||
func buildLogsPredicate(_ physical.Expression) (dataobj.LogsPredicate, error) { |
||||
// TODO(rfratto): implement converting expressions into logs predicates.
|
||||
//
|
||||
// There's a few challenges here:
|
||||
//
|
||||
// - Expressions do not cleanly map to [dataobj.LogsPredicate]s. For example,
|
||||
// an expression may be simply a column reference, but a logs predicate is
|
||||
// always some expression that can evaluate to true.
|
||||
//
|
||||
// - Mapping expressions into [dataobj.TimeRangePredicate] is a massive pain;
|
||||
// since TimeRangePredicate specifies both bounds for the time range, we
|
||||
// would need to find and collapse multiple physical.Expressions into a
|
||||
// single TimeRangePredicate.
|
||||
//
|
||||
// - While [dataobj.MetadataMatcherPredicate] and
|
||||
// [dataobj.LogMessageFilterPredicate] are catch-alls for function-based
|
||||
// predicates, they are row-based and not column-based, so our
|
||||
// expressionEvaluator cannot be used here.
|
||||
//
|
||||
// Long term, we likely want two things:
|
||||
//
|
||||
// 1. Use dataset.Reader and dataset.Predicate directly instead of
|
||||
// dataobj.LogsReader.
|
||||
//
|
||||
// 2. Update dataset.Reader to be vector based instead of row-based.
|
||||
//
|
||||
// It's not clear if we should resolve the issues with LogsPredicate (or find
|
||||
// hacks to make them work in the short term), or skip straight to using
|
||||
// dataset.Reader instead.
|
||||
//
|
||||
// Implementing DataObjScan in the dataobj package would be a clean way to
|
||||
// handle all of this, but that would cause cyclic dependencies. I also don't
|
||||
// think we should start removing things from internal for this; we can probably
|
||||
// find a way to remove the explicit dependency from the dataobj package from
|
||||
// the physical planner instead.
|
||||
return nil, fmt.Errorf("logs predicate conversion is not supported") |
||||
} |
||||
@ -0,0 +1,301 @@ |
||||
package executor |
||||
|
||||
import ( |
||||
"bytes" |
||||
"math" |
||||
"testing" |
||||
"time" |
||||
|
||||
"github.com/apache/arrow-go/v18/arrow" |
||||
"github.com/stretchr/testify/require" |
||||
|
||||
"github.com/grafana/loki/v3/pkg/dataobj" |
||||
"github.com/grafana/loki/v3/pkg/engine/internal/types" |
||||
"github.com/grafana/loki/v3/pkg/engine/planner/physical" |
||||
"github.com/grafana/loki/v3/pkg/logproto" |
||||
|
||||
"github.com/grafana/loki/pkg/push" |
||||
) |
||||
|
||||
var ( |
||||
labelMD = buildMetadata(types.ColumnTypeLabel) |
||||
metadataMD = buildMetadata(types.ColumnTypeMetadata) |
||||
builtinMD = buildMetadata(types.ColumnTypeBuiltin) |
||||
) |
||||
|
||||
func buildMetadata(ty types.ColumnType) arrow.Metadata { |
||||
return arrow.MetadataFrom(map[string]string{ |
||||
types.ColumnTypeMetadataKey: ty.String(), |
||||
}) |
||||
} |
||||
|
||||
func Test_dataobjScan(t *testing.T) { |
||||
obj := buildDataobj(t, []logproto.Stream{ |
||||
{ |
||||
Labels: `{service="loki", env="prod"}`, |
||||
Entries: []logproto.Entry{ |
||||
{ |
||||
Timestamp: time.Unix(5, 0), |
||||
Line: "hello world", |
||||
StructuredMetadata: []push.LabelAdapter{{Name: "guid", Value: "aaaa-bbbb-cccc-dddd"}}, |
||||
}, |
||||
{ |
||||
Timestamp: time.Unix(10, 0), |
||||
Line: "goodbye world", |
||||
StructuredMetadata: []push.LabelAdapter{{Name: "guid", Value: "eeee-ffff-aaaa-bbbb"}}, |
||||
}, |
||||
}, |
||||
}, |
||||
{ |
||||
Labels: `{service="notloki", env="prod"}`, |
||||
Entries: []logproto.Entry{ |
||||
{ |
||||
Timestamp: time.Unix(2, 0), |
||||
Line: "hello world", |
||||
StructuredMetadata: []push.LabelAdapter{{Name: "pod", Value: "notloki-pod-1"}}, |
||||
}, |
||||
{ |
||||
Timestamp: time.Unix(3, 0), |
||||
Line: "goodbye world", |
||||
StructuredMetadata: []push.LabelAdapter{{Name: "pod", Value: "notloki-pod-1"}}, |
||||
}, |
||||
}, |
||||
}, |
||||
}) |
||||
|
||||
t.Run("All columns", func(t *testing.T) { |
||||
pipeline := newDataobjScanPipeline(t.Context(), dataobjScanOptions{ |
||||
Object: obj, |
||||
StreamIDs: []int64{1, 2}, // All streams
|
||||
Projections: nil, // All columns
|
||||
Direction: physical.Forward, |
||||
Limit: 0, // No limit
|
||||
}) |
||||
|
||||
expectFields := []arrow.Field{ |
||||
{Name: "env", Type: arrow.BinaryTypes.String, Metadata: labelMD, Nullable: true}, |
||||
{Name: "service", Type: arrow.BinaryTypes.String, Metadata: labelMD, Nullable: true}, |
||||
{Name: "guid", Type: arrow.BinaryTypes.String, Metadata: metadataMD, Nullable: true}, |
||||
{Name: "pod", Type: arrow.BinaryTypes.String, Metadata: metadataMD, Nullable: true}, |
||||
{Name: "timestamp", Type: arrow.FixedWidthTypes.Timestamp_ns, Metadata: builtinMD, Nullable: true}, |
||||
{Name: "message", Type: arrow.BinaryTypes.String, Metadata: builtinMD, Nullable: true}, |
||||
} |
||||
|
||||
expectCSV := `prod,notloki,NULL,notloki-pod-1,1970-01-01 00:00:02,hello world |
||||
prod,notloki,NULL,notloki-pod-1,1970-01-01 00:00:03,goodbye world |
||||
prod,loki,aaaa-bbbb-cccc-dddd,NULL,1970-01-01 00:00:05,hello world |
||||
prod,loki,eeee-ffff-aaaa-bbbb,NULL,1970-01-01 00:00:10,goodbye world` |
||||
|
||||
expectRecord, err := CSVToArrow(expectFields, expectCSV) |
||||
require.NoError(t, err) |
||||
defer expectRecord.Release() |
||||
|
||||
AssertPipelinesEqual(t, pipeline, NewBufferedPipeline(expectRecord)) |
||||
}) |
||||
|
||||
t.Run("Column subset", func(t *testing.T) { |
||||
pipeline := newDataobjScanPipeline(t.Context(), dataobjScanOptions{ |
||||
Object: obj, |
||||
StreamIDs: []int64{1, 2}, // All streams
|
||||
Projections: []physical.ColumnExpression{ |
||||
&physical.ColumnExpr{Ref: types.ColumnRef{Column: "timestamp", Type: types.ColumnTypeBuiltin}}, |
||||
&physical.ColumnExpr{Ref: types.ColumnRef{Column: "env", Type: types.ColumnTypeLabel}}, |
||||
}, |
||||
Direction: physical.Forward, |
||||
Limit: 0, // No limit
|
||||
}) |
||||
|
||||
expectFields := []arrow.Field{ |
||||
{Name: "timestamp", Type: arrow.FixedWidthTypes.Timestamp_ns, Metadata: builtinMD, Nullable: true}, |
||||
{Name: "env", Type: arrow.BinaryTypes.String, Metadata: labelMD, Nullable: true}, |
||||
} |
||||
|
||||
expectCSV := `1970-01-01 00:00:02,prod |
||||
1970-01-01 00:00:03,prod |
||||
1970-01-01 00:00:05,prod |
||||
1970-01-01 00:00:10,prod` |
||||
|
||||
expectRecord, err := CSVToArrow(expectFields, expectCSV) |
||||
require.NoError(t, err) |
||||
defer expectRecord.Release() |
||||
|
||||
AssertPipelinesEqual(t, pipeline, NewBufferedPipeline(expectRecord)) |
||||
}) |
||||
|
||||
t.Run("Unknown column", func(t *testing.T) { |
||||
// Here, we'll check for a column which only exists once in the dataobj but is
|
||||
// ambiguous from the perspective of the caller.
|
||||
pipeline := newDataobjScanPipeline(t.Context(), dataobjScanOptions{ |
||||
Object: obj, |
||||
StreamIDs: []int64{1, 2}, // All streams
|
||||
Projections: []physical.ColumnExpression{ |
||||
&physical.ColumnExpr{Ref: types.ColumnRef{Column: "env", Type: types.ColumnTypeAmbiguous}}, |
||||
}, |
||||
Direction: physical.Forward, |
||||
Limit: 0, // No limit
|
||||
}) |
||||
|
||||
expectFields := []arrow.Field{ |
||||
{Name: "env", Type: arrow.BinaryTypes.String, Metadata: labelMD, Nullable: true}, |
||||
{Name: "env", Type: arrow.BinaryTypes.String, Metadata: metadataMD, Nullable: true}, |
||||
} |
||||
|
||||
expectCSV := `prod,NULL |
||||
prod,NULL |
||||
prod,NULL |
||||
prod,NULL` |
||||
|
||||
expectRecord, err := CSVToArrow(expectFields, expectCSV) |
||||
require.NoError(t, err) |
||||
defer expectRecord.Release() |
||||
|
||||
AssertPipelinesEqual(t, pipeline, NewBufferedPipeline(expectRecord)) |
||||
}) |
||||
} |
||||
|
||||
func Test_dataobjScan_DuplicateColumns(t *testing.T) { |
||||
obj := buildDataobj(t, []logproto.Stream{ |
||||
// Case 1: A single row has a value for a label and metadata column with
|
||||
// the same name.
|
||||
{ |
||||
Labels: `{service="loki", env="prod", pod="pod-1"}`, |
||||
Entries: []logproto.Entry{ |
||||
{ |
||||
Timestamp: time.Unix(1, 0), |
||||
Line: "message 1", |
||||
StructuredMetadata: []push.LabelAdapter{{Name: "pod", Value: "override"}}, |
||||
}, |
||||
}, |
||||
}, |
||||
|
||||
// Case 2: A label and metadata column share a name but have values in
|
||||
// different rows.
|
||||
{ |
||||
Labels: `{service="loki", env="prod"}`, |
||||
Entries: []logproto.Entry{{ |
||||
Timestamp: time.Unix(2, 0), |
||||
Line: "message 2", |
||||
StructuredMetadata: []push.LabelAdapter{{Name: "namespace", Value: "namespace-1"}}, |
||||
}}, |
||||
}, |
||||
{ |
||||
Labels: `{service="loki", env="prod", namespace="namespace-2"}`, |
||||
Entries: []logproto.Entry{{ |
||||
Timestamp: time.Unix(3, 0), |
||||
Line: "message 3", |
||||
StructuredMetadata: nil, |
||||
}}, |
||||
}, |
||||
}) |
||||
|
||||
t.Run("All columns", func(t *testing.T) { |
||||
pipeline := newDataobjScanPipeline(t.Context(), dataobjScanOptions{ |
||||
Object: obj, |
||||
StreamIDs: []int64{1, 2, 3}, // All streams
|
||||
Projections: nil, // All columns
|
||||
Direction: physical.Forward, |
||||
Limit: 0, // No limit
|
||||
}) |
||||
|
||||
expectFields := []arrow.Field{ |
||||
{Name: "env", Type: arrow.BinaryTypes.String, Metadata: labelMD, Nullable: true}, |
||||
{Name: "namespace", Type: arrow.BinaryTypes.String, Metadata: labelMD, Nullable: true}, |
||||
{Name: "pod", Type: arrow.BinaryTypes.String, Metadata: labelMD, Nullable: true}, |
||||
{Name: "service", Type: arrow.BinaryTypes.String, Metadata: labelMD, Nullable: true}, |
||||
|
||||
{Name: "namespace", Type: arrow.BinaryTypes.String, Metadata: metadataMD, Nullable: true}, |
||||
{Name: "pod", Type: arrow.BinaryTypes.String, Metadata: metadataMD, Nullable: true}, |
||||
|
||||
{Name: "timestamp", Type: arrow.FixedWidthTypes.Timestamp_ns, Metadata: builtinMD, Nullable: true}, |
||||
{Name: "message", Type: arrow.BinaryTypes.String, Metadata: builtinMD, Nullable: true}, |
||||
} |
||||
|
||||
expectCSV := `prod,NULL,pod-1,loki,NULL,override,1970-01-01 00:00:01,message 1 |
||||
prod,NULL,NULL,loki,namespace-1,NULL,1970-01-01 00:00:02,message 2 |
||||
prod,namespace-2,NULL,loki,NULL,NULL,1970-01-01 00:00:03,message 3` |
||||
|
||||
expectRecord, err := CSVToArrow(expectFields, expectCSV) |
||||
require.NoError(t, err) |
||||
defer expectRecord.Release() |
||||
|
||||
AssertPipelinesEqual(t, pipeline, NewBufferedPipeline(expectRecord)) |
||||
}) |
||||
|
||||
t.Run("Ambiguous pod", func(t *testing.T) { |
||||
pipeline := newDataobjScanPipeline(t.Context(), dataobjScanOptions{ |
||||
Object: obj, |
||||
StreamIDs: []int64{1, 2, 3}, // All streams
|
||||
Projections: []physical.ColumnExpression{ |
||||
&physical.ColumnExpr{Ref: types.ColumnRef{Column: "pod", Type: types.ColumnTypeAmbiguous}}, |
||||
}, |
||||
Direction: physical.Forward, |
||||
Limit: 0, // No limit
|
||||
}) |
||||
|
||||
expectFields := []arrow.Field{ |
||||
{Name: "pod", Type: arrow.BinaryTypes.String, Metadata: labelMD, Nullable: true}, |
||||
{Name: "pod", Type: arrow.BinaryTypes.String, Metadata: metadataMD, Nullable: true}, |
||||
} |
||||
|
||||
expectCSV := `pod-1,override |
||||
NULL,NULL |
||||
NULL,NULL` |
||||
|
||||
expectRecord, err := CSVToArrow(expectFields, expectCSV) |
||||
require.NoError(t, err) |
||||
defer expectRecord.Release() |
||||
|
||||
AssertPipelinesEqual(t, pipeline, NewBufferedPipeline(expectRecord)) |
||||
}) |
||||
|
||||
t.Run("Ambiguous namespace", func(t *testing.T) { |
||||
pipeline := newDataobjScanPipeline(t.Context(), dataobjScanOptions{ |
||||
Object: obj, |
||||
StreamIDs: []int64{1, 2, 3}, // All streams
|
||||
Projections: []physical.ColumnExpression{ |
||||
&physical.ColumnExpr{Ref: types.ColumnRef{Column: "namespace", Type: types.ColumnTypeAmbiguous}}, |
||||
}, |
||||
Direction: physical.Forward, |
||||
Limit: 0, // No limit
|
||||
}) |
||||
|
||||
expectFields := []arrow.Field{ |
||||
{Name: "namespace", Type: arrow.BinaryTypes.String, Metadata: labelMD, Nullable: true}, |
||||
{Name: "namespace", Type: arrow.BinaryTypes.String, Metadata: metadataMD, Nullable: true}, |
||||
} |
||||
|
||||
expectCSV := `NULL,NULL |
||||
NULL,namespace-1 |
||||
namespace-2,NULL` |
||||
|
||||
expectRecord, err := CSVToArrow(expectFields, expectCSV) |
||||
require.NoError(t, err) |
||||
defer expectRecord.Release() |
||||
|
||||
AssertPipelinesEqual(t, pipeline, NewBufferedPipeline(expectRecord)) |
||||
}) |
||||
} |
||||
|
||||
func buildDataobj(t testing.TB, streams []logproto.Stream) *dataobj.Object { |
||||
t.Helper() |
||||
|
||||
builder, err := dataobj.NewBuilder(dataobj.BuilderConfig{ |
||||
TargetPageSize: 8_000, |
||||
TargetObjectSize: math.MaxInt, |
||||
TargetSectionSize: 32_000, |
||||
BufferSize: 8_000, |
||||
SectionStripeMergeLimit: 2, |
||||
}) |
||||
require.NoError(t, err) |
||||
|
||||
for _, stream := range streams { |
||||
require.NoError(t, builder.Append(stream)) |
||||
} |
||||
|
||||
var buf bytes.Buffer |
||||
_, err = builder.Flush(&buf) |
||||
require.NoError(t, err) |
||||
|
||||
r := bytes.NewReader(buf.Bytes()) |
||||
return dataobj.FromReaderAt(r, r.Size()) |
||||
} |
||||
@ -0,0 +1,132 @@ |
||||
package topk |
||||
|
||||
import ( |
||||
"container/heap" |
||||
"iter" |
||||
"math/rand/v2" |
||||
"slices" |
||||
) |
||||
|
||||
// Heap implements a heap of T. If Limit is specified, only the greatest
|
||||
// elements (according to Less) up to Limit are kept.
|
||||
//
|
||||
// When removing elements, the smallest element (according to Less) is returned
|
||||
// first. If using Heap as a max-heap, these elements need to stored in reverse
|
||||
// order.
|
||||
type Heap[T any] struct { |
||||
Limit int // Maximum number of entries to keep (0 = unlimited). Optional.
|
||||
Less func(a, b T) bool // Less returns true if a < b. Required.
|
||||
|
||||
values []T // Current values in the heap.
|
||||
} |
||||
|
||||
// Push adds v into the heap. If the heap is full, v is added only if it is
|
||||
// larger than the smallest value in the heap.
|
||||
func (h *Heap[T]) Push(v T) { |
||||
if h.Limit == 0 || len(h.values) < h.Limit { |
||||
heap.Push(h.impl(), v) |
||||
return |
||||
} |
||||
|
||||
// h.values[0] is always the smallest value in the heap.
|
||||
if h.Less(h.values[0], v) { |
||||
_ = heap.Pop(h.impl()) |
||||
heap.Push(h.impl(), v) |
||||
} |
||||
} |
||||
|
||||
// Pop removes and returns the minimum element from the heap. Pop returns the
|
||||
// zero value for T and false if the heap is empty.
|
||||
func (h *Heap[T]) Pop() (T, bool) { |
||||
if len(h.values) == 0 { |
||||
var zero T |
||||
return zero, false |
||||
} |
||||
|
||||
return heap.Pop(heapImpl[T]{h}).(T), true |
||||
} |
||||
|
||||
// Len returns the current number of elements in the heap.
|
||||
func (h *Heap[T]) Len() int { return len(h.values) } |
||||
|
||||
// PopAll removes and returns all elements from the heap in sorted order.
|
||||
func (h *Heap[T]) PopAll() []T { |
||||
res := h.values |
||||
|
||||
slices.SortFunc(res, func(a, b T) int { |
||||
if h.Less(a, b) { |
||||
return -1 |
||||
} |
||||
return 1 |
||||
}) |
||||
|
||||
// Reset h.values to nil to avoid changes to the heap modifying the returned
|
||||
// slice.
|
||||
h.values = nil |
||||
return res |
||||
} |
||||
|
||||
// Range returns an iterator over elements in the heap in random order without
|
||||
// modifying the heap. The iteration order is not consistent between calls to
|
||||
// Range.
|
||||
//
|
||||
// To retrieve items in sorted order, use [Heap.Pop] or [Heap.PopAll].
|
||||
func (h *Heap[T]) Range() iter.Seq[T] { |
||||
if len(h.values) == 0 { |
||||
return func(func(T) bool) {} |
||||
} |
||||
|
||||
// Create a random start point in the heap to avoid relying on the return
|
||||
// order.
|
||||
//
|
||||
// This is similar to how Go range over maps work, but that creates a seed at
|
||||
// the time the heap is created rather than when ranging begins.
|
||||
start := rand.IntN(len(h.values)) |
||||
|
||||
return func(yield func(T) bool) { |
||||
curr := start |
||||
|
||||
for { |
||||
if !yield(h.values[curr]) { |
||||
return |
||||
} |
||||
|
||||
// Increment curr and stop once we've fully looped back to where we
|
||||
// started.
|
||||
curr = (curr + 1) % len(h.values) |
||||
if curr == start { |
||||
return |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
type heapImpl[T any] struct { |
||||
*Heap[T] |
||||
} |
||||
|
||||
func (h *Heap[T]) impl() heap.Interface { return heapImpl[T]{h} } |
||||
|
||||
var _ heap.Interface = (*heapImpl[int])(nil) |
||||
|
||||
func (impl heapImpl[T]) Len() int { return impl.Heap.Len() } |
||||
|
||||
func (impl heapImpl[T]) Less(i, j int) bool { |
||||
return impl.Heap.Less(impl.values[i], impl.values[j]) |
||||
} |
||||
|
||||
func (impl heapImpl[T]) Swap(i, j int) { |
||||
impl.values[i], impl.values[j] = impl.values[j], impl.values[i] |
||||
} |
||||
|
||||
func (impl heapImpl[T]) Push(x any) { |
||||
impl.values = append(impl.values, x.(T)) |
||||
} |
||||
|
||||
func (impl heapImpl[T]) Pop() any { |
||||
old := impl.values |
||||
n := len(old) |
||||
x := old[n-1] |
||||
impl.values = old[:n-1] |
||||
return x |
||||
} |
||||
@ -0,0 +1,84 @@ |
||||
package topk_test |
||||
|
||||
import ( |
||||
"fmt" |
||||
"slices" |
||||
"sort" |
||||
"testing" |
||||
|
||||
"github.com/stretchr/testify/require" |
||||
|
||||
"github.com/grafana/loki/v3/pkg/util/topk" |
||||
) |
||||
|
||||
// ExampleHeap_greatest shows how to use a [topk.Heap] to get the top-k greatest
|
||||
// elements in descending order.
|
||||
func ExampleHeap_greatest() { |
||||
heap := &topk.Heap[int]{ |
||||
Limit: 3, |
||||
Less: func(a, b int) bool { return a < b }, |
||||
} |
||||
|
||||
for i := range 10 { |
||||
heap.Push(i) |
||||
} |
||||
|
||||
actual := heap.PopAll() |
||||
slices.Reverse(actual) // Reverse to get in greatest-descending order.
|
||||
|
||||
fmt.Println(actual) |
||||
// Output: [9 8 7]
|
||||
} |
||||
|
||||
// ExampleHeap_least shows how to use a [topk.Heap] to get the top-k least
|
||||
// elements in ascending order.
|
||||
func ExampleHeap_least() { |
||||
heap := &topk.Heap[int]{ |
||||
Limit: 3, |
||||
Less: func(a, b int) bool { return a > b }, |
||||
} |
||||
|
||||
for i := range 10 { |
||||
heap.Push(i) |
||||
} |
||||
|
||||
actual := heap.PopAll() |
||||
slices.Reverse(actual) // Reverse to get in least-ascending order.
|
||||
|
||||
fmt.Println(actual) |
||||
// Output: [0 1 2]
|
||||
} |
||||
|
||||
func TestHeap_Range(t *testing.T) { |
||||
heap := &topk.Heap[int]{ |
||||
Limit: 3, |
||||
Less: func(a, b int) bool { return a < b }, |
||||
} |
||||
|
||||
for i := range 10 { |
||||
heap.Push(i) |
||||
} |
||||
|
||||
var actual []int |
||||
for v := range heap.Range() { |
||||
actual = append(actual, v) |
||||
} |
||||
sort.Ints(actual) |
||||
|
||||
expected := []int{7, 8, 9} |
||||
require.Equal(t, expected, actual) |
||||
} |
||||
|
||||
func TestHeap_Range_Empty(t *testing.T) { |
||||
heap := &topk.Heap[int]{ |
||||
Limit: 3, |
||||
Less: func(a, b int) bool { return a < b }, |
||||
} |
||||
|
||||
require.NotPanics(t, func() { |
||||
// Iterating over an empty heap should be a no-op.
|
||||
for range heap.Range() { |
||||
t.Fatal("there should not be any values in the empty heap") |
||||
} |
||||
}) |
||||
} |
||||
Loading…
Reference in new issue