chore(engine): introduce ScanSet node (#19524)

Signed-off-by: Robert Fratto <robertfratto@gmail.com>
pull/19530/head
Robert Fratto 6 months ago committed by GitHub
parent b95bfabd42
commit bd3f3dabe1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 83
      pkg/engine/internal/executor/executor.go
  2. 10
      pkg/engine/internal/executor/executor_test.go
  3. 248
      pkg/engine/internal/executor/sortmerge.go
  4. 146
      pkg/engine/internal/executor/sortmerge_test.go
  5. 31
      pkg/engine/internal/planner/physical/merge.go
  6. 60
      pkg/engine/internal/planner/physical/optimizer.go
  7. 145
      pkg/engine/internal/planner/physical/optimizer_test.go
  8. 9
      pkg/engine/internal/planner/physical/plan.go
  9. 142
      pkg/engine/internal/planner/physical/planner.go
  10. 140
      pkg/engine/internal/planner/physical/planner_test.go
  11. 35
      pkg/engine/internal/planner/physical/printer.go
  12. 16
      pkg/engine/internal/planner/physical/printer_test.go
  13. 70
      pkg/engine/internal/planner/physical/scanset.go
  14. 23
      pkg/engine/internal/planner/physical/sort_order.go
  15. 63
      pkg/engine/internal/planner/physical/sortmerge.go
  16. 3
      pkg/engine/internal/planner/physical/visitor.go
  17. 28
      pkg/engine/internal/planner/physical/visitor_test.go
  18. 7
      pkg/logql/bench/store_dataobj_v2_engine.go

@ -77,16 +77,12 @@ func (c *Context) execute(ctx context.Context, node physical.Node) Pipeline {
return tracePipeline("physical.DataObjScan", c.executeDataObjScan(ctx, n))
}, inputs)
case *physical.SortMerge:
return tracePipeline("physical.SortMerge", c.executeSortMerge(ctx, n, inputs))
case *physical.TopK:
return tracePipeline("physical.TopK", c.executeTopK(ctx, n, inputs))
case *physical.Limit:
return tracePipeline("physical.Limit", c.executeLimit(ctx, n, inputs))
case *physical.Filter:
return tracePipeline("physical.Filter", c.executeFilter(ctx, n, inputs))
case *physical.Merge:
return tracePipeline("physical.Merge", c.executeMerge(ctx, n, inputs))
case *physical.Projection:
return tracePipeline("physical.Projection", c.executeProjection(ctx, n, inputs))
case *physical.RangeAggregation:
@ -99,6 +95,8 @@ func (c *Context) execute(ctx context.Context, node physical.Node) Pipeline {
return tracePipeline("physical.ColumnCompat", c.executeColumnCompat(ctx, n, inputs))
case *physical.Parallelize:
return tracePipeline("physical.Parallelize", c.executeParallelize(ctx, n, inputs))
case *physical.ScanSet:
return tracePipeline("physical.ScanSet", c.executeScanSet(ctx, n))
default:
return errorPipeline(ctx, fmt.Errorf("invalid node type: %T", node))
}
@ -248,27 +246,6 @@ func (c *Context) executeTopK(ctx context.Context, topK *physical.TopK, inputs [
return pipeline
}
func (c *Context) executeSortMerge(ctx context.Context, sortmerge *physical.SortMerge, inputs []Pipeline) Pipeline {
ctx, span := tracer.Start(ctx, "Context.executeSortMerge", trace.WithAttributes(
attribute.Stringer("order", sortmerge.Order),
attribute.Int("num_inputs", len(inputs)),
))
if sortmerge.Column != nil {
span.SetAttributes(attribute.Stringer("column", sortmerge.Column))
}
defer span.End()
if len(inputs) == 0 {
return emptyPipeline()
}
pipeline, err := NewSortMergePipeline(inputs, sortmerge.Order, sortmerge.Column, c.evaluator)
if err != nil {
return errorPipeline(ctx, err)
}
return pipeline
}
func (c *Context) executeLimit(ctx context.Context, limit *physical.Limit, inputs []Pipeline) Pipeline {
ctx, span := tracer.Start(ctx, "Context.executeLimit", trace.WithAttributes(
attribute.Int("skip", int(limit.Skip)),
@ -308,24 +285,6 @@ func (c *Context) executeFilter(ctx context.Context, filter *physical.Filter, in
return NewFilterPipeline(filter, inputs[0], c.evaluator, allocator)
}
func (c *Context) executeMerge(ctx context.Context, _ *physical.Merge, inputs []Pipeline) Pipeline {
ctx, span := tracer.Start(ctx, "Context.executeMerge", trace.WithAttributes(
attribute.Int("num_inputs", len(inputs)),
))
defer span.End()
if len(inputs) == 0 {
return emptyPipeline()
}
pipeline, err := newMergePipeline(inputs, c.mergePrefetchCount)
if err != nil {
return errorPipeline(ctx, err)
}
return pipeline
}
func (c *Context) executeProjection(ctx context.Context, proj *physical.Projection, inputs []Pipeline) Pipeline {
ctx, span := tracer.Start(ctx, "Context.executeProjection", trace.WithAttributes(
attribute.Int("num_columns", len(proj.Columns)),
@ -441,3 +400,41 @@ func (c *Context) executeParallelize(ctx context.Context, _ *physical.Paralleliz
// propagate up the input.
return inputs[0]
}
func (c *Context) executeScanSet(ctx context.Context, set *physical.ScanSet) Pipeline {
// ScanSet typically gets partitioned by the scheduler into multiple scan
// nodes.
//
// However, for locally testing unpartitioned pipelines, we still supprt
// running a ScanSet. In this case, we treat internally execute it as a
// Merge on top of multiple sequential scans.
var targets []Pipeline
for _, target := range set.Targets {
switch target.Type {
case physical.ScanTypeDataObject:
// Make sure projections and predicates get passed down to the
// individual scan.
partition := target.DataObject
partition.Predicates = set.Predicates
partition.Projections = set.Projections
targets = append(targets, newLazyPipeline(func(ctx context.Context, _ []Pipeline) Pipeline {
return tracePipeline("physical.DataObjScan", c.executeDataObjScan(ctx, partition))
}, nil))
default:
return errorPipeline(ctx, fmt.Errorf("unrecognized ScanSet target %s", target.Type))
}
}
if len(targets) == 0 {
return emptyPipeline()
}
pipeline, err := newMergePipeline(targets, c.mergePrefetchCount)
if err != nil {
return errorPipeline(ctx, err)
}
return pipeline
}

@ -25,16 +25,6 @@ func TestExecutor(t *testing.T) {
})
}
func TestExecutor_SortMerge(t *testing.T) {
t.Run("no inputs result in empty pipeline", func(t *testing.T) {
ctx := t.Context()
c := &Context{}
pipeline := c.executeSortMerge(ctx, &physical.SortMerge{}, nil)
_, err := pipeline.Read(ctx)
require.ErrorContains(t, err, EOF.Error())
})
}
func TestExecutor_Limit(t *testing.T) {
t.Run("no inputs result in empty pipeline", func(t *testing.T) {
ctx := t.Context()

@ -1,248 +0,0 @@
package executor
import (
"context"
"errors"
"fmt"
"slices"
"sort"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
)
type compareFunc[T comparable] func(a, b T) bool
// NewSortMergePipeline returns a new pipeline that merges already sorted inputs into a single output.
func NewSortMergePipeline(inputs []Pipeline, order physical.SortOrder, column physical.ColumnExpression, evaluator expressionEvaluator) (*KWayMerge, error) {
var lessFunc func(a, b int64) bool
switch order {
case physical.ASC:
lessFunc = func(a, b int64) bool { return a <= b }
case physical.DESC:
lessFunc = func(a, b int64) bool { return a >= b }
default:
return nil, fmt.Errorf("invalid sort order %v", order)
}
for i := range inputs {
inputs[i] = newPrefetchingPipeline(inputs[i])
}
return &KWayMerge{
inputs: inputs,
columnEval: evaluator.newFunc(column),
compare: lessFunc,
}, nil
}
// KWayMerge is a k-way merge of multiple sorted inputs.
// It requires the input batches to be sorted in the same order (ASC/DESC) as the SortMerge operator itself.
// The sort order is defined by the direction of the query, which is either FORWARD or BACKWARDS,
// which is applied to the SortMerge as well as to the DataObjScan during query planning.
type KWayMerge struct {
inputs []Pipeline
initialized bool
batches []arrow.Record
exhausted []bool
offsets []int64
columnEval evalFunc
compare compareFunc[int64]
}
var _ Pipeline = (*KWayMerge)(nil)
// Close implements Pipeline.
func (p *KWayMerge) Close() {
for _, batch := range p.batches {
if batch != nil {
batch.Release()
}
}
for _, input := range p.inputs {
input.Close()
}
}
// Read implements Pipeline.
func (p *KWayMerge) Read(ctx context.Context) (arrow.Record, error) {
p.init(ctx)
return p.read(ctx)
}
func (p *KWayMerge) init(ctx context.Context) {
if p.initialized {
return
}
p.initialized = true
n := len(p.inputs)
p.batches = make([]arrow.Record, n)
p.exhausted = make([]bool, n)
p.offsets = make([]int64, n)
// Initialize pre-fetching on inputs
for i := range p.inputs {
inp, ok := p.inputs[i].(*prefetchWrapper)
if ok {
inp.init(ctx)
}
}
if p.compare == nil {
p.compare = func(a, b int64) bool { return a <= b }
}
}
// Iterate through each record, looking at the value from their starting slice offset.
// Track the top two winners (e.g., the record whose next value is the smallest and the record whose next value is the next smallest).
// Find the largest offset in the starting record whose value is still less than the value of the runner-up record from the previous step.
// Return the slice of that record using the two offsets, and update the stored offset of the returned record for the next call to Read.
func (p *KWayMerge) read(ctx context.Context) (arrow.Record, error) {
start:
timestamps := make([]int64, 0, len(p.inputs))
inputIndexes := make([]int, 0, len(p.inputs))
loop:
for i := range len(p.inputs) {
// Skip exhausted inputs
if p.exhausted[i] {
continue loop
}
// Load next batch if it hasn't been loaded yet, or if current one is already fully consumed
// Read another batch as long as the input yields zero-length batches.
for p.batches[i] == nil || p.offsets[i] == p.batches[i].NumRows() {
// Reset offset for input at index i
p.offsets[i] = 0
// Release previously fully consumed batch
if p.batches[i] != nil {
p.batches[i].Release()
p.batches[i] = nil // remove reference to arrow.Record from slice
}
// Read next batch from input at index i
// If it reaches EOF, mark the input as exhausted and continue with the next input.
rec, err := p.inputs[i].Read(ctx)
if err != nil {
if errors.Is(err, EOF) {
p.exhausted[i] = true
continue loop
}
return nil, err
}
p.batches[i] = rec
}
// Fetch timestamp value at current offset
col, err := p.columnEval(p.batches[i])
if err != nil {
return nil, err
}
defer col.Release()
tsCol, ok := col.ToArray().(*array.Timestamp)
if !ok {
return nil, errors.New("column is not a timestamp column")
}
ts := tsCol.Value(int(p.offsets[i]))
tsCol.Release()
// Populate slices for sorting
inputIndexes = append(inputIndexes, i)
timestamps = append(timestamps, int64(ts))
}
// Pipeline is exhausted if no more input batches are available
if !slices.Contains(p.exhausted, false) {
return nil, EOF
}
if len(inputIndexes) == 0 {
goto start
}
// If there is only a single remaining batch, return the remaining record
if len(inputIndexes) == 1 {
j := inputIndexes[0]
start := p.offsets[j]
end := p.batches[j].NumRows()
// check against empty last batch
if start >= end || end == 0 {
return nil, EOF
}
p.offsets[j] = end
return p.batches[j].NewSlice(start, end), nil
}
sortIndexesByTimestamps(inputIndexes, timestamps, p.compare)
// Return the slice of the current record
j := inputIndexes[0]
// Fetch timestamp value at current offset
col, err := p.columnEval(p.batches[j])
if err != nil {
return nil, err
}
defer col.Release()
// We assume the column is a Uint64 array
tsCol, ok := col.ToArray().(*array.Timestamp)
if !ok {
return nil, errors.New("column is not a timestamp column")
}
defer tsCol.Release()
// Calculate start/end of the sub-slice of the record
start := p.offsets[j]
end := start + 1
for ; end < p.batches[j].NumRows(); end++ {
ts := tsCol.Value(int(end))
if !p.compare(int64(ts), timestamps[1]) {
break
}
}
// check against empty batch
if start > end || end == 0 {
p.offsets[j] = end
return p.batches[j], nil
}
p.offsets[j] = end
return p.batches[j].NewSlice(start, end), nil
}
func sortIndexesByTimestamps(indexes []int, timestamps []int64, lessFn compareFunc[int64]) {
if len(indexes) != len(timestamps) {
panic("lengths of indexes and timestamps must match")
}
pairs := make([]inputTimestampPair, len(indexes))
for i := range indexes {
pairs[i] = inputTimestampPair{indexes[i], timestamps[i]}
}
// Sort pairs by timestamp
sort.SliceStable(pairs, func(i, j int) bool {
return lessFn(pairs[i].timestamp, pairs[j].timestamp)
})
// Unpack the sorted pairs back into the original slices
for i := range pairs {
indexes[i] = pairs[i].index
timestamps[i] = pairs[i].timestamp
}
}
type inputTimestampPair struct {
index int
timestamp int64
}

@ -1,146 +0,0 @@
package executor
import (
"slices"
"testing"
"time"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/stretchr/testify/require"
"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
"github.com/grafana/loki/v3/pkg/engine/internal/types"
)
func TestSortMerge(t *testing.T) {
now := time.Unix(1000000, 0)
var batchSize = int64(3)
c := &Context{
batchSize: batchSize,
}
t.Run("invalid column name", func(t *testing.T) {
merge := &physical.SortMerge{
Column: &physical.ColumnExpr{
Ref: types.ColumnRef{
Column: "not_a_timestamp_column",
Type: types.ColumnTypeBuiltin,
},
},
Order: physical.ASC,
}
inputs := []Pipeline{
ascendingTimestampPipeline(now.Add(1*time.Nanosecond)).Pipeline(batchSize, 10),
ascendingTimestampPipeline(now.Add(2*time.Nanosecond)).Pipeline(batchSize, 10),
ascendingTimestampPipeline(now.Add(3*time.Nanosecond)).Pipeline(batchSize, 10),
}
pipeline, err := NewSortMergePipeline(inputs, merge.Order, merge.Column, expressionEvaluator{})
require.NoError(t, err)
ctx := t.Context()
_, err = pipeline.Read(ctx)
require.ErrorContains(t, err, "column is not a timestamp column")
})
t.Run("ascending timestamp", func(t *testing.T) {
merge := &physical.SortMerge{
Column: &physical.ColumnExpr{
Ref: types.ColumnRef{
Column: types.ColumnNameBuiltinTimestamp,
Type: types.ColumnTypeBuiltin,
},
},
Order: physical.ASC,
}
inputs := []Pipeline{
ascendingTimestampPipeline(now.Add(1*time.Nanosecond)).Pipeline(batchSize, 10),
ascendingTimestampPipeline(now.Add(2*time.Millisecond)).Pipeline(batchSize, 10),
ascendingTimestampPipeline(now.Add(3*time.Second)).Pipeline(batchSize, 10),
}
pipeline, err := NewSortMergePipeline(inputs, merge.Order, merge.Column, expressionEvaluator{})
require.NoError(t, err)
ctx := t.Context()
timestamps := make([]arrow.Timestamp, 0, 30)
var batches, rows int64
for {
batch, err := pipeline.Read(ctx)
if err == EOF {
break
}
if err != nil {
t.Fatalf("did not expect error, got %s", err.Error())
}
tsCol, err := c.evaluator.eval(merge.Column, batch)
require.NoError(t, err)
defer tsCol.Release()
arr := tsCol.ToArray().(*array.Timestamp)
defer arr.Release()
timestamps = append(timestamps, arr.Values()...)
batches++
rows += batch.NumRows()
}
// Check if ts column is sorted
require.Truef(t,
slices.IsSortedFunc(timestamps, func(a, b arrow.Timestamp) int { return int(a - b) }),
"timestamps are not sorted in ASC order: %v", timestamps)
})
t.Run("descending timestamp", func(t *testing.T) {
merge := &physical.SortMerge{
Column: &physical.ColumnExpr{
Ref: types.ColumnRef{
Column: types.ColumnNameBuiltinTimestamp,
Type: types.ColumnTypeBuiltin,
},
},
Order: physical.DESC,
}
inputs := []Pipeline{
descendingTimestampPipeline(now.Add(1*time.Nanosecond)).Pipeline(batchSize, 10),
descendingTimestampPipeline(now.Add(2*time.Millisecond)).Pipeline(batchSize, 10),
descendingTimestampPipeline(now.Add(3*time.Second)).Pipeline(batchSize, 10),
}
pipeline, err := NewSortMergePipeline(inputs, merge.Order, merge.Column, expressionEvaluator{})
require.NoError(t, err)
ctx := t.Context()
timestamps := make([]arrow.Timestamp, 0, 30)
var batches, rows int64
for {
batch, err := pipeline.Read(ctx)
if err == EOF {
break
}
if err != nil {
t.Fatalf("did not expect error, got %s", err.Error())
}
tsCol, err := c.evaluator.eval(merge.Column, batch)
defer tsCol.Release()
require.NoError(t, err)
arr := tsCol.ToArray().(*array.Timestamp)
defer arr.Release()
timestamps = append(timestamps, arr.Values()...)
batches++
rows += batch.NumRows()
}
// Check if ts column is sorted
require.Truef(t,
slices.IsSortedFunc(timestamps, func(a, b arrow.Timestamp) int { return int(b - a) }),
"timestamps are not sorted in DESC order: %v", timestamps)
})
}

@ -1,31 +0,0 @@
package physical
import "fmt"
// Merge represents a merge operation in the physical plan that merges
// N inputs to 1 output.
type Merge struct {
id string
}
// ID implements the [Node] interface.
// Returns a string that uniquely identifies the node in the plan.
func (m *Merge) ID() string {
if m.id == "" {
return fmt.Sprintf("%p", m)
}
return m.id
}
// Type implements the [Node] interface.
// Returns the type of the node.
func (m *Merge) Type() NodeType {
return NodeTypeMerge
}
// Accept implements the [Node] interface.
// Dispatches itself to the provided [Visitor] v
func (m *Merge) Accept(v Visitor) error {
return v.VisitMerge(m)
}

@ -35,26 +35,6 @@ func (r *removeNoopFilter) apply(node Node) bool {
var _ rule = (*removeNoopFilter)(nil)
// removeNoopMerge is a rule that removes merge/sortmerge nodes with only a single input
type removeNoopMerge struct {
plan *Plan
}
// apply implements rule.
func (r *removeNoopMerge) apply(node Node) bool {
changed := false
switch node := node.(type) {
case *Merge, *SortMerge:
if len(r.plan.Children(node)) <= 1 {
r.plan.graph.Eliminate(node)
changed = true
}
}
return changed
}
var _ rule = (*removeNoopMerge)(nil)
// predicatePushdown is a rule that moves down filter predicates to the scan nodes.
type predicatePushdown struct {
plan *Plan
@ -79,6 +59,12 @@ func (r *predicatePushdown) apply(node Node) bool {
func (r *predicatePushdown) applyPredicatePushdown(node Node, predicate Expression) bool {
switch node := node.(type) {
case *ScanSet:
if canApplyPredicate(predicate) {
node.Predicates = append(node.Predicates, predicate)
return true
}
return false
case *DataObjScan:
if canApplyPredicate(predicate) {
node.Predicates = append(node.Predicates, predicate)
@ -224,13 +210,15 @@ func (r *projectionPushdown) applyProjectionPushdown(
applyIfNotEmpty bool,
) bool {
switch node := node.(type) {
case *ScanSet:
return r.handleScanSet(node, projections, applyIfNotEmpty)
case *DataObjScan:
return r.handleDataObjScan(node, projections, applyIfNotEmpty)
case *ParseNode:
return r.handleParseNode(node, projections, applyIfNotEmpty)
case *RangeAggregation:
return r.handleRangeAggregation(node, projections)
case *Parallelize, *Filter, *Merge, *SortMerge, *ColumnCompat:
case *Parallelize, *Filter, *ColumnCompat:
// Push to next direct child that cares about projections
return r.pushToChildren(node, projections, applyIfNotEmpty)
}
@ -238,6 +226,36 @@ func (r *projectionPushdown) applyProjectionPushdown(
return false
}
// handleScanSet handles projection pushdown for ScanSet nodes
func (r *projectionPushdown) handleScanSet(node *ScanSet, projections []ColumnExpression, applyIfNotEmpty bool) bool {
shouldNotApply := len(projections) == 0 && applyIfNotEmpty
if !r.isMetricQuery() || shouldNotApply {
return false
}
// Add to scan projections if not already present
changed := false
for _, colExpr := range projections {
colExpr, ok := colExpr.(*ColumnExpr)
if !ok {
continue
}
var wasAdded bool
node.Projections, wasAdded = addUniqueProjection(node.Projections, colExpr)
if wasAdded {
changed = true
}
}
if changed {
// Sort projections by column name for deterministic order
slices.SortFunc(node.Projections, sortProjections)
}
return changed
}
// handleDataObjScan handles projection pushdown for DataObjScan nodes
func (r *projectionPushdown) handleDataObjScan(node *DataObjScan, projections []ColumnExpression, applyIfNotEmpty bool) bool {
shouldNotApply := len(projections) == 0 && applyIfNotEmpty

@ -1,7 +1,6 @@
package physical
import (
"fmt"
"sort"
"testing"
"time"
@ -76,9 +75,15 @@ var time1000 = types.Timestamp(1000000000)
func dummyPlan() *Plan {
plan := &Plan{}
scan1 := plan.graph.Add(&DataObjScan{id: "scan1"})
scan2 := plan.graph.Add(&DataObjScan{id: "scan2"})
merge := plan.graph.Add(&SortMerge{id: "merge"})
scanSet := plan.graph.Add(&ScanSet{
id: "set",
Targets: []*ScanTarget{
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
},
})
filter1 := plan.graph.Add(&Filter{id: "filter1", Predicates: []Expression{
&BinaryExpr{
Left: newColumnExpr("timestamp", types.ColumnTypeBuiltin),
@ -97,9 +102,7 @@ func dummyPlan() *Plan {
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: filter3, Child: filter2})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: filter2, Child: filter1})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: filter1, Child: merge})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: scan1})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: scan2})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: filter1, Child: scanSet})
return plan
}
@ -133,21 +136,22 @@ func TestOptimizer(t *testing.T) {
actual := PrintAsTree(plan)
optimized := &Plan{}
scan1 := optimized.graph.Add(&DataObjScan{id: "scan1", Predicates: []Expression{
&BinaryExpr{
Left: newColumnExpr("timestamp", types.ColumnTypeBuiltin),
Right: NewLiteral(time1000),
Op: types.BinaryOpGt,
scanSet := optimized.graph.Add(&ScanSet{
id: "set",
Targets: []*ScanTarget{
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
},
}})
scan2 := optimized.graph.Add(&DataObjScan{id: "scan2", Predicates: []Expression{
&BinaryExpr{
Left: newColumnExpr("timestamp", types.ColumnTypeBuiltin),
Right: NewLiteral(time1000),
Op: types.BinaryOpGt,
Predicates: []Expression{
&BinaryExpr{
Left: newColumnExpr("timestamp", types.ColumnTypeBuiltin),
Right: NewLiteral(time1000),
Op: types.BinaryOpGt,
},
},
}})
merge := optimized.graph.Add(&SortMerge{id: "merge"})
})
filter1 := optimized.graph.Add(&Filter{id: "filter1", Predicates: []Expression{}})
filter2 := optimized.graph.Add(&Filter{id: "filter2", Predicates: []Expression{
&BinaryExpr{
@ -160,9 +164,7 @@ func TestOptimizer(t *testing.T) {
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: filter3, Child: filter2})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: filter2, Child: filter1})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: filter1, Child: merge})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: scan1})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: scan2})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: filter1, Child: scanSet})
expected := PrintAsTree(optimized)
require.Equal(t, expected, actual)
@ -181,9 +183,16 @@ func TestOptimizer(t *testing.T) {
actual := PrintAsTree(plan)
optimized := &Plan{}
scan1 := optimized.graph.Add(&DataObjScan{id: "scan1", Predicates: []Expression{}})
scan2 := optimized.graph.Add(&DataObjScan{id: "scan2", Predicates: []Expression{}})
merge := optimized.graph.Add(&SortMerge{id: "merge"})
scanSet := optimized.graph.Add(&ScanSet{
id: "set",
Targets: []*ScanTarget{
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
},
Predicates: []Expression{},
})
filter1 := optimized.graph.Add(&Filter{id: "filter1", Predicates: []Expression{
&BinaryExpr{
Left: newColumnExpr("timestamp", types.ColumnTypeBuiltin),
@ -200,9 +209,7 @@ func TestOptimizer(t *testing.T) {
}})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: filter2, Child: filter1})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: filter1, Child: merge})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: scan1})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: scan2})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: filter1, Child: scanSet})
expected := PrintAsTree(optimized)
require.Equal(t, expected, actual)
@ -626,49 +633,18 @@ func TestOptimizer(t *testing.T) {
require.Equal(t, expected, actual)
})
t.Run("cleanup no-op merge nodes", func(t *testing.T) {
plan := func() *Plan {
plan := &Plan{}
limit := plan.graph.Add(&Limit{id: "limit"})
merge := plan.graph.Add(&Merge{id: "merge"})
sortmerge := plan.graph.Add(&Merge{id: "sortmerge"})
scan := plan.graph.Add(&DataObjScan{id: "scan"})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: limit, Child: merge})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: sortmerge})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: sortmerge, Child: scan})
return plan
}()
optimizations := []*optimization{
newOptimization("cleanup", plan).withRules(
&removeNoopMerge{plan},
),
}
o := newOptimizer(plan, optimizations)
o.optimize(plan.Roots()[0])
actual := PrintAsTree(plan)
optimized := func() *Plan {
plan := &Plan{}
limit := plan.graph.Add(&Limit{id: "limit"})
scan := plan.graph.Add(&DataObjScan{id: "scan"})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: limit, Child: scan})
return plan
}()
expected := PrintAsTree(optimized)
require.Equal(t, expected, actual, fmt.Sprintf("Expected:\n%s\nActual:\n%s\n", expected, actual))
})
// both predicate pushdown and limits pushdown should work together
t.Run("predicate and limits pushdown", func(t *testing.T) {
plan := &Plan{}
scan1 := plan.graph.Add(&DataObjScan{id: "scan1"})
scan2 := plan.graph.Add(&DataObjScan{id: "scan2"})
sortMerge := plan.graph.Add(&SortMerge{id: "sortMerge"})
scanSet := plan.graph.Add(&ScanSet{
id: "set",
Targets: []*ScanTarget{
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
},
})
filter := plan.graph.Add(&Filter{id: "filter", Predicates: []Expression{
&BinaryExpr{
Left: newColumnExpr("timestamp", types.ColumnTypeBuiltin),
@ -679,9 +655,7 @@ func TestOptimizer(t *testing.T) {
limit := plan.graph.Add(&Limit{id: "limit", Fetch: 100})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: limit, Child: filter})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: filter, Child: sortMerge})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: sortMerge, Child: scan1})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: sortMerge, Child: scan2})
_ = plan.graph.AddEdge(dag.Edge[Node]{Parent: filter, Child: scanSet})
planner := NewPlanner(NewContext(time.Unix(0, 0), time.Unix(3600, 0)), &catalog{})
actual, err := planner.Optimize(plan)
@ -689,28 +663,25 @@ func TestOptimizer(t *testing.T) {
optimized := &Plan{}
{
scan1 := optimized.graph.Add(&DataObjScan{id: "scan1",
Predicates: []Expression{
&BinaryExpr{
Left: newColumnExpr("timestamp", types.ColumnTypeBuiltin),
Right: NewLiteral(time1000),
Op: types.BinaryOpGt,
},
}})
scan2 := optimized.graph.Add(&DataObjScan{id: "scan2",
scanSet := optimized.graph.Add(&ScanSet{
id: "set",
Targets: []*ScanTarget{
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
},
Predicates: []Expression{
&BinaryExpr{
Left: newColumnExpr("timestamp", types.ColumnTypeBuiltin),
Right: NewLiteral(time1000),
Op: types.BinaryOpGt,
},
}})
merge := optimized.graph.Add(&SortMerge{id: "merge"})
},
})
limit := optimized.graph.Add(&Limit{id: "limit1", Fetch: 100})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: limit, Child: merge})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: scan1})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: scan2})
_ = optimized.graph.AddEdge(dag.Edge[Node]{Parent: limit, Child: scanSet})
}
expected := PrintAsTree(optimized)
@ -1053,7 +1024,7 @@ func TestProjectionPushdown_PushesRequestedKeysToParseNodes(t *testing.T) {
parseNode = pn
continue
}
if pn, ok := node.(*DataObjScan); ok {
if pn, ok := node.(*ScanSet); ok {
for _, colExpr := range pn.Projections {
expr := colExpr.(*ColumnExpr)
projections[expr.Ref.Column] = struct{}{}

@ -19,6 +19,7 @@ const (
NodeTypeCompat
NodeTypeTopK
NodeTypeParallelize
NodeTypeScanSet
)
func (t NodeType) String() string {
@ -47,6 +48,8 @@ func (t NodeType) String() string {
return "TopK"
case NodeTypeParallelize:
return "Parallelize"
case NodeTypeScanSet:
return "ScanSet"
default:
return "Undefined"
}
@ -73,8 +76,6 @@ type Node interface {
}
var _ Node = (*DataObjScan)(nil)
var _ Node = (*Merge)(nil)
var _ Node = (*SortMerge)(nil)
var _ Node = (*Projection)(nil)
var _ Node = (*Limit)(nil)
var _ Node = (*Filter)(nil)
@ -84,10 +85,9 @@ var _ Node = (*ParseNode)(nil)
var _ Node = (*ColumnCompat)(nil)
var _ Node = (*TopK)(nil)
var _ Node = (*Parallelize)(nil)
var _ Node = (*ScanSet)(nil)
func (*DataObjScan) isNode() {}
func (*Merge) isNode() {}
func (*SortMerge) isNode() {}
func (*Projection) isNode() {}
func (*Limit) isNode() {}
func (*Filter) isNode() {}
@ -97,6 +97,7 @@ func (*ParseNode) isNode() {}
func (*ColumnCompat) isNode() {}
func (*TopK) isNode() {}
func (*Parallelize) isNode() {}
func (*ScanSet) isNode() {}
// WalkOrder defines the order for how a node and its children are visited.
type WalkOrder uint8

@ -160,84 +160,6 @@ func (p *Planner) process(inst logical.Value, ctx *Context) ([]Node, error) {
return nil, nil
}
func (p *Planner) buildNodeGroup(currentGroup []FilteredShardDescriptor, baseNode Node, ctx *Context) error {
scans := []Node{}
for _, descriptor := range currentGroup {
// output current group to nodes
for _, section := range descriptor.Sections {
scan := &DataObjScan{
Location: descriptor.Location,
StreamIDs: descriptor.Streams,
Section: section,
}
p.plan.graph.Add(scan)
scans = append(scans, scan)
}
}
if len(scans) > 1 && ctx.direction != UNSORTED {
// a single topK for overlapping scan nodes.
topK := &TopK{
SortBy: newColumnExpr(types.ColumnNameBuiltinTimestamp, types.ColumnTypeBuiltin),
Ascending: ctx.direction == ASC, // apply direction from previously visited Sort node
NullsFirst: false, // temporarily hardcoded.
}
p.plan.graph.Add(topK)
for _, scan := range scans {
if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: topK, Child: scan}); err != nil {
return err
}
}
if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: baseNode, Child: topK}); err != nil {
return err
}
} else {
for _, scan := range scans {
child := scan
if ctx.direction != UNSORTED {
topK := &TopK{
SortBy: newColumnExpr(types.ColumnNameBuiltinTimestamp, types.ColumnTypeBuiltin),
Ascending: ctx.direction == ASC, // apply direction from previously visited Sort node
NullsFirst: false, // temporarily hardcoded.
}
p.plan.graph.Add(topK)
if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: topK, Child: scan}); err != nil {
return err
}
child = topK
}
if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: baseNode, Child: child}); err != nil {
return err
}
}
}
return nil
}
func overlappingShardDescriptors(filteredShardDescriptors []FilteredShardDescriptor) [][]FilteredShardDescriptor {
// Ensure that shard descriptors are sorted by end time
sort.Slice(filteredShardDescriptors, func(i, j int) bool {
return filteredShardDescriptors[i].TimeRange.End.After(filteredShardDescriptors[j].TimeRange.End)
})
groups := make([][]FilteredShardDescriptor, 0, len(filteredShardDescriptors))
var tr TimeRange
for i, shardDesc := range filteredShardDescriptors {
if i == 0 || !tr.Overlaps(shardDesc.TimeRange) {
// Create new group for first item or if item does not overlap with previous group
groups = append(groups, []FilteredShardDescriptor{shardDesc})
tr = shardDesc.TimeRange
} else {
// Append to existing group
groups[len(groups)-1] = append(groups[len(groups)-1], shardDesc)
tr = tr.Merge(shardDesc.TimeRange)
}
}
return groups
}
// Convert [logical.MakeTable] into one or more [DataObjScan] nodes.
func (p *Planner) processMakeTable(lp *logical.MakeTable, ctx *Context) ([]Node, error) {
shard, ok := lp.Shard.(*logical.ShardInfo)
@ -256,10 +178,11 @@ func (p *Planner) processMakeTable(lp *logical.MakeTable, ctx *Context) ([]Node,
if err != nil {
return nil, err
}
groups := overlappingShardDescriptors(filteredShardDescriptors)
sort.Slice(filteredShardDescriptors, func(i, j int) bool {
return filteredShardDescriptors[i].TimeRange.End.After(filteredShardDescriptors[j].TimeRange.End)
})
if ctx.direction == ASC {
slices.Reverse(groups)
slices.Reverse(filteredShardDescriptors)
}
// Scan work can be parallelized across multiple workers, so we wrap
@ -267,29 +190,40 @@ func (p *Planner) processMakeTable(lp *logical.MakeTable, ctx *Context) ([]Node,
var parallelize Node = &Parallelize{}
p.plan.graph.Add(parallelize)
var merge Node = &Merge{}
p.plan.graph.Add(merge)
for _, gr := range groups {
if err := p.buildNodeGroup(gr, merge, ctx); err != nil {
return nil, err
scanSet := &ScanSet{}
p.plan.graph.Add(scanSet)
for _, desc := range filteredShardDescriptors {
for _, section := range desc.Sections {
scanSet.Targets = append(scanSet.Targets, &ScanTarget{
Type: ScanTypeDataObject,
DataObject: &DataObjScan{
Location: desc.Location,
StreamIDs: desc.Streams,
Section: section,
},
})
}
}
var base Node = scanSet
if p.context.v1Compatible {
compat := &ColumnCompat{
Source: types.ColumnTypeMetadata,
Destination: types.ColumnTypeMetadata,
Collision: types.ColumnTypeLabel,
}
merge, err = p.wrapNodeWith(merge, compat)
base, err = p.wrapNodeWith(base, compat)
if err != nil {
return nil, err
}
}
// Add an edge between the parallelize and the final merge node (which may
// Add an edge between the parallelize and the final base node (which may
// have been changed after processing compatibility).
if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: parallelize, Child: merge}); err != nil {
if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: parallelize, Child: base}); err != nil {
return nil, err
}
return []Node{parallelize}, nil
@ -313,14 +247,37 @@ func (p *Planner) processSelect(lp *logical.Select, ctx *Context) ([]Node, error
return []Node{node}, nil
}
// Pass sort direction from [logical.Sort] to the children.
// processSort processes a [logical.Sort] node.
func (p *Planner) processSort(lp *logical.Sort, ctx *Context) ([]Node, error) {
order := DESC
if lp.Ascending {
order = ASC
}
return p.process(lp.Table, ctx.WithDirection(order))
node := &TopK{
SortBy: &ColumnExpr{Ref: lp.Column.Ref},
Ascending: order == ASC,
NullsFirst: false,
// K initially starts at 0, indicating to sort everything. The
// [limitPushdown] optimization pass can update this value based on how
// many rows are needed.
K: 0,
}
p.plan.graph.Add(node)
children, err := p.process(lp.Table, ctx.WithDirection(order))
if err != nil {
return nil, err
}
for i := range children {
if err := p.plan.graph.AddEdge(dag.Edge[Node]{Parent: node, Child: children[i]}); err != nil {
return nil, err
}
}
return []Node{node}, nil
}
// Convert [logical.Limit] into one [Limit] node.
@ -454,9 +411,6 @@ func (p *Planner) Optimize(plan *Plan) (*Plan, error) {
newOptimization("ProjectionPushdown", plan).withRules(
&projectionPushdown{plan: plan},
),
newOptimization("CleanupMerge", plan).withRules(
&removeNoopMerge{plan: plan},
),
}
optimizer := newOptimizer(plan, optimizations)
optimizer.optimize(root)

@ -90,8 +90,13 @@ func locations(t *testing.T, plan *Plan, nodes []Node) []string {
res := make([]string, 0, len(nodes))
visitor := &nodeCollectVisitor{
onVisitDataObjScan: func(scan *DataObjScan) error {
res = append(res, string(scan.Location))
onVisitScanSet: func(set *ScanSet) error {
for _, target := range set.Targets {
switch target.Type {
case ScanTypeDataObject:
res = append(res, string(target.DataObject.Location))
}
}
return nil
},
}
@ -106,8 +111,13 @@ func sections(t *testing.T, plan *Plan, nodes []Node) [][]int {
res := make([][]int, 0, len(nodes))
visitor := &nodeCollectVisitor{
onVisitDataObjScan: func(scan *DataObjScan) error {
res = append(res, []int{scan.Section})
onVisitScanSet: func(set *ScanSet) error {
for _, target := range set.Targets {
switch target.Type {
case ScanTypeDataObject:
res = append(res, []int{target.DataObject.Section})
}
}
return nil
},
}
@ -483,25 +493,21 @@ func TestPlanner_MakeTable_Ordering(t *testing.T) {
expectedPlan := &Plan{}
parallelize := expectedPlan.graph.Add(&Parallelize{id: "parallelize"})
compat := expectedPlan.graph.Add(&ColumnCompat{id: "compat", Source: types.ColumnTypeMetadata, Destination: types.ColumnTypeMetadata, Collision: types.ColumnTypeLabel})
merge := expectedPlan.graph.Add(&Merge{id: "merge"})
topK1 := expectedPlan.graph.Add(&TopK{id: "topk1", SortBy: &ColumnExpr{Ref: types.ColumnRef{Column: "timestamp", Type: types.ColumnTypeBuiltin}}, Ascending: true})
topK2 := expectedPlan.graph.Add(&TopK{id: "topk2", SortBy: &ColumnExpr{Ref: types.ColumnRef{Column: "timestamp", Type: types.ColumnTypeBuiltin}}, Ascending: true})
scan1 := expectedPlan.graph.Add(&DataObjScan{id: "scan1", Location: "obj1", Section: 3, StreamIDs: []int64{1, 2}})
scan2 := expectedPlan.graph.Add(&DataObjScan{id: "scan2", Location: "obj2", Section: 1, StreamIDs: []int64{3, 4}})
scan3 := expectedPlan.graph.Add(&DataObjScan{id: "scan3", Location: "obj3", Section: 2, StreamIDs: []int64{5, 1}})
scan4 := expectedPlan.graph.Add(&DataObjScan{id: "scan4", Location: "obj3", Section: 3, StreamIDs: []int64{5, 1}})
scanSet := expectedPlan.graph.Add(&ScanSet{
id: "scanset",
// Targets should be added in the order of the scan timestamps
// ASC => oldest to newest
Targets: []*ScanTarget{
{Type: ScanTypeDataObject, DataObject: &DataObjScan{id: "scan4", Location: "obj3", Section: 3, StreamIDs: []int64{5, 1}}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{id: "scan3", Location: "obj3", Section: 2, StreamIDs: []int64{5, 1}}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{id: "scan2", Location: "obj2", Section: 1, StreamIDs: []int64{3, 4}}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{id: "scan1", Location: "obj1", Section: 3, StreamIDs: []int64{1, 2}}},
},
})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: parallelize, Child: compat})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: compat, Child: merge})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: topK1})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: topK2})
// Sort merges should be added in the order of the scan timestamps
// ASC => oldest to newest
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: topK1, Child: scan3})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: topK1, Child: scan4})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: topK2, Child: scan1})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: topK2, Child: scan2})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: compat, Child: scanSet})
actual := PrintAsTree(plan)
expected := PrintAsTree(expectedPlan)
@ -521,24 +527,20 @@ func TestPlanner_MakeTable_Ordering(t *testing.T) {
expectedPlan := &Plan{}
parallelize := expectedPlan.graph.Add(&Parallelize{id: "parallelize"})
compat := expectedPlan.graph.Add(&ColumnCompat{id: "compat", Source: types.ColumnTypeMetadata, Destination: types.ColumnTypeMetadata, Collision: types.ColumnTypeLabel})
merge := expectedPlan.graph.Add(&Merge{id: "merge"})
topK1 := expectedPlan.graph.Add(&TopK{id: "topk1", SortBy: &ColumnExpr{Ref: types.ColumnRef{Column: "timestamp", Type: types.ColumnTypeBuiltin}}, Ascending: false})
topK2 := expectedPlan.graph.Add(&TopK{id: "topk2", SortBy: &ColumnExpr{Ref: types.ColumnRef{Column: "timestamp", Type: types.ColumnTypeBuiltin}}, Ascending: false})
scan1 := expectedPlan.graph.Add(&DataObjScan{id: "scan1", Location: "obj1", Section: 3, StreamIDs: []int64{1, 2}})
scan2 := expectedPlan.graph.Add(&DataObjScan{id: "scan2", Location: "obj2", Section: 1, StreamIDs: []int64{3, 4}})
scan3 := expectedPlan.graph.Add(&DataObjScan{id: "scan3", Location: "obj3", Section: 2, StreamIDs: []int64{5, 1}})
scan4 := expectedPlan.graph.Add(&DataObjScan{id: "scan4", Location: "obj3", Section: 3, StreamIDs: []int64{5, 1}})
scanSet := expectedPlan.graph.Add(&ScanSet{
id: "scanset",
// Targets should be added in the order of the scan timestamps
Targets: []*ScanTarget{
{Type: ScanTypeDataObject, DataObject: &DataObjScan{id: "scan1", Location: "obj1", Section: 3, StreamIDs: []int64{1, 2}}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{id: "scan2", Location: "obj2", Section: 1, StreamIDs: []int64{3, 4}}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{id: "scan3", Location: "obj3", Section: 2, StreamIDs: []int64{5, 1}}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{id: "scan4", Location: "obj3", Section: 3, StreamIDs: []int64{5, 1}}},
},
})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: parallelize, Child: compat})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: compat, Child: merge})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: topK1})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: topK2})
// Sort merges should be added in the order of the scan timestamps
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: topK1, Child: scan1})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: topK1, Child: scan2})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: topK2, Child: scan3})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: topK2, Child: scan4})
_ = expectedPlan.graph.AddEdge(dag.Edge[Node]{Parent: compat, Child: scanSet})
actual := PrintAsTree(plan)
expected := PrintAsTree(expectedPlan)
@ -550,67 +552,3 @@ func TestPlanner_MakeTable_Ordering(t *testing.T) {
require.Equal(t, expected, actual)
})
}
func TestPlanner_OverlappingShardDescriptors(t *testing.T) {
tests := []struct {
name string
ranges []TimeRange
groups int
}{
{
name: "Isolated groups",
ranges: []TimeRange{
{Start: time.UnixMilli(1), End: time.UnixMilli(2)},
{Start: time.UnixMilli(3), End: time.UnixMilli(4)},
{Start: time.UnixMilli(5), End: time.UnixMilli(6)},
},
groups: 3,
},
{
name: "Equal start and end are one group",
ranges: []TimeRange{
{Start: time.UnixMilli(1), End: time.UnixMilli(2)},
{Start: time.UnixMilli(2), End: time.UnixMilli(4)},
},
groups: 1,
},
{
name: "One range contains two isolated groups",
ranges: []TimeRange{
{Start: time.UnixMilli(1), End: time.UnixMilli(2)},
{Start: time.UnixMilli(3), End: time.UnixMilli(4)},
{Start: time.UnixMilli(0), End: time.UnixMilli(5)},
},
groups: 1,
},
{
name: "One range spans two isolated groups",
ranges: []TimeRange{
{Start: time.UnixMilli(0), End: time.UnixMilli(2)},
{Start: time.UnixMilli(4), End: time.UnixMilli(5)},
{Start: time.UnixMilli(2), End: time.UnixMilli(4)},
},
groups: 1,
},
{
name: "Real world example",
ranges: []TimeRange{
{Start: time.Date(2025, time.September, 16, 15, 0, 31, 361695211, time.UTC), End: time.Date(2025, time.September, 16, 15, 0, 46, 800186241, time.UTC)},
{Start: time.Date(2025, time.September, 16, 15, 0, 31, 350398040, time.UTC), End: time.Date(2025, time.September, 16, 15, 0, 31, 350398040, time.UTC)},
{Start: time.Date(2025, time.September, 16, 15, 0, 31, 330227014, time.UTC), End: time.Date(2025, time.September, 16, 15, 1, 3, 337407239, time.UTC)},
},
groups: 1,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
descriptors := []FilteredShardDescriptor{}
for _, tr := range tt.ranges {
descriptors = append(descriptors, FilteredShardDescriptor{TimeRange: tr})
}
groups := overlappingShardDescriptors(descriptors)
require.Equal(t, tt.groups, len(groups))
})
}
}

@ -38,11 +38,6 @@ func toTreeNode(n Node) *tree.Node {
for i := range node.Predicates {
treeNode.Properties = append(treeNode.Properties, tree.NewProperty(fmt.Sprintf("predicate[%d]", i), false, node.Predicates[i].String()))
}
case *SortMerge:
treeNode.Properties = []tree.Property{
tree.NewProperty("column", false, node.Column),
tree.NewProperty("order", false, node.Order),
}
case *Projection:
treeNode.Properties = []tree.Property{
tree.NewProperty("columns", true, toAnySlice(node.Columns)...),
@ -51,8 +46,6 @@ func toTreeNode(n Node) *tree.Node {
for i := range node.Predicates {
treeNode.Properties = append(treeNode.Properties, tree.NewProperty(fmt.Sprintf("predicate[%d]", i), false, node.Predicates[i].String()))
}
case *Merge:
// nothing to add
case *Limit:
treeNode.Properties = []tree.Property{
tree.NewProperty("offset", false, node.Skip),
@ -92,6 +85,34 @@ func toTreeNode(n Node) *tree.Node {
tree.NewProperty("nulls_first", false, node.NullsFirst),
tree.NewProperty("k", false, node.K),
}
case *Parallelize:
// Nothing to add
case *ScanSet:
treeNode.Properties = []tree.Property{
tree.NewProperty("num_targets", false, len(node.Targets)),
}
if len(node.Projections) > 0 {
treeNode.Properties = append(treeNode.Properties, tree.NewProperty("projections", true, toAnySlice(node.Projections)...))
}
for i := range node.Predicates {
treeNode.Properties = append(treeNode.Properties, tree.NewProperty(fmt.Sprintf("predicate[%d]", i), false, node.Predicates[i].String()))
}
for _, target := range node.Targets {
properties := []tree.Property{
tree.NewProperty("type", false, target.Type.String()),
}
switch target.Type {
case ScanTypeDataObject:
// Create a child node to extract the properties of the target.
childNode := toTreeNode(target.DataObject)
properties = append(properties, childNode.Properties...)
}
treeNode.AddComment("@target", "", properties)
}
}
return treeNode
}

@ -12,13 +12,17 @@ func TestPrinter(t *testing.T) {
limit := p.graph.Add(&Limit{id: "limit"})
filter := p.graph.Add(&Filter{id: "filter"})
merge := p.graph.Add(&SortMerge{id: "merge"})
scan1 := p.graph.Add(&DataObjScan{id: "scan1"})
scan2 := p.graph.Add(&DataObjScan{id: "scan2"})
scanSet := p.graph.Add(&ScanSet{
id: "set",
Targets: []*ScanTarget{
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
{Type: ScanTypeDataObject, DataObject: &DataObjScan{}},
},
})
_ = p.graph.AddEdge(dag.Edge[Node]{Parent: limit, Child: filter})
_ = p.graph.AddEdge(dag.Edge[Node]{Parent: filter, Child: merge})
_ = p.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: scan1})
_ = p.graph.AddEdge(dag.Edge[Node]{Parent: merge, Child: scan2})
_ = p.graph.AddEdge(dag.Edge[Node]{Parent: filter, Child: scanSet})
repr := PrintAsTree(p)
t.Log("\n" + repr)

@ -0,0 +1,70 @@
package physical
import (
"fmt"
)
// ScanTarget represents a target of a [ScanSet].
type ScanTarget struct {
Type ScanType
// DataObj is non-nil if Type is [ScanTypeDataObject]. Despite DataObjScan
// implementing [Node], the value is not inserted into the graph as a node.
DataObject *DataObjScan
}
// ScanType represents the data being scanned in a target of a [ScanSet].
type ScanType int
const (
ScanTypeInvalid ScanType = iota
ScanTypeDataObject
)
// String returns a string representation of the scan type.
func (ty ScanType) String() string {
switch ty {
case ScanTypeInvalid:
return "ScanTypeInvalid"
case ScanTypeDataObject:
return "ScanTypeDataObject"
default:
return fmt.Sprintf("ScanType(%d)", ty)
}
}
// ScanSet represents a physical plan operation for reading data from targets.
type ScanSet struct {
id string
// Targets to scan.
Targets []*ScanTarget
// Projections are used to limit the columns that are read to the ones
// provided in the column expressions to reduce the amount of data that
// needs to be processed.
Projections []ColumnExpression
// Predicates are used to filter rows to reduce the amount of rows that are
// returned. Predicates would almost always contain a time range filter to
// only read the logs for the requested time range.
Predicates []Expression
}
// ID returns a string that uniquely identifies the node in the plan.
func (s *ScanSet) ID() string {
if s.id == "" {
return fmt.Sprintf("%p", s)
}
return s.id
}
// Type returns [NodeTypeScanSet].
func (s *ScanSet) Type() NodeType {
return NodeTypeScanSet
}
// Accept dispatches s to the provided [Visitor] v.
func (s *ScanSet) Accept(v Visitor) error {
return v.VisitScanSet(s)
}

@ -0,0 +1,23 @@
package physical
type SortOrder uint8
const (
UNSORTED SortOrder = iota
ASC
DESC
)
// String returns the string representation of the [SortOrder].
func (o SortOrder) String() string {
switch o {
case UNSORTED:
return "UNSORTED"
case ASC:
return "ASC"
case DESC:
return "DESC"
default:
return "UNDEFINED"
}
}

@ -1,63 +0,0 @@
package physical
import "fmt"
type SortOrder uint8
const (
UNSORTED SortOrder = iota
ASC
DESC
)
// String returns the string representation of the [SortOrder].
func (o SortOrder) String() string {
switch o {
case UNSORTED:
return "UNSORTED"
case ASC:
return "ASC"
case DESC:
return "DESC"
default:
return "UNDEFINED"
}
}
// SortMerge represents a sort+merge operation in the physical plan. It
// performs sorting of data based on the specified Column and Order direction.
type SortMerge struct {
id string
// Column defines the column expression by which the rows should be sorted.
// This is almost always the timestamp column, because it is the column
// by which the results of the DataObjScan node are sorted. This allows
// for sorting and merging multiple already sorted inputs from the DataObjScan
// without being a pipeline breaker.
Column ColumnExpression
// Order defines whether the column should be sorted in ascending or
// descending order. Must match the read direction of the DataObjScan that
// feeds into the SortMerge.
Order SortOrder
}
// ID implements the [Node] interface.
// Returns a string that uniquely identifies the node in the plan.
func (m *SortMerge) ID() string {
if m.id == "" {
return fmt.Sprintf("%p", m)
}
return m.id
}
// Type implements the [Node] interface.
// Returns the type of the node.
func (*SortMerge) Type() NodeType {
return NodeTypeSortMerge
}
// Accept implements the [Node] interface.
// Dispatches itself to the provided [Visitor] v
func (m *SortMerge) Accept(v Visitor) error {
return v.VisitSortMerge(m)
}

@ -6,15 +6,14 @@ package physical
// plan.
type Visitor interface {
VisitDataObjScan(*DataObjScan) error
VisitSortMerge(*SortMerge) error
VisitProjection(*Projection) error
VisitRangeAggregation(*RangeAggregation) error
VisitFilter(*Filter) error
VisitMerge(*Merge) error
VisitLimit(*Limit) error
VisitVectorAggregation(*VectorAggregation) error
VisitParse(*ParseNode) error
VisitCompat(*ColumnCompat) error
VisitTopK(*TopK) error
VisitParallelize(*Parallelize) error
VisitScanSet(*ScanSet) error
}

@ -14,13 +14,12 @@ type nodeCollectVisitor struct {
onVisitDataObjScan func(*DataObjScan) error
onVisitFilter func(*Filter) error
onVisitLimit func(*Limit) error
onVisitSortMerge func(*SortMerge) error
onVisitMerge func(*Merge) error
onVisitProjection func(*Projection) error
onVisitRangeAggregation func(*RangeAggregation) error
onVisitVectorAggregation func(*VectorAggregation) error
onVisitParse func(*ParseNode) error
onVisitParallelize func(*Parallelize) error
onVisitScanSet func(*ScanSet) error
}
func (v *nodeCollectVisitor) VisitDataObjScan(n *DataObjScan) error {
@ -55,23 +54,6 @@ func (v *nodeCollectVisitor) VisitProjection(n *Projection) error {
return nil
}
func (v *nodeCollectVisitor) VisitSortMerge(n *SortMerge) error {
if v.onVisitSortMerge != nil {
return v.onVisitSortMerge(n)
}
v.visited = append(v.visited, fmt.Sprintf("%s.%s", n.Type().String(), n.ID()))
return nil
}
func (v *nodeCollectVisitor) VisitMerge(n *Merge) error {
if v.onVisitMerge != nil {
return v.onVisitMerge(n)
}
v.visited = append(v.visited, fmt.Sprintf("%s.%s", n.Type().String(), n.ID()))
return nil
}
func (v *nodeCollectVisitor) VisitRangeAggregation(n *RangeAggregation) error {
if v.onVisitRangeAggregation != nil {
return v.onVisitRangeAggregation(n)
@ -114,3 +96,11 @@ func (v *nodeCollectVisitor) VisitParallelize(n *Parallelize) error {
v.visited = append(v.visited, fmt.Sprintf("%s.%s", n.Type().String(), n.ID()))
return nil
}
func (v *nodeCollectVisitor) VisitScanSet(n *ScanSet) error {
if v.onVisitScanSet != nil {
return v.onVisitScanSet(n)
}
v.visited = append(v.visited, fmt.Sprintf("%s.%s", n.Type().String(), n.ID()))
return nil
}

@ -29,9 +29,10 @@ type DataObjV2EngineStore struct {
func NewDataObjV2EngineStore(dir string, tenantID string) (*DataObjV2EngineStore, error) {
storageDir := filepath.Join(dir, storageDir)
return dataobjV2StoreWithOpts(storageDir, tenantID, engine.Config{
Enable: true,
BatchSize: 512,
RangeConfig: rangeio.DefaultConfig,
Enable: true,
BatchSize: 512,
RangeConfig: rangeio.DefaultConfig,
MergePrefetchCount: 8,
}, metastore.Config{
IndexStoragePrefix: "index/v0",
})

Loading…
Cancel
Save