loki/pkg/engine/internal/executor/range_aggregation_bench_tes...

package executor

import (
	"context"
	"errors"
	"testing"
	"time"

	"github.com/apache/arrow-go/v18/arrow"
	"github.com/apache/arrow-go/v18/arrow/memory"

	"github.com/grafana/loki/v3/pkg/engine/internal/assertions"
	"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
	"github.com/grafana/loki/v3/pkg/engine/internal/semconv"
	"github.com/grafana/loki/v3/pkg/engine/internal/types"
	"github.com/grafana/loki/v3/pkg/util/arrowtest"
)

// BenchmarkRangeAggregationPipeline measures pipeline.Read for each window strategy.
// The pipeline and input batches are built once per subbenchmark; each iteration only
// resets cursors and aggregator state so Read can run again.
func BenchmarkRangeAggregationPipeline(b *testing.B) {
	old := assertions.Enabled
	assertions.Enabled = false
	b.Cleanup(func() { assertions.Enabled = old })

	groupBy := buildRangeAggregationGrouping()
	schema, rows := buildRangeAggregationInput()
	inputRecords := buildInputRecords(b, schema, rows)
	b.Cleanup(func() {
		for _, rec := range inputRecords {
			rec.Release()
		}
	})

	cases := []struct {
		name string
		opts rangeAggregationOptions
	}{
		{
			name: "case=instant",
			opts: rangeAggregationOptions{
				grouping:      groupBy,
				startTs:       time.Unix(1000, 0),
				endTs:         time.Unix(1000, 0),
				rangeInterval: 1000 * time.Second,
				step:          0,
				operation:     types.RangeAggregationTypeCount,
			},
		},
		{
			name: "case=aligned",
			opts: rangeAggregationOptions{
				grouping:      groupBy,
				startTs:       time.Unix(10, 0),
				endTs:         time.Unix(40, 0),
				rangeInterval: 10 * time.Second,
				step:          10 * time.Second,
				operation:     types.RangeAggregationTypeCount,
			},
		},
		{
			name: "case=gapped",
			opts: rangeAggregationOptions{
				grouping:      groupBy,
				startTs:       time.Unix(10, 0),
				endTs:         time.Unix(40, 0),
				rangeInterval: 5 * time.Second,
				step:          10 * time.Second,
				operation:     types.RangeAggregationTypeCount,
			},
		},
		{
			name: "case=overlapping",
			opts: rangeAggregationOptions{
				grouping:      groupBy,
				startTs:       time.Unix(10, 0),
				endTs:         time.Unix(40, 0),
				rangeInterval: 5 * time.Minute,
				step:          10 * time.Second,
				operation:     types.RangeAggregationTypeCount,
			},
		},
	}

	ctx := context.Background()
	evaluator := newExpressionEvaluator()

	for _, tc := range cases {
		b.Run(tc.name, func(b *testing.B) {
			input := NewBufferedPipeline(inputRecords...)
			pipeline, err := newRangeAggregationPipeline([]Pipeline{input}, evaluator, tc.opts)
			if err != nil {
				b.Fatal(err)
			}
			if err := pipeline.Open(ctx); err != nil {
				b.Fatal(err)
			}

			b.ReportAllocs()
			b.ResetTimer()
			for i := 0; i < b.N; i++ {
				// Read all the records each iteration
				for {
					rec, err := pipeline.Read(ctx)
					if err != nil {
						if errors.Is(err, EOF) {
							resetRangeAggregationPipeline(pipeline, input)
							break
						}
						b.Fatal(err)
					}
					if rec != nil {
						rec.Release()
					}
				}
			}
		})
	}
}

// resetRangeAggregationPipeline rewinds a range aggregation pipeline so Read can be
// invoked again with the same inputs. rangeAggregationPipeline is single-shot by
// default (inputsExhausted); this is intended for benchmarks and tests only.
func resetRangeAggregationPipeline(p *rangeAggregationPipeline, input *BufferedPipeline) {
	p.inputsExhausted = false
	p.aggregator.Reset()
	input.Reset()
}

func buildInputRecords(b *testing.B, schema *arrow.Schema, rows []arrowtest.Rows) []arrow.RecordBatch {
	b.Helper()

	records := make([]arrow.RecordBatch, len(rows))
	for i, r := range rows {
		records[i] = r.Record(memory.DefaultAllocator, schema)
	}
	return records
}

func buildRangeAggregationGrouping() physical.Grouping {
	return physical.Grouping{
		Columns: []physical.ColumnExpression{
			&physical.ColumnExpr{
				Ref: types.ColumnRef{
					Column: "env",
					Type:   types.ColumnTypeAmbiguous,
				},
			},
			&physical.ColumnExpr{
				Ref: types.ColumnRef{
					Column: "service",
					Type:   types.ColumnTypeAmbiguous,
				},
			},
		},
		Without: false,
	}
}

func buildRangeAggregationInput() (*arrow.Schema, []arrowtest.Rows) {
	fields := []arrow.Field{
		semconv.FieldFromFQN(colTs, false),
		semconv.FieldFromFQN(colEnv, false),
		semconv.FieldFromFQN(colSvc, false),
	}
	schema := arrow.NewSchema(fields, nil)

	const (
		rowsPerBatch = 1024
		batches      = 8
	)

	rows := make([]arrowtest.Rows, batches)
	base := time.Unix(12, 0).UTC()
	for batch := range batches {
		batchRows := make(arrowtest.Rows, rowsPerBatch)
		for i := range rowsPerBatch {
			offset := batch*rowsPerBatch + i
			batchRows[i] = arrowtest.Row{
				colTs:  base.Add(time.Duration(offset) * time.Second),
				colEnv: []string{"prod", "dev", "staging"}[offset%3],
				colSvc: []string{"app1", "app2", "app3", "app4"}[offset%4],
			}
		}
		rows[batch] = batchRows
	}

	return schema, rows
}