loki/pkg/logql/rangemapper.go

package logql

import (
	"fmt"
	"math"
	"time"

	"github.com/go-kit/log/level"
	"github.com/pkg/errors"
	"github.com/prometheus/client_golang/prometheus"

	"github.com/grafana/loki/v3/pkg/logql/syntax"
	util_log "github.com/grafana/loki/v3/pkg/util/log"
)

var splittableVectorOp = map[string]struct{}{
	syntax.OpTypeSum:      {},
	syntax.OpTypeCount:    {},
	syntax.OpTypeMax:      {},
	syntax.OpTypeMin:      {},
	syntax.OpTypeAvg:      {},
	syntax.OpTypeTopK:     {},
	syntax.OpTypeSort:     {},
	syntax.OpTypeSortDesc: {},
}

var splittableRangeVectorOp = map[string]struct{}{
	syntax.OpRangeTypeRate:      {},
	syntax.OpRangeTypeBytesRate: {},
	syntax.OpRangeTypeBytes:     {},
	syntax.OpRangeTypeCount:     {},
	syntax.OpRangeTypeSum:       {},
	syntax.OpRangeTypeMax:       {},
	syntax.OpRangeTypeMin:       {},
}

// RangeMapper is used to rewrite LogQL sample expressions into multiple
// downstream sample expressions with a smaller time range that can be executed
// using the downstream engine.
//
// A rewrite is performed using the following rules:
//  1. Check if query is splittable based on the range.
//  2. Check if the query is splittable based on the query AST
//  3. Range aggregations are split into multiple downstream range aggregation expressions
//     that are concatenated with an appropriate vector aggregator with a grouping operator.
//     If the range aggregation has a grouping, the grouping is also applied to
//     the resultant vector aggregator expression.
//     If the range aggregation has no grouping, a grouping operator using "without" is applied
//     to the resultant vector aggregator expression to preserve the stream labels.
//  4. Vector aggregations are split into multiple downstream vector aggregations
//     that are merged with vector aggregation using "without" and then aggregated
//     using the vector aggregation with the same operator,
//     either with or without grouping.
//  5. Left and right-hand side of binary operations are split individually
//     using the same rules as above.
type RangeMapper struct {
	splitByInterval time.Duration
	metrics         *MapperMetrics
	stats           *MapperStats

	splitAlignTs time.Time
}

// NewRangeMapperWithSplitAlign is similar to `NewRangeMapper` except it accepts additional `splitAlign` argument and used to
// align the subqueries generated according to that. Look at `rangeSplitAlign` method for more information.
func NewRangeMapperWithSplitAlign(interval time.Duration, splitAlign time.Time, metrics *MapperMetrics, stats *MapperStats) (RangeMapper, error) {
	rm, err := NewRangeMapper(interval, metrics, stats)
	if err != nil {
		return RangeMapper{}, err
	}
	rm.splitAlignTs = splitAlign

	return rm, nil
}

// NewRangeMapper creates a new RangeMapper instance with the given duration as
// split interval. The interval must be greater than 0.
func NewRangeMapper(interval time.Duration, metrics *MapperMetrics, stats *MapperStats) (RangeMapper, error) {
	if interval <= 0 {
		return RangeMapper{}, fmt.Errorf("cannot create RangeMapper with splitByInterval <= 0; got %s", interval)
	}
	return RangeMapper{
		splitByInterval: interval,
		metrics:         metrics,
		stats:           stats,
	}, nil
}

func NewRangeMapperMetrics(registerer prometheus.Registerer) *MapperMetrics {
	return newMapperMetrics(registerer, "range")
}

// Parse parses the given LogQL query string into a sample expression and
// applies the rewrite rules for splitting it into a sample expression that can
// be executed by the downstream engine.
// It returns a boolean indicating whether a rewrite was possible, the
// rewritten sample expression, and an error in case the rewrite failed.
func (m RangeMapper) Parse(expr syntax.Expr) (bool, syntax.Expr, error) {
	origExpr, ok := expr.(syntax.SampleExpr)
	if !ok {
		return true, nil, errors.New("only sample expression supported")
	}

	recorder := m.metrics.downstreamRecorder()

	if !isSplittableByRange(origExpr) {
		m.metrics.ParsedQueries.WithLabelValues(NoopKey).Inc()
		return true, origExpr, nil
	}

	modExpr, err := m.Map(origExpr, nil, recorder)
	if err != nil {
		m.metrics.ParsedQueries.WithLabelValues(FailureKey).Inc()
		return true, nil, err
	}

	noop := origExpr.String() == modExpr.String()
	if noop {
		// reset split queries counter if the query is a noop
		m.stats.resetSplitQueries()
		m.metrics.ParsedQueries.WithLabelValues(NoopKey).Inc()
	} else {
		m.metrics.ParsedQueries.WithLabelValues(SuccessKey).Inc()
	}

	recorder.Finish() // only record metrics for successful mappings

	return noop, modExpr, err
}

// Map rewrites sample expression expr and returns the resultant sample expression to be executed by the downstream engine
// It is called recursively on the expression tree.
// The function takes an optional vector aggregation as second argument, that
// is pushed down to the downstream expression.
func (m RangeMapper) Map(expr syntax.SampleExpr, vectorAggrPushdown *syntax.VectorAggregationExpr, recorder *downstreamRecorder) (syntax.SampleExpr, error) {
	// immediately clone the passed expr to avoid mutating the original
	expr = syntax.MustClone(expr)
	switch e := expr.(type) {
	case *syntax.VectorAggregationExpr:
		return m.mapVectorAggregationExpr(e, recorder)
	case *syntax.RangeAggregationExpr:
		return m.mapRangeAggregationExpr(e, vectorAggrPushdown, recorder), nil
	case *syntax.BinOpExpr:
		lhsMapped, err := m.Map(e.SampleExpr, vectorAggrPushdown, recorder)
		if err != nil {
			return nil, err
		}
		// if left-hand side is a noop, we need to return the original expression
		// so the whole expression is a noop and thus not executed using the
		// downstream engine.
		// Note: literal expressions are identical to their mapped expression,
		// map binary expression if left-hand size is a literal
		if _, ok := e.SampleExpr.(*syntax.LiteralExpr); e.SampleExpr.String() == lhsMapped.String() && !ok {
			return e, nil
		}
		rhsMapped, err := m.Map(e.RHS, vectorAggrPushdown, recorder)
		if err != nil {
			return nil, err
		}
		// if right-hand side is a noop, we need to return the original expression
		// so the whole expression is a noop and thus not executed using the
		// downstream engine
		// Note: literal expressions are identical to their mapped expression,
		// map binary expression if right-hand size is a literal
		if _, ok := e.RHS.(*syntax.LiteralExpr); e.RHS.String() == rhsMapped.String() && !ok {
			return e, nil
		}
		e.SampleExpr = lhsMapped
		e.RHS = rhsMapped
		return e, nil
	case *syntax.LabelReplaceExpr:
		lhsMapped, err := m.Map(e.Left, vectorAggrPushdown, recorder)
		if err != nil {
			return nil, err
		}
		e.Left = lhsMapped
		return e, nil
	case *syntax.LiteralExpr:
		return e, nil
	case *syntax.VectorExpr:
		return e, nil
	default:
		// ConcatSampleExpr and DownstreamSampleExpr are not supported input expression types
		return nil, errors.Errorf("unexpected expr type (%T) for ASTMapper type (%T) ", expr, m)
	}
}

// getRangeInterval returns the interval in the range vector
// Note that this function must not be called with a BinOpExpr as argument
// as it returns only the range of the RHS.
// Example: expression `count_over_time({app="foo"}[10m])` returns 10m
func getRangeInterval(expr syntax.SampleExpr) time.Duration {
	var rangeInterval time.Duration
	expr.Walk(func(e syntax.Expr) {
		switch concrete := e.(type) {
		case *syntax.RangeAggregationExpr:
			rangeInterval = concrete.Left.Interval
		}
	})
	return rangeInterval
}

// hasLabelExtractionStage returns true if an expression contains a stage for label extraction,
// such as `| json` or `| logfmt`, that would result in an exploding amount of series in downstream queries.
func hasLabelExtractionStage(expr syntax.SampleExpr) bool {
	found := false
	expr.Walk(func(e syntax.Expr) {
		switch concrete := e.(type) {
		case *syntax.LogfmtParserExpr:
			found = true
		case *syntax.LabelParserExpr:
			// It will **not** return true for `regexp`, `unpack` and `pattern`, since these label extraction
			// stages can control how many labels, and therefore the resulting amount of series, are extracted.
			if concrete.Op == syntax.OpParserTypeJSON {
				found = true
			}
		}
	})
	return found
}

// sumOverFullRange returns an expression that sums up individual downstream queries (with preserving labels)
// and dividing it by the full range in seconds to calculate a rate value.
// The operation defines the range aggregation operation of the downstream queries.
// Examples:
// rate({app="foo"}[2m])
// => (sum without (count_over_time({app="foo"}[1m]) ++ count_over_time({app="foo"}[1m]) offset 1m) / 120)
// rate({app="foo"} | unwrap bar [2m])
// => (sum without (sum_over_time({app="foo"}[1m]) ++ sum_over_time({app="foo"}[1m]) offset 1m) / 120)
func (m RangeMapper) sumOverFullRange(expr *syntax.RangeAggregationExpr, overrideDownstream *syntax.VectorAggregationExpr, operation string, rangeInterval time.Duration, recorder *downstreamRecorder) syntax.SampleExpr {
	var downstreamExpr syntax.SampleExpr = &syntax.RangeAggregationExpr{
		Left:      expr.Left,
		Operation: operation,
	}
	// Optimization: in case overrideDownstream exists, the downstream expression can be optimized with the grouping
	// and operation of the overrideDownstream expression in order to reduce the returned streams' label set.
	if overrideDownstream != nil {
		downstreamExpr = &syntax.VectorAggregationExpr{
			Left:      downstreamExpr,
			Grouping:  overrideDownstream.Grouping,
			Operation: overrideDownstream.Operation,
		}
		// Ensure our modified expression is still valid.
		if downstreamExpr.(*syntax.VectorAggregationExpr).Left.(*syntax.RangeAggregationExpr).Validate() != nil {
			return expr
		}
	}

	return &syntax.BinOpExpr{
		SampleExpr: &syntax.VectorAggregationExpr{
			Left: m.mapConcatSampleExpr(downstreamExpr, rangeInterval, recorder),
			Grouping: &syntax.Grouping{
				Without: true,
				Groups:  []string{},
			},
			Operation: syntax.OpTypeSum,
		},
		RHS:  &syntax.LiteralExpr{Val: rangeInterval.Seconds()},
		Op:   syntax.OpTypeDiv,
		Opts: &syntax.BinOpOptions{},
	}
}

// vectorAggrWithRangeDownstreams returns an expression that aggregates a concat sample expression of multiple range
// aggregations. If a vector aggregation is pushed down, the downstream queries of the concat sample expression are
// wrapped in the vector aggregation of the parent node.
// Example:
// min(bytes_over_time({job="bar"} [2m])
// => min without (bytes_over_time({job="bar"} [1m]) ++ bytes_over_time({job="bar"} [1m] offset 1m))
// min by (app) (bytes_over_time({job="bar"} [2m])
// => min without (min by (app) (bytes_over_time({job="bar"} [1m])) ++ min by (app) (bytes_over_time({job="bar"} [1m] offset 1m)))
func (m RangeMapper) vectorAggrWithRangeDownstreams(expr *syntax.RangeAggregationExpr, vectorAggrPushdown *syntax.VectorAggregationExpr, op string, rangeInterval time.Duration, recorder *downstreamRecorder) syntax.SampleExpr {
	grouping := expr.Grouping
	if expr.Grouping == nil {
		grouping = &syntax.Grouping{
			Without: true,
			Groups:  []string{},
		}
	}
	var downstream syntax.SampleExpr = expr
	if vectorAggrPushdown != nil {
		downstream = vectorAggrPushdown
	}
	return &syntax.VectorAggregationExpr{
		Left:      m.mapConcatSampleExpr(downstream, rangeInterval, recorder),
		Grouping:  grouping,
		Operation: op,
	}
}

// appendDownstream adds expression expr with a range interval 'interval' and offset 'offset' to the downstreams list.
// Returns the updated downstream ConcatSampleExpr.
func appendDownstream(downstreams *ConcatSampleExpr, expr syntax.SampleExpr, interval time.Duration, offset time.Duration) *ConcatSampleExpr {
	sampleExpr := syntax.MustClone(expr)
	sampleExpr.Walk(func(e syntax.Expr) {
		switch concrete := e.(type) {
		case *syntax.RangeAggregationExpr:
			concrete.Left.Interval = interval
			if offset != 0 {
				concrete.Left.Offset = offset
			}
		}
	})
	downstreams = &ConcatSampleExpr{
		DownstreamSampleExpr: DownstreamSampleExpr{
			SampleExpr: sampleExpr,
		},
		next: downstreams,
	}
	return downstreams
}

func getOffsets(expr syntax.SampleExpr) []time.Duration {
	// Expect to always find at most 1 offset, so preallocate it accordingly
	offsets := make([]time.Duration, 0, 1)

	expr.Walk(func(e syntax.Expr) {
		switch concrete := e.(type) {
		case *syntax.RangeAggregationExpr:
			offsets = append(offsets, concrete.Left.Offset)
		}
	})
	return offsets
}

// getOriginalOffset returns the offset specified in the input expr
// Note that the returned offset can be zero or negative
func (m RangeMapper) getOriginalOffset(expr syntax.SampleExpr) (offset time.Duration, err error) {
	offsets := getOffsets(expr)
	if len(offsets) == 0 {
		return time.Duration(0), nil
	}
	if len(offsets) > 1 {
		return time.Duration(0), fmt.Errorf("found %d offsets while expecting at most 1", len(offsets))
	}

	return offsets[0], nil
}

// mapConcatSampleExpr transform expr in multiple downstream subexpressions split by offset range interval
// rangeInterval should be greater than m.splitByInterval, otherwise the resultant expression
// will have an unnecessary aggregation operation
func (m RangeMapper) mapConcatSampleExpr(expr syntax.SampleExpr, rangeInterval time.Duration, recorder *downstreamRecorder) syntax.SampleExpr {
	if m.splitAlignTs.IsZero() {
		return m.rangeSplit(expr, rangeInterval, recorder)
	}
	return m.rangeSplitAlign(expr, rangeInterval, recorder)
}

// rangeSplitAlign try to split given `rangeInterval` into units of `m.splitByInterval` by making sure `rangeInterval` is aligned with `m.splitByInterval` for as much as the units as possible.
// Consider following example with real use case.
// Instant Query: `sum(rate({foo="bar"}[3h])`
// execTs: 12:34:00
// splitBy: 1h
// Given above parameters, queries will be split into following
// 1. sum(rate({foo="bar"}[34m]))
// 2. sum(rate({foo="bar"}[1h] offset 34m))
// 3. sum(rate({foo="bar"}[1h] offset 1h34m))
// 4. sum(rate({foo="bar"}[26m] offset 2h34m))
func (m RangeMapper) rangeSplitAlign(
	expr syntax.SampleExpr, rangeInterval time.Duration, recorder *downstreamRecorder,
) syntax.SampleExpr {
	if rangeInterval <= m.splitByInterval {
		return expr
	}

	originalOffset, err := m.getOriginalOffset(expr)
	if err != nil {
		return expr
	}

	align := m.splitAlignTs.Sub(m.splitAlignTs.Truncate(m.splitByInterval)) // say, 12:34:00 - 12:00:00(truncated) = 34m

	if align == 0 {
		return m.rangeSplit(expr, rangeInterval, recorder) // Don't have to align
	}

	var (
		newRng = align

		// TODO(kavi): If the originalOffset is non-zero, there may be a edge case, where subqueries generated won't be aligned correctly. Handle this edge case in separate PR.
		newOffset            = originalOffset
		downstreams          *ConcatSampleExpr
		pendingRangeInterval = rangeInterval
		splits               = 0
	)

	// first subquery
	downstreams = appendDownstream(downstreams, expr, newRng, newOffset)
	splits++

	newOffset += align // e.g: offset 34m
	pendingRangeInterval -= newRng
	newRng = m.splitByInterval // [1h]

	// Rest of the subqueries.
	for pendingRangeInterval > 0 {
		if pendingRangeInterval < m.splitByInterval {
			newRng = pendingRangeInterval // last subquery
		}
		downstreams = appendDownstream(downstreams, expr, newRng, newOffset)
		newOffset += m.splitByInterval
		pendingRangeInterval -= newRng
		splits++
	}

	// update stats and metrics
	m.stats.AddSplitQueries(splits)
	recorder.Add(splits, MetricsKey)

	return downstreams
}

func (m RangeMapper) rangeSplit(expr syntax.SampleExpr, rangeInterval time.Duration, recorder *downstreamRecorder) syntax.SampleExpr {
	splitCount := int(math.Ceil(float64(rangeInterval) / float64(m.splitByInterval)))
	if splitCount <= 1 {
		return expr
	}

	originalOffset, err := m.getOriginalOffset(expr)
	if err != nil {
		return expr
	}

	var downstreams *ConcatSampleExpr
	for split := 0; split < splitCount; split++ {
		splitOffset := time.Duration(split) * m.splitByInterval
		// The range interval of the last downstream query can be smaller than the split interval
		splitRangeInterval := m.splitByInterval
		if splitOffset+splitRangeInterval > rangeInterval {
			splitRangeInterval = rangeInterval - splitOffset
		}
		// The offset of downstream queries is always the original offset + a multiple of the split interval
		splitOffset += originalOffset
		downstreams = appendDownstream(downstreams, expr, splitRangeInterval, splitOffset)
	}

	// Update stats and metrics
	m.stats.AddSplitQueries(splitCount)
	recorder.Add(splitCount, MetricsKey)

	return downstreams
}

func (m RangeMapper) mapVectorAggregationExpr(expr *syntax.VectorAggregationExpr, recorder *downstreamRecorder) (syntax.SampleExpr, error) {
	rangeInterval := getRangeInterval(expr)

	// in case the interval is smaller than the configured split interval,
	// don't split it.
	if rangeInterval <= m.splitByInterval {
		return expr, nil
	}

	// In order to minimize the amount of streams on the downstream query,
	// we can push down the outer vector aggregation to the downstream query.
	// This does not work for `count()` and `topk()`, though.
	// We also do not want to push down, if the inner expression is a binary operation.
	var vectorAggrPushdown *syntax.VectorAggregationExpr
	if _, ok := expr.Left.(*syntax.BinOpExpr); !ok && expr.Operation != syntax.OpTypeCount && expr.Operation != syntax.OpTypeTopK && expr.Operation != syntax.OpTypeSort && expr.Operation != syntax.OpTypeSortDesc {
		vectorAggrPushdown = expr
	}

	// Split the vector aggregation's inner expression
	lhsMapped, err := m.Map(expr.Left, vectorAggrPushdown, recorder)
	if err != nil {
		return nil, err
	}

	return &syntax.VectorAggregationExpr{
		Left:      lhsMapped,
		Grouping:  expr.Grouping,
		Params:    expr.Params,
		Operation: expr.Operation,
	}, nil
}

// mapRangeAggregationExpr maps expr into a new SampleExpr with multiple downstream subqueries split by range interval
// Optimization: in order to reduce the returned stream from the inner downstream functions, in case a range aggregation
// expression is aggregated by a vector aggregation expression with a label grouping, the downstream expression can be
// exactly the same as the initial query concatenated by a `sum` operation. If this is the case, overrideDownstream
// contains the initial query which will be the downstream expression with a split range interval.
// Example: `sum by (a) (bytes_over_time)`
// Is mapped to `sum by (a) (sum without downstream<sum by (a) (bytes_over_time)>++downstream<sum by (a) (bytes_over_time)>++...)`
func (m RangeMapper) mapRangeAggregationExpr(expr *syntax.RangeAggregationExpr, vectorAggrPushdown *syntax.VectorAggregationExpr, recorder *downstreamRecorder) syntax.SampleExpr {
	rangeInterval := getRangeInterval(expr)

	// in case the interval is smaller than the configured split interval,
	// don't split it.
	if rangeInterval <= m.splitByInterval {
		return expr
	}

	labelExtractor := hasLabelExtractionStage(expr)

	// Downstream queries with label extractors can potentially produce a huge amount of series
	// which can impact the queries and consequently fail.
	// Note: vector aggregation expressions aggregate the result in a single empty label set,
	// so these expressions can be pushed downstream
	if expr.Grouping == nil && vectorAggrPushdown == nil && labelExtractor {
		return expr
	}
	switch expr.Operation {
	case syntax.OpRangeTypeSum:
		return m.vectorAggrWithRangeDownstreams(expr, vectorAggrPushdown, syntax.OpTypeSum, rangeInterval, recorder)
	case syntax.OpRangeTypeBytes, syntax.OpRangeTypeCount:
		// Downstream queries with label extractors use concat as aggregation operator instead of sum
		// in order to merge the resultant label sets
		if labelExtractor {
			var downstream syntax.SampleExpr = expr
			if vectorAggrPushdown != nil {
				downstream = vectorAggrPushdown
			}
			return m.mapConcatSampleExpr(downstream, rangeInterval, recorder)
		}
		return m.vectorAggrWithRangeDownstreams(expr, vectorAggrPushdown, syntax.OpTypeSum, rangeInterval, recorder)
	case syntax.OpRangeTypeMax:
		return m.vectorAggrWithRangeDownstreams(expr, vectorAggrPushdown, syntax.OpTypeMax, rangeInterval, recorder)
	case syntax.OpRangeTypeMin:
		return m.vectorAggrWithRangeDownstreams(expr, vectorAggrPushdown, syntax.OpTypeMin, rangeInterval, recorder)
	case syntax.OpRangeTypeRate:
		if labelExtractor && vectorAggrPushdown.Operation != syntax.OpTypeSum {
			return expr
		}
		// rate({app="foo"}[2m]) =>
		// => (sum without (count_over_time({app="foo"}[1m]) ++ count_over_time({app="foo"}[1m]) offset 1m) / 120)
		op := syntax.OpRangeTypeCount
		if expr.Left.Unwrap != nil {
			// rate({app="foo"} | unwrap bar [2m])
			// => (sum without (sum_over_time({app="foo"}[1m]) ++ sum_over_time({app="foo"}[1m]) offset 1m) / 120)
			op = syntax.OpRangeTypeSum
		}
		return m.sumOverFullRange(expr, vectorAggrPushdown, op, rangeInterval, recorder)
	case syntax.OpRangeTypeBytesRate:
		if labelExtractor && vectorAggrPushdown.Operation != syntax.OpTypeSum {
			return expr
		}
		return m.sumOverFullRange(expr, vectorAggrPushdown, syntax.OpRangeTypeBytes, rangeInterval, recorder)
	default:
		// this should not be reachable.
		// If an operation is splittable it should have an optimization listed.
		level.Warn(util_log.Logger).Log(
			"msg", "unexpected range aggregation expression",
			"operation", expr.Operation,
		)
		return expr
	}
}

// isSplittableByRange returns whether it is possible to optimize the given
// sample expression.
// A vector aggregation is splittable, if the aggregation operation is
// supported and the inner expression is also splittable.
// A range aggregation is splittable, if the aggregation operation is
// supported.
// A binary expression is splittable, if both the left and the right-hand side
// are splittable.
func isSplittableByRange(expr syntax.SampleExpr) bool {
	switch e := expr.(type) {
	case *syntax.VectorAggregationExpr:
		_, ok := splittableVectorOp[e.Operation]
		return ok && isSplittableByRange(e.Left)
	case *syntax.RangeAggregationExpr:
		_, ok := splittableRangeVectorOp[e.Operation]
		return ok
	case *syntax.BinOpExpr:
		_, literalLHS := e.SampleExpr.(*syntax.LiteralExpr)
		_, literalRHS := e.RHS.(*syntax.LiteralExpr)
		// Note: if both left-hand side and right-hand side are literal expressions,
		// the syntax.ParseSampleExpr returns a literal expression
		return isSplittableByRange(e.SampleExpr) || literalLHS && isSplittableByRange(e.RHS) || literalRHS
	case *syntax.LabelReplaceExpr:
		return isSplittableByRange(e.Left)
	case *syntax.VectorExpr:
		return false
	default:
		return false
	}
}