loki/pkg/engine/internal/executor/util.go

package executor

import (
	"fmt"

	"github.com/apache/arrow-go/v18/arrow"
	"github.com/apache/arrow-go/v18/arrow/array"
	"github.com/cespare/xxhash/v2"

	"github.com/grafana/loki/v3/pkg/engine/internal/semconv"
)

var (
	separator = []byte{0}
)

// columnForIdent returns the column ([arrow.Array]) and its column index in the schema of the given input batch ([arrow.RecordBatch]).
// It returns an optional error in case the column with the fully qualified name of the identifier could not be found,
// or there are where multiple columns with the same name in the schema.
// In case of an error, the returned column index is -1.
func columnForIdent(ident *semconv.Identifier, batch arrow.RecordBatch) (arrow.Array, int, error) {
	return columnForFQN(ident.FQN(), batch)
}

func columnForFQN(fqn string, batch arrow.RecordBatch) (arrow.Array, int, error) {
	indices := batch.Schema().FieldIndices(fqn)
	if len(indices) == 0 {
		return nil, -1, fmt.Errorf("column not found for %s", fqn)
	}
	if len(indices) > 1 {
		return nil, -1, fmt.Errorf("multiple columns found for %s", fqn)
	}

	return batch.Column(indices[0]), indices[0], nil
}

// labelValuesCache returns label values for a given row in range and vector aggregators, but cache them in order
// to reduce object allocations for repeated label sets. It first scans the row for non-empty labels and computes xxhash.
// In case of a cache miss it scans the row again and allocates arrays for label values.
type labelValuesCache struct {
	digest *xxhash.Digest
	cache  map[uint64][]string
}

func newLabelValuesCache() *labelValuesCache {
	return &labelValuesCache{
		digest: xxhash.New(),
		cache:  make(map[uint64][]string),
	}
}

func (c *labelValuesCache) getLabelValues(arrays []*array.String, row int) []string {
	c.digest.Reset()
	for _, arr := range arrays {
		val := arr.Value(row)
		if val != "" {
			_, _ = c.digest.Write(separator)
			_, _ = c.digest.WriteString(val)
		}
	}
	key := c.digest.Sum64()

	labelValues, ok := c.cache[key]
	if !ok {
		labelValues = make([]string, 0, len(arrays))

		for _, arr := range arrays {
			val := arr.Value(row)
			if val != "" {
				labelValues = append(labelValues, val)
			}
		}
		c.cache[key] = labelValues
	}

	return labelValues
}

// fieldsCache returns labels for a given row in range and vector aggregators, but cache them in order
// to reduce object allocations for repeated label sets. It first scans the row for non-empty labels and computes xxhash.
// In case of a cache miss it scans the row again and allocates arrays for label names.
type fieldsCache struct {
	digest *xxhash.Digest
	cache  map[uint64][]arrow.Field
}

func newFieldsCache() *fieldsCache {
	return &fieldsCache{
		digest: xxhash.New(),
		cache:  make(map[uint64][]arrow.Field),
	}
}

func (c *fieldsCache) getFields(arrays []*array.String, fields []arrow.Field, row int) []arrow.Field {
	c.digest.Reset()
	for i, arr := range arrays {
		val := arr.Value(row)
		if val != "" {
			_, _ = c.digest.Write(separator)
			_, _ = c.digest.WriteString(fields[i].Name)
		}
	}
	key := c.digest.Sum64()

	labels, ok := c.cache[key]
	if !ok {
		labels = make([]arrow.Field, 0, len(arrays))
		for i, arr := range arrays {
			val := arr.Value(row)
			if val != "" {
				labels = append(labels, fields[i])
			}
		}
		c.cache[key] = labels
	}

	return labels
}