package executor import ( "fmt" "slices" "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/array" "github.com/grafana/loki/v3/pkg/engine/internal/planner/physical" "github.com/grafana/loki/v3/pkg/engine/internal/semconv" "github.com/grafana/loki/v3/pkg/engine/internal/types" ) func collectGroupingColumns(record arrow.RecordBatch, grouping physical.Grouping, evaluator *expressionEvaluator, identCache *semconv.IdentifierCache) ([]*array.String, []arrow.Field, error) { if grouping.Without { return collectWithoutGroupingColumns(record, grouping, identCache) } return collectByGroupingColumns(record, grouping, evaluator) } func collectByGroupingColumns(record arrow.RecordBatch, grouping physical.Grouping, evaluator *expressionEvaluator) ([]*array.String, []arrow.Field, error) { arrays := make([]*array.String, 0, len(grouping.Columns)) fields := make([]arrow.Field, 0, len(grouping.Columns)) for _, columnExpr := range grouping.Columns { arr, err := evaluator.evalForGrouping(columnExpr, record) if err != nil { return nil, nil, err } if arr.DataType().ID() != types.Arrow.String.ID() { return nil, nil, fmt.Errorf("unsupported datatype for grouping %s", arr.DataType()) } arrays = append(arrays, arr.(*array.String)) colExpr, ok := columnExpr.(*physical.ColumnExpr) if !ok { return nil, nil, fmt.Errorf("invalid column expression type %T", columnExpr) } ident := semconv.NewIdentifier(colExpr.Ref.Column, colExpr.Ref.Type, types.Loki.String) fields = append(fields, semconv.FieldFromIdent(ident, true)) } return arrays, fields, nil } // collectWithoutGroupingColumns collects columns from the input record excluding // those that match the grouping expressions. // // The returned fields & arrays are sorted in the order of their column names. // Sorting is necessary to ensure that the grouping keys are in the same order // irrespective of the order of columns in the input record. // // And columns with the same short name are coalesced into a single array. // Without this, columns with same short name but different [types.ColumnType] // would be treated as separate grouping keys, which is not the intended behavior. func collectWithoutGroupingColumns(record arrow.RecordBatch, grouping physical.Grouping, identCache *semconv.IdentifierCache) ([]*array.String, []arrow.Field, error) { shortNames := make([]string, 0) columns := make(map[string][]*columnWithType) for i, field := range record.Schema().Fields() { ident, err := identCache.ParseFQN(field.Name) if err != nil { return nil, nil, err } if !isGroupingCandidate(ident.ColumnType()) { continue } match, err := identMatchesGrouping(grouping.Columns, ident) if err != nil { return nil, nil, err } // exclude columns that match `without` grouping keys if match { continue } arr, ok := record.Column(i).(*array.String) if !ok { return nil, nil, fmt.Errorf("unsupported datatype for grouping %s", record.Column(i).DataType()) } shortName := ident.ShortName() if _, exists := columns[shortName]; !exists { shortNames = append(shortNames, shortName) } columns[shortName] = append(columns[shortName], &columnWithType{ col: arr, ct: ident.ColumnType(), }) } // sort names to ensure deterministic order of grouping keys. // input records may have columns in any order. slices.Sort(shortNames) arrays := make([]*array.String, 0, len(shortNames)) fields := make([]arrow.Field, 0, len(shortNames)) for _, shortName := range shortNames { cols := columns[shortName] var arr arrow.Array if len(cols) == 1 { arr = cols[0].col } else { arr = NewCoalesce(cols) } arrays = append(arrays, arr.(*array.String)) // always set to ambiguous type. // Imagine two records with the same short name but different types. // Record 1 has `utf8.label.env` // Record 2 has `utf8.metadata.env` // // If the original type is preserved, aggregation will treat them // as different groups, which is not the intended behavior. // // TODO: aggregator.go should be updated to use short names // instead of full identifiers for grouping keys. ident := semconv.NewIdentifier(shortName, types.ColumnTypeAmbiguous, types.Loki.String) fields = append(fields, semconv.FieldFromIdent(ident, true)) } return arrays, fields, nil } func isGroupingCandidate(columnType types.ColumnType) bool { return columnType == types.ColumnTypeLabel || columnType == types.ColumnTypeMetadata || columnType == types.ColumnTypeParsed || // aggregation node downstream of another aggregation node may // receive grouping keys with ambiguous type. columnType == types.ColumnTypeAmbiguous } func identMatchesGrouping(grouping []physical.ColumnExpression, ident *semconv.Identifier) (bool, error) { for _, g := range grouping { colExpr, ok := g.(*physical.ColumnExpr) if !ok { return false, fmt.Errorf("unknown column expression %v", g) } // Match ambiguous columns only by name. if colExpr.Ref.Type == types.ColumnTypeAmbiguous && colExpr.Ref.Column == ident.ShortName() { return true, nil } // Match all other columns by name and type. if colExpr.Ref.Column == ident.ShortName() && colExpr.Ref.Type == ident.ColumnType() { return true, nil } } return false, nil }