Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/engine/internal/executor/compat.go

206 lines
6.7 KiB

package executor
import (
"cmp"
"context"
"slices"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/apache/arrow-go/v18/arrow/memory"
"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
"github.com/grafana/loki/v3/pkg/engine/internal/semconv"
)
func newColumnCompatibilityPipeline(compat *physical.ColumnCompat, input Pipeline) Pipeline {
const extracted = "_extracted"
return newGenericPipeline(func(ctx context.Context, inputs []Pipeline) (arrow.Record, error) {
input := inputs[0]
batch, err := input.Read(ctx)
if err != nil {
return nil, err
}
defer batch.Release()
// Return early if the batch has zero rows, even if column names would collide.
if batch.NumRows() == 0 {
batch.Retain() // retain to account for deferred release after reading the batch from the input
return batch, nil
}
// First, find all fields in the schema that have colliding names,
// based on the collision column type and the source column type.
var (
collisionFieldIndices []int
collisionFieldNames []string
sourceFieldIndices []int
sourceFieldNames []string
)
schema := batch.Schema()
for idx := range schema.NumFields() {
ident, err := semconv.ParseFQN(schema.Field(idx).Name)
if err != nil {
return nil, err
}
switch ident.ColumnType() {
case compat.Collision:
collisionFieldIndices = append(collisionFieldIndices, idx)
collisionFieldNames = append(collisionFieldNames, ident.ShortName())
case compat.Source:
sourceFieldIndices = append(sourceFieldIndices, idx)
sourceFieldNames = append(sourceFieldNames, ident.ShortName())
}
}
duplicates := findDuplicates(collisionFieldNames, sourceFieldNames)
// Return early if there are no colliding column names.
if len(duplicates) == 0 {
batch.Retain() // retain to account for deferred release after reading the batch from the input
return batch, nil
}
// Next, update the schema with the new columns that have the _extracted suffix.
newSchema := batch.Schema()
duplicateCols := make([]duplicateColumn, 0, len(duplicates))
r := int(batch.NumCols())
for i, duplicate := range duplicates {
collisionFieldIdx := collisionFieldIndices[duplicate.s1Idx]
sourceFieldIdx := sourceFieldIndices[duplicate.s2Idx]
sourceField := newSchema.Field(sourceFieldIdx)
sourceIdent, err := semconv.ParseFQN(sourceField.Name)
if err != nil {
return nil, err
}
destinationIdent := semconv.NewIdentifier(sourceIdent.ShortName()+extracted, compat.Destination, sourceIdent.DataType())
newSchema, err = newSchema.AddField(len(newSchema.Fields()), semconv.FieldFromIdent(destinationIdent, true))
if err != nil {
return nil, err
}
duplicateCols = append(duplicateCols, duplicateColumn{
name: duplicate.value,
collisionIdx: collisionFieldIdx,
sourceIdx: sourceFieldIdx,
destinationIdx: r + i,
})
}
// Create a new builder with the updated schema.
// The per-field builders are only used for columns where row values are modified,
// otherwise the full column from the input record is copied into the new record.
builder := array.NewRecordBuilder(memory.DefaultAllocator, newSchema)
builder.Reserve(int(batch.NumRows()))
defer builder.Release()
newSchemaColumns := make([]arrow.Array, newSchema.NumFields())
// Now, go through all fields of the old schema and append the rows to the new builder.
for idx := range schema.NumFields() {
col := batch.Column(idx)
duplicateIdx := slices.IndexFunc(duplicateCols, func(d duplicateColumn) bool { return d.sourceIdx == idx })
// If not a colliding column, just copy over the column data of the original record.
if duplicateIdx < 0 {
newSchemaColumns[idx] = col
continue
}
// If the currently processed column is the source field for a colliding column,
// then write non-null values from source column into destination column.
// Also, "clear" the original column value by writing a NULL instead of the original value.
duplicate := duplicateCols[duplicateIdx]
collisionCol := batch.Column(duplicate.collisionIdx)
switch sourceFieldBuilder := builder.Field(idx).(type) {
case *array.StringBuilder:
destinationFieldBuilder := builder.Field(duplicate.destinationIdx).(*array.StringBuilder)
for i := range int(batch.NumRows()) {
if col.IsNull(i) || !col.IsValid(i) {
sourceFieldBuilder.AppendNull() // append NULL to original column
destinationFieldBuilder.AppendNull() // append NULL to _extracted column
} else if collisionCol.IsNull(i) || !collisionCol.IsValid(i) {
v := col.(*array.String).Value(i)
sourceFieldBuilder.Append(v) // append value to original column
destinationFieldBuilder.AppendNull() // append NULL to _extracted column
} else {
sourceFieldBuilder.AppendNull() // append NULL to original column
v := col.(*array.String).Value(i)
destinationFieldBuilder.Append(v) // append value to _extracted column
}
}
sourceCol := sourceFieldBuilder.NewArray()
defer sourceCol.Release()
newSchemaColumns[duplicate.sourceIdx] = sourceCol
destinationCol := destinationFieldBuilder.NewArray()
defer destinationCol.Release()
newSchemaColumns[duplicate.destinationIdx] = destinationCol
default:
panic("invalid source column type: only string columns can be checked for collisions")
}
}
return array.NewRecord(newSchema, newSchemaColumns, batch.NumRows()), nil
}, input)
}
// duplicate holds indexes to a duplicate values in two slices
type duplicate struct {
value string
s1Idx, s2Idx int
}
// findDuplicates finds strings that appear in both slices and returns
// their indexes in each slice.
// The function assumes that elements in a slices are unique.
func findDuplicates(s1, s2 []string) []duplicate {
if len(s1) == 0 || len(s2) == 0 {
return nil
}
set1 := make(map[string]int)
for i, v := range s1 {
set1[v] = i
}
set2 := make(map[string]int)
for i, v := range s2 {
set2[v] = i
}
// Find duplicates that exist in both slices
var duplicates []duplicate
for value, s1Idx := range set1 {
if s2Idx, exists := set2[value]; exists {
duplicates = append(duplicates, duplicate{
value: value,
s1Idx: s1Idx,
s2Idx: s2Idx,
})
}
}
slices.SortStableFunc(duplicates, func(a, b duplicate) int { return cmp.Compare(a.value, b.value) })
return duplicates
}
// duplicateColumn holds indexes to fields/columns in an [*arrow.Schema].
type duplicateColumn struct {
// name is the duplicate column name
name string
// collisionIdx is the index of the collision column
collisionIdx int
// sourceIdx is the index of the source column
sourceIdx int
// destinationIdx is the index of the destination column
destinationIdx int
}