Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/engine/internal/executor/parse.go

239 lines
7.6 KiB

package executor
import (
"context"
"fmt"
"sort"
"unsafe"
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
"github.com/apache/arrow-go/v18/arrow/memory"
"github.com/grafana/loki/v3/pkg/engine/internal/datatype"
"github.com/grafana/loki/v3/pkg/engine/internal/types"
"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
)
func NewParsePipeline(parse *physical.ParseNode, input Pipeline, allocator memory.Allocator) *GenericPipeline {
return newGenericPipeline(Local, func(ctx context.Context, inputs []Pipeline) state {
// Pull the next item from the input pipeline
input := inputs[0]
batch, err := input.Read(ctx)
if err != nil {
return failureState(err)
}
// Batch needs to be released here since it won't be passed to the caller and won't be reused after
// this call to newGenericPipeline.
defer batch.Release()
// Find the message column
messageColIdx := -1
schema := batch.Schema()
for i := 0; i < schema.NumFields(); i++ {
if schema.Field(i).Name == types.ColumnNameBuiltinMessage {
messageColIdx = i
break
}
}
if messageColIdx == -1 {
return failureState(fmt.Errorf("message column not found"))
}
messageCol := batch.Column(messageColIdx)
stringCol, ok := messageCol.(*array.String)
if !ok {
return failureState(fmt.Errorf("message column is not a string column"))
}
var headers []string
var parsedColumns []arrow.Array
switch parse.Kind {
case physical.ParserLogfmt:
headers, parsedColumns = buildLogfmtColumns(stringCol, parse.RequestedKeys, allocator)
case physical.ParserJSON:
headers, parsedColumns = buildJSONColumns(stringCol, parse.RequestedKeys, allocator)
default:
return failureState(fmt.Errorf("unsupported parser kind: %v", parse.Kind))
}
// Build new schema with original fields plus parsed fields
newFields := make([]arrow.Field, 0, schema.NumFields()+len(headers))
for i := 0; i < schema.NumFields(); i++ {
newFields = append(newFields, schema.Field(i))
}
for _, header := range headers {
// Add metadata to mark these as parsed columns
metadata := datatype.ColumnMetadata(
types.ColumnTypeParsed,
datatype.Loki.String,
)
newFields = append(newFields, arrow.Field{
Name: header,
Type: arrow.BinaryTypes.String,
Metadata: metadata,
Nullable: true,
})
}
newSchema := arrow.NewSchema(newFields, nil)
// Build new record with all columns
numOriginalCols := int(batch.NumCols())
numParsedCols := len(parsedColumns)
allColumns := make([]arrow.Array, numOriginalCols+numParsedCols)
// Copy original columns
for i := range numOriginalCols {
col := batch.Column(i)
col.Retain() // Retain since we're releasing the batch
allColumns[i] = col
}
// Add parsed columns
for i, col := range parsedColumns {
// Defenisve check added for clarity and safety, but BuildLogfmtColumns should already guarantee this
if col.Len() != stringCol.Len() {
return failureState(fmt.Errorf("parsed column %d (%s) has %d rows but expected %d",
i, headers[i], col.Len(), stringCol.Len()))
}
allColumns[numOriginalCols+i] = col
}
// Create the new record
newRecord := array.NewRecord(newSchema, allColumns, int64(stringCol.Len()))
// Release the columns we retained/created
for _, col := range allColumns {
col.Release()
}
return successState(newRecord)
}, input)
}
// parseFunc represents a function that parses a single line and returns key-value pairs
type parseFunc func(line string, requestedKeys []string) (map[string]string, error)
// buildColumns builds Arrow columns from input lines using the provided parser
// Returns the column headers, the Arrow columns, and any error
func buildColumns(input *array.String, requestedKeys []string, allocator memory.Allocator, parseFunc parseFunc, errorType string) ([]string, []arrow.Array) {
columnBuilders := make(map[string]*array.StringBuilder)
columnOrder := parseLines(input, requestedKeys, columnBuilders, allocator, parseFunc, errorType)
// Build final arrays
columns := make([]arrow.Array, 0, len(columnOrder))
headers := make([]string, 0, len(columnOrder))
for _, key := range columnOrder {
builder := columnBuilders[key]
columns = append(columns, builder.NewArray())
headers = append(headers, key)
builder.Release()
}
return headers, columns
}
// parseLines discovers columns dynamically as lines are parsed
func parseLines(input *array.String, requestedKeys []string, columnBuilders map[string]*array.StringBuilder, allocator memory.Allocator, parseFunc parseFunc, errorType string) []string {
columnOrder := []string{}
var errorBuilder, errorDetailsBuilder *array.StringBuilder
hasErrorColumns := false
for i := 0; i < input.Len(); i++ {
line := input.Value(i)
parsed, err := parseFunc(line, requestedKeys)
// Handle error columns
if err != nil {
// Create error columns on first error
if !hasErrorColumns {
errorBuilder = array.NewStringBuilder(allocator)
errorDetailsBuilder = array.NewStringBuilder(allocator)
columnBuilders[types.ColumnNameParsedError] = errorBuilder
columnBuilders[types.ColumnNameParsedErrorDetails] = errorDetailsBuilder
columnOrder = append(columnOrder, types.ColumnNameParsedError, types.ColumnNameParsedErrorDetails)
hasErrorColumns = true
// Backfill NULLs for previous rows
for j := 0; j < i; j++ {
errorBuilder.AppendNull()
errorDetailsBuilder.AppendNull()
}
}
// Append error values
errorBuilder.Append(errorType)
errorDetailsBuilder.Append(err.Error())
// When there's an error, don't create new columns for the failed parse
// Only add NULLs for columns that already exist
} else if hasErrorColumns {
// No error on this row, but we have error columns
errorBuilder.AppendNull()
errorDetailsBuilder.AppendNull()
}
// Track which keys we've seen this row
seenKeys := make(map[string]bool)
if hasErrorColumns {
// Mark error columns as seen so we don't append nulls for them
seenKeys[types.ColumnNameParsedError] = true
seenKeys[types.ColumnNameParsedErrorDetails] = true
}
// Add values for parsed keys (only if no error)
if err == nil {
for key, value := range parsed {
seenKeys[key] = true
builder, exists := columnBuilders[key]
if !exists {
// New column discovered - create and backfill
builder = array.NewStringBuilder(allocator)
columnBuilders[key] = builder
columnOrder = append(columnOrder, key)
// Backfill NULLs for previous rows
builder.AppendNulls(i)
}
builder.Append(value)
}
}
// For error cases, don't mark the failed keys as seen - let them get NULLs below
// Append NULLs for columns not in this row
for _, key := range columnOrder {
if !seenKeys[key] {
columnBuilders[key].AppendNull()
}
}
}
// Sort column order for consistency (excluding error columns)
if hasErrorColumns {
// Keep error columns at the end, sort the rest
nonErrorColumns := make([]string, 0, len(columnOrder)-2)
for _, key := range columnOrder {
if key != types.ColumnNameParsedError && key != types.ColumnNameParsedErrorDetails {
nonErrorColumns = append(nonErrorColumns, key)
}
}
sort.Strings(nonErrorColumns)
columnOrder = append(nonErrorColumns, types.ColumnNameParsedError, types.ColumnNameParsedErrorDetails)
} else {
sort.Strings(columnOrder)
}
return columnOrder
}
// unsafeBytes converts a string to []byte without allocation
func unsafeBytes(s string) []byte {
return unsafe.Slice(unsafe.StringData(s), len(s))
}
// unsafeString converts a []byte to string without allocation
func unsafeString(b []byte) string {
return unsafe.String(unsafe.SliceData(b), len(b))
}