Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/engine/internal/workflow/task_summary.go

222 lines
7.6 KiB

package workflow
import (
"fmt"
"time"
"github.com/go-kit/log/level"
"github.com/grafana/loki/v3/pkg/dataobj"
"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
"github.com/grafana/loki/v3/pkg/engine/internal/scheduler/schedulerstat"
"github.com/grafana/loki/v3/pkg/engine/internal/worker/workerstat"
"github.com/grafana/loki/v3/pkg/xcap"
)
func (wf *Workflow) printTaskSummary(task *Task, oldState TaskState, newStatus TaskStatus) {
capture := newStatus.Capture
if capture == nil {
// Every terminal notification carries the per-task capture. Skip
// rather than emit a log line that's all nils.
return
}
var (
durTotal = xcap.Value[int64](capture, schedulerstat.TaskTotalDuration)
durStaging = xcap.Value[int64](capture, schedulerstat.TaskStagingDuration)
durQueue = xcap.Value[int64](capture, schedulerstat.TaskQueueDuration)
durExecution = xcap.Value[int64](capture, schedulerstat.TaskExecutionDuration)
durOther = durTotal - durStaging - durQueue - durExecution
durExecutionOpen = xcap.Value[int64](capture, workerstat.TaskExecutionOpenDuration)
durExecutionRead = xcap.Value[int64](capture, workerstat.TaskExecutionReadDuration)
durExecutionReadRecv = xcap.Value[int64](capture, workerstat.TaskExecutionReadRecvDuration)
durExecutionSend = xcap.Value[int64](capture, workerstat.TaskExecutionSendDuration)
durExecutionOther = durExecution - durExecutionOpen - durExecutionRead - durExecutionSend
pagesDownloaded = xcap.Value[int64](capture, xcap.StatDatasetPrimaryPagesDownloaded) + xcap.Value[int64](capture, xcap.StatDatasetSecondaryPagesDownloaded)
bytesDownloaded = xcap.Value[int64](capture, xcap.StatDatasetPrimaryColumnBytes) + xcap.Value[int64](capture, xcap.StatDatasetSecondaryColumnBytes)
bytesProcessed = xcap.Value[int64](capture, xcap.StatDatasetPrimaryRowBytes) + xcap.Value[int64](capture, xcap.StatDatasetSecondaryRowBytes)
linesProcessed = xcap.Value[int64](capture, xcap.StatDatasetPrimaryRowsRead) + xcap.Value[int64](capture, xcap.StatDatasetSecondaryRowsRead)
)
level.Info(wf.logger).Log(
"msg", "task-summary",
// Identity
"task_id", task.ULID,
"query_id", wf.opts.ID,
"parent_task_id", wf.parentTaskID(task),
"task_type", taskTypeName(task),
"operator_type", taskOperatorType(task),
// Outcome
"status", taskStatusName(newStatus.State),
"cancellation_phase", cancellationPhaseName(oldState, newStatus.State),
"error", newStatus.Error,
// Timings
"duration_ms", time.Duration(durTotal).Milliseconds(),
"duration_staging_ms", time.Duration(durStaging).Milliseconds(),
"duration_queue_ms", time.Duration(durQueue).Milliseconds(),
"duration_execution_ms", time.Duration(durExecution).Milliseconds(),
"duration_other_ms", time.Duration(durOther).Milliseconds(),
// Breakdown timings
"duration_execution_open_ms", time.Duration(durExecutionOpen).Milliseconds(),
"duration_execution_read_ms", time.Duration(durExecutionRead).Milliseconds(),
"duration_execution_read_recv_ms", time.Duration(durExecutionReadRecv).Milliseconds(),
"duration_execution_send_ms", time.Duration(durExecutionSend).Milliseconds(),
"duration_execution_other_ms", time.Duration(durExecutionOther).Milliseconds(),
// Stage 9 (leaf) counters.
"pages_total", xcap.Value[int64](capture, dataobj.StatDatasetPagesTotal),
"pages_pruned", xcap.Value[int64](capture, dataobj.StatDatasetPagesPruned),
"pages_downloaded", pagesDownloaded, // Pages downloaded by readerDownloader
"pages_bytes_downloaded", bytesDownloaded, // Bytes downloaded for pages by readerDownloader
"total_bytes_downloaded", xcap.Value[int64](capture, dataobj.StatObjectBytesDownloaded),
// Stage 10 counters.
"batches_emitted", xcap.Value[int64](capture, xcap.TaskRecordsSent),
"batches_consumed", xcap.Value[int64](capture, xcap.TaskDrainRecordsReceived),
"bytes_processed", bytesProcessed, // TODO(rfratto): missing semantics for non-leaf nodes
"lines_processed", linesProcessed, // TODO(rfratto): missing semantics for non-leaf nodes
"lines_emitted", xcap.Value[int64](capture, xcap.TaskRowsSent),
// Task result cache.
"cache_check", taskResultCacheOutcome(capture),
)
}
// parentTaskID returns the task's parent ULID, or the zero ULID if the task
// is a root (one-parent assumption per the workflow planner).
func (wf *Workflow) parentTaskID(task *Task) any {
wf.tasksMut.RLock()
parents := wf.graph.Parents(task)
wf.tasksMut.RUnlock()
if len(parents) == 0 {
return nil
}
// One-parent assumption per the workflow planner. If multi-parent task
// graphs are ever introduced, this and the per-task log schema will need
// to be revisited.
return parents[0].ULID
}
// taskTypeName returns "leaf" if the task has no external sources (i.e., it
// reads directly from storage rather than from another task), otherwise
// "non-leaf".
func taskTypeName(task *Task) string {
if len(task.Sources) == 0 {
return "leaf"
}
return "non-leaf"
}
// taskOperatorType returns the type name of the task's root operator, or
// the empty string if the fragment has no usable root.
func taskOperatorType(task *Task) string {
if task.Fragment == nil {
return ""
}
root, err := task.Fragment.Root()
if err != nil {
return "none"
}
root, err = unwrapPhysicalNode(task.Fragment, root)
if err != nil {
return "none"
}
return root.Type().String()
}
// unwrapPhysicalNode unwraps "helper" physical nodes to reveal the actual root
// node.
func unwrapPhysicalNode(plan *physical.Plan, root physical.Node) (physical.Node, error) {
// TODO(rfratto): this should really be behaviour defined in the physical
// package.
switch root := root.(type) {
case *physical.Cache:
next, err := getRootChild("caching", plan.Children(root))
if err != nil {
return nil, err
}
return unwrapPhysicalNode(plan, next)
case *physical.Batching:
next, err := getRootChild("batching", plan.Children(root))
if err != nil {
return nil, err
}
return unwrapPhysicalNode(plan, next)
}
return root, nil
}
func getRootChild(parent string, children []physical.Node) (physical.Node, error) {
switch len(children) {
case 0:
return nil, fmt.Errorf("%s node has no children", parent)
case 1:
return children[0], nil
default:
return nil, fmt.Errorf("%s node has multiple children", parent)
}
}
// taskStatusName maps a terminal [TaskState] to the spec's status enum.
func taskStatusName(state TaskState) string {
switch state {
case TaskStateCompleted:
return "success"
case TaskStateFailed:
return "fail"
case TaskStateCancelled:
return "cancel"
default:
return state.String()
}
}
// cancellationPhaseName returns the cancellation phase when newState is
// [TaskStateCancelled], or nil otherwise (so the log field is omitted for
// non-cancellation outcomes).
//
// - "pre_assignment" — cancellation happened before the task was assigned
// to a worker (oldState was Created or Pending).
// - "during_execution" — cancellation happened while a worker was running
// the task (oldState was Running).
func cancellationPhaseName(oldState, newState TaskState) any {
if newState != TaskStateCancelled {
return nil
}
switch oldState {
case TaskStateCreated, TaskStatePending:
return "pre_assignment"
case TaskStateRunning:
return "during_execution"
default:
return nil
}
}
// taskResultCacheOutcome derives the task result cache outcome for the task
// from its capture, returning "hit", "miss", or "n/a".
func taskResultCacheOutcome(capture *xcap.Capture) string {
var (
hits, _ = xcap.TryValue[int64](capture, xcap.TaskCacheHits)
misses, _ = xcap.TryValue[int64](capture, xcap.TaskCacheMisses)
)
switch {
case hits > 0:
return "hit"
case misses > 0:
return "miss"
default:
return "n/a"
}
}