Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
loki/pkg/engine/internal/workflow/workflow.go

615 lines
19 KiB

// Package workflow defines how to represent physical plans as distributed
// workflows.
package workflow
import (
"context"
"fmt"
gotrace "runtime/trace"
"strconv"
"sync"
"time"
"github.com/apache/arrow-go/v18/arrow"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/oklog/ulid/v2"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"go.opentelemetry.io/otel"
"github.com/grafana/loki/v3/pkg/engine/internal/executor"
"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
"github.com/grafana/loki/v3/pkg/engine/internal/util/dag"
"github.com/grafana/loki/v3/pkg/logqlmodel/stats"
"github.com/grafana/loki/v3/pkg/xcap"
)
const (
eliminationReasonEmpty = "empty"
eliminationReasonNonEmpty = "non_empty"
)
var (
tracer = otel.Tracer("pkg/engine/internal/workflow")
shortCircuitsTotal = promauto.NewCounter(prometheus.CounterOpts{
Name: "loki_engine_v2_task_short_circuits_total",
Help: "Total number of tasks preemptively canceled by short circuiting.",
})
eliminatedCachedTasksTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "loki_engine_v2_task_cached_eliminated_total",
Help: "Total number of tasks eliminated before execution due to a cache hit.",
}, []string{"reason"})
)
// Options configures a [Workflow].
type Options struct {
ID ulid.ULID // Optional ID for the workflow. One will be generated if not provided.
Tenant string // Tenant ID associated with the workflow.
Actor []string // Optional path to the actor that is generating the workflow.
// MaxRunningScanTasks specifies the maximum number of scan tasks that may
// run concurrently within a single workflow. 0 means no limit.
MaxRunningScanTasks int
// MaxRunningOtherTasks specifies the maximum number of non-scan tasks that
// may run concurrently within a single workflow. 0 means no limit.
MaxRunningOtherTasks int
// MaxRunningCompactionTasks specifies the maximum number of compaction
// tasks that may run concurrently within a single workflow. 0 means no
// limit.
//
// The lane is dormant in query workflows (typeFor never classifies a
// query task as compaction); compactor workflows opt in by populating
// this field once the dataobj-compaction physical-plan node types and
// the corresponding typeFor classification land.
MaxRunningCompactionTasks int
// DebugTasks toggles debug messages for a task. This is very verbose and
// should only be enabled for debugging purposes.
//
// Regardless of the value of DebugTasks, workers still log when
// they start and finish assigned tasks.
DebugTasks bool
// DebugStreams toggles debug messages for data streams. This is very
// verbose and should only be enabled for debugging purposes.
DebugStreams bool
// CacheEnabled controls whether task fragments and DataObjScan nodes are
// wrapped with a Cache node during workflow planning.
CacheEnabled bool
// MaxTaskCacheSize is the maximum size in bytes of a task result that can be
// stored in the cache. 0 means only empty responses are cached.
MaxTaskCacheSize uint64
// MaxDataObjScanCacheSize is the maximum encoded size in bytes of a DataObjScan
// result that may be stored. 0 means only empty scan responses are cached.
MaxDataObjScanCacheSize uint64
// CacheCompression is the compression codec to use when encoding cache entries
// (e.g. "snappy"). An empty string means no compression.
CacheCompression string
// PruneEmptyCachedTasks controls whether tasks with a known-empty cached
// result are eliminated at plan time before any work is dispatched.
PruneEmptyCachedTasks bool
// NonEmptyCachedTasksMaxSize is the maximum total encoded size in bytes of
// non-empty cached task buffers that may be embedded in task assignments.
// Results that exceed the remaining budget are skipped; smaller results that
// still fit continue to be included. 0 disables non-empty task pruning entirely.
NonEmptyCachedTasksMaxSize uint64
// TaskCacheRegistry is the registry of cache backends used at plan time to
// prune tasks whose cached result is known to be empty.
TaskCacheRegistry executor.TaskCacheRegistry
// PruneCachedTasksFetchTimeout is the timeout applied to each cache Fetch
// call during task pruning at plan time. 0 means no timeout.
PruneCachedTasksFetchTimeout time.Duration
}
var _ fmt.Stringer = (*Workflow)(nil)
// Workflow represents a physical plan that has been partitioned into
// parallelizable tasks.
type Workflow struct {
opts Options
logger log.Logger
runner Runner
graph dag.Graph[*Task]
resultsStream *Stream
resultsPipeline *streamPipe
manifest *Manifest
statsMut sync.Mutex
stats stats.Result
captureMut sync.Mutex
// used to merge and link task regions
capture *xcap.Capture
parentRegion *xcap.Region
span *xcap.Span
tasksMut sync.RWMutex
taskStates map[*Task]TaskState
streamsMut sync.RWMutex
streamStates map[*Stream]StreamState
admissionControl *admissionControl
}
// New creates a new Workflow from a physical plan. New returns an error if the
// physical plan does not have exactly one root node, or if the physical plan
// cannot be partitioned into a Workflow.
//
// The provided Runner will be used for Workflow execution.
func New(ctx context.Context, opts Options, logger log.Logger, runner Runner, plan *physical.Plan) (*Workflow, error) {
graph, err := planWorkflow(ctx, opts.Tenant, plan, cacheParams{
enabled: opts.CacheEnabled,
taskCacheMaxSizeBytes: opts.MaxTaskCacheSize,
dataObjScanMaxSizeBytes: opts.MaxDataObjScanCacheSize,
compression: opts.CacheCompression,
registry: opts.TaskCacheRegistry,
pruneEmptyCachedTasks: opts.PruneEmptyCachedTasks,
nonEmptyCachedTasksMaxBytes: opts.NonEmptyCachedTasksMaxSize,
pruneFetchTimeout: opts.PruneCachedTasksFetchTimeout,
}, logger)
if err != nil {
return nil, err
}
// All tasks were eliminated at plan time — return an empty workflow whose
// Run() immediately yields EOF without dispatching any work.
if graph.Len() == 0 {
level.Debug(logger).Log("msg", "workflow plan is empty")
return &Workflow{
opts: opts,
logger: logger,
runner: runner,
taskStates: make(map[*Task]TaskState),
streamStates: make(map[*Stream]StreamState),
}, nil
}
// Inject a stream for final task results.
results, err := injectResultsStream(opts.Tenant, &graph)
if err != nil {
return nil, err
}
wf := &Workflow{
opts: opts,
logger: logger,
runner: runner,
graph: graph,
resultsStream: results,
resultsPipeline: newStreamPipe(),
taskStates: make(map[*Task]TaskState),
streamStates: make(map[*Stream]StreamState),
}
// Detach cancellation from the caller's ctx so a cancellation of the
// planning context does not abort the manifest registration, but keep
// the xcap region attached for observation recording.
if err := wf.init(context.WithoutCancel(ctx)); err != nil {
wf.Close()
return nil, err
}
return wf, nil
}
// Empty reports whether the workflow has no tasks to execute. This happens when
// all tasks were eliminated at plan time because their cached results were empty.
func (wf *Workflow) Empty() bool {
return wf.graph.Len() == 0
}
// injectResultsStream injects a new stream into the sinks of the root task for
// the workflow to receive final results.
func injectResultsStream(tenantID string, graph *dag.Graph[*Task]) (*Stream, error) {
results := &Stream{ULID: ulid.Make(), TenantID: tenantID}
// Inject a stream for final task results.
rootTask, err := graph.Root()
if err != nil {
return nil, err
}
rootNode, err := rootTask.Fragment.Root()
if err != nil {
return nil, err
}
rootTask.Sinks[rootNode] = append(rootTask.Sinks[rootNode], results)
return results, nil
}
// init initializes the workflow.
func (wf *Workflow) init(ctx context.Context) error {
id := wf.opts.ID
if id.IsZero() {
id = ulid.Make()
}
wf.manifest = &Manifest{
ID: id,
Tenant: wf.opts.Tenant,
Actor: wf.opts.Actor,
Streams: wf.allStreams(),
Tasks: wf.allTasks(),
StreamEventHandler: wf.onStreamChange,
TaskEventHandler: wf.onTaskChange,
}
if err := wf.runner.RegisterManifest(ctx, wf.manifest); err != nil {
return err
}
return wf.runner.Listen(ctx, wf.resultsPipeline, wf.resultsStream)
}
// String returns a string representation of the workflow. It is a convenience
// method for calling [Sprint].
func (wf *Workflow) String() string {
return Sprint(wf)
}
// Opts returns options of the workflow (mostly for testing purposes).
func (wf *Workflow) Opts() Options { return wf.opts }
// Len returns the total number of tasks in the workflow.
func (wf *Workflow) Len() int {
if wf.Empty() {
return 0
}
return len(wf.manifest.Tasks)
}
// Close releases resources associated with the workflow.
func (wf *Workflow) Close() {
if wf.span != nil {
wf.span.End()
}
if wf.Empty() {
return
}
if err := wf.runner.UnregisterManifest(context.Background(), wf.manifest); err != nil {
level.Warn(wf.logger).Log("msg", "failed to unregister workflow manifest", "err", err)
}
}
// Run executes the workflow, returning a pipeline to read results from. The
// provided context is used for the lifetime of the workflow execution.
//
// The returned pipeline must be closed when the workflow is complete to release
// resources.
func (wf *Workflow) Run(ctx context.Context) (pipeline executor.Pipeline, err error) {
if wf.Empty() {
level.Debug(wf.logger).Log("msg", "workflow is empty. will return an empty pipeline.")
return newEOFPipeline(), nil
}
wf.capture = xcap.CaptureFromContext(ctx)
wf.parentRegion = xcap.RegionFromContext(ctx)
// wf.Run tracks the lifetime of the workflow execution.
ctx, wf.span = xcap.StartSpan(ctx, tracer, "wf.Run")
wrapped := &wrappedPipeline{
inner: wf.resultsPipeline,
onClose: func() {
// Merge final stats results back into the caller.
wf.statsMut.Lock()
defer wf.statsMut.Unlock()
stats.JoinResults(ctx, wf.stats)
},
}
// Start dispatching in background goroutine
gotrace.Log(ctx, "dispatch_tasks", "starting dispatch of "+strconv.Itoa(len(wf.manifest.Tasks))+" tasks")
go func() {
err := wf.dispatchTasks(ctx, wf.manifest.Tasks)
if err != nil {
wf.resultsPipeline.SetError(err)
wrapped.Close()
} else {
gotrace.Log(ctx, "dispatch_tasks", "all tasks dispatched")
}
}()
return wrapped, nil
}
// dispatchTasks groups the slice of tasks by their associated "admission lane" (token bucket)
// and dispatches them to the runner.
// Tasks from different admission lanes are dispatched concurrently.
// The caller needs to wait on the returned error group.
func (wf *Workflow) dispatchTasks(ctx context.Context, tasks []*Task) error {
wf.admissionControl = newAdmissionControl(
int64(wf.opts.MaxRunningScanTasks),
int64(wf.opts.MaxRunningOtherTasks),
int64(wf.opts.MaxRunningCompactionTasks),
)
// this span captures the time spent waiting for all tasks to be admitted
// but not the time spent to assign them all to workers.
//
// context is not updated here to avoid making this a parent of
// task spans since admission span ends once all tasks are admitted,
// but tasks can still be running after it ends.
_, span := tracer.Start(ctx, "wf.taskAdmission")
defer span.End()
region := xcap.RegionFromContext(ctx)
region.Record(xcap.StatTaskCount.Observe(int64(len(tasks))))
groups := wf.admissionControl.groupByType(tasks)
// taskTypeCompaction is appended last because the loop is sequential
// (each lane is fully drained before the next): a populated Compaction
// lane should never delay Scan dispatch. The lane is currently dormant
// — typeFor does not classify any task as compaction — so this slot is
// always empty for query workflows. It will be populated once the
// dataobj-compaction node types and typeFor classification land.
for _, taskType := range []taskType{
taskTypeOther,
taskTypeScan,
taskTypeCompaction,
} {
lane := wf.admissionControl.get(taskType)
tasks := groups[taskType]
var offset, batchSize int64
total := int64(len(tasks))
for ; offset < total; offset += batchSize {
batchSize = int64(1)
start := time.Now()
if err := lane.Acquire(ctx, batchSize); err != nil {
return fmt.Errorf("failed to acquire tokens from admission lane %s: %w", taskType, err)
}
region.Record(xcap.StatTaskAdmissionWaitDuration.Observe(time.Since(start).Seconds()))
batch := tasks[offset : offset+batchSize]
if err := wf.runner.Start(ctx, batch...); err != nil {
return fmt.Errorf("failed to start tasks: %w", err)
}
}
}
return nil
}
func (wf *Workflow) allStreams() []*Stream {
var (
result []*Stream
seenStreams = map[*Stream]struct{}{}
)
// We only iterate over sources below (for convenience), and since
// wf.results is only used as a sink, we need to manually add it here.
result = append(result, wf.resultsStream)
seenStreams[wf.resultsStream] = struct{}{}
for _, root := range wf.graph.Roots() {
_ = wf.graph.Walk(root, func(t *Task) error {
// Task construction guarantees that there is a sink for each source
// (minus the results stream we generate), so there's no point in
// iterating over Sources and Sinks.
for _, streams := range t.Sources {
for _, stream := range streams {
if _, seen := seenStreams[stream]; seen {
continue
}
seenStreams[stream] = struct{}{}
result = append(result, stream)
}
}
return nil
}, dag.PreOrderWalk)
}
return result
}
func (wf *Workflow) allTasks() []*Task {
var tasks []*Task
for _, root := range wf.graph.Roots() {
// [dag.Graph.Walk] guarantees that each node is only visited once, so
// we can safely append the task to the list without checking if it's
// already been seen.
_ = wf.graph.Walk(root, func(t *Task) error {
tasks = append(tasks, t)
return nil
}, dag.PreOrderWalk)
}
return tasks
}
func (wf *Workflow) onStreamChange(_ context.Context, stream *Stream, newState StreamState) {
if wf.opts.DebugStreams {
level.Debug(wf.logger).Log("msg", "stream state change", "stream_id", stream.ULID, "new_state", newState)
}
wf.streamsMut.Lock()
defer wf.streamsMut.Unlock()
wf.streamStates[stream] = newState
if newState == StreamStateClosed && stream.ULID == wf.resultsStream.ULID {
// Close the results pipeline once the results stream has closed.
wf.resultsPipeline.Close()
}
}
func (wf *Workflow) onTaskChange(ctx context.Context, task *Task, newStatus TaskStatus) {
if wf.opts.DebugTasks {
level.Debug(wf.logger).Log("msg", "task state change", "task_id", task.ULID, "new_state", newStatus.State)
}
wf.tasksMut.Lock()
oldState := wf.taskStates[task]
wf.taskStates[task] = newStatus.State
wf.tasksMut.Unlock()
if newStatus.State.Terminal() {
wf.handleTerminalStateChange(ctx, task, oldState, newStatus)
} else {
wf.handleNonTerminalStateChange(ctx, task, newStatus)
}
}
func (wf *Workflow) handleTerminalStateChange(ctx context.Context, task *Task, oldState TaskState, newStatus TaskStatus) {
// State has not changed
if oldState == newStatus.State {
return
}
if newStatus.State == TaskStateFailed {
// Use the first failure from a task as the failure for the entire
// workflow.
wf.resultsPipeline.SetError(newStatus.Error)
}
if wf.admissionControl == nil {
level.Warn(wf.logger).Log("msg", "admission control was not initialized")
} else if oldState == TaskStatePending || oldState == TaskStateRunning {
// Release tokens only if the task was already enqueued and therefore either pending or running.
defer wf.admissionControl.laneFor(task).Release(1)
}
if newStatus.Capture != nil {
wf.mergeCapture(newStatus.Capture)
}
if newStatus.Statistics != nil {
wf.mergeResults(*newStatus.Statistics)
}
// task reached a terminal state. We need to detect if task's immediate
// children should be canceled. We only look at immediate unterminated
// children, since canceling them will trigger onTaskChange to process
// indirect children.
var tasksToCancel []*Task
wf.tasksMut.RLock()
{
NextChild:
for _, child := range wf.graph.Children(task) {
// Ignore children in terminal states.
if childState := wf.taskStates[child]; childState.Terminal() {
continue
}
// Cancel the child if and only if all of the child's parents (which
// includes the task that just updated) are in a terminal state.
for _, parent := range wf.graph.Parents(child) {
parentState := wf.taskStates[parent]
if !parentState.Terminal() {
continue NextChild
}
}
tasksToCancel = append(tasksToCancel, child)
}
}
wf.tasksMut.RUnlock()
wf.cancelTasks(ctx, tasksToCancel)
// Print the summary at the very end to track full end-to-end task time.
wf.printTaskSummary(task, oldState, newStatus)
}
func (wf *Workflow) handleNonTerminalStateChange(ctx context.Context, task *Task, newStatus TaskStatus) {
// If the task is running, but its contributing time range has been changed
if newStatus.State == TaskStateRunning && !newStatus.ContributingTimeRange.Timestamp.IsZero() {
// We need to detect if task's immediate children should be canceled because they can no longer contribute
// to the state of the running task. We only look at immediate unterminated
// children, since canceling them will trigger onTaskChange to process indirect children.
var tasksToCancel []*Task
ts := newStatus.ContributingTimeRange.Timestamp
lessThan := newStatus.ContributingTimeRange.LessThan
wf.tasksMut.RLock()
{
for _, child := range wf.graph.Children(task) {
// Ignore children in terminal states.
if childState := wf.taskStates[child]; childState.Terminal() {
continue
}
// Ignore if time ranges intersect, so they can contribute
if lessThan && child.MaxTimeRange.Start.Before(ts) ||
!lessThan && child.MaxTimeRange.End.After(ts) {
continue
}
// TODO(spiridonov): We do not check parents here right now, there is only 1 parent now,
// but in general a task can be canceled only if all its parents are in terminal states OR
// have non-inersecting contributing time range.
tasksToCancel = append(tasksToCancel, child)
shortCircuitsTotal.Inc()
}
}
wf.tasksMut.RUnlock()
wf.cancelTasks(ctx, tasksToCancel)
}
}
func (wf *Workflow) cancelTasks(ctx context.Context, tasks []*Task) {
// Runners may re-invoke onTaskChange, so we don't want to hold the mutex
// when calling this.
if err := wf.runner.Cancel(ctx, tasks...); err != nil {
level.Warn(wf.logger).Log("msg", "failed to cancel tasks", "err", err)
}
}
func (wf *Workflow) mergeCapture(capture *xcap.Capture) {
wf.captureMut.Lock()
defer wf.captureMut.Unlock()
wf.capture.Merge(wf.parentRegion, capture)
}
func (wf *Workflow) mergeResults(results stats.Result) {
wf.statsMut.Lock()
defer wf.statsMut.Unlock()
wf.stats.Merge(results)
}
type wrappedPipeline struct {
inner executor.Pipeline
onClose func()
}
func (p *wrappedPipeline) Open(ctx context.Context) error {
return p.inner.Open(ctx)
}
func (p *wrappedPipeline) Read(ctx context.Context) (arrow.RecordBatch, error) {
return p.inner.Read(ctx)
}
// Close closes the resources of the pipeline.
func (p *wrappedPipeline) Close() {
p.inner.Close()
p.onClose()
}