mirror of https://github.com/grafana/loki
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
263 lines
7.6 KiB
263 lines
7.6 KiB
package engine
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/go-kit/log"
|
|
"github.com/go-kit/log/level"
|
|
"github.com/grafana/dskit/httpgrpc"
|
|
"github.com/grafana/dskit/tenant"
|
|
"github.com/oklog/ulid/v2"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"go.opentelemetry.io/otel/codes"
|
|
"go.opentelemetry.io/otel/trace"
|
|
|
|
"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
|
|
"github.com/grafana/loki/v3/pkg/engine/internal/workflow"
|
|
"github.com/grafana/loki/v3/pkg/util/httpreq"
|
|
"github.com/grafana/loki/v3/pkg/xcap"
|
|
)
|
|
|
|
type query struct {
|
|
id ulid.ULID
|
|
engine *Engine
|
|
logger log.Logger
|
|
queryType string
|
|
useCache bool
|
|
|
|
tenantID string
|
|
userAgent string
|
|
actorPath string
|
|
|
|
startTime time.Time
|
|
finishTime time.Time
|
|
|
|
capture *xcap.Capture
|
|
rootRegion *xcap.Region
|
|
|
|
failed bool
|
|
failureType string
|
|
}
|
|
|
|
func (e *Engine) newQuery(ctx context.Context, logger log.Logger, queryType string, useCache bool) (*query, context.Context, error) {
|
|
startTime := time.Now()
|
|
|
|
ctx, capture := xcap.NewCapture(ctx, nil)
|
|
ctx, region := xcap.StartRegion(ctx, "Query")
|
|
|
|
id := ulid.Make()
|
|
q := &query{
|
|
id: id,
|
|
engine: e,
|
|
logger: log.With(logger, "query_id", id),
|
|
queryType: queryType,
|
|
useCache: useCache,
|
|
|
|
userAgent: httpreq.ExtractHeader(ctx, "User-Agent"),
|
|
actorPath: httpreq.ExtractHeader(ctx, httpreq.LokiActorPathHeader),
|
|
|
|
startTime: startTime,
|
|
|
|
capture: capture,
|
|
rootRegion: region,
|
|
}
|
|
if err := q.init(ctx); err != nil {
|
|
// The query failed to initialize, close it and return the error.
|
|
q.Close()
|
|
return nil, ctx, err
|
|
}
|
|
return q, ctx, nil
|
|
}
|
|
|
|
func (q *query) init(ctx context.Context) error {
|
|
tenantID, err := tenant.TenantID(ctx)
|
|
if err != nil {
|
|
q.RecordError(ctx, err)
|
|
return httpgrpc.Error(http.StatusBadRequest, err.Error())
|
|
}
|
|
q.tenantID = tenantID
|
|
return nil
|
|
}
|
|
|
|
// Logger returns the logger associated with the query. It has fields attached
|
|
// for query identification.
|
|
func (q *query) Logger() log.Logger { return q.logger }
|
|
|
|
// TenantID returns the tenant ID associated with the query.
|
|
func (q *query) TenantID() string { return q.tenantID }
|
|
|
|
// ID returns the unique identifier of the query.
|
|
func (q *query) ID() ulid.ULID { return q.id }
|
|
|
|
// Duration returns the current duration of the query.
|
|
func (q *query) Duration() time.Duration {
|
|
if q.finishTime.IsZero() {
|
|
return time.Since(q.startTime)
|
|
}
|
|
return q.finishTime.Sub(q.startTime)
|
|
}
|
|
|
|
// Prepare constucts a workflow from the given physical plan. The returned
|
|
// workflow must be closed to release resources.
|
|
func (q *query) Prepare(ctx context.Context, plan *physical.Plan, useAdmissionLanes bool) (*workflow.Workflow, error) {
|
|
ctx, span := xcap.StartSpan(ctx, tracer, "query.Prepare")
|
|
defer span.End()
|
|
|
|
timer := prometheus.NewTimer(q.engine.metrics.workflowPlanning)
|
|
|
|
var maxRunningScanTasks int
|
|
if useAdmissionLanes {
|
|
maxRunningScanTasks = q.engine.limits.MaxScanTaskParallelism(q.tenantID)
|
|
}
|
|
|
|
opts := workflow.Options{
|
|
ID: q.id,
|
|
Tenant: q.tenantID,
|
|
Actor: httpreq.ExtractActorPath(ctx),
|
|
|
|
MaxRunningScanTasks: maxRunningScanTasks,
|
|
MaxRunningOtherTasks: 0,
|
|
|
|
CacheEnabled: q.useCache,
|
|
MaxTaskCacheSize: uint64(q.engine.cfg.Executor.TaskResultsCache.TaskResultMaxCacheableSize),
|
|
MaxDataObjScanCacheSize: uint64(q.engine.cfg.Executor.TaskResultsCache.DataObjScanResultMaxCacheableSize),
|
|
CacheCompression: q.engine.cfg.Executor.TaskResultsCache.Compression,
|
|
PruneEmptyCachedTasks: q.engine.cfg.Executor.TaskResultsCache.PruneEmptyCachedTasks,
|
|
NonEmptyCachedTasksMaxSize: uint64(q.engine.cfg.Executor.TaskResultsCache.PruneCachedTasksMaxSize),
|
|
PruneCachedTasksFetchTimeout: q.engine.cfg.Executor.TaskResultsCache.PruneCachedTasksFetchTimeout,
|
|
TaskCacheRegistry: q.engine.taskCaches,
|
|
|
|
DebugTasks: q.engine.limits.DebugEngineTasks(q.tenantID),
|
|
DebugStreams: q.engine.limits.DebugEngineStreams(q.tenantID),
|
|
}
|
|
wf, err := workflow.New(ctx, opts, q.logger, q.engine.scheduler.inner, plan)
|
|
if err != nil {
|
|
level.Warn(q.logger).Log("msg", "failed to create workflow", "err", err)
|
|
q.RecordError(ctx, err)
|
|
return nil, ErrNotSupported
|
|
}
|
|
|
|
duration := timer.ObserveDuration()
|
|
span.Record(statPrepareDuration.Observe(int64(duration)))
|
|
|
|
// The execution plan can be way more verbose than the physical plan, so we
|
|
// only log it at debug level.
|
|
level.Debug(q.logger).Log(
|
|
"msg", "execution-plan-detail",
|
|
"plan", workflow.Sprint(wf),
|
|
)
|
|
level.Info(q.logger).Log(
|
|
"msg", "execution-plan-summary",
|
|
"duration_ms", duration.Milliseconds(),
|
|
"tasks", wf.Len(),
|
|
)
|
|
|
|
span.SetStatus(codes.Ok, "")
|
|
return wf, nil
|
|
}
|
|
|
|
// RecordError records the query as having failed with the given error.
|
|
func (q *query) RecordError(ctx context.Context, err error) {
|
|
span := trace.SpanFromContext(ctx)
|
|
span.RecordError(err)
|
|
span.SetStatus(codes.Error, err.Error())
|
|
|
|
switch {
|
|
case errors.Is(err, context.Canceled):
|
|
q.failureType = "canceled"
|
|
case errors.Is(err, context.DeadlineExceeded):
|
|
q.failureType = "timeout"
|
|
default:
|
|
q.failureType = "fail"
|
|
}
|
|
q.failed = true
|
|
}
|
|
|
|
// Close finalizes the query and logs a summary line about the query's
|
|
// execution.
|
|
func (q *query) Close() {
|
|
if !q.finishTime.IsZero() {
|
|
// Already closed.
|
|
return
|
|
}
|
|
|
|
q.finishTime = time.Now()
|
|
q.rootRegion.End()
|
|
q.capture.End()
|
|
|
|
status := "success"
|
|
if q.failed {
|
|
status = q.failureType
|
|
}
|
|
|
|
var (
|
|
logicalPlanDuration, _ = q.capture.Value(statLogicalPlanDuration).Int64()
|
|
physicalPlanDuration, _ = q.capture.Value(statPhysicalPlanDuration).Int64()
|
|
prepareDuration, _ = q.capture.Value(statPrepareDuration).Int64()
|
|
executeDuration, _ = q.capture.Value(statExecutionDuration).Int64()
|
|
|
|
otherDuration = calculateResidual(q.finishTime.Sub(q.startTime), logicalPlanDuration, physicalPlanDuration, prepareDuration, executeDuration)
|
|
)
|
|
|
|
q.logger.Log(
|
|
"msg", "query-summary",
|
|
|
|
"user_agent", q.userAgent,
|
|
"actor_path", q.actorPath,
|
|
"query_type", q.queryType,
|
|
|
|
"submitted_at", q.startTime.Format(time.RFC3339Nano),
|
|
"completed_at", q.finishTime.Format(time.RFC3339Nano),
|
|
"duration_ms", q.finishTime.Sub(q.startTime).Milliseconds(),
|
|
|
|
"status", status,
|
|
|
|
// "duration_cache_check_ms", "", // See comment on query_result_cache_hit for why this is disabled.
|
|
"duration_logical_plan_ms", time.Duration(logicalPlanDuration).Milliseconds(),
|
|
"duration_physical_plan_ms", time.Duration(physicalPlanDuration).Milliseconds(),
|
|
"duration_prepare_ms", time.Duration(prepareDuration).Milliseconds(),
|
|
"duration_execute_ms", time.Duration(executeDuration).Milliseconds(),
|
|
"duration_other_ms", otherDuration.Milliseconds(),
|
|
|
|
"dominant_phase", calculateDominantPhase(map[string]time.Duration{
|
|
"logical": time.Duration(logicalPlanDuration),
|
|
"physical": time.Duration(physicalPlanDuration),
|
|
"prepare": time.Duration(prepareDuration),
|
|
"execute": time.Duration(executeDuration),
|
|
"other": otherDuration,
|
|
}),
|
|
|
|
// TODO(rfratto): Move caching logic into the engine so we can track
|
|
// cache hits/misses/latency.
|
|
"query_result_cache_hit", "false",
|
|
)
|
|
}
|
|
|
|
// calculateResidual finds the amount of time in dur that is not accounted for
|
|
// in the given stats.
|
|
//
|
|
// Each stat must be an int64 value.
|
|
func calculateResidual(dur time.Duration, stats ...int64) time.Duration {
|
|
var total time.Duration
|
|
for _, s := range stats {
|
|
total += time.Duration(s)
|
|
}
|
|
return dur - total
|
|
}
|
|
|
|
func calculateDominantPhase(phases map[string]time.Duration) string {
|
|
var (
|
|
maxPhase string
|
|
maxDuration time.Duration
|
|
)
|
|
for phase, duration := range phases {
|
|
if duration > maxDuration {
|
|
maxPhase = phase
|
|
maxDuration = duration
|
|
}
|
|
}
|
|
return maxPhase
|
|
}
|
|
|